Batch Transform with strategy='MultiRecord' returns only one line

Hi,
I have a simple code to perform batch transform on imdb dataset, using Llama-2. The code runs fine however, out of 200 records in the input file, however, Batch Transform with strategy=‘MultiRecord’ returns only one line (inference for my first input line, while I have 200 lines in the input .jasonl file), and then exits with success. However, if I choose strategy=‘SingleRecord’, it will return the results for all lines, however it is extremally slow (it takes 4 hors for 200 prompts).

I have attached the code, and really appreciate your feedback. I have tried many things and searched the forums exhaustively, and no luck. Looking forward to hearing from you. Thanks


from datasets import load_dataset
imdb = load_dataset("imdb")
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(200))])

with open(batchtransform_data_file_path, mode="w+") as outfile, \
    open(batchtransform_data_with_label_file_path, mode="w+") as outfile_with_label:

    i=-1
    # Read each line from the original file
    for obj in small_test_dataset:  
        i=i+1
       
        messages1 = [
                    { "role": "system","content": "You are an intelligent Review data analyst, your goal is to rate the user reviews as Positive, Neutral, or Negative. Instruction: Based on the review provided, classify the review with ONLY a single word, as either Positive, or Neutral, or Negative."}
                ]

        instruction = template1["prompt"].format(data=obj["text"])

        messages1.append({"role": "user", "content": instruction})

        prompt = build_llama2_prompt(messages1)


        # Format the data according to the template
        formatted_data = {
            #"id": i,
            "inputs": prompt,
            # "max_new_tokens": Max_New_Tokens,
            # "truncation": Input_Truncation,
        }
        formatted_data_with_label = {
            "id": i,
            "inputs": prompt,
            "label":obj["label"],
            # "max_new_tokens": Max_New_Tokens,
            # "truncation": Input_Truncation,
        }
        # Write the formatted data to the new file
        formatted_data["inputs"] = formatted_data["inputs"].replace("@","")
        json.dump(formatted_data, outfile)
        outfile.write('\n')

        formatted_data_with_label["inputs"] = formatted_data_with_label["inputs"].replace("@","")
        json.dump(formatted_data_with_label, outfile_with_label)
        outfile_with_label.write('\n')
  
  
  
  llm_image = get_huggingface_llm_image_uri(
    "huggingface", # huggingface or lmi
    version=llm_image_uri_ver,
    session=Sagemaker_Session,
    region=region_name
  )
  
  config = {
    'HF_MODEL_ID': HF_model_name, # model_id from hf.co/models
    'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
    'MAX_INPUT_LENGTH': json.dumps(MAX_INPUT_LENGTH),  # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(MAX_TOTAL_TOKENS),  # Max length of the generation (including input text)
    'MAX_BATCH_TOTAL_TOKENS': json.dumps(MAX_BATCH_TOTAL_TOKENS),  # Limits the number of tokens that can be processed in parallel during the generation
    'HUGGING_FACE_HUB_TOKEN': HUGGING_FACE_HUB_TOKEN
  }
  
  
  
  llm_model = HuggingFaceModel(
    role=my_role,
    image_uri=llm_image,
    env=config
  )
  hyper_params = { "max_new_tokens":str( Max_New_Tokens), "return_full_text":str( False)}
  
  
  batch_job = llm_model.transformer(
      instance_count=Instance_Count,
      instance_type=InstanceType,
      strategy='MultiRecord',
      assemble_with='Line', 
      #strategy='SingleRecord', 
      output_path=s3_output_data_path, 
      env = hyper_params,
      accept='application/json',
  )
  
  batch_job.transform(
      data= f"{s3_input_data_path}/{batchtransform_data_file_name}",
      content_type='application/json',#'application/jsonlines',#'application/json',    
      split_type='Line', 
      wait=True,
  )