I’m making a batch predict function with a model I trained. The issue is that after creating inputs with the tokenizer, moving the inputs to cuda takes an extremely long time. About 95% of the prediction function time is spent on this, and 2.5% on the actual prediction, so I feel like I must be doing something wrong. Here is the function:
class Predictor:
def __init__(self, model_name, batch_size, max_input_length, binary_threshold):
self.batch_size = batch_size
self.max_input_length = max_input_length
self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.binary_threshold = binary_threshold
def predict(self, csv_file_path):
start_time=datetime.datetime.now()
print(os.path.basename(csv_file_path))
df = pd.read_csv(csv_file_path)
print(len(df))
dataset = Dataset.from_pandas(df)
num_batches = math.ceil(len(dataset)/self.batch_size)
my_pb = widgets.IntProgress(
value=0,
min=0,
max=num_batches,
description='Loading:',
bar_style= 'success',
style={'bar_color': 'green'},
orientation='horizontal'
)
display(my_pb)
# Do the predictions, one batch at a time
df['yes_probability'] = pd.Series(dtype='float')
for batch in range(num_batches):
# Tokenize the entire dataset
input_df = df.loc[self.batch_size*batch:self.batch_size*(batch+1)-1, 'body'].tolist()
inputs = self.tokenizer(input_df, return_tensors="pt", truncation=True, padding=True, max_length=self.max_input_length)
inputs = inputs.to('cuda')
# Make predictions
with torch.no_grad():
logits = self.model(**inputs).logits
# Get YES probabilities and add to original dataframe
yes_probabilities = logits.softmax(dim=1)[:,1].to('cpu', non_blocking=True).tolist()
df.loc[self.batch_size*batch:self.batch_size*(batch+1)-1, 'yes_probability'] = yes_probabilities
# Update progress bar
my_pb.value = batch + 1
output_df = df[df['yes_probability'] > self.binary_threshold]
end_time = datetime.datetime.now()
total_time = (end_time - start_time).total_seconds()
time_per_item = round(total_time / (self.batch_size * num_batches), 4)
print(f"{total_time} total seconds ({time_per_item} seconds per item)")
print(f"{len(output_df)}/{len(df)} predicted as YES ({round(100*len(output_df)/len(df), 2)}%)")
print()
print()
output_filename = os.path.basename(csv_file_path).split(os.extsep)[0] + ".csv"
output_df.to_csv(f"/datasets/s3/simple_classifier_outputs/{output_filename}")
The line inputs = inputs.to('cuda')
is what takes up 95% of the time, based on the library line_profiler
.