I have dataset class like following and after each eposh refreshing it like this train_loader.dataset.load_shuffle_concat()
Would it work as expected in multi gpu setting? By expected i mean same shuffle concat operation in each process and splitting all batches across gpus?
Is it a good way to do what i want?
class StressDataset(Dataset):
def __init__(self, hp):
self.file_path = hp.file_path
self.max_len = hp.max_len
self.load_shuffle_concat()
# shuffle and concat strings up to target len
def load_shuffle_concat(self):
with open(self.file_path, 'r') as f:
lines = f.readlines()
# use torch for random source
ids = torch.randperm(len(lines)).tolist()
lines = np.array(lines)[[ids]]
self.samples = []
current_sample = {'x': []}
for x in lines:
if len(x) > self.max_len:
continue
if len(current_sample['x']) + len(x) > self.max_len:
self.samples.append(current_sample)
current_sample = {'x': x}
else:
current_sample['x'] += x
self.samples.append(current_sample)
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
out = {}
out['x'] = self.samples[idx].values()
out['x'] = torch.tensor(out['x'])
return out