1)When running
#import os
Set a different temporary directory
os.environ[‘TMPDIR’] = ‘./tmp’
torch.set_num_threads(1) # Limit the number of threads used by PyTorch
#Tokenize the text field in our dataset without adding special tokens or padding since we will do this manually.
def tokenize(element):
return tokenizer(
element[“text”],
truncation=True,
#max_length=2048,
max_length=1024,
#max_length=512,
#padding=True,
add_special_tokens=False,
#add_special_tokens=True,
)
num_cores = max(1, os.cpu_count() // 2) # Use half of your available CPU cores
dataset_tokenized = dataset_QA.map(
tokenize,
batched=True,
#num_proc=num_cores, # Use fewer processes
#num_proc=4, # Disable multiprocessing
num_proc=os.cpu_count(), # multithreaded
remove_columns=[“text”], # don’t need this anymore, we have tokens from here on
keep_in_memory=True
)
Got the error:
#####################
Process SyncManager-196:
Traceback (most recent call last):
File “/usr/local/lib/python3.12/site-packages/multiprocess/process.py”, line 314, in _bootstrap
self.run()
File “/usr/local/lib/python3.12/site-packages/multiprocess/process.py”, line 108, in run
self._target(*self._args, **self._kwargs)
File “/usr/local/lib/python3.12/site-packages/multiprocess/managers.py”, line 591, in _run_server
server = cls._Server(registry, address, authkey, serializer)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/managers.py”, line 156, in init
self.listener = Listener(address=address, backlog=16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/connection.py”, line 461, in init
address = address or arbitrary_address(family)
^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/connection.py”, line 80, in arbitrary_address
return tempfile.mktemp(prefix=‘listener-’, dir=util.get_temp_dir())
^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/util.py”, line 146, in get_temp_dir
tempdir = tempfile.mkdtemp(prefix=‘pymp-’)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/lib64/python3.12/tempfile.py”, line 384, in mkdtemp
_os.mkdir(file, 0o700)
OSError: [Errno 5] Input/output error: ‘/tmp/pymp-7wlx2o08’
EOFError Traceback (most recent call last)
Cell In[24], line 20
8 return tokenizer(
9 element[“text”],
10 truncation=True,
(…)
16 #add_special_tokens=True,
17 )
18 num_cores = max(1, os.cpu_count() // 2) # Use half of your available CPU cores
—> 20 dataset_tokenized = dataset_QA.map(
21 tokenize,
22 batched=True,
23 #num_proc=num_cores, # Use fewer processes
24 #num_proc=4, # Disable multiprocessing
25 num_proc=os.cpu_count(), # multithreaded
26 remove_columns=[“text”], # don’t need this anymore, we have tokens from here on
27 keep_in_memory=True
28 )
File /projetos/fl6f/lib/python3.12/site-packages/datasets/dataset_dict.py:887, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
883 if cache_file_names is None:
884 cache_file_names = {k: None for k in self}
885 return DatasetDict(
886 {
→ 887 k: dataset.map(
888 function=function,
889 with_indices=with_indices,
890 with_rank=with_rank,
891 input_columns=input_columns,
892 batched=batched,
893 batch_size=batch_size,
894 drop_last_batch=drop_last_batch,
895 remove_columns=remove_columns,
896 keep_in_memory=keep_in_memory,
897 load_from_cache_file=load_from_cache_file,
898 cache_file_name=cache_file_names[k],
899 writer_batch_size=writer_batch_size,
900 features=features,
901 disable_nullable=disable_nullable,
902 fn_kwargs=fn_kwargs,
903 num_proc=num_proc,
904 desc=desc,
905 )
906 for k, dataset in self.items()
907 }
908 )
File /projetos/fl6f/lib/python3.12/site-packages/datasets/arrow_dataset.py:560, in transmit_format..wrapper(*args, **kwargs)
553 self_format = {
554 “type”: self._format_type,
555 “format_kwargs”: self._format_kwargs,
556 “columns”: self._format_columns,
557 “output_all_columns”: self._output_all_columns,
558 }
559 # apply actual function
→ 560 out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
561 datasets: List[“Dataset”] = list(out.values()) if isinstance(out, dict) else [out]
562 # re-apply format to the output
File /projetos/fl6f/lib/python3.12/site-packages/datasets/arrow_dataset.py:3147, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
3141 logger.info(f"Spawning {num_proc} processes")
3142 with hf_tqdm(
3143 unit=" examples",
3144 total=pbar_total,
3145 desc=(desc or “Map”) + f" (num_proc={num_proc})",
3146 ) as pbar:
→ 3147 for rank, done, content in iflatmap_unordered(
3148 pool, Dataset._map_single, kwargs_iterable=kwargs_per_job
3149 ):
3150 if done:
3151 shards_done += 1
File /projetos/fl6f/lib/python3.12/site-packages/datasets/utils/py_utils.py:696, in iflatmap_unordered(pool, func, kwargs_iterable)
694 pool_changed = False
695 manager_cls = Manager if isinstance(pool, multiprocessing.pool.Pool) else multiprocess.Manager
→ 696 with manager_cls() as manager:
697 queue = manager.Queue()
698 async_results = [
699 pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable
700 ]
File /usr/local/lib/python3.12/site-packages/multiprocess/context.py:57, in BaseContext.Manager(self)
55 from .managers import SyncManager
56 m = SyncManager(ctx=self.get_context())
—> 57 m.start()
58 return m
File /usr/local/lib/python3.12/site-packages/multiprocess/managers.py:566, in BaseManager.start(self, initializer, initargs)
564 # get address of server
565 writer.close()
→ 566 self._address = reader.recv()
567 reader.close()
569 # register a finalizer
File /usr/local/lib/python3.12/site-packages/multiprocess/connection.py:253, in _ConnectionBase.recv(self)
251 self._check_closed()
252 self._check_readable()
→ 253 buf = self._recv_bytes()
254 return _ForkingPickler.loads(buf.getbuffer())
File /usr/local/lib/python3.12/site-packages/multiprocess/connection.py:433, in Connection._recv_bytes(self, maxsize)
432 def _recv_bytes(self, maxsize=None):
→ 433 buf = self._recv(4)
434 size, = struct.unpack(“!i”, buf.getvalue())
435 if size == -1:
File /usr/local/lib/python3.12/site-packages/multiprocess/connection.py:402, in Connection._recv(self, size, read)
400 if n == 0:
401 if remaining == size:
→ 402 raise EOFError
403 else:
404 raise OSError(“got end of file during message”)
EOFError:
#################################
I checked the disk space and it is not the problem. It was running before, and stopped runing, i don’t understand why…
2)Same happens when i run the Trainer
#from dotenv import load_dotenv
#load_dotenv()
Set a different temporary directory
os.environ[‘TMPDIR’] = ‘/home/fl6f/work/tmp’
#Clear GPU memory before training
clear_gpu_memory()
#wait_until_enough_gpu_memory(min_memory_available)
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
checkpoint_path = “/projetos/fl6f/LAMMA_QuestionAnswering_Pira/out/checkpoint-156”
#Train the model (FineTuning)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
#data_collator=collate,
data_collator=data_collator,
train_dataset=dataset_tokenized[“train”],
eval_dataset=dataset_tokenized[“validation”],
#train_dataset=subset_train,
#eval_dataset=subset_test,
#callbacks=[early_stopping, PlotLossCallback()]
callbacks=[early_stopping]# Adiciona o callback aqui# Add the early stopping callback here
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
#Train
trainer.train()
I got this error:
#######################
OSError Traceback (most recent call last)
Cell In[23], line 32
29 model.config.use_cache = False # silence the warnings. Please re-enable for inference!
31 #Train
—> 32 trainer.train()
File /projetos/fl6f/lib/python3.12/site-packages/transformers/trainer.py:1938, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
→ 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,
1941 trial=trial,
1942 ignore_keys_for_eval=ignore_keys_for_eval,
1943 )
File /projetos/fl6f/lib/python3.12/site-packages/transformers/trainer.py:2075, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2071 gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
2073 self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
→ 2075 model = self._wrap_model(self.model_wrapped)
2077 # as the model is wrapped, don’t use accelerator.prepare
2078 # this is for unhandled cases such as
2079 # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
2080 use_accelerator_prepare = True if model is self.model else False
File /projetos/fl6f/lib/python3.12/site-packages/transformers/trainer.py:1693, in Trainer._wrap_model(self, model, training, dataloader)
1690 return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
1692 # train/eval could be run multiple-times - if already wrapped, don’t re-wrap it again
→ 1693 if self.accelerator.unwrap_model(model) is not model:
1694 return model
1696 # Mixed precision training with apex (torch < 1.6)
File /projetos/fl6f/lib/python3.12/site-packages/accelerate/accelerator.py:2583, in Accelerator.unwrap_model(self, model, keep_fp32_wrapper)
2552 def unwrap_model(self, model, keep_fp32_wrapper: bool = True):
2553 “”"
2554 Unwraps the model
from the additional layer possible added by [~Accelerator.prepare
]. Useful before saving
2555 the model.
(…)
2581 ```
2582 “”"
→ 2583 return extract_model_from_parallel(model, keep_fp32_wrapper)
File /projetos/fl6f/lib/python3.12/site-packages/accelerate/utils/other.py:85, in extract_model_from_parallel(model, keep_fp32_wrapper, recursive)
82 options += (DeepSpeedEngine,)
84 if is_torch_version(“>=”, FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
—> 85 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
87 options += (FSDP,)
89 while isinstance(model, options):
File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/init.py:1
----> 1 from ._flat_param import FlatParameter as FlatParameter
2 from .fully_sharded_data_parallel import (
3 BackwardPrefetch,
4 CPUOffload,
(…)
18 StateDictType,
19 )
21 all = [
22 “BackwardPrefetch”,
23 “CPUOffload”,
(…)
37 “StateDictType”,
38 ]
File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/_flat_param.py:30
28 import torch.nn.functional as F
29 from torch import Tensor
—> 30 from torch.distributed.fsdp._common_utils import (
31 _FSDPDeviceHandle,
32 _named_parameters_with_duplicates,
33 _no_dispatch_record_stream,
34 _set_fsdp_flattened,
35 HandleTrainingState,
36 )
37 from torch.distributed.utils import (
38 _alloc_storage,
39 _data_ptr_allocated,
40 _free_storage,
41 _p_assert,
42 )
43 from torch.nn.parameter import _ParameterMeta # type: ignore[attr-defined]
File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/_common_utils.py:35
31 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
32 _CHECKPOINT_PREFIX,
33 )
34 from torch.distributed.device_mesh import DeviceMesh
—> 35 from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
36 from torch.distributed.utils import _apply_to_tensors
37 from torch.utils._mode_utils import no_dispatch
File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/_fsdp_extensions.py:6
4 import torch
5 import torch.distributed as dist
----> 6 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
7 from torch.distributed._shard.sharded_tensor.shard import Shard
8 from torch.distributed._tensor import DeviceMesh, DTensor
File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/init.py:1
----> 1 from .api import (
2 _shard_tensor,
3 load_with_process_group,
4 shard_module,
5 shard_parameter,
6 )
File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/api.py:7
5 import torch.nn as nn
6 from torch.distributed import distributed_c10d
----> 7 from torch.distributed._shard.sharded_tensor import (
8 ShardedTensor,
9 )
10 from .sharding_spec import (
11 ShardingSpec,
12 ChunkShardingSpec
13 )
14 from .sharding_plan import (
15 ShardingPlan
16 )
File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/sharded_tensor/init.py:11
8 else:
9 ShardingSpec = “ShardingSpec”
—> 11 from .api import (
12 _CUSTOM_SHARDED_OPS,
13 _SHARDED_OPS,
14 Shard,
15 ShardedTensorBase,
16 ShardedTensor,
17 ShardedTensorMetadata,
18 TensorProperties,
19 )
20 from .metadata import ShardMetadata # noqa: F401
21 from torch.distributed._shard.op_registry_utils import _decorator_func
File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/sharded_tensor/api.py:38
36 from .metadata import TensorProperties, ShardedTensorMetadata
37 from .shard import Shard
—> 38 from .reshard import reshuffle_local_shard, reshard_local_shard
39 from .utils import (
40 _flatten_tensor_size,
41 _parse_and_validate_remote_device,
(…)
44 build_global_metadata
45 )
46 from torch.distributed.remote_device import _remote_device
File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/sharded_tensor/reshard.py:14
9 import torch.distributed._shard.sharding_spec as shard_spec
10 from torch.distributed._shard.sharding_spec._internals import (
11 get_split_size,
12 get_chunked_dim_size,
13 )
—> 14 from torch.distributed.nn.functional import (
15 all_to_all,
16 all_to_all_single,
17 )
18 from torch.distributed._shard.metadata import ShardMetadata
20 from .shard import Shard
File /usr/local/lib64/python3.12/site-packages/torch/distributed/nn/init.py:3
1 import torch
2 if torch.distributed.rpc.is_available():
----> 3 from .api.remote_module import RemoteModule
4 from .functional import *
File /usr/local/lib64/python3.12/site-packages/torch/distributed/nn/api/remote_module.py:24
22 import torch.distributed.rpc as rpc
23 from torch import Tensor, device, dtype, nn
—> 24 from torch.distributed.nn.jit import instantiator
25 from torch.distributed import _remote_device
26 from torch.distributed.rpc.internal import _internal_rpc_pickler
File /usr/local/lib64/python3.12/site-packages/torch/distributed/nn/jit/instantiator.py:19
15 logger = logging.getLogger(name)
18 _FILE_PREFIX = “remote_module”
—> 19 _TEMP_DIR = tempfile.TemporaryDirectory()
20 INSTANTIATED_TEMPLATE_DIR_PATH = _TEMP_DIR.name
21 logger.info(“Created a temporary directory at %s”, INSTANTIATED_TEMPLATE_DIR_PATH)
File /usr/lib64/python3.12/tempfile.py:882, in TemporaryDirectory.init(self, suffix, prefix, dir, ignore_cleanup_errors, delete)
880 def init(self, suffix=None, prefix=None, dir=None,
881 ignore_cleanup_errors=False, *, delete=True):
→ 882 self.name = mkdtemp(suffix, prefix, dir)
883 self._ignore_cleanup_errors = ignore_cleanup_errors
884 self._delete = delete
File /usr/lib64/python3.12/tempfile.py:384, in mkdtemp(suffix, prefix, dir)
382 _sys.audit(“tempfile.mkdtemp”, file)
383 try:
→ 384 _os.mkdir(file, 0o700)
385 except FileExistsError:
386 continue # try again
OSError: [Errno 5] Input/output error: ‘/tmp/tmpfe_zflge’
It was running before as well, but it gives this error and i have no idea how to solve…