Error with the tmp

1)When running

#import os

Set a different temporary directory

os.environ[‘TMPDIR’] = ‘./tmp’
torch.set_num_threads(1) # Limit the number of threads used by PyTorch

#Tokenize the text field in our dataset without adding special tokens or padding since we will do this manually.
def tokenize(element):
return tokenizer(
element[“text”],
truncation=True,
#max_length=2048,
max_length=1024,
#max_length=512,
#padding=True,
add_special_tokens=False,
#add_special_tokens=True,
)
num_cores = max(1, os.cpu_count() // 2) # Use half of your available CPU cores

dataset_tokenized = dataset_QA.map(
tokenize,
batched=True,
#num_proc=num_cores, # Use fewer processes
#num_proc=4, # Disable multiprocessing
num_proc=os.cpu_count(), # multithreaded
remove_columns=[“text”], # don’t need this anymore, we have tokens from here on
keep_in_memory=True
)

Got the error:
#####################
Process SyncManager-196:
Traceback (most recent call last):
File “/usr/local/lib/python3.12/site-packages/multiprocess/process.py”, line 314, in _bootstrap
self.run()
File “/usr/local/lib/python3.12/site-packages/multiprocess/process.py”, line 108, in run
self._target(*self._args, **self._kwargs)
File “/usr/local/lib/python3.12/site-packages/multiprocess/managers.py”, line 591, in _run_server
server = cls._Server(registry, address, authkey, serializer)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/managers.py”, line 156, in init
self.listener = Listener(address=address, backlog=16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/connection.py”, line 461, in init
address = address or arbitrary_address(family)
^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/connection.py”, line 80, in arbitrary_address
return tempfile.mktemp(prefix=‘listener-’, dir=util.get_temp_dir())
^^^^^^^^^^^^^^^^^^^
File “/usr/local/lib/python3.12/site-packages/multiprocess/util.py”, line 146, in get_temp_dir
tempdir = tempfile.mkdtemp(prefix=‘pymp-’)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/usr/lib64/python3.12/tempfile.py”, line 384, in mkdtemp
_os.mkdir(file, 0o700)
OSError: [Errno 5] Input/output error: ‘/tmp/pymp-7wlx2o08’

EOFError Traceback (most recent call last)
Cell In[24], line 20
8 return tokenizer(
9 element[“text”],
10 truncation=True,
(…)
16 #add_special_tokens=True,
17 )
18 num_cores = max(1, os.cpu_count() // 2) # Use half of your available CPU cores
—> 20 dataset_tokenized = dataset_QA.map(
21 tokenize,
22 batched=True,
23 #num_proc=num_cores, # Use fewer processes
24 #num_proc=4, # Disable multiprocessing
25 num_proc=os.cpu_count(), # multithreaded
26 remove_columns=[“text”], # don’t need this anymore, we have tokens from here on
27 keep_in_memory=True
28 )

File /projetos/fl6f/lib/python3.12/site-packages/datasets/dataset_dict.py:887, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
883 if cache_file_names is None:
884 cache_file_names = {k: None for k in self}
885 return DatasetDict(
886 {
→ 887 k: dataset.map(
888 function=function,
889 with_indices=with_indices,
890 with_rank=with_rank,
891 input_columns=input_columns,
892 batched=batched,
893 batch_size=batch_size,
894 drop_last_batch=drop_last_batch,
895 remove_columns=remove_columns,
896 keep_in_memory=keep_in_memory,
897 load_from_cache_file=load_from_cache_file,
898 cache_file_name=cache_file_names[k],
899 writer_batch_size=writer_batch_size,
900 features=features,
901 disable_nullable=disable_nullable,
902 fn_kwargs=fn_kwargs,
903 num_proc=num_proc,
904 desc=desc,
905 )
906 for k, dataset in self.items()
907 }
908 )

File /projetos/fl6f/lib/python3.12/site-packages/datasets/arrow_dataset.py:560, in transmit_format..wrapper(*args, **kwargs)
553 self_format = {
554 “type”: self._format_type,
555 “format_kwargs”: self._format_kwargs,
556 “columns”: self._format_columns,
557 “output_all_columns”: self._output_all_columns,
558 }
559 # apply actual function
→ 560 out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
561 datasets: List[“Dataset”] = list(out.values()) if isinstance(out, dict) else [out]
562 # re-apply format to the output

File /projetos/fl6f/lib/python3.12/site-packages/datasets/arrow_dataset.py:3147, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
3141 logger.info(f"Spawning {num_proc} processes")
3142 with hf_tqdm(
3143 unit=" examples",
3144 total=pbar_total,
3145 desc=(desc or “Map”) + f" (num_proc={num_proc})",
3146 ) as pbar:
→ 3147 for rank, done, content in iflatmap_unordered(
3148 pool, Dataset._map_single, kwargs_iterable=kwargs_per_job
3149 ):
3150 if done:
3151 shards_done += 1

File /projetos/fl6f/lib/python3.12/site-packages/datasets/utils/py_utils.py:696, in iflatmap_unordered(pool, func, kwargs_iterable)
694 pool_changed = False
695 manager_cls = Manager if isinstance(pool, multiprocessing.pool.Pool) else multiprocess.Manager
→ 696 with manager_cls() as manager:
697 queue = manager.Queue()
698 async_results = [
699 pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable
700 ]

File /usr/local/lib/python3.12/site-packages/multiprocess/context.py:57, in BaseContext.Manager(self)
55 from .managers import SyncManager
56 m = SyncManager(ctx=self.get_context())
—> 57 m.start()
58 return m

File /usr/local/lib/python3.12/site-packages/multiprocess/managers.py:566, in BaseManager.start(self, initializer, initargs)
564 # get address of server
565 writer.close()
→ 566 self._address = reader.recv()
567 reader.close()
569 # register a finalizer

File /usr/local/lib/python3.12/site-packages/multiprocess/connection.py:253, in _ConnectionBase.recv(self)
251 self._check_closed()
252 self._check_readable()
→ 253 buf = self._recv_bytes()
254 return _ForkingPickler.loads(buf.getbuffer())

File /usr/local/lib/python3.12/site-packages/multiprocess/connection.py:433, in Connection._recv_bytes(self, maxsize)
432 def _recv_bytes(self, maxsize=None):
→ 433 buf = self._recv(4)
434 size, = struct.unpack(“!i”, buf.getvalue())
435 if size == -1:

File /usr/local/lib/python3.12/site-packages/multiprocess/connection.py:402, in Connection._recv(self, size, read)
400 if n == 0:
401 if remaining == size:
→ 402 raise EOFError
403 else:
404 raise OSError(“got end of file during message”)

EOFError:
#################################
I checked the disk space and it is not the problem. It was running before, and stopped runing, i don’t understand why…


2)Same happens when i run the Trainer


#from dotenv import load_dotenv
#load_dotenv()

Set a different temporary directory

os.environ[‘TMPDIR’] = ‘/home/fl6f/work/tmp’

#Clear GPU memory before training
clear_gpu_memory()
#wait_until_enough_gpu_memory(min_memory_available)
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
checkpoint_path = “/projetos/fl6f/LAMMA_QuestionAnswering_Pira/out/checkpoint-156”

#Train the model (FineTuning)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
#data_collator=collate,
data_collator=data_collator,
train_dataset=dataset_tokenized[“train”],
eval_dataset=dataset_tokenized[“validation”],
#train_dataset=subset_train,
#eval_dataset=subset_test,
#callbacks=[early_stopping, PlotLossCallback()]
callbacks=[early_stopping]# Adiciona o callback aqui# Add the early stopping callback here
)

model.config.use_cache = False # silence the warnings. Please re-enable for inference!

#Train
trainer.train()

I got this error:
#######################

OSError Traceback (most recent call last)
Cell In[23], line 32
29 model.config.use_cache = False # silence the warnings. Please re-enable for inference!
31 #Train
—> 32 trainer.train()

File /projetos/fl6f/lib/python3.12/site-packages/transformers/trainer.py:1938, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
→ 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,
1941 trial=trial,
1942 ignore_keys_for_eval=ignore_keys_for_eval,
1943 )

File /projetos/fl6f/lib/python3.12/site-packages/transformers/trainer.py:2075, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2071 gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
2073 self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
→ 2075 model = self._wrap_model(self.model_wrapped)
2077 # as the model is wrapped, don’t use accelerator.prepare
2078 # this is for unhandled cases such as
2079 # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
2080 use_accelerator_prepare = True if model is self.model else False

File /projetos/fl6f/lib/python3.12/site-packages/transformers/trainer.py:1693, in Trainer._wrap_model(self, model, training, dataloader)
1690 return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
1692 # train/eval could be run multiple-times - if already wrapped, don’t re-wrap it again
→ 1693 if self.accelerator.unwrap_model(model) is not model:
1694 return model
1696 # Mixed precision training with apex (torch < 1.6)

File /projetos/fl6f/lib/python3.12/site-packages/accelerate/accelerator.py:2583, in Accelerator.unwrap_model(self, model, keep_fp32_wrapper)
2552 def unwrap_model(self, model, keep_fp32_wrapper: bool = True):
2553 “”"
2554 Unwraps the model from the additional layer possible added by [~Accelerator.prepare]. Useful before saving
2555 the model.
(…)
2581 ```
2582 “”"
→ 2583 return extract_model_from_parallel(model, keep_fp32_wrapper)

File /projetos/fl6f/lib/python3.12/site-packages/accelerate/utils/other.py:85, in extract_model_from_parallel(model, keep_fp32_wrapper, recursive)
82 options += (DeepSpeedEngine,)
84 if is_torch_version(“>=”, FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
—> 85 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
87 options += (FSDP,)
89 while isinstance(model, options):

File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/init.py:1
----> 1 from ._flat_param import FlatParameter as FlatParameter
2 from .fully_sharded_data_parallel import (
3 BackwardPrefetch,
4 CPUOffload,
(…)
18 StateDictType,
19 )
21 all = [
22 “BackwardPrefetch”,
23 “CPUOffload”,
(…)
37 “StateDictType”,
38 ]

File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/_flat_param.py:30
28 import torch.nn.functional as F
29 from torch import Tensor
—> 30 from torch.distributed.fsdp._common_utils import (
31 _FSDPDeviceHandle,
32 _named_parameters_with_duplicates,
33 _no_dispatch_record_stream,
34 _set_fsdp_flattened,
35 HandleTrainingState,
36 )
37 from torch.distributed.utils import (
38 _alloc_storage,
39 _data_ptr_allocated,
40 _free_storage,
41 _p_assert,
42 )
43 from torch.nn.parameter import _ParameterMeta # type: ignore[attr-defined]

File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/_common_utils.py:35
31 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
32 _CHECKPOINT_PREFIX,
33 )
34 from torch.distributed.device_mesh import DeviceMesh
—> 35 from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
36 from torch.distributed.utils import _apply_to_tensors
37 from torch.utils._mode_utils import no_dispatch

File /usr/local/lib64/python3.12/site-packages/torch/distributed/fsdp/_fsdp_extensions.py:6
4 import torch
5 import torch.distributed as dist
----> 6 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
7 from torch.distributed._shard.sharded_tensor.shard import Shard
8 from torch.distributed._tensor import DeviceMesh, DTensor

File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/init.py:1
----> 1 from .api import (
2 _shard_tensor,
3 load_with_process_group,
4 shard_module,
5 shard_parameter,
6 )

File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/api.py:7
5 import torch.nn as nn
6 from torch.distributed import distributed_c10d
----> 7 from torch.distributed._shard.sharded_tensor import (
8 ShardedTensor,
9 )
10 from .sharding_spec import (
11 ShardingSpec,
12 ChunkShardingSpec
13 )
14 from .sharding_plan import (
15 ShardingPlan
16 )

File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/sharded_tensor/init.py:11
8 else:
9 ShardingSpec = “ShardingSpec”
—> 11 from .api import (
12 _CUSTOM_SHARDED_OPS,
13 _SHARDED_OPS,
14 Shard,
15 ShardedTensorBase,
16 ShardedTensor,
17 ShardedTensorMetadata,
18 TensorProperties,
19 )
20 from .metadata import ShardMetadata # noqa: F401
21 from torch.distributed._shard.op_registry_utils import _decorator_func

File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/sharded_tensor/api.py:38
36 from .metadata import TensorProperties, ShardedTensorMetadata
37 from .shard import Shard
—> 38 from .reshard import reshuffle_local_shard, reshard_local_shard
39 from .utils import (
40 _flatten_tensor_size,
41 _parse_and_validate_remote_device,
(…)
44 build_global_metadata
45 )
46 from torch.distributed.remote_device import _remote_device

File /usr/local/lib64/python3.12/site-packages/torch/distributed/_shard/sharded_tensor/reshard.py:14
9 import torch.distributed._shard.sharding_spec as shard_spec
10 from torch.distributed._shard.sharding_spec._internals import (
11 get_split_size,
12 get_chunked_dim_size,
13 )
—> 14 from torch.distributed.nn.functional import (
15 all_to_all,
16 all_to_all_single,
17 )
18 from torch.distributed._shard.metadata import ShardMetadata
20 from .shard import Shard

File /usr/local/lib64/python3.12/site-packages/torch/distributed/nn/init.py:3
1 import torch
2 if torch.distributed.rpc.is_available():
----> 3 from .api.remote_module import RemoteModule
4 from .functional import *

File /usr/local/lib64/python3.12/site-packages/torch/distributed/nn/api/remote_module.py:24
22 import torch.distributed.rpc as rpc
23 from torch import Tensor, device, dtype, nn
—> 24 from torch.distributed.nn.jit import instantiator
25 from torch.distributed import _remote_device
26 from torch.distributed.rpc.internal import _internal_rpc_pickler

File /usr/local/lib64/python3.12/site-packages/torch/distributed/nn/jit/instantiator.py:19
15 logger = logging.getLogger(name)
18 _FILE_PREFIX = “remote_module
—> 19 _TEMP_DIR = tempfile.TemporaryDirectory()
20 INSTANTIATED_TEMPLATE_DIR_PATH = _TEMP_DIR.name
21 logger.info(“Created a temporary directory at %s”, INSTANTIATED_TEMPLATE_DIR_PATH)

File /usr/lib64/python3.12/tempfile.py:882, in TemporaryDirectory.init(self, suffix, prefix, dir, ignore_cleanup_errors, delete)
880 def init(self, suffix=None, prefix=None, dir=None,
881 ignore_cleanup_errors=False, *, delete=True):
→ 882 self.name = mkdtemp(suffix, prefix, dir)
883 self._ignore_cleanup_errors = ignore_cleanup_errors
884 self._delete = delete

File /usr/lib64/python3.12/tempfile.py:384, in mkdtemp(suffix, prefix, dir)
382 _sys.audit(“tempfile.mkdtemp”, file)
383 try:
→ 384 _os.mkdir(file, 0o700)
385 except FileExistsError:
386 continue # try again

OSError: [Errno 5] Input/output error: ‘/tmp/tmpfe_zflge’

It was running before as well, but it gives this error and i have no idea how to solve…

1 Like

The most suspicious of this information is Python 3.12. Python 3.12 frequently causes errors in the HF space.

1 Like

Thank you to reply! Considering your comment… Which version of python should i use for LLM fine tuning? Which one is recommended?

1 Like

The version used by HF is 3.10. 3.11 seems to be relative stable in the newer versions. I’m using 3.9 in my local environment, but since it’s close to the end of support and more and more things are not working, I recommend using 3.10 or later if you’re going to use it.:sweat_smile:

1 Like

Could be access timeout error due to large number of files and/or subfolders as per the following explanation.
https://research.google.com/colaboratory/faq.html#drive-timeout