What does the "AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr'" mean?

I’m trying to train a model with very standard HF code I’ve used before:

import os
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from pathlib import Path
import glob

def preprocess_function_proofnet_simple(examples: dict[str, list], tokenizer, max_length: int = 1024) -> dict[str, torch.Tensor]:
    Preprocess the input data for the proofnet dataset.

    examples: The examples to preprocess.
    tokenizer: The tokenizer for encoding the texts.

    The processed model inputs.
    inputs = [f"{examples['nl_statement'][i]}{tokenizer.eos_token}{examples['formal_statement'][i]}" for i in range(len(examples['nl_statement']))]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = model_inputs.input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

def get_proofnet_dataset(tokenizer, preprocess_function=preprocess_function_proofnet_simple):
    dataset_val = load_dataset("hoskinson-center/proofnet", split='validation')
    dataset_test = load_dataset("hoskinson-center/proofnet", split='test')
    val_dataset = dataset_val.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=["nl_statement", "formal_statement"])
    test_dataset = dataset_test.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=["nl_statement", "formal_statement"])
    return val_dataset, test_dataset



# Load Hugging Face token from file
with open(Path("~/keys/hf_file_key.txt").expanduser(), "r") as file:
    hf_token = file.read().strip()

# Set the Hugging Face token as an environment variable
os.environ["HF_TOKEN"] = hf_token

# Login using the token
from huggingface_hub import login

# Load model and tokenizer
pretrained_model_name_or_path = "openai-community/gpt2"
if 'gpt2' in pretrained_model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)
    # device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
    # device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
    device = torch.device('cpu')
    model = model.to(device)
    max_length: int = tokenizer.model_max_length

# Define training arguments with memory optimization tricks
training_args = TrainingArguments(
    output_dir="~/tmp/results",  # Output directory for saving model checkpoints
    per_device_train_batch_size=1,  # Training batch size per device
    per_device_eval_batch_size=1,  # Evaluation batch size per device
    max_steps=2,  # Total number of training steps
    logging_dir='~/tmp/logs',  # Directory for storing logs
    logging_steps=10,  # Frequency of logging steps
    gradient_accumulation_steps=1,  # Accumulate gradients to simulate a larger batch size
    save_steps=500,  # Save checkpoint every 500 steps
    save_total_limit=3,  # Only keep the last 3 checkpoints
    evaluation_strategy="steps",  # Evaluate model at specified steps
    eval_steps=100,  # Evaluate every 100 steps
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    optim="paged_adamw_32bit",  # Optimizer choice with memory optimization
    learning_rate=1e-5,  # Learning rate for training
    warmup_ratio=0.01,  # Warmup ratio for learning rate schedule
    weight_decay=0.01,  # Weight decay for regularization
    lr_scheduler_type='cosine',  # Learning rate scheduler type
    report_to="none",  # Disable reporting to external tracking tools
    # bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported by the hardware
    half_precision_backend="auto",  # Automatically select the best backend for mixed precision
    # dataloader_num_workers=4,  # TODO Number of subprocesses for data loading
    # dataloader_pin_memory=True,  # TODO periphery, Pin memory in data loaders for faster transfer to GPU
    # skip_memory_metrics=True,  # Skip memory metrics to save memory
    # dataloader_prefetch_factor=2,  # TODO periphery, Number of batches to prefetch
    # torchdynamo="nvfuser",  # TODO periphery, Use NVFuser backend for optimized torch operations
    full_determinism=True,  # TODO periphery, Ensure reproducibility

train_dataset, test_dataset = get_proofnet_dataset(tokenizer)

# Initialize the Trainer
trainer = Trainer(
    # eval_dataset=eval_dataset,

# Start training
print(f'\n-- Start training')

# Save the model and tokenizer

but no matter what I do e.g.,

  1. I’ve forced every possible way I can to have cpu enabled to force it to train at all
  2. use a HF dataset from the internet I’ve used before
  3. updated pytorch pip install --upgrade torch
  4. Disabled MPS
  5. tried making sure cpu was used
  6. training_args = TrainingArguments(

    use_mps_device=True if torch.backends.mps.is_available() else False,

  7. " 1. Verify data types: Ensure that your model and data are using compatible data types. MPS might have issues with certain data types." but it’s obvious it should work cuz the HF trainer does this on it’s own by fetching the device from my model. I’ve checked this code before.
  8. yes I did device = torch.device("cpu")

but it doesn’t work and I get a very cryptic error I’ve never seen before and nothing on google shows up:

Exception has occurred: AttributeError
'NoneType' object has no attribute 'cget_managed_ptr'
  File "/Users/me/py_proj/py_src/train/hf_trainer_train.py", line 93, in <module>
AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr'

what is going on? How do I debug this?

Related to this issue I also have this odd warning, wonder if it’s related:

'NoneType' object has no attribute 'cadam32bit_grad_fp32'

My conda env (locally, in server I’m using venv):

