I start a GRPO trainning with this script:
from datasets import load_dataset
import json
dataset_id = "/data/cy/LLMlable/chat/train_data/train0513_grpo.json"
train_dataset = load_dataset("json",data_files=dataset_id,split="train")
def make_conversation(example):
return {
"prompt": [
{"role": "user", "content": example["instruction"]},
],
}
train_dataset = train_dataset.map(make_conversation)
train_dataset = train_dataset.remove_columns(["input", "instruction"])
print(train_dataset)
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer
import os
os.environ["CUDA_HOME"] = '/usr/local/cuda-12.5'
# 将 CUDA 的 bin 目录加入 PATH
os.environ["PATH"] = f"/usr/local/cuda-12.5/bin:{os.environ['PATH']}"
model_id = "/data/cy/LLM/LLaMA-Factory-main/export/test0516"
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
device_map="auto",
use_cache=False
)
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
task_type="CAUSAL_LM",
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"],
)
model.gradient_checkpointing_enable()
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
import re
import psycopg2
import json
import hashlib
import random
rw_path = '/data/cy/LLMlable/grpo/grpo/reward_stack.json'
with open(rw_path,'r') as f:
rw_dict = json.load(f)
host = "127.0.0.1"
port = "9926"
dbname = "stack"
user = "cy"
password = "SDUcy-202215106"
connection = psycopg2.connect(
host=host,
port=port,
dbname=dbname,
user=user,
password=password
)
cursor = connection.cursor()
hintdict = {
'hash join': 'set enable_hashjoin=',
'merge join': 'set enable_mergejoin=',
'nested loop join': 'set enable_nestloop=',
'index only scan': 'set enable_indexonlyscan=',
'sequential scan': 'set enable_seqscan=',
'index scan': 'set enable_indexscan='
}
sqlpath = '/data/cy/LLMlable/chat/stack/all/'
def getCostPlan(sql,cur):
cur.execute("explain (COSTS) "+sql)
rows = cur.fetchall()
return rows
def read_sql_file(file_path):
with open(file_path, 'r') as file:
sql = file.read()
return sql
# def format_reward(completions, **kwargs):
# """Reward function that checks if the completion has a specific format."""
# pattern=r"<think>[\s\S]*?<\/think>\s*hint:\s*\{\s*[\s\S]*?\s*\}\s*"
# completion_contents = [completion[0]["content"] for completion in completions]
# for i in completion_contents:
# print(i)
# matches = [re.match(pattern, content) for content in completion_contents]
# rewards_list = [1.0 if match else 0.0 for match in matches]
# # rewards_list = [1.0 if match else 0.0+random.random() for match in matches]
# print("f1:",rewards_list)
# return [1.0 if match else 0.0 for match in matches]
def format_reward2(completions, **kwargs):
completion_contents = [completion[0]["content"] for completion in completions]
rw_list=[]
for content in completion_contents:
try:
content=content.split('hint:')[1].replace('}','').replace('{','').replace("\n",'').split(',')
for term in content:
term = term.split(':')
hint_str = hintdict[term[0].strip()]+' '+term[1].strip()+';'
rw_list.append(1.0)
except:
rw_list.append(0.0)
print("f2:",rw_list)
return rw_list
def hint_reward(completions, **kwargs):
completion_contents = [completion[0]["content"] for completion in completions]
rewards_list=[]
sqlnames = kwargs['sqlname']
# {hash join: True, merge join: False, nested loop join: True, index scan: True, sequential scan: True, index only scan: False}\n"
for content,sqlname in zip(completion_contents,sqlnames):
connection = psycopg2.connect(
host=host,
port=port,
dbname=dbname,
user=user,
password=password
)
cursor = connection.cursor()
sql_txt = read_sql_file(sqlpath+sqlname)
try:
content=content.split('hint:')[1].replace('}','').replace('{','').replace("\n",'').split(',')
for term in content:
term = term.split(':')
hint_str = hintdict[term[0].strip()]+' '+term[1].strip()+';'
# print(hint_str)
cursor.execute(hint_str)
plan_hash = hashlib.md5((str(getCostPlan(sql_txt,cursor)).encode())).hexdigest()
rewards_list.append(rw_dict[sqlname][plan_hash])
except:
rewards_list.append(0.0)
cursor.close()
print("f3:",rewards_list)
return rewards_list
from trl import GRPOConfig
# Configure training arguments using GRPOConfig
training_args = GRPOConfig(
output_dir="V2-GRPO-test",
learning_rate=1e-5,
remove_unused_columns=False, # to access the solution column in accuracy_reward
gradient_accumulation_steps=4,
num_train_epochs=1,
bf16=True,
# Parameters that control de data preprocessing
max_completion_length=1024, # default: 256
num_generations=4, # default: 8
max_prompt_length=3072, # default: 512
per_device_train_batch_size=1,
# Parameters related to reporting and saving
report_to=["wandb"],
logging_steps=5,
push_to_hub=False,
save_strategy="steps",
save_steps=2,
gradient_checkpointing=True,
# processing_class=tokenizer
)
from trl import GRPOTrainer
trainer = GRPOTrainer(
model=model, reward_funcs=[format_reward2,hint_reward], args=training_args, train_dataset=train_dataset
)
trainer.train()
trainer.save_model(training_args.output_dir)
The log shows despite the reward and grad_norm is not zero,the train loss is always zero.But when the training finished,the final output shows that the loss is not zero. Why does this happen? Has my model been trained?
The log:
Dataset({
features: ['output', 'sqlname', 'prompt'],
num_rows: 961
})
[2025-05-16 20:49:53,716] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
/home/cy/anaconda3/envs/hf_trl/lib/python3.11/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
{'loss': 0.0, 'grad_norm': 0.09833737462759018, 'learning_rate': 9.833333333333333e-06, 'completion_length': 242.15, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.686456960439682, 'reward': 1.686456948518753, 'reward_std': 0.5303836800158024, 'kl': 0.0004268963893991895, 'epoch': 0.02}
{'loss': 0.0, 'grad_norm': 0.08386103063821793, 'learning_rate': 9.625e-06, 'completion_length': 244.425, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.5837401330471039, 'reward': 1.583740133047104, 'reward_std': 0.6751966059207917, 'kl': 0.0005165024515008554, 'epoch': 0.04}
{'loss': 0.0, 'grad_norm': 0.07626724988222122, 'learning_rate': 9.416666666666667e-06, 'completion_length': 240.85, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7732090130448341, 'reward': 1.7732090175151825, 'reward_std': 0.7709747180342674, 'kl': 0.0005288927190122194, 'epoch': 0.06}
{'loss': 0.0, 'grad_norm': 0.10117252171039581, 'learning_rate': 9.208333333333333e-06, 'completion_length': 235.5875, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7216249376535415, 'reward': 1.7216249585151673, 'reward_std': 0.6284362055361271, 'kl': 0.0005364336524507962, 'epoch': 0.08}
{'loss': 0.0, 'grad_norm': 0.08160920441150665, 'learning_rate': 9e-06, 'completion_length': 240.0375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.789138701558113, 'reward': 1.7891387045383453, 'reward_std': 0.6136982448399066, 'kl': 0.0005334435947588645, 'epoch': 0.1}
{'loss': 0.0, 'grad_norm': 0.08980654925107956, 'learning_rate': 8.791666666666667e-06, 'completion_length': 238.0375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7379942715168, 'reward': 1.7379942655563354, 'reward_std': 0.6109503719955682, 'kl': 0.0005178835795959458, 'epoch': 0.12}
{'loss': 0.0, 'grad_norm': 0.09846791625022888, 'learning_rate': 8.583333333333333e-06, 'completion_length': 238.1, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.8919083803892136, 'reward': 1.8919083714485168, 'reward_std': 0.7251449711620808, 'kl': 0.0005302477686200291, 'epoch': 0.15}
{'loss': 0.0, 'grad_norm': 0.06589101254940033, 'learning_rate': 8.375e-06, 'completion_length': 228.65, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7236788332462311, 'reward': 1.7236788332462312, 'reward_std': 0.6145793333649635, 'kl': 0.0004996116273105145, 'epoch': 0.17}
{'loss': 0.0, 'grad_norm': 0.07223747670650482, 'learning_rate': 8.166666666666668e-06, 'completion_length': 240.2375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.5457767516374588, 'reward': 1.545776754617691, 'reward_std': 0.5292002744972706, 'kl': 0.0005244471380137838, 'epoch': 0.19}
{'loss': 0.0, 'grad_norm': 0.09219600260257721, 'learning_rate': 7.958333333333333e-06, 'completion_length': 242.2375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7113161787390709, 'reward': 1.7113161981105804, 'reward_std': 0.6662813112139702, 'kl': 0.0005246120665105991, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 0.08651315420866013, 'learning_rate': 7.75e-06, 'completion_length': 236.2625, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7821606770157814, 'reward': 1.7821606695652008, 'reward_std': 0.699380847811699, 'kl': 0.0005228802081546746, 'epoch': 0.23}
{'loss': 0.0, 'grad_norm': 0.08899950981140137, 'learning_rate': 7.541666666666667e-06, 'completion_length': 235.2875, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6847546353936196, 'reward': 1.6847546219825744, 'reward_std': 0.6952019453048706, 'kl': 0.0005268092500045896, 'epoch': 0.25}
{'loss': 0.0, 'grad_norm': 0.10531917959451675, 'learning_rate': 7.333333333333333e-06, 'completion_length': 244.3, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.80422792583704, 'reward': 1.8042279362678528, 'reward_std': 0.5598676132038236, 'kl': 0.0005284514045342803, 'epoch': 0.27}
{'loss': 0.0, 'grad_norm': 0.09361914545297623, 'learning_rate': 7.125e-06, 'completion_length': 239.8625, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6599106639623642, 'reward': 1.6599106550216676, 'reward_std': 0.5838687766343356, 'kl': 0.0005130083693074994, 'epoch': 0.29}
{'loss': 0.0, 'grad_norm': 0.08656957000494003, 'learning_rate': 6.916666666666667e-06, 'completion_length': 232.825, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7171093255281449, 'reward': 1.7171093285083772, 'reward_std': 0.5700982138514519, 'kl': 0.0005371240986278281, 'epoch': 0.31}
{'loss': 0.0, 'grad_norm': 0.09666649252176285, 'learning_rate': 6.708333333333333e-06, 'completion_length': 237.0375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.8844376325607299, 'reward': 1.8844376266002656, 'reward_std': 0.7487143039703369, 'kl': 0.0005386197401094251, 'epoch': 0.33}
{'loss': 0.0, 'grad_norm': 0.1058727279305458, 'learning_rate': 6.5000000000000004e-06, 'completion_length': 233.45, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6797272339463234, 'reward': 1.6797272205352782, 'reward_std': 0.6311022289562971, 'kl': 0.0005338093222235329, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 0.10363585501909256, 'learning_rate': 6.291666666666667e-06, 'completion_length': 235.6, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.9810375615954399, 'reward': 1.968537563085556, 'reward_std': 0.786603014729917, 'kl': 0.0005278794225887396, 'epoch': 0.37}
{'loss': 0.0, 'grad_norm': 0.09924133121967316, 'learning_rate': 6.083333333333333e-06, 'completion_length': 243.5625, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6545234180986881, 'reward': 1.6545234203338623, 'reward_std': 0.6480654507875443, 'kl': 0.0005347360609448514, 'epoch': 0.4}
{'loss': 0.0, 'grad_norm': 0.09262462705373764, 'learning_rate': 5.8750000000000005e-06, 'completion_length': 235.1625, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7519623577594757, 'reward': 1.7519623696804048, 'reward_std': 0.5781447453424334, 'kl': 0.0005310224223649129, 'epoch': 0.42}
{'loss': 0.0, 'grad_norm': 0.10632356256246567, 'learning_rate': 5.666666666666667e-06, 'completion_length': 241.025, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.9946554750204086, 'reward': 1.9946554541587829, 'reward_std': 0.6944329828023911, 'kl': 0.0005232438081293367, 'epoch': 0.44}
{'loss': 0.0, 'grad_norm': 0.0872630849480629, 'learning_rate': 5.458333333333333e-06, 'completion_length': 241.9125, 'rewards/format_reward2': 0.975, 'rewards/hint_reward': 0.875643989443779, 'reward': 1.8506439924240112, 'reward_std': 0.7810414537787438, 'kl': 0.000516696619160939, 'epoch': 0.46}
{'loss': 0.0, 'grad_norm': 0.11046557128429413, 'learning_rate': 5.2500000000000006e-06, 'completion_length': 244.7, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6702523469924927, 'reward': 1.6702523291110993, 'reward_std': 0.6210360750555992, 'kl': 0.0005639222246827558, 'epoch': 0.48}
{'loss': 0.0, 'grad_norm': 0.10285649448633194, 'learning_rate': 5.041666666666667e-06, 'completion_length': 237.0, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7387900158762932, 'reward': 1.7387900352478027, 'reward_std': 0.6748368714004755, 'kl': 0.000546219055831898, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 0.09859994053840637, 'learning_rate': 4.833333333333333e-06, 'completion_length': 240.4, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.47963430136442187, 'reward': 1.4796343088150024, 'reward_std': 0.6493100047111511, 'kl': 0.0005311622546287254, 'epoch': 0.52}
{'loss': 0.0, 'grad_norm': 0.09000447392463684, 'learning_rate': 4.625000000000001e-06, 'completion_length': 247.2875, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7482958018779755, 'reward': 1.7482958018779755, 'reward_std': 0.5870657205581665, 'kl': 0.0005441085188067519, 'epoch': 0.54}
{'loss': 0.0, 'grad_norm': 0.11419157683849335, 'learning_rate': 4.416666666666667e-06, 'completion_length': 244.825, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.47554641366004946, 'reward': 1.4630464136600494, 'reward_std': 0.5691350907087326, 'kl': 0.0005496730314916931, 'epoch': 0.56}
{'loss': 0.0, 'grad_norm': 0.09484495967626572, 'learning_rate': 4.208333333333333e-06, 'completion_length': 239.05, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.728359380364418, 'reward': 1.7158593893051148, 'reward_std': 0.6138381041586399, 'kl': 0.0005266572508844547, 'epoch': 0.58}
{'loss': 0.0, 'grad_norm': 0.09359879046678543, 'learning_rate': 4.000000000000001e-06, 'completion_length': 249.35, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7664372086524963, 'reward': 1.7664372265338897, 'reward_std': 0.5865088984370231, 'kl': 0.00055030612857081, 'epoch': 0.6}
{'loss': 0.0, 'grad_norm': 0.11166156828403473, 'learning_rate': 3.7916666666666666e-06, 'completion_length': 240.1125, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6625506669282913, 'reward': 1.662550675868988, 'reward_std': 0.5158184990286827, 'kl': 0.0005491750562214293, 'epoch': 0.62}
{'loss': 0.0, 'grad_norm': 0.08442356437444687, 'learning_rate': 3.5833333333333335e-06, 'completion_length': 244.85, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6975038453936577, 'reward': 1.6975038468837738, 'reward_std': 0.6479316338896751, 'kl': 0.0005403956805821508, 'epoch': 0.65}
{'loss': 0.0, 'grad_norm': 0.10485571622848511, 'learning_rate': 3.3750000000000003e-06, 'completion_length': 235.9125, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.6261389210820199, 'reward': 1.6136389076709747, 'reward_std': 0.7126941554248333, 'kl': 0.0005452550933114253, 'epoch': 0.67}
{'loss': 0.0, 'grad_norm': 0.09902128577232361, 'learning_rate': 3.1666666666666667e-06, 'completion_length': 242.7375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6681374207139015, 'reward': 1.668137401342392, 'reward_std': 0.596645655343309, 'kl': 0.000601339059357997, 'epoch': 0.69}
{'loss': 0.0, 'grad_norm': 0.09568341821432114, 'learning_rate': 2.9583333333333335e-06, 'completion_length': 237.2625, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.8848576739430427, 'reward': 1.8723577022552491, 'reward_std': 0.7480486467480659, 'kl': 0.0005603199097095057, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 0.10015729814767838, 'learning_rate': 2.7500000000000004e-06, 'completion_length': 236.7125, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7749321073293686, 'reward': 1.7749320983886718, 'reward_std': 0.6368187621235848, 'kl': 0.0005502376181539149, 'epoch': 0.73}
{'loss': 0.0, 'grad_norm': 0.11633292585611343, 'learning_rate': 2.5416666666666668e-06, 'completion_length': 241.0125, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7991459637880325, 'reward': 1.7991459548473359, 'reward_std': 0.7132919415831566, 'kl': 0.0005318806826835499, 'epoch': 0.75}
{'loss': 0.0, 'grad_norm': 0.10906606912612915, 'learning_rate': 2.3333333333333336e-06, 'completion_length': 242.3875, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7020174562931061, 'reward': 1.702017456293106, 'reward_std': 0.5429980456829071, 'kl': 0.0005515233700862154, 'epoch': 0.77}
{'loss': 0.0, 'grad_norm': 0.12483209371566772, 'learning_rate': 2.125e-06, 'completion_length': 242.7625, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6960464447736741, 'reward': 1.6960464298725129, 'reward_std': 0.5978964403271675, 'kl': 0.0005584630722296424, 'epoch': 0.79}
{'loss': 0.0, 'grad_norm': 0.11350805312395096, 'learning_rate': 1.916666666666667e-06, 'completion_length': 239.95, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.632606404274702, 'reward': 1.6326064050197602, 'reward_std': 0.6396717444062233, 'kl': 0.0005443066140287556, 'epoch': 0.81}
{'loss': 0.0, 'grad_norm': 0.10226169228553772, 'learning_rate': 1.7083333333333334e-06, 'completion_length': 242.6, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.8669334009289742, 'reward': 1.8669333696365356, 'reward_std': 0.6857013538479805, 'kl': 0.0005572879337705672, 'epoch': 0.83}
{'loss': 0.0, 'grad_norm': 0.09106634557247162, 'learning_rate': 1.5e-06, 'completion_length': 240.65, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.7581478208303452, 'reward': 1.7581478118896485, 'reward_std': 0.6972749218344688, 'kl': 0.0005479031256982126, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 0.12189562618732452, 'learning_rate': 1.2916666666666669e-06, 'completion_length': 232.925, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.8270958885550499, 'reward': 1.827095890045166, 'reward_std': 0.7235485404729843, 'kl': 0.0006022430883604102, 'epoch': 0.87}
{'loss': 0.0, 'grad_norm': 0.10620136559009552, 'learning_rate': 1.0833333333333335e-06, 'completion_length': 232.5375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.881186357140541, 'reward': 1.8811863839626313, 'reward_std': 0.6683604184538126, 'kl': 0.000592767032503616, 'epoch': 0.89}
{'loss': 0.0, 'grad_norm': 0.08141123503446579, 'learning_rate': 8.75e-07, 'completion_length': 234.375, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.5150951027870179, 'reward': 1.5025951206684112, 'reward_std': 0.5887993931770324, 'kl': 0.0005780345440143719, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 0.11707420647144318, 'learning_rate': 6.666666666666667e-07, 'completion_length': 234.9375, 'rewards/format_reward2': 0.9875, 'rewards/hint_reward': 0.6512034311890602, 'reward': 1.6387034356594086, 'reward_std': 0.6193146544974297, 'kl': 0.0006213093263795599, 'epoch': 0.94}
{'loss': 0.0, 'grad_norm': 0.10141977667808533, 'learning_rate': 4.583333333333333e-07, 'completion_length': 237.45, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.6260499149560929, 'reward': 1.6260499119758607, 'reward_std': 0.7117581814527512, 'kl': 0.0005733551224693656, 'epoch': 0.96}
{'loss': 0.0, 'grad_norm': 0.10804533213376999, 'learning_rate': 2.5000000000000004e-07, 'completion_length': 235.6875, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.5845954522490502, 'reward': 1.5845954477787019, 'reward_std': 0.6889735788106919, 'kl': 0.0005797993973828853, 'epoch': 0.98}
{'loss': 0.0, 'grad_norm': 0.09732513874769211, 'learning_rate': 4.166666666666667e-08, 'completion_length': 239.2375, 'rewards/format_reward2': 1.0, 'rewards/hint_reward': 0.744280219078064, 'reward': 1.7442802131175994, 'reward_std': 0.6373357579112053, 'kl': 0.0005513303462066687, 'epoch': 1.0}
{'train_runtime': 32920.2439, 'train_samples_per_second': 0.029, 'train_steps_per_second': 0.007, 'train_loss': 2.1668560384568992e-05, 'epoch': 1.0}