{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968, "eval_steps": 200, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 584.2053571428571, "epoch": 0.0037333333333333333, "grad_norm": 0.0, "kl": 3.123922007424491e-07, "learning_rate": 1.1111111111111111e-07, "loss": -0.0051, "reward": 0.6517857142857143, "reward_std": 0.35586076974868774, "rewards/accuracy_reward": 0.6517857142857143, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 590.6540178571429, "epoch": 0.018666666666666668, "grad_norm": 0.1855999380350113, "kl": 0.00017639675310679844, "learning_rate": 5.555555555555555e-07, "loss": -0.024, "reward": 0.6004464285714286, "reward_std": 0.30399111764771597, "rewards/accuracy_reward": 0.6004464285714286, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 577.3446428571428, "epoch": 0.037333333333333336, "grad_norm": 0.10475708544254303, "kl": 0.00021860003471374512, "learning_rate": 1.111111111111111e-06, "loss": -0.0247, "reward": 0.6017857142857143, "reward_std": 0.32809826050485885, "rewards/accuracy_reward": 0.6017857142857143, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 635.5607142857143, "epoch": 0.056, "grad_norm": 0.022834666073322296, "kl": 0.00047294582639421733, "learning_rate": 1.6666666666666669e-06, "loss": -0.0105, "reward": 0.5839285714285715, "reward_std": 0.32206040705953326, "rewards/accuracy_reward": 0.5839285714285715, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 608.0660714285714, "epoch": 0.07466666666666667, "grad_norm": 1.4037145376205444, "kl": 0.003454787390572684, "learning_rate": 2.222222222222222e-06, "loss": -0.0408, "reward": 0.5660714285714286, "reward_std": 0.30615685411861965, "rewards/accuracy_reward": 0.5660714285714286, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 585.6625, "epoch": 0.09333333333333334, "grad_norm": 0.04715263098478317, "kl": 0.006245636940002441, "learning_rate": 2.777777777777778e-06, "loss": -0.0257, "reward": 0.5785714285714286, "reward_std": 0.3421275573117392, "rewards/accuracy_reward": 0.5785714285714286, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 592.3142857142857, "epoch": 0.112, "grad_norm": 0.022860102355480194, "kl": 0.3629974705832345, "learning_rate": 2.9988435543610844e-06, "loss": -0.0083, "reward": 0.6392857142857142, "reward_std": 0.30530826789992194, "rewards/accuracy_reward": 0.6392857142857142, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 606.3267857142857, "epoch": 0.13066666666666665, "grad_norm": 0.11820275336503983, "kl": 6.243857077189854, "learning_rate": 2.99178284305241e-06, "loss": 0.3173, "reward": 0.5857142857142857, "reward_std": 0.3432325610092708, "rewards/accuracy_reward": 0.5857142857142857, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 572.0892857142857, "epoch": 0.14933333333333335, "grad_norm": 21.97744369506836, "kl": 0.33153715814862933, "learning_rate": 2.978334088587117e-06, "loss": -0.0104, "reward": 0.6625, "reward_std": 0.33634612134524755, "rewards/accuracy_reward": 0.6607142857142857, "rewards/format_reward": 0.0017857142857142857, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 600.6875, "epoch": 0.168, "grad_norm": 0.17174075543880463, "kl": 0.3202719347817557, "learning_rate": 2.958554880596515e-06, "loss": 0.0072, "reward": 0.6, "reward_std": 0.3338796964713505, "rewards/accuracy_reward": 0.6, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 616.7535714285714, "epoch": 0.18666666666666668, "grad_norm": 0.09551213681697845, "kl": 0.33372737339564734, "learning_rate": 2.9325299166857803e-06, "loss": 0.0081, "reward": 0.6071428571428571, "reward_std": 0.3335836121014186, "rewards/accuracy_reward": 0.6071428571428571, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 564.6660714285714, "epoch": 0.20533333333333334, "grad_norm": 0.12027700990438461, "kl": 0.20207279750279017, "learning_rate": 2.9003706397458025e-06, "loss": 0.0048, "reward": 0.6446428571428572, "reward_std": 0.3487179126058306, "rewards/accuracy_reward": 0.6446428571428572, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 576.2642857142857, "epoch": 0.224, "grad_norm": 0.594131588935852, "kl": 0.24056614467075893, "learning_rate": 2.862214760737622e-06, "loss": -0.0414, "reward": 0.6232142857142857, "reward_std": 0.33634612134524755, "rewards/accuracy_reward": 0.6232142857142857, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 578.2357142857143, "epoch": 0.24266666666666667, "grad_norm": 0.3320864140987396, "kl": 0.17519269670758927, "learning_rate": 2.818225668992948e-06, "loss": -0.0211, "reward": 0.6125, "reward_std": 0.32176432268960137, "rewards/accuracy_reward": 0.6125, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 569.9196428571429, "epoch": 0.2613333333333333, "grad_norm": 0.42868152260780334, "kl": 0.07821709769112724, "learning_rate": 2.7685917325559604e-06, "loss": -0.0073, "reward": 0.6, "reward_std": 0.35862327899251667, "rewards/accuracy_reward": 0.6, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 592.0357142857143, "epoch": 0.28, "grad_norm": 0.0943068340420723, "kl": 0.06971386500767299, "learning_rate": 2.713525491562421e-06, "loss": -0.0088, "reward": 0.6053571428571428, "reward_std": 0.32537541815212795, "rewards/accuracy_reward": 0.6053571428571428, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 578.4678571428572, "epoch": 0.2986666666666667, "grad_norm": 0.011618987657129765, "kl": 0.08212520054408483, "learning_rate": 2.6532627481101893e-06, "loss": -0.0353, "reward": 0.6267857142857143, "reward_std": 0.2574786084038871, "rewards/accuracy_reward": 0.6267857142857143, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 622.4571428571429, "epoch": 0.31733333333333336, "grad_norm": 0.7904458045959473, "kl": 0.05238505772181919, "learning_rate": 2.5880615565184313e-06, "loss": -0.0003, "reward": 0.6267857142857143, "reward_std": 0.2849450332777841, "rewards/accuracy_reward": 0.6267857142857143, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 557.3285714285714, "epoch": 0.336, "grad_norm": 0.15280544757843018, "kl": 0.17704980032784598, "learning_rate": 2.518201118299413e-06, "loss": 0.0103, "reward": 0.6214285714285714, "reward_std": 0.3481654107570648, "rewards/accuracy_reward": 0.6214285714285714, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 613.6553571428572, "epoch": 0.3546666666666667, "grad_norm": 0.21528230607509613, "kl": 0.09126150948660715, "learning_rate": 2.4439805865747562e-06, "loss": -0.0161, "reward": 0.6196428571428572, "reward_std": 0.3066696890762874, "rewards/accuracy_reward": 0.6196428571428572, "rewards/format_reward": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 579.3107142857143, "epoch": 0.37333333333333335, "grad_norm": 0.20065245032310486, "kl": 0.17014105660574777, "learning_rate": 2.3657177850558505e-06, "loss": -0.0211, "reward": 0.6392857142857142, "reward_std": 0.3195939821856362, "rewards/accuracy_reward": 0.6392857142857142, "rewards/format_reward": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 617.9732142857143, "epoch": 0.392, "grad_norm": 0.010423385538160801, "kl": 0.06536189488002232, "learning_rate": 2.2837478470739234e-06, "loss": -0.0072, "reward": 0.5732142857142857, "reward_std": 0.34157505546297345, "rewards/accuracy_reward": 0.5732142857142857, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 587.2910714285714, "epoch": 0.4106666666666667, "grad_norm": 0.14102862775325775, "kl": 0.1574784415108817, "learning_rate": 2.198421780487667e-06, "loss": -0.0464, "reward": 0.6053571428571428, "reward_std": 0.3215475721018655, "rewards/accuracy_reward": 0.6053571428571428, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 592.3178571428572, "epoch": 0.42933333333333334, "grad_norm": 0.10852164775133133, "kl": 0.11443628583635602, "learning_rate": 2.1101049646137005e-06, "loss": -0.0282, "reward": 0.5714285714285714, "reward_std": 0.3459554033620017, "rewards/accuracy_reward": 0.5714285714285714, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 618.7821428571428, "epoch": 0.448, "grad_norm": 0.6055614948272705, "kl": 0.16327661786760603, "learning_rate": 2.0191755856162397e-06, "loss": -0.0015, "reward": 0.6017857142857143, "reward_std": 0.30254575865609307, "rewards/accuracy_reward": 0.6017857142857143, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 586.575, "epoch": 0.4666666666666667, "grad_norm": 0.33487093448638916, "kl": 0.2866717747279576, "learning_rate": 1.9260230170558845e-06, "loss": -0.0131, "reward": 0.5660714285714286, "reward_std": 0.34650790521076746, "rewards/accuracy_reward": 0.5660714285714286, "rewards/format_reward": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 598.3553571428571, "epoch": 0.48533333333333334, "grad_norm": 0.4094862639904022, "kl": 0.2726052965436663, "learning_rate": 1.8310461525322523e-06, "loss": -0.0308, "reward": 0.6160714285714286, "reward_std": 0.34790899327823094, "rewards/accuracy_reward": 0.6160714285714286, "rewards/format_reward": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 615.3178571428572, "epoch": 0.504, "grad_norm": 0.5523812770843506, "kl": 0.39636241367885044, "learning_rate": 1.7346516975603465e-06, "loss": -0.0353, "reward": 0.5767857142857142, "reward_std": 0.33332719462258475, "rewards/accuracy_reward": 0.5767857142857142, "rewards/format_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 626.4767857142857, "epoch": 0.5226666666666666, "grad_norm": 0.08616114407777786, "kl": 0.4847276960100446, "learning_rate": 1.637252427995104e-06, "loss": -0.0036, "reward": 0.5392857142857143, "reward_std": 0.30309826050485883, "rewards/accuracy_reward": 0.5392857142857143, "rewards/format_reward": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 605.8607142857143, "epoch": 0.5413333333333333, "grad_norm": 0.35899537801742554, "kl": 1.201324244907924, "learning_rate": 1.53926542246181e-06, "loss": -0.0055, "reward": 0.5928571428571429, "reward_std": 0.3091361139501844, "rewards/accuracy_reward": 0.5928571428571429, "rewards/format_reward": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 596.8107142857143, "epoch": 0.56, "grad_norm": 0.16239966452121735, "kl": 0.4776475633893694, "learning_rate": 1.4411102763613975e-06, "loss": -0.0096, "reward": 0.5910714285714286, "reward_std": 0.36189862319401334, "rewards/accuracy_reward": 0.5910714285714286, "rewards/format_reward": 0.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 579.8357142857143, "epoch": 0.5786666666666667, "grad_norm": 0.2588573098182678, "kl": 0.37055963788713725, "learning_rate": 1.3432073050985201e-06, "loss": -0.0182, "reward": 0.5892857142857143, "reward_std": 0.3256318356309618, "rewards/accuracy_reward": 0.5892857142857143, "rewards/format_reward": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 636.9660714285715, "epoch": 0.5973333333333334, "grad_norm": 0.3046307861804962, "kl": 0.18770294189453124, "learning_rate": 1.245975744226463e-06, "loss": -0.0251, "reward": 0.6071428571428571, "reward_std": 0.32452683193343024, "rewards/accuracy_reward": 0.6071428571428571, "rewards/format_reward": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 591.6803571428571, "epoch": 0.616, "grad_norm": 0.19371181726455688, "kl": 0.3922314235142299, "learning_rate": 1.1498319542161423e-06, "loss": -0.0093, "reward": 0.5875, "reward_std": 0.3490139969757625, "rewards/accuracy_reward": 0.5875, "rewards/format_reward": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 584.7196428571428, "epoch": 0.6346666666666667, "grad_norm": 0.10712730884552002, "kl": 0.3678185599190848, "learning_rate": 1.0551876375366437e-06, "loss": -0.0021, "reward": 0.5875, "reward_std": 0.33443219832011634, "rewards/accuracy_reward": 0.5875, "rewards/format_reward": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 596.0017857142857, "epoch": 0.6533333333333333, "grad_norm": 1.3254014253616333, "kl": 0.6625612531389509, "learning_rate": 9.624480756820497e-07, "loss": 0.0012, "reward": 0.625, "reward_std": 0.3338796964713505, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 586.9053571428572, "epoch": 0.672, "grad_norm": 5.647360324859619, "kl": 1.4718305315290179, "learning_rate": 8.720103936938583e-07, "loss": 0.0312, "reward": 0.5928571428571429, "reward_std": 0.3011843374797276, "rewards/accuracy_reward": 0.5928571428571429, "rewards/format_reward": 0.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 624.6, "epoch": 0.6906666666666667, "grad_norm": 0.3304588198661804, "kl": 1.4476038251604353, "learning_rate": 7.842618596105873e-07, "loss": -0.0088, "reward": 0.5571428571428572, "reward_std": 0.3657661361353738, "rewards/accuracy_reward": 0.5571428571428572, "rewards/format_reward": 0.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 609.3267857142857, "epoch": 0.7093333333333334, "grad_norm": 0.0052251736633479595, "kl": 0.7994631086077009, "learning_rate": 6.995782261265828e-07, "loss": -0.0033, "reward": 0.6571428571428571, "reward_std": 0.3195939821856362, "rewards/accuracy_reward": 0.6571428571428571, "rewards/format_reward": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 627.0696428571429, "epoch": 0.728, "grad_norm": 0.8703639507293701, "kl": 0.4258202144077846, "learning_rate": 6.183221215612905e-07, "loss": -0.0207, "reward": 0.5607142857142857, "reward_std": 0.3360897038664137, "rewards/accuracy_reward": 0.5607142857142857, "rewards/format_reward": 0.0, "step": 195 }, { "epoch": 0.7466666666666667, "grad_norm": 0.2712614834308624, "learning_rate": 5.40841497029123e-07, "loss": 0.0225, "step": 200 }, { "epoch": 0.7466666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 581.896, "eval_kl": 0.6586514877319336, "eval_loss": -0.014948751777410507, "eval_reward": 0.49185, "eval_reward_std": 0.3328778264760971, "eval_rewards/accuracy_reward": 0.49185, "eval_rewards/format_reward": 0.0, "eval_runtime": 80126.5226, "eval_samples_per_second": 0.062, "eval_steps_per_second": 0.016, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 596.6607142857143, "epoch": 0.7653333333333333, "grad_norm": 0.17807097733020782, "kl": 0.6059337615966797, "learning_rate": 4.674681364593688e-07, "loss": 0.0047, "reward": 0.5991071428571428, "reward_std": 0.30915594739573343, "rewards/accuracy_reward": 0.5991071428571428, "rewards/format_reward": 0.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 645.5696428571429, "epoch": 0.784, "grad_norm": 1.1000945568084717, "kl": 0.31501290457589287, "learning_rate": 3.98516235846472e-07, "loss": 0.0128, "reward": 0.6053571428571428, "reward_std": 0.3121470332145691, "rewards/accuracy_reward": 0.6035714285714285, "rewards/format_reward": 0.0017857142857142857, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 614.0910714285715, "epoch": 0.8026666666666666, "grad_norm": 0.3984079360961914, "kl": 0.5104325430733817, "learning_rate": 3.3428105781454364e-07, "loss": 0.0147, "reward": 0.575, "reward_std": 0.3561171872275216, "rewards/accuracy_reward": 0.575, "rewards/format_reward": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 577.9607142857143, "epoch": 0.8213333333333334, "grad_norm": 0.9381574392318726, "kl": 0.5813296726771764, "learning_rate": 2.750376672574816e-07, "loss": -0.0266, "reward": 0.6232142857142857, "reward_std": 0.32235649142946515, "rewards/accuracy_reward": 0.6232142857142857, "rewards/format_reward": 0.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 625.3107142857143, "epoch": 0.84, "grad_norm": 0.2192346155643463, "kl": 0.5947923932756697, "learning_rate": 2.2103975346886175e-07, "loss": 0.0184, "reward": 0.6017857142857143, "reward_std": 0.3096886157989502, "rewards/accuracy_reward": 0.6017857142857143, "rewards/format_reward": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 609.7071428571429, "epoch": 0.8586666666666667, "grad_norm": 1.8841651678085327, "kl": 0.9176664079938616, "learning_rate": 1.7251854380543735e-07, "loss": -0.0191, "reward": 0.5982142857142857, "reward_std": 0.3261843374797276, "rewards/accuracy_reward": 0.5982142857142857, "rewards/format_reward": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 614.4732142857143, "epoch": 0.8773333333333333, "grad_norm": 0.18197353184223175, "kl": 0.8367650713239397, "learning_rate": 1.2968181353609853e-07, "loss": 0.0089, "reward": 0.6214285714285714, "reward_std": 0.28277469277381895, "rewards/accuracy_reward": 0.6214285714285714, "rewards/format_reward": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 587.1607142857143, "epoch": 0.896, "grad_norm": 0.9800770282745361, "kl": 1.4477369035993304, "learning_rate": 9.271299611627392e-08, "loss": -0.0131, "reward": 0.6142857142857143, "reward_std": 0.3237179126058306, "rewards/accuracy_reward": 0.6142857142857143, "rewards/format_reward": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 598.5571428571428, "epoch": 0.9146666666666666, "grad_norm": 1.9400585889816284, "kl": 1.4993116106305804, "learning_rate": 6.177039769771042e-08, "loss": -0.0213, "reward": 0.5803571428571429, "reward_std": 0.36108970386641365, "rewards/accuracy_reward": 0.5803571428571429, "rewards/format_reward": 0.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 625.3357142857143, "epoch": 0.9333333333333333, "grad_norm": 1.5598750114440918, "kl": 0.9994749886648996, "learning_rate": 3.698651923723101e-08, "loss": -0.0048, "reward": 0.6071428571428571, "reward_std": 0.39846149512699675, "rewards/accuracy_reward": 0.6071428571428571, "rewards/format_reward": 0.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 589.4857142857143, "epoch": 0.952, "grad_norm": 1.0319033861160278, "kl": 1.174580601283482, "learning_rate": 1.846748910729351e-08, "loss": -0.0229, "reward": 0.6053571428571428, "reward_std": 0.3490139969757625, "rewards/accuracy_reward": 0.6053571428571428, "rewards/format_reward": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 602.9089285714285, "epoch": 0.9706666666666667, "grad_norm": 0.4708018898963928, "kl": 0.8759209769112724, "learning_rate": 6.292608638007513e-09, "loss": 0.0268, "reward": 0.6160714285714286, "reward_std": 0.32809826050485885, "rewards/accuracy_reward": 0.6160714285714286, "rewards/format_reward": 0.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 609.9160714285714, "epoch": 0.9893333333333333, "grad_norm": 0.7122711539268494, "kl": 1.1589093889508928, "learning_rate": 5.140125366641102e-10, "loss": 0.008, "reward": 0.6017857142857143, "reward_std": 0.3146214655467442, "rewards/accuracy_reward": 0.6017857142857143, "rewards/format_reward": 0.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 585.875, "epoch": 0.9968, "kl": 1.7515335083007812, "reward": 0.5803571428571429, "reward_std": 0.327693800841059, "rewards/accuracy_reward": 0.5803571428571429, "rewards/format_reward": 0.0, "step": 267, "total_flos": 0.0, "train_loss": -0.0028305855572698046, "train_runtime": 255598.0178, "train_samples_per_second": 0.029, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }