diff --git "a/checkpoint-245080/trainer_state.json" "b/checkpoint-245080/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-245080/trainer_state.json" @@ -0,0 +1,69059 @@ +{ + "best_metric": 6.8204345703125, + "best_model_checkpoint": "bert-base-german-cased-gnd/checkpoint-245080", + "epoch": 22.0, + "eval_steps": 500, + "global_step": 245080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002244165170556553, + "grad_norm": 5.923706531524658, + "learning_rate": 2.244165170556553e-09, + "loss": 10.0185, + "step": 25 + }, + { + "epoch": 0.004488330341113106, + "grad_norm": 6.282773971557617, + "learning_rate": 4.488330341113106e-09, + "loss": 10.0223, + "step": 50 + }, + { + "epoch": 0.006732495511669659, + "grad_norm": 6.057834148406982, + "learning_rate": 6.73249551166966e-09, + "loss": 10.0137, + "step": 75 + }, + { + "epoch": 0.008976660682226212, + "grad_norm": 6.063293933868408, + "learning_rate": 8.976660682226212e-09, + "loss": 10.0215, + "step": 100 + }, + { + "epoch": 0.011220825852782765, + "grad_norm": 6.16914176940918, + "learning_rate": 1.1220825852782766e-08, + "loss": 10.012, + "step": 125 + }, + { + "epoch": 0.013464991023339317, + "grad_norm": 6.403384208679199, + "learning_rate": 1.346499102333932e-08, + "loss": 10.0257, + "step": 150 + }, + { + "epoch": 0.01570915619389587, + "grad_norm": 6.198572158813477, + "learning_rate": 1.5709156193895872e-08, + "loss": 10.0156, + "step": 175 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 6.348753452301025, + "learning_rate": 1.7953321364452425e-08, + "loss": 10.0045, + "step": 200 + }, + { + "epoch": 0.020197486535008975, + "grad_norm": 6.081040382385254, + "learning_rate": 2.0197486535008977e-08, + "loss": 10.0105, + "step": 225 + }, + { + "epoch": 0.02244165170556553, + "grad_norm": 5.912536144256592, + "learning_rate": 2.2441651705565532e-08, + "loss": 10.031, + "step": 250 + }, + { + "epoch": 0.024685816876122084, + "grad_norm": 6.130837440490723, + "learning_rate": 2.4685816876122085e-08, + "loss": 10.0259, + "step": 275 + }, + { + "epoch": 0.026929982046678635, + "grad_norm": 6.424361228942871, + "learning_rate": 2.6840215439856374e-08, + "loss": 10.0203, + "step": 300 + }, + { + "epoch": 0.02917414721723519, + "grad_norm": 6.063850402832031, + "learning_rate": 2.908438061041293e-08, + "loss": 10.0253, + "step": 325 + }, + { + "epoch": 0.03141831238779174, + "grad_norm": 6.256524562835693, + "learning_rate": 3.132854578096948e-08, + "loss": 9.999, + "step": 350 + }, + { + "epoch": 0.033662477558348294, + "grad_norm": 5.996226787567139, + "learning_rate": 3.3572710951526034e-08, + "loss": 10.0271, + "step": 375 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 6.051399230957031, + "learning_rate": 3.581687612208259e-08, + "loss": 9.9981, + "step": 400 + }, + { + "epoch": 0.0381508078994614, + "grad_norm": 5.770738124847412, + "learning_rate": 3.806104129263914e-08, + "loss": 10.0307, + "step": 425 + }, + { + "epoch": 0.04039497307001795, + "grad_norm": 6.201465606689453, + "learning_rate": 4.0305206463195694e-08, + "loss": 10.0247, + "step": 450 + }, + { + "epoch": 0.042639138240574505, + "grad_norm": 6.215156555175781, + "learning_rate": 4.2549371633752243e-08, + "loss": 10.0235, + "step": 475 + }, + { + "epoch": 0.04488330341113106, + "grad_norm": 6.287331581115723, + "learning_rate": 4.47935368043088e-08, + "loss": 10.0212, + "step": 500 + }, + { + "epoch": 0.047127468581687613, + "grad_norm": 6.0642547607421875, + "learning_rate": 4.7037701974865354e-08, + "loss": 9.9893, + "step": 525 + }, + { + "epoch": 0.04937163375224417, + "grad_norm": 5.901621341705322, + "learning_rate": 4.928186714542191e-08, + "loss": 9.9799, + "step": 550 + }, + { + "epoch": 0.051615798922800715, + "grad_norm": 6.366001129150391, + "learning_rate": 5.152603231597846e-08, + "loss": 10.0289, + "step": 575 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 6.202953338623047, + "learning_rate": 5.377019748653501e-08, + "loss": 10.0241, + "step": 600 + }, + { + "epoch": 0.056104129263913824, + "grad_norm": 5.897172451019287, + "learning_rate": 5.601436265709157e-08, + "loss": 10.032, + "step": 625 + }, + { + "epoch": 0.05834829443447038, + "grad_norm": 6.226415634155273, + "learning_rate": 5.825852782764812e-08, + "loss": 10.0143, + "step": 650 + }, + { + "epoch": 0.06059245960502693, + "grad_norm": 6.0532121658325195, + "learning_rate": 6.050269299820467e-08, + "loss": 10.048, + "step": 675 + }, + { + "epoch": 0.06283662477558348, + "grad_norm": 6.0996527671813965, + "learning_rate": 6.274685816876122e-08, + "loss": 10.0021, + "step": 700 + }, + { + "epoch": 0.06508078994614004, + "grad_norm": 6.2961883544921875, + "learning_rate": 6.499102333931778e-08, + "loss": 10.0316, + "step": 725 + }, + { + "epoch": 0.06732495511669659, + "grad_norm": 6.12375020980835, + "learning_rate": 6.723518850987433e-08, + "loss": 10.0282, + "step": 750 + }, + { + "epoch": 0.06956912028725314, + "grad_norm": 5.9284539222717285, + "learning_rate": 6.947935368043089e-08, + "loss": 10.0476, + "step": 775 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 6.208143711090088, + "learning_rate": 7.172351885098744e-08, + "loss": 10.0107, + "step": 800 + }, + { + "epoch": 0.07405745062836624, + "grad_norm": 6.047924518585205, + "learning_rate": 7.3967684021544e-08, + "loss": 10.0128, + "step": 825 + }, + { + "epoch": 0.0763016157989228, + "grad_norm": 6.0715413093566895, + "learning_rate": 7.621184919210055e-08, + "loss": 10.0229, + "step": 850 + }, + { + "epoch": 0.07854578096947935, + "grad_norm": 6.317643642425537, + "learning_rate": 7.845601436265711e-08, + "loss": 10.0107, + "step": 875 + }, + { + "epoch": 0.0807899461400359, + "grad_norm": 6.043159008026123, + "learning_rate": 8.070017953321366e-08, + "loss": 9.9908, + "step": 900 + }, + { + "epoch": 0.08303411131059246, + "grad_norm": 6.077661514282227, + "learning_rate": 8.294434470377021e-08, + "loss": 10.0414, + "step": 925 + }, + { + "epoch": 0.08527827648114901, + "grad_norm": 6.282474040985107, + "learning_rate": 8.518850987432676e-08, + "loss": 10.0293, + "step": 950 + }, + { + "epoch": 0.08752244165170557, + "grad_norm": 6.013352870941162, + "learning_rate": 8.743267504488331e-08, + "loss": 10.0342, + "step": 975 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 6.024497032165527, + "learning_rate": 8.967684021543986e-08, + "loss": 10.0159, + "step": 1000 + }, + { + "epoch": 0.09201077199281867, + "grad_norm": 6.050700664520264, + "learning_rate": 9.19210053859964e-08, + "loss": 10.0194, + "step": 1025 + }, + { + "epoch": 0.09425493716337523, + "grad_norm": 6.135870933532715, + "learning_rate": 9.416517055655297e-08, + "loss": 10.0258, + "step": 1050 + }, + { + "epoch": 0.09649910233393177, + "grad_norm": 5.829977512359619, + "learning_rate": 9.640933572710952e-08, + "loss": 10.0309, + "step": 1075 + }, + { + "epoch": 0.09874326750448834, + "grad_norm": 6.11236047744751, + "learning_rate": 9.865350089766608e-08, + "loss": 10.0216, + "step": 1100 + }, + { + "epoch": 0.10098743267504488, + "grad_norm": 5.899176120758057, + "learning_rate": 1.0089766606822263e-07, + "loss": 10.0044, + "step": 1125 + }, + { + "epoch": 0.10323159784560143, + "grad_norm": 6.165717601776123, + "learning_rate": 1.0314183123877919e-07, + "loss": 10.0055, + "step": 1150 + }, + { + "epoch": 0.10547576301615799, + "grad_norm": 6.240515232086182, + "learning_rate": 1.0538599640933574e-07, + "loss": 10.0127, + "step": 1175 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 6.178211212158203, + "learning_rate": 1.0763016157989229e-07, + "loss": 10.0029, + "step": 1200 + }, + { + "epoch": 0.1099640933572711, + "grad_norm": 5.895815372467041, + "learning_rate": 1.0987432675044884e-07, + "loss": 9.9836, + "step": 1225 + }, + { + "epoch": 0.11220825852782765, + "grad_norm": 5.8588409423828125, + "learning_rate": 1.1211849192100539e-07, + "loss": 10.0157, + "step": 1250 + }, + { + "epoch": 0.1144524236983842, + "grad_norm": 6.171236038208008, + "learning_rate": 1.1436265709156194e-07, + "loss": 10.0131, + "step": 1275 + }, + { + "epoch": 0.11669658886894076, + "grad_norm": 5.868578910827637, + "learning_rate": 1.1660682226211851e-07, + "loss": 9.9925, + "step": 1300 + }, + { + "epoch": 0.1189407540394973, + "grad_norm": 6.244411468505859, + "learning_rate": 1.1885098743267506e-07, + "loss": 9.9832, + "step": 1325 + }, + { + "epoch": 0.12118491921005387, + "grad_norm": 6.19016170501709, + "learning_rate": 1.2109515260323162e-07, + "loss": 10.0084, + "step": 1350 + }, + { + "epoch": 0.12342908438061041, + "grad_norm": 6.01399564743042, + "learning_rate": 1.2333931777378816e-07, + "loss": 10.0077, + "step": 1375 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 5.9070868492126465, + "learning_rate": 1.2558348294434472e-07, + "loss": 10.0294, + "step": 1400 + }, + { + "epoch": 0.12791741472172352, + "grad_norm": 6.282022953033447, + "learning_rate": 1.2782764811490126e-07, + "loss": 10.028, + "step": 1425 + }, + { + "epoch": 0.13016157989228008, + "grad_norm": 6.023783206939697, + "learning_rate": 1.3007181328545782e-07, + "loss": 9.9979, + "step": 1450 + }, + { + "epoch": 0.13240574506283662, + "grad_norm": 6.3142499923706055, + "learning_rate": 1.3231597845601435e-07, + "loss": 10.012, + "step": 1475 + }, + { + "epoch": 0.13464991023339318, + "grad_norm": 6.287275791168213, + "learning_rate": 1.3456014362657094e-07, + "loss": 10.0167, + "step": 1500 + }, + { + "epoch": 0.13689407540394974, + "grad_norm": 5.928888320922852, + "learning_rate": 1.3680430879712748e-07, + "loss": 10.006, + "step": 1525 + }, + { + "epoch": 0.13913824057450627, + "grad_norm": 5.886402606964111, + "learning_rate": 1.3904847396768404e-07, + "loss": 10.026, + "step": 1550 + }, + { + "epoch": 0.14138240574506283, + "grad_norm": 5.959338665008545, + "learning_rate": 1.4129263913824058e-07, + "loss": 9.9924, + "step": 1575 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 6.209696292877197, + "learning_rate": 1.4353680430879714e-07, + "loss": 10.0025, + "step": 1600 + }, + { + "epoch": 0.14587073608617596, + "grad_norm": 6.068072319030762, + "learning_rate": 1.457809694793537e-07, + "loss": 9.9976, + "step": 1625 + }, + { + "epoch": 0.1481149012567325, + "grad_norm": 5.792809963226318, + "learning_rate": 1.4802513464991024e-07, + "loss": 10.0061, + "step": 1650 + }, + { + "epoch": 0.15035906642728905, + "grad_norm": 6.076018810272217, + "learning_rate": 1.502692998204668e-07, + "loss": 9.9943, + "step": 1675 + }, + { + "epoch": 0.1526032315978456, + "grad_norm": 5.938835144042969, + "learning_rate": 1.5251346499102333e-07, + "loss": 9.9916, + "step": 1700 + }, + { + "epoch": 0.15484739676840215, + "grad_norm": 6.042789459228516, + "learning_rate": 1.5475763016157992e-07, + "loss": 9.9887, + "step": 1725 + }, + { + "epoch": 0.1570915619389587, + "grad_norm": 5.899285793304443, + "learning_rate": 1.5700179533213646e-07, + "loss": 9.9834, + "step": 1750 + }, + { + "epoch": 0.15933572710951527, + "grad_norm": 6.056965351104736, + "learning_rate": 1.59245960502693e-07, + "loss": 10.0148, + "step": 1775 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 5.979471206665039, + "learning_rate": 1.6149012567324956e-07, + "loss": 9.9696, + "step": 1800 + }, + { + "epoch": 0.16382405745062836, + "grad_norm": 6.4153032302856445, + "learning_rate": 1.6373429084380614e-07, + "loss": 10.0043, + "step": 1825 + }, + { + "epoch": 0.16606822262118492, + "grad_norm": 5.971315860748291, + "learning_rate": 1.6597845601436268e-07, + "loss": 9.992, + "step": 1850 + }, + { + "epoch": 0.16831238779174149, + "grad_norm": 6.3580241203308105, + "learning_rate": 1.6822262118491924e-07, + "loss": 10.0151, + "step": 1875 + }, + { + "epoch": 0.17055655296229802, + "grad_norm": 6.025638103485107, + "learning_rate": 1.7046678635547578e-07, + "loss": 10.0325, + "step": 1900 + }, + { + "epoch": 0.17280071813285458, + "grad_norm": 6.2231597900390625, + "learning_rate": 1.7271095152603234e-07, + "loss": 9.982, + "step": 1925 + }, + { + "epoch": 0.17504488330341114, + "grad_norm": 5.875751972198486, + "learning_rate": 1.7495511669658888e-07, + "loss": 9.9942, + "step": 1950 + }, + { + "epoch": 0.17728904847396768, + "grad_norm": 5.888873100280762, + "learning_rate": 1.7719928186714544e-07, + "loss": 9.9949, + "step": 1975 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 5.935683250427246, + "learning_rate": 1.7944344703770197e-07, + "loss": 9.9667, + "step": 2000 + }, + { + "epoch": 0.1817773788150808, + "grad_norm": 5.915865421295166, + "learning_rate": 1.8168761220825854e-07, + "loss": 10.012, + "step": 2025 + }, + { + "epoch": 0.18402154398563733, + "grad_norm": 6.366013050079346, + "learning_rate": 1.839317773788151e-07, + "loss": 9.9996, + "step": 2050 + }, + { + "epoch": 0.1862657091561939, + "grad_norm": 6.270355224609375, + "learning_rate": 1.8617594254937163e-07, + "loss": 10.0075, + "step": 2075 + }, + { + "epoch": 0.18850987432675045, + "grad_norm": 6.0307769775390625, + "learning_rate": 1.8842010771992822e-07, + "loss": 10.0237, + "step": 2100 + }, + { + "epoch": 0.19075403949730702, + "grad_norm": 6.084704875946045, + "learning_rate": 1.9066427289048476e-07, + "loss": 10.0021, + "step": 2125 + }, + { + "epoch": 0.19299820466786355, + "grad_norm": 6.118374824523926, + "learning_rate": 1.9290843806104132e-07, + "loss": 10.0191, + "step": 2150 + }, + { + "epoch": 0.1952423698384201, + "grad_norm": 5.96092414855957, + "learning_rate": 1.9515260323159786e-07, + "loss": 9.9847, + "step": 2175 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 6.254398345947266, + "learning_rate": 1.9739676840215442e-07, + "loss": 10.0109, + "step": 2200 + }, + { + "epoch": 0.1997307001795332, + "grad_norm": 5.920108318328857, + "learning_rate": 1.9964093357271098e-07, + "loss": 9.9892, + "step": 2225 + }, + { + "epoch": 0.20197486535008977, + "grad_norm": 5.904831886291504, + "learning_rate": 2.0188509874326752e-07, + "loss": 9.9907, + "step": 2250 + }, + { + "epoch": 0.20421903052064633, + "grad_norm": 5.9788312911987305, + "learning_rate": 2.0412926391382408e-07, + "loss": 9.9926, + "step": 2275 + }, + { + "epoch": 0.20646319569120286, + "grad_norm": 5.8715009689331055, + "learning_rate": 2.0637342908438061e-07, + "loss": 9.9849, + "step": 2300 + }, + { + "epoch": 0.20870736086175942, + "grad_norm": 6.098932266235352, + "learning_rate": 2.0861759425493718e-07, + "loss": 10.0027, + "step": 2325 + }, + { + "epoch": 0.21095152603231598, + "grad_norm": 6.1760969161987305, + "learning_rate": 2.108617594254937e-07, + "loss": 9.9932, + "step": 2350 + }, + { + "epoch": 0.21319569120287254, + "grad_norm": 5.990290641784668, + "learning_rate": 2.131059245960503e-07, + "loss": 9.9967, + "step": 2375 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 5.740996360778809, + "learning_rate": 2.1535008976660684e-07, + "loss": 9.9766, + "step": 2400 + }, + { + "epoch": 0.21768402154398564, + "grad_norm": 5.989614963531494, + "learning_rate": 2.175942549371634e-07, + "loss": 9.984, + "step": 2425 + }, + { + "epoch": 0.2199281867145422, + "grad_norm": 5.987428188323975, + "learning_rate": 2.1983842010771996e-07, + "loss": 9.9985, + "step": 2450 + }, + { + "epoch": 0.22217235188509873, + "grad_norm": 5.979262351989746, + "learning_rate": 2.220825852782765e-07, + "loss": 9.9938, + "step": 2475 + }, + { + "epoch": 0.2244165170556553, + "grad_norm": 5.745146751403809, + "learning_rate": 2.2432675044883306e-07, + "loss": 9.9723, + "step": 2500 + }, + { + "epoch": 0.22666068222621186, + "grad_norm": 5.93808126449585, + "learning_rate": 2.265709156193896e-07, + "loss": 9.9843, + "step": 2525 + }, + { + "epoch": 0.2289048473967684, + "grad_norm": 5.86356782913208, + "learning_rate": 2.2881508078994616e-07, + "loss": 10.0084, + "step": 2550 + }, + { + "epoch": 0.23114901256732495, + "grad_norm": 6.07513952255249, + "learning_rate": 2.310592459605027e-07, + "loss": 9.9789, + "step": 2575 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 6.149722099304199, + "learning_rate": 2.3330341113105928e-07, + "loss": 9.9556, + "step": 2600 + }, + { + "epoch": 0.23563734290843807, + "grad_norm": 6.315128326416016, + "learning_rate": 2.355475763016158e-07, + "loss": 9.9491, + "step": 2625 + }, + { + "epoch": 0.2378815080789946, + "grad_norm": 6.52398681640625, + "learning_rate": 2.3779174147217238e-07, + "loss": 9.9777, + "step": 2650 + }, + { + "epoch": 0.24012567324955117, + "grad_norm": 5.969620227813721, + "learning_rate": 2.400359066427289e-07, + "loss": 9.9676, + "step": 2675 + }, + { + "epoch": 0.24236983842010773, + "grad_norm": 5.887979030609131, + "learning_rate": 2.422800718132855e-07, + "loss": 9.9844, + "step": 2700 + }, + { + "epoch": 0.24461400359066426, + "grad_norm": 5.923941612243652, + "learning_rate": 2.4452423698384204e-07, + "loss": 9.9828, + "step": 2725 + }, + { + "epoch": 0.24685816876122083, + "grad_norm": 6.087390422821045, + "learning_rate": 2.467684021543986e-07, + "loss": 9.9767, + "step": 2750 + }, + { + "epoch": 0.2491023339317774, + "grad_norm": 5.919648170471191, + "learning_rate": 2.4901256732495516e-07, + "loss": 10.0055, + "step": 2775 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": Infinity, + "learning_rate": 2.5116696588868944e-07, + "loss": 9.9811, + "step": 2800 + }, + { + "epoch": 0.2535906642728905, + "grad_norm": 5.885696887969971, + "learning_rate": 2.53411131059246e-07, + "loss": 9.999, + "step": 2825 + }, + { + "epoch": 0.25583482944344704, + "grad_norm": 5.643342018127441, + "learning_rate": 2.556552962298025e-07, + "loss": 9.9603, + "step": 2850 + }, + { + "epoch": 0.2580789946140036, + "grad_norm": 5.8831963539123535, + "learning_rate": 2.5789946140035907e-07, + "loss": 9.9851, + "step": 2875 + }, + { + "epoch": 0.26032315978456017, + "grad_norm": 6.071913719177246, + "learning_rate": 2.6014362657091563e-07, + "loss": 9.9898, + "step": 2900 + }, + { + "epoch": 0.2625673249551167, + "grad_norm": 5.984524726867676, + "learning_rate": 2.6229802513464996e-07, + "loss": 9.9833, + "step": 2925 + }, + { + "epoch": 0.26481149012567323, + "grad_norm": 5.861862659454346, + "learning_rate": 2.6454219030520647e-07, + "loss": 9.9579, + "step": 2950 + }, + { + "epoch": 0.2670556552962298, + "grad_norm": 5.845977306365967, + "learning_rate": 2.6678635547576304e-07, + "loss": 9.9965, + "step": 2975 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 6.00257682800293, + "learning_rate": 2.690305206463196e-07, + "loss": 9.9762, + "step": 3000 + }, + { + "epoch": 0.2715439856373429, + "grad_norm": 5.936895847320557, + "learning_rate": 2.7127468581687616e-07, + "loss": 9.9845, + "step": 3025 + }, + { + "epoch": 0.2737881508078995, + "grad_norm": 6.154696464538574, + "learning_rate": 2.7351885098743267e-07, + "loss": 9.9891, + "step": 3050 + }, + { + "epoch": 0.276032315978456, + "grad_norm": 5.889113426208496, + "learning_rate": 2.7576301615798923e-07, + "loss": 9.9687, + "step": 3075 + }, + { + "epoch": 0.27827648114901254, + "grad_norm": 5.828521728515625, + "learning_rate": 2.780071813285458e-07, + "loss": 9.969, + "step": 3100 + }, + { + "epoch": 0.28052064631956913, + "grad_norm": 5.600679874420166, + "learning_rate": 2.8025134649910236e-07, + "loss": 9.9858, + "step": 3125 + }, + { + "epoch": 0.28276481149012567, + "grad_norm": 6.024610996246338, + "learning_rate": 2.824955116696589e-07, + "loss": 9.9883, + "step": 3150 + }, + { + "epoch": 0.2850089766606822, + "grad_norm": 6.104897975921631, + "learning_rate": 2.847396768402155e-07, + "loss": 9.9924, + "step": 3175 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 6.034098148345947, + "learning_rate": 2.8698384201077204e-07, + "loss": 9.9618, + "step": 3200 + }, + { + "epoch": 0.2894973070017953, + "grad_norm": 5.980985164642334, + "learning_rate": 2.8922800718132855e-07, + "loss": 9.9852, + "step": 3225 + }, + { + "epoch": 0.2917414721723519, + "grad_norm": 5.577481269836426, + "learning_rate": 2.914721723518851e-07, + "loss": 9.9867, + "step": 3250 + }, + { + "epoch": 0.29398563734290845, + "grad_norm": 5.943061351776123, + "learning_rate": 2.937163375224417e-07, + "loss": 9.9713, + "step": 3275 + }, + { + "epoch": 0.296229802513465, + "grad_norm": 5.839987277984619, + "learning_rate": 2.9596050269299824e-07, + "loss": 9.9592, + "step": 3300 + }, + { + "epoch": 0.29847396768402157, + "grad_norm": 5.992583274841309, + "learning_rate": 2.9820466786355475e-07, + "loss": 9.9753, + "step": 3325 + }, + { + "epoch": 0.3007181328545781, + "grad_norm": 5.780086040496826, + "learning_rate": 3.004488330341113e-07, + "loss": 9.959, + "step": 3350 + }, + { + "epoch": 0.30296229802513464, + "grad_norm": 5.798867702484131, + "learning_rate": 3.026929982046679e-07, + "loss": 9.9769, + "step": 3375 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 5.91071081161499, + "learning_rate": 3.0493716337522443e-07, + "loss": 9.9572, + "step": 3400 + }, + { + "epoch": 0.30745062836624776, + "grad_norm": 5.736480712890625, + "learning_rate": 3.07181328545781e-07, + "loss": 9.938, + "step": 3425 + }, + { + "epoch": 0.3096947935368043, + "grad_norm": 6.023299217224121, + "learning_rate": 3.0942549371633756e-07, + "loss": 9.9569, + "step": 3450 + }, + { + "epoch": 0.3119389587073609, + "grad_norm": 6.0235795974731445, + "learning_rate": 3.116696588868941e-07, + "loss": 9.9641, + "step": 3475 + }, + { + "epoch": 0.3141831238779174, + "grad_norm": 6.133740425109863, + "learning_rate": 3.139138240574507e-07, + "loss": 9.9512, + "step": 3500 + }, + { + "epoch": 0.31642728904847395, + "grad_norm": 6.015284061431885, + "learning_rate": 3.161579892280072e-07, + "loss": 9.9554, + "step": 3525 + }, + { + "epoch": 0.31867145421903054, + "grad_norm": 5.938147068023682, + "learning_rate": 3.1840215439856375e-07, + "loss": 9.9609, + "step": 3550 + }, + { + "epoch": 0.32091561938958707, + "grad_norm": 5.9505815505981445, + "learning_rate": 3.2064631956912037e-07, + "loss": 9.9562, + "step": 3575 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 5.824986934661865, + "learning_rate": 3.228904847396769e-07, + "loss": 9.9724, + "step": 3600 + }, + { + "epoch": 0.3254039497307002, + "grad_norm": 5.804319381713867, + "learning_rate": 3.251346499102334e-07, + "loss": 9.9484, + "step": 3625 + }, + { + "epoch": 0.3276481149012567, + "grad_norm": 5.8378682136535645, + "learning_rate": 3.2737881508078995e-07, + "loss": 9.9615, + "step": 3650 + }, + { + "epoch": 0.32989228007181326, + "grad_norm": 6.199387550354004, + "learning_rate": 3.2962298025134656e-07, + "loss": 9.9546, + "step": 3675 + }, + { + "epoch": 0.33213644524236985, + "grad_norm": 5.7572174072265625, + "learning_rate": 3.318671454219031e-07, + "loss": 9.968, + "step": 3700 + }, + { + "epoch": 0.3343806104129264, + "grad_norm": 6.078331470489502, + "learning_rate": 3.3411131059245964e-07, + "loss": 9.9486, + "step": 3725 + }, + { + "epoch": 0.33662477558348297, + "grad_norm": 5.95357084274292, + "learning_rate": 3.3635547576301615e-07, + "loss": 9.9553, + "step": 3750 + }, + { + "epoch": 0.3388689407540395, + "grad_norm": 6.045083045959473, + "learning_rate": 3.3859964093357276e-07, + "loss": 9.9374, + "step": 3775 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 5.7407708168029785, + "learning_rate": 3.4084380610412927e-07, + "loss": 9.9524, + "step": 3800 + }, + { + "epoch": 0.3433572710951526, + "grad_norm": 5.840803146362305, + "learning_rate": 3.4308797127468583e-07, + "loss": 9.9422, + "step": 3825 + }, + { + "epoch": 0.34560143626570916, + "grad_norm": 5.81524658203125, + "learning_rate": 3.4533213644524245e-07, + "loss": 9.9597, + "step": 3850 + }, + { + "epoch": 0.3478456014362657, + "grad_norm": 6.159881114959717, + "learning_rate": 3.4757630161579896e-07, + "loss": 9.96, + "step": 3875 + }, + { + "epoch": 0.3500897666068223, + "grad_norm": 6.111410617828369, + "learning_rate": 3.498204667863555e-07, + "loss": 9.9818, + "step": 3900 + }, + { + "epoch": 0.3523339317773788, + "grad_norm": 5.967281818389893, + "learning_rate": 3.5206463195691203e-07, + "loss": 9.9281, + "step": 3925 + }, + { + "epoch": 0.35457809694793535, + "grad_norm": 5.869514465332031, + "learning_rate": 3.5430879712746864e-07, + "loss": 9.9236, + "step": 3950 + }, + { + "epoch": 0.35682226211849194, + "grad_norm": 5.893311977386475, + "learning_rate": 3.5655296229802515e-07, + "loss": 9.945, + "step": 3975 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 5.925975322723389, + "learning_rate": 3.587971274685817e-07, + "loss": 9.9225, + "step": 4000 + }, + { + "epoch": 0.361310592459605, + "grad_norm": 6.094491958618164, + "learning_rate": 3.610412926391383e-07, + "loss": 9.9504, + "step": 4025 + }, + { + "epoch": 0.3635547576301616, + "grad_norm": 6.08005952835083, + "learning_rate": 3.6328545780969484e-07, + "loss": 9.9634, + "step": 4050 + }, + { + "epoch": 0.36579892280071813, + "grad_norm": 6.147259712219238, + "learning_rate": 3.6552962298025135e-07, + "loss": 9.9599, + "step": 4075 + }, + { + "epoch": 0.36804308797127466, + "grad_norm": 5.8658342361450195, + "learning_rate": 3.677737881508079e-07, + "loss": 9.937, + "step": 4100 + }, + { + "epoch": 0.37028725314183125, + "grad_norm": 5.9528703689575195, + "learning_rate": 3.700179533213645e-07, + "loss": 9.9198, + "step": 4125 + }, + { + "epoch": 0.3725314183123878, + "grad_norm": 5.915778160095215, + "learning_rate": 3.7226211849192103e-07, + "loss": 9.9239, + "step": 4150 + }, + { + "epoch": 0.3747755834829443, + "grad_norm": 5.7966437339782715, + "learning_rate": 3.745062836624776e-07, + "loss": 9.9417, + "step": 4175 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 6.038307189941406, + "learning_rate": 3.7675044883303416e-07, + "loss": 9.9659, + "step": 4200 + }, + { + "epoch": 0.37926391382405744, + "grad_norm": 6.067355155944824, + "learning_rate": 3.789946140035907e-07, + "loss": 9.8998, + "step": 4225 + }, + { + "epoch": 0.38150807899461403, + "grad_norm": 6.2918620109558105, + "learning_rate": 3.8123877917414723e-07, + "loss": 9.9199, + "step": 4250 + }, + { + "epoch": 0.38375224416517056, + "grad_norm": 5.965016841888428, + "learning_rate": 3.834829443447038e-07, + "loss": 9.9575, + "step": 4275 + }, + { + "epoch": 0.3859964093357271, + "grad_norm": 5.65086030960083, + "learning_rate": 3.8572710951526035e-07, + "loss": 9.9687, + "step": 4300 + }, + { + "epoch": 0.3882405745062837, + "grad_norm": 5.9582061767578125, + "learning_rate": 3.879712746858169e-07, + "loss": 9.9449, + "step": 4325 + }, + { + "epoch": 0.3904847396768402, + "grad_norm": 5.828031539916992, + "learning_rate": 3.902154398563734e-07, + "loss": 9.9137, + "step": 4350 + }, + { + "epoch": 0.39272890484739675, + "grad_norm": 5.918435573577881, + "learning_rate": 3.9245960502693e-07, + "loss": 9.9372, + "step": 4375 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 5.983017921447754, + "learning_rate": 3.947037701974866e-07, + "loss": 9.9529, + "step": 4400 + }, + { + "epoch": 0.3972172351885099, + "grad_norm": 5.753073692321777, + "learning_rate": 3.969479353680431e-07, + "loss": 9.9278, + "step": 4425 + }, + { + "epoch": 0.3994614003590664, + "grad_norm": 6.010031223297119, + "learning_rate": 3.991921005385997e-07, + "loss": 9.9274, + "step": 4450 + }, + { + "epoch": 0.401705565529623, + "grad_norm": 6.072628498077393, + "learning_rate": 4.0143626570915624e-07, + "loss": 9.9242, + "step": 4475 + }, + { + "epoch": 0.40394973070017953, + "grad_norm": 6.020074844360352, + "learning_rate": 4.036804308797128e-07, + "loss": 9.9811, + "step": 4500 + }, + { + "epoch": 0.40619389587073607, + "grad_norm": 6.062161922454834, + "learning_rate": 4.059245960502693e-07, + "loss": 9.937, + "step": 4525 + }, + { + "epoch": 0.40843806104129265, + "grad_norm": 5.988836288452148, + "learning_rate": 4.0816876122082587e-07, + "loss": 9.9288, + "step": 4550 + }, + { + "epoch": 0.4106822262118492, + "grad_norm": 6.4043869972229, + "learning_rate": 4.1041292639138243e-07, + "loss": 9.9115, + "step": 4575 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 6.181560516357422, + "learning_rate": 4.12657091561939e-07, + "loss": 9.9413, + "step": 4600 + }, + { + "epoch": 0.4151705565529623, + "grad_norm": 5.755512237548828, + "learning_rate": 4.149012567324955e-07, + "loss": 9.8958, + "step": 4625 + }, + { + "epoch": 0.41741472172351884, + "grad_norm": 6.008756160736084, + "learning_rate": 4.171454219030521e-07, + "loss": 9.9347, + "step": 4650 + }, + { + "epoch": 0.4196588868940754, + "grad_norm": 5.89402961730957, + "learning_rate": 4.193895870736087e-07, + "loss": 9.9103, + "step": 4675 + }, + { + "epoch": 0.42190305206463197, + "grad_norm": 5.98569393157959, + "learning_rate": 4.216337522441652e-07, + "loss": 9.8914, + "step": 4700 + }, + { + "epoch": 0.4241472172351885, + "grad_norm": 5.9297943115234375, + "learning_rate": 4.2387791741472175e-07, + "loss": 9.9084, + "step": 4725 + }, + { + "epoch": 0.4263913824057451, + "grad_norm": 5.802469253540039, + "learning_rate": 4.261220825852783e-07, + "loss": 9.9314, + "step": 4750 + }, + { + "epoch": 0.4286355475763016, + "grad_norm": 6.0606536865234375, + "learning_rate": 4.283662477558349e-07, + "loss": 9.9331, + "step": 4775 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 5.885489463806152, + "learning_rate": 4.306104129263914e-07, + "loss": 9.8946, + "step": 4800 + }, + { + "epoch": 0.43312387791741475, + "grad_norm": 5.712328910827637, + "learning_rate": 4.32854578096948e-07, + "loss": 9.897, + "step": 4825 + }, + { + "epoch": 0.4353680430879713, + "grad_norm": 5.857424736022949, + "learning_rate": 4.350987432675045e-07, + "loss": 9.9363, + "step": 4850 + }, + { + "epoch": 0.4376122082585278, + "grad_norm": 5.935864448547363, + "learning_rate": 4.3734290843806107e-07, + "loss": 9.9032, + "step": 4875 + }, + { + "epoch": 0.4398563734290844, + "grad_norm": 5.951303005218506, + "learning_rate": 4.395870736086176e-07, + "loss": 9.9228, + "step": 4900 + }, + { + "epoch": 0.44210053859964094, + "grad_norm": 5.86503791809082, + "learning_rate": 4.418312387791742e-07, + "loss": 9.9041, + "step": 4925 + }, + { + "epoch": 0.44434470377019747, + "grad_norm": 6.115828514099121, + "learning_rate": 4.4407540394973076e-07, + "loss": 9.9344, + "step": 4950 + }, + { + "epoch": 0.44658886894075406, + "grad_norm": 6.035009384155273, + "learning_rate": 4.4631956912028727e-07, + "loss": 9.8607, + "step": 4975 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 5.931192874908447, + "learning_rate": 4.485637342908439e-07, + "loss": 9.9228, + "step": 5000 + }, + { + "epoch": 0.4510771992818671, + "grad_norm": 6.100157737731934, + "learning_rate": 4.508078994614004e-07, + "loss": 9.9015, + "step": 5025 + }, + { + "epoch": 0.4533213644524237, + "grad_norm": 5.860504627227783, + "learning_rate": 4.5305206463195696e-07, + "loss": 9.9078, + "step": 5050 + }, + { + "epoch": 0.45556552962298025, + "grad_norm": 6.34818696975708, + "learning_rate": 4.5529622980251346e-07, + "loss": 9.9102, + "step": 5075 + }, + { + "epoch": 0.4578096947935368, + "grad_norm": 5.985219955444336, + "learning_rate": 4.575403949730701e-07, + "loss": 9.9118, + "step": 5100 + }, + { + "epoch": 0.46005385996409337, + "grad_norm": 5.924988269805908, + "learning_rate": 4.5978456014362664e-07, + "loss": 9.917, + "step": 5125 + }, + { + "epoch": 0.4622980251346499, + "grad_norm": 5.892970561981201, + "learning_rate": 4.6202872531418315e-07, + "loss": 9.8854, + "step": 5150 + }, + { + "epoch": 0.46454219030520644, + "grad_norm": 6.0755767822265625, + "learning_rate": 4.6427289048473966e-07, + "loss": 9.8664, + "step": 5175 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 5.894937992095947, + "learning_rate": 4.665170556552963e-07, + "loss": 9.8794, + "step": 5200 + }, + { + "epoch": 0.46903052064631956, + "grad_norm": 6.054007530212402, + "learning_rate": 4.6876122082585284e-07, + "loss": 9.8783, + "step": 5225 + }, + { + "epoch": 0.47127468581687615, + "grad_norm": 6.167922496795654, + "learning_rate": 4.7100538599640935e-07, + "loss": 9.8935, + "step": 5250 + }, + { + "epoch": 0.4735188509874327, + "grad_norm": 6.165359020233154, + "learning_rate": 4.7324955116696596e-07, + "loss": 9.8948, + "step": 5275 + }, + { + "epoch": 0.4757630161579892, + "grad_norm": 6.025323390960693, + "learning_rate": 4.7549371633752247e-07, + "loss": 9.8806, + "step": 5300 + }, + { + "epoch": 0.4780071813285458, + "grad_norm": 6.206519603729248, + "learning_rate": 4.77737881508079e-07, + "loss": 9.8489, + "step": 5325 + }, + { + "epoch": 0.48025134649910234, + "grad_norm": 6.024432182312012, + "learning_rate": 4.799820466786355e-07, + "loss": 9.8831, + "step": 5350 + }, + { + "epoch": 0.48249551166965887, + "grad_norm": 6.0181474685668945, + "learning_rate": 4.822262118491922e-07, + "loss": 9.8947, + "step": 5375 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 5.970649242401123, + "learning_rate": 4.844703770197487e-07, + "loss": 9.8858, + "step": 5400 + }, + { + "epoch": 0.486983842010772, + "grad_norm": 5.984105110168457, + "learning_rate": 4.867145421903052e-07, + "loss": 9.859, + "step": 5425 + }, + { + "epoch": 0.48922800718132853, + "grad_norm": 6.263362884521484, + "learning_rate": 4.889587073608618e-07, + "loss": 9.8761, + "step": 5450 + }, + { + "epoch": 0.4914721723518851, + "grad_norm": 5.972558975219727, + "learning_rate": 4.912028725314184e-07, + "loss": 9.8887, + "step": 5475 + }, + { + "epoch": 0.49371633752244165, + "grad_norm": 6.241543292999268, + "learning_rate": 4.934470377019749e-07, + "loss": 9.9094, + "step": 5500 + }, + { + "epoch": 0.4959605026929982, + "grad_norm": 6.063108921051025, + "learning_rate": 4.956912028725314e-07, + "loss": 9.8837, + "step": 5525 + }, + { + "epoch": 0.4982046678635548, + "grad_norm": 5.869775772094727, + "learning_rate": 4.97935368043088e-07, + "loss": 9.9072, + "step": 5550 + }, + { + "epoch": 0.5004488330341114, + "grad_norm": 6.1106133460998535, + "learning_rate": 5.001795332136445e-07, + "loss": 9.8676, + "step": 5575 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 6.008604049682617, + "learning_rate": 5.024236983842011e-07, + "loss": 9.8807, + "step": 5600 + }, + { + "epoch": 0.5049371633752244, + "grad_norm": 6.230731010437012, + "learning_rate": 5.046678635547577e-07, + "loss": 9.8656, + "step": 5625 + }, + { + "epoch": 0.507181328545781, + "grad_norm": 6.124911785125732, + "learning_rate": 5.069120287253143e-07, + "loss": 9.8729, + "step": 5650 + }, + { + "epoch": 0.5094254937163375, + "grad_norm": 5.867502212524414, + "learning_rate": 5.091561938958708e-07, + "loss": 9.8499, + "step": 5675 + }, + { + "epoch": 0.5116696588868941, + "grad_norm": 6.16951322555542, + "learning_rate": 5.114003590664273e-07, + "loss": 9.8958, + "step": 5700 + }, + { + "epoch": 0.5139138240574507, + "grad_norm": 6.087893486022949, + "learning_rate": 5.136445242369839e-07, + "loss": 9.8417, + "step": 5725 + }, + { + "epoch": 0.5161579892280072, + "grad_norm": 6.255253314971924, + "learning_rate": 5.158886894075404e-07, + "loss": 9.8675, + "step": 5750 + }, + { + "epoch": 0.5184021543985637, + "grad_norm": 6.064080238342285, + "learning_rate": 5.181328545780969e-07, + "loss": 9.8891, + "step": 5775 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 6.1127400398254395, + "learning_rate": 5.203770197486536e-07, + "loss": 9.8601, + "step": 5800 + }, + { + "epoch": 0.5228904847396768, + "grad_norm": 6.0353684425354, + "learning_rate": 5.226211849192101e-07, + "loss": 9.8732, + "step": 5825 + }, + { + "epoch": 0.5251346499102334, + "grad_norm": 6.334977626800537, + "learning_rate": 5.248653500897667e-07, + "loss": 9.8295, + "step": 5850 + }, + { + "epoch": 0.52737881508079, + "grad_norm": 6.238837718963623, + "learning_rate": 5.271095152603232e-07, + "loss": 9.8405, + "step": 5875 + }, + { + "epoch": 0.5296229802513465, + "grad_norm": 6.0906500816345215, + "learning_rate": 5.293536804308798e-07, + "loss": 9.8527, + "step": 5900 + }, + { + "epoch": 0.531867145421903, + "grad_norm": 6.12799596786499, + "learning_rate": 5.315978456014363e-07, + "loss": 9.9016, + "step": 5925 + }, + { + "epoch": 0.5341113105924596, + "grad_norm": 6.162265300750732, + "learning_rate": 5.338420107719928e-07, + "loss": 9.8899, + "step": 5950 + }, + { + "epoch": 0.5363554757630161, + "grad_norm": 6.299708843231201, + "learning_rate": 5.360861759425494e-07, + "loss": 9.8467, + "step": 5975 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 6.184859275817871, + "learning_rate": 5.38330341113106e-07, + "loss": 9.8255, + "step": 6000 + }, + { + "epoch": 0.5408438061041293, + "grad_norm": 6.195328712463379, + "learning_rate": 5.405745062836626e-07, + "loss": 9.8624, + "step": 6025 + }, + { + "epoch": 0.5430879712746858, + "grad_norm": 6.31873083114624, + "learning_rate": 5.428186714542191e-07, + "loss": 9.8367, + "step": 6050 + }, + { + "epoch": 0.5453321364452424, + "grad_norm": 6.040925025939941, + "learning_rate": 5.450628366247757e-07, + "loss": 9.8748, + "step": 6075 + }, + { + "epoch": 0.547576301615799, + "grad_norm": 6.643120288848877, + "learning_rate": 5.473070017953322e-07, + "loss": 9.8158, + "step": 6100 + }, + { + "epoch": 0.5498204667863554, + "grad_norm": 6.441460609436035, + "learning_rate": 5.495511669658887e-07, + "loss": 9.8444, + "step": 6125 + }, + { + "epoch": 0.552064631956912, + "grad_norm": 6.126970291137695, + "learning_rate": 5.517953321364452e-07, + "loss": 9.8607, + "step": 6150 + }, + { + "epoch": 0.5543087971274686, + "grad_norm": 6.1640825271606445, + "learning_rate": 5.540394973070018e-07, + "loss": 9.8345, + "step": 6175 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 6.127485752105713, + "learning_rate": 5.562836624775584e-07, + "loss": 9.8243, + "step": 6200 + }, + { + "epoch": 0.5587971274685817, + "grad_norm": 6.135263442993164, + "learning_rate": 5.58527827648115e-07, + "loss": 9.8175, + "step": 6225 + }, + { + "epoch": 0.5610412926391383, + "grad_norm": 6.226190567016602, + "learning_rate": 5.607719928186716e-07, + "loss": 9.8398, + "step": 6250 + }, + { + "epoch": 0.5632854578096947, + "grad_norm": 6.58268928527832, + "learning_rate": 5.630161579892281e-07, + "loss": 9.8298, + "step": 6275 + }, + { + "epoch": 0.5655296229802513, + "grad_norm": 6.1738200187683105, + "learning_rate": 5.652603231597846e-07, + "loss": 9.8079, + "step": 6300 + }, + { + "epoch": 0.5677737881508079, + "grad_norm": 6.537256240844727, + "learning_rate": 5.675044883303411e-07, + "loss": 9.8552, + "step": 6325 + }, + { + "epoch": 0.5700179533213644, + "grad_norm": 6.4177141189575195, + "learning_rate": 5.697486535008977e-07, + "loss": 9.8336, + "step": 6350 + }, + { + "epoch": 0.572262118491921, + "grad_norm": 6.533304691314697, + "learning_rate": 5.719928186714542e-07, + "loss": 9.8345, + "step": 6375 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 6.370611190795898, + "learning_rate": 5.742369838420108e-07, + "loss": 9.8641, + "step": 6400 + }, + { + "epoch": 0.5767504488330341, + "grad_norm": 6.4204583168029785, + "learning_rate": 5.764811490125673e-07, + "loss": 9.8453, + "step": 6425 + }, + { + "epoch": 0.5789946140035906, + "grad_norm": 6.180099964141846, + "learning_rate": 5.78725314183124e-07, + "loss": 9.8362, + "step": 6450 + }, + { + "epoch": 0.5812387791741472, + "grad_norm": 6.460410118103027, + "learning_rate": 5.809694793536805e-07, + "loss": 9.8346, + "step": 6475 + }, + { + "epoch": 0.5834829443447038, + "grad_norm": 6.278439998626709, + "learning_rate": 5.83213644524237e-07, + "loss": 9.8496, + "step": 6500 + }, + { + "epoch": 0.5857271095152603, + "grad_norm": 6.245810508728027, + "learning_rate": 5.854578096947936e-07, + "loss": 9.8173, + "step": 6525 + }, + { + "epoch": 0.5879712746858169, + "grad_norm": 6.253671169281006, + "learning_rate": 5.877019748653501e-07, + "loss": 9.8382, + "step": 6550 + }, + { + "epoch": 0.5902154398563735, + "grad_norm": 6.102259635925293, + "learning_rate": 5.899461400359067e-07, + "loss": 9.7895, + "step": 6575 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 6.340899467468262, + "learning_rate": 5.921903052064632e-07, + "loss": 9.7829, + "step": 6600 + }, + { + "epoch": 0.5947037701974865, + "grad_norm": 6.426835536956787, + "learning_rate": 5.944344703770198e-07, + "loss": 9.8195, + "step": 6625 + }, + { + "epoch": 0.5969479353680431, + "grad_norm": 6.461434841156006, + "learning_rate": 5.966786355475764e-07, + "loss": 9.8001, + "step": 6650 + }, + { + "epoch": 0.5991921005385996, + "grad_norm": 6.494462013244629, + "learning_rate": 5.989228007181329e-07, + "loss": 9.8142, + "step": 6675 + }, + { + "epoch": 0.6014362657091562, + "grad_norm": 6.175943851470947, + "learning_rate": 6.011669658886895e-07, + "loss": 9.8263, + "step": 6700 + }, + { + "epoch": 0.6036804308797128, + "grad_norm": 6.314114570617676, + "learning_rate": 6.03411131059246e-07, + "loss": 9.8178, + "step": 6725 + }, + { + "epoch": 0.6059245960502693, + "grad_norm": 6.163882255554199, + "learning_rate": 6.056552962298026e-07, + "loss": 9.7813, + "step": 6750 + }, + { + "epoch": 0.6081687612208259, + "grad_norm": 6.536901473999023, + "learning_rate": 6.078994614003591e-07, + "loss": 9.8156, + "step": 6775 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 6.392378330230713, + "learning_rate": 6.101436265709157e-07, + "loss": 9.8064, + "step": 6800 + }, + { + "epoch": 0.6126570915619389, + "grad_norm": 6.232139587402344, + "learning_rate": 6.123877917414722e-07, + "loss": 9.7677, + "step": 6825 + }, + { + "epoch": 0.6149012567324955, + "grad_norm": 6.232960224151611, + "learning_rate": 6.146319569120287e-07, + "loss": 9.8107, + "step": 6850 + }, + { + "epoch": 0.6171454219030521, + "grad_norm": 6.34416389465332, + "learning_rate": 6.168761220825854e-07, + "loss": 9.8069, + "step": 6875 + }, + { + "epoch": 0.6193895870736086, + "grad_norm": 6.536038875579834, + "learning_rate": 6.191202872531419e-07, + "loss": 9.7812, + "step": 6900 + }, + { + "epoch": 0.6216337522441652, + "grad_norm": 6.877230167388916, + "learning_rate": 6.213644524236985e-07, + "loss": 9.8014, + "step": 6925 + }, + { + "epoch": 0.6238779174147218, + "grad_norm": 6.281793594360352, + "learning_rate": 6.23608617594255e-07, + "loss": 9.7538, + "step": 6950 + }, + { + "epoch": 0.6261220825852782, + "grad_norm": 6.582910537719727, + "learning_rate": 6.258527827648116e-07, + "loss": 9.811, + "step": 6975 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 6.26833963394165, + "learning_rate": 6.280969479353681e-07, + "loss": 9.7519, + "step": 7000 + }, + { + "epoch": 0.6306104129263914, + "grad_norm": 6.657871246337891, + "learning_rate": 6.303411131059246e-07, + "loss": 9.7885, + "step": 7025 + }, + { + "epoch": 0.6328545780969479, + "grad_norm": 6.513658046722412, + "learning_rate": 6.325852782764811e-07, + "loss": 9.767, + "step": 7050 + }, + { + "epoch": 0.6350987432675045, + "grad_norm": 6.234604358673096, + "learning_rate": 6.348294434470378e-07, + "loss": 9.8074, + "step": 7075 + }, + { + "epoch": 0.6373429084380611, + "grad_norm": 6.443291187286377, + "learning_rate": 6.370736086175944e-07, + "loss": 9.7726, + "step": 7100 + }, + { + "epoch": 0.6395870736086176, + "grad_norm": 6.330977439880371, + "learning_rate": 6.393177737881509e-07, + "loss": 9.8158, + "step": 7125 + }, + { + "epoch": 0.6418312387791741, + "grad_norm": 6.572497844696045, + "learning_rate": 6.415619389587075e-07, + "loss": 9.7053, + "step": 7150 + }, + { + "epoch": 0.6440754039497307, + "grad_norm": 6.3431010246276855, + "learning_rate": 6.43806104129264e-07, + "loss": 9.7544, + "step": 7175 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 6.267871379852295, + "learning_rate": 6.460502692998205e-07, + "loss": 9.7431, + "step": 7200 + }, + { + "epoch": 0.6485637342908438, + "grad_norm": 6.496402263641357, + "learning_rate": 6.48294434470377e-07, + "loss": 9.7929, + "step": 7225 + }, + { + "epoch": 0.6508078994614004, + "grad_norm": 6.4353132247924805, + "learning_rate": 6.505385996409335e-07, + "loss": 9.7546, + "step": 7250 + }, + { + "epoch": 0.6530520646319569, + "grad_norm": 6.385329723358154, + "learning_rate": 6.527827648114901e-07, + "loss": 9.7529, + "step": 7275 + }, + { + "epoch": 0.6552962298025135, + "grad_norm": 6.381908416748047, + "learning_rate": 6.550269299820468e-07, + "loss": 9.7604, + "step": 7300 + }, + { + "epoch": 0.65754039497307, + "grad_norm": 6.5878376960754395, + "learning_rate": 6.572710951526033e-07, + "loss": 9.7904, + "step": 7325 + }, + { + "epoch": 0.6597845601436265, + "grad_norm": 6.402446746826172, + "learning_rate": 6.595152603231599e-07, + "loss": 9.774, + "step": 7350 + }, + { + "epoch": 0.6620287253141831, + "grad_norm": 6.583916664123535, + "learning_rate": 6.617594254937164e-07, + "loss": 9.7779, + "step": 7375 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 6.5942463874816895, + "learning_rate": 6.640035906642729e-07, + "loss": 9.7881, + "step": 7400 + }, + { + "epoch": 0.6665170556552962, + "grad_norm": 6.421679496765137, + "learning_rate": 6.662477558348294e-07, + "loss": 9.7386, + "step": 7425 + }, + { + "epoch": 0.6687612208258528, + "grad_norm": 6.596854209899902, + "learning_rate": 6.68491921005386e-07, + "loss": 9.7465, + "step": 7450 + }, + { + "epoch": 0.6710053859964094, + "grad_norm": 6.4215312004089355, + "learning_rate": 6.707360861759426e-07, + "loss": 9.7338, + "step": 7475 + }, + { + "epoch": 0.6732495511669659, + "grad_norm": 6.582616329193115, + "learning_rate": 6.729802513464991e-07, + "loss": 9.7455, + "step": 7500 + }, + { + "epoch": 0.6754937163375224, + "grad_norm": 6.506743431091309, + "learning_rate": 6.752244165170558e-07, + "loss": 9.7264, + "step": 7525 + }, + { + "epoch": 0.677737881508079, + "grad_norm": 6.476115703582764, + "learning_rate": 6.774685816876123e-07, + "loss": 9.7686, + "step": 7550 + }, + { + "epoch": 0.6799820466786356, + "grad_norm": 6.730452537536621, + "learning_rate": 6.797127468581688e-07, + "loss": 9.7574, + "step": 7575 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 6.6246538162231445, + "learning_rate": 6.819569120287253e-07, + "loss": 9.7271, + "step": 7600 + }, + { + "epoch": 0.6844703770197487, + "grad_norm": 6.707385540008545, + "learning_rate": 6.842010771992819e-07, + "loss": 9.7412, + "step": 7625 + }, + { + "epoch": 0.6867145421903053, + "grad_norm": 6.579505443572998, + "learning_rate": 6.864452423698385e-07, + "loss": 9.7519, + "step": 7650 + }, + { + "epoch": 0.6889587073608617, + "grad_norm": 6.510263442993164, + "learning_rate": 6.88689407540395e-07, + "loss": 9.7704, + "step": 7675 + }, + { + "epoch": 0.6912028725314183, + "grad_norm": 6.399744033813477, + "learning_rate": 6.909335727109516e-07, + "loss": 9.7577, + "step": 7700 + }, + { + "epoch": 0.6934470377019749, + "grad_norm": 6.4463982582092285, + "learning_rate": 6.931777378815082e-07, + "loss": 9.771, + "step": 7725 + }, + { + "epoch": 0.6956912028725314, + "grad_norm": 6.656363010406494, + "learning_rate": 6.954219030520647e-07, + "loss": 9.7687, + "step": 7750 + }, + { + "epoch": 0.697935368043088, + "grad_norm": 6.470757007598877, + "learning_rate": 6.976660682226212e-07, + "loss": 9.7137, + "step": 7775 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 6.443357467651367, + "learning_rate": 6.999102333931777e-07, + "loss": 9.7442, + "step": 7800 + }, + { + "epoch": 0.702423698384201, + "grad_norm": 6.682624340057373, + "learning_rate": 7.021543985637344e-07, + "loss": 9.7429, + "step": 7825 + }, + { + "epoch": 0.7046678635547576, + "grad_norm": 6.4991631507873535, + "learning_rate": 7.043985637342909e-07, + "loss": 9.788, + "step": 7850 + }, + { + "epoch": 0.7069120287253142, + "grad_norm": 6.607325077056885, + "learning_rate": 7.066427289048474e-07, + "loss": 9.7432, + "step": 7875 + }, + { + "epoch": 0.7091561938958707, + "grad_norm": 6.530834197998047, + "learning_rate": 7.08886894075404e-07, + "loss": 9.7233, + "step": 7900 + }, + { + "epoch": 0.7114003590664273, + "grad_norm": 6.620058536529541, + "learning_rate": 7.111310592459605e-07, + "loss": 9.7109, + "step": 7925 + }, + { + "epoch": 0.7136445242369839, + "grad_norm": 6.32672643661499, + "learning_rate": 7.133752244165171e-07, + "loss": 9.705, + "step": 7950 + }, + { + "epoch": 0.7158886894075404, + "grad_norm": 6.551999568939209, + "learning_rate": 7.156193895870736e-07, + "loss": 9.6905, + "step": 7975 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 6.484992504119873, + "learning_rate": 7.178635547576303e-07, + "loss": 9.7129, + "step": 8000 + }, + { + "epoch": 0.7203770197486535, + "grad_norm": 6.518681526184082, + "learning_rate": 7.201077199281868e-07, + "loss": 9.6946, + "step": 8025 + }, + { + "epoch": 0.72262118491921, + "grad_norm": 6.932231903076172, + "learning_rate": 7.223518850987433e-07, + "loss": 9.6983, + "step": 8050 + }, + { + "epoch": 0.7248653500897666, + "grad_norm": 6.489604949951172, + "learning_rate": 7.245960502692999e-07, + "loss": 9.692, + "step": 8075 + }, + { + "epoch": 0.7271095152603232, + "grad_norm": 6.9130377769470215, + "learning_rate": 7.268402154398564e-07, + "loss": 9.6878, + "step": 8100 + }, + { + "epoch": 0.7293536804308797, + "grad_norm": 6.524890899658203, + "learning_rate": 7.290843806104129e-07, + "loss": 9.7109, + "step": 8125 + }, + { + "epoch": 0.7315978456014363, + "grad_norm": 6.6267571449279785, + "learning_rate": 7.313285457809694e-07, + "loss": 9.7249, + "step": 8150 + }, + { + "epoch": 0.7338420107719928, + "grad_norm": 6.640625953674316, + "learning_rate": 7.335727109515262e-07, + "loss": 9.7084, + "step": 8175 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 6.492373943328857, + "learning_rate": 7.358168761220827e-07, + "loss": 9.7063, + "step": 8200 + }, + { + "epoch": 0.7383303411131059, + "grad_norm": 6.693154811859131, + "learning_rate": 7.380610412926392e-07, + "loss": 9.7058, + "step": 8225 + }, + { + "epoch": 0.7405745062836625, + "grad_norm": 6.537960052490234, + "learning_rate": 7.403052064631958e-07, + "loss": 9.7211, + "step": 8250 + }, + { + "epoch": 0.742818671454219, + "grad_norm": 6.8210577964782715, + "learning_rate": 7.425493716337523e-07, + "loss": 9.709, + "step": 8275 + }, + { + "epoch": 0.7450628366247756, + "grad_norm": 6.640403747558594, + "learning_rate": 7.447935368043088e-07, + "loss": 9.7265, + "step": 8300 + }, + { + "epoch": 0.7473070017953322, + "grad_norm": 6.648167610168457, + "learning_rate": 7.470377019748653e-07, + "loss": 9.6869, + "step": 8325 + }, + { + "epoch": 0.7495511669658886, + "grad_norm": 6.7162861824035645, + "learning_rate": 7.49281867145422e-07, + "loss": 9.7295, + "step": 8350 + }, + { + "epoch": 0.7517953321364452, + "grad_norm": 6.757798194885254, + "learning_rate": 7.515260323159786e-07, + "loss": 9.652, + "step": 8375 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 6.5583062171936035, + "learning_rate": 7.537701974865351e-07, + "loss": 9.6755, + "step": 8400 + }, + { + "epoch": 0.7562836624775583, + "grad_norm": 6.95444917678833, + "learning_rate": 7.560143626570917e-07, + "loss": 9.7442, + "step": 8425 + }, + { + "epoch": 0.7585278276481149, + "grad_norm": 6.543112277984619, + "learning_rate": 7.582585278276482e-07, + "loss": 9.6824, + "step": 8450 + }, + { + "epoch": 0.7607719928186715, + "grad_norm": 6.50526762008667, + "learning_rate": 7.605026929982047e-07, + "loss": 9.6612, + "step": 8475 + }, + { + "epoch": 0.7630161579892281, + "grad_norm": 6.608448505401611, + "learning_rate": 7.627468581687612e-07, + "loss": 9.705, + "step": 8500 + }, + { + "epoch": 0.7652603231597845, + "grad_norm": 6.698200702667236, + "learning_rate": 7.649910233393177e-07, + "loss": 9.6809, + "step": 8525 + }, + { + "epoch": 0.7675044883303411, + "grad_norm": 6.701222896575928, + "learning_rate": 7.672351885098744e-07, + "loss": 9.6524, + "step": 8550 + }, + { + "epoch": 0.7697486535008977, + "grad_norm": 6.648589611053467, + "learning_rate": 7.69479353680431e-07, + "loss": 9.6711, + "step": 8575 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 6.808827877044678, + "learning_rate": 7.717235188509875e-07, + "loss": 9.6996, + "step": 8600 + }, + { + "epoch": 0.7742369838420108, + "grad_norm": 6.843456745147705, + "learning_rate": 7.739676840215441e-07, + "loss": 9.6514, + "step": 8625 + }, + { + "epoch": 0.7764811490125674, + "grad_norm": 6.576251029968262, + "learning_rate": 7.762118491921006e-07, + "loss": 9.6731, + "step": 8650 + }, + { + "epoch": 0.7787253141831239, + "grad_norm": 6.645464897155762, + "learning_rate": 7.784560143626571e-07, + "loss": 9.6855, + "step": 8675 + }, + { + "epoch": 0.7809694793536804, + "grad_norm": 6.524102687835693, + "learning_rate": 7.807001795332136e-07, + "loss": 9.6995, + "step": 8700 + }, + { + "epoch": 0.783213644524237, + "grad_norm": 6.703819751739502, + "learning_rate": 7.829443447037703e-07, + "loss": 9.6451, + "step": 8725 + }, + { + "epoch": 0.7854578096947935, + "grad_norm": 6.577558517456055, + "learning_rate": 7.851885098743268e-07, + "loss": 9.6749, + "step": 8750 + }, + { + "epoch": 0.7877019748653501, + "grad_norm": 6.574929714202881, + "learning_rate": 7.874326750448833e-07, + "loss": 9.6959, + "step": 8775 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 6.392107009887695, + "learning_rate": 7.8967684021544e-07, + "loss": 9.6654, + "step": 8800 + }, + { + "epoch": 0.7921903052064632, + "grad_norm": 6.591090679168701, + "learning_rate": 7.919210053859965e-07, + "loss": 9.711, + "step": 8825 + }, + { + "epoch": 0.7944344703770198, + "grad_norm": 6.599772930145264, + "learning_rate": 7.94165170556553e-07, + "loss": 9.6253, + "step": 8850 + }, + { + "epoch": 0.7966786355475763, + "grad_norm": 6.830121040344238, + "learning_rate": 7.964093357271095e-07, + "loss": 9.6579, + "step": 8875 + }, + { + "epoch": 0.7989228007181328, + "grad_norm": 6.605536937713623, + "learning_rate": 7.986535008976662e-07, + "loss": 9.6733, + "step": 8900 + }, + { + "epoch": 0.8011669658886894, + "grad_norm": 6.831969261169434, + "learning_rate": 8.008976660682227e-07, + "loss": 9.6662, + "step": 8925 + }, + { + "epoch": 0.803411131059246, + "grad_norm": 6.739194869995117, + "learning_rate": 8.031418312387792e-07, + "loss": 9.6172, + "step": 8950 + }, + { + "epoch": 0.8056552962298025, + "grad_norm": 6.6600799560546875, + "learning_rate": 8.053859964093358e-07, + "loss": 9.6384, + "step": 8975 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 6.610813140869141, + "learning_rate": 8.076301615798923e-07, + "loss": 9.6262, + "step": 9000 + }, + { + "epoch": 0.8101436265709157, + "grad_norm": 6.8304762840271, + "learning_rate": 8.098743267504489e-07, + "loss": 9.6263, + "step": 9025 + }, + { + "epoch": 0.8123877917414721, + "grad_norm": 6.513163089752197, + "learning_rate": 8.121184919210054e-07, + "loss": 9.61, + "step": 9050 + }, + { + "epoch": 0.8146319569120287, + "grad_norm": 6.7347636222839355, + "learning_rate": 8.143626570915621e-07, + "loss": 9.6111, + "step": 9075 + }, + { + "epoch": 0.8168761220825853, + "grad_norm": 6.566047668457031, + "learning_rate": 8.165170556552963e-07, + "loss": 9.6378, + "step": 9100 + }, + { + "epoch": 0.8191202872531418, + "grad_norm": 6.570896625518799, + "learning_rate": 8.187612208258529e-07, + "loss": 9.6309, + "step": 9125 + }, + { + "epoch": 0.8213644524236984, + "grad_norm": 6.67928409576416, + "learning_rate": 8.210053859964094e-07, + "loss": 9.6992, + "step": 9150 + }, + { + "epoch": 0.823608617594255, + "grad_norm": 6.7185869216918945, + "learning_rate": 8.232495511669659e-07, + "loss": 9.6712, + "step": 9175 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 6.759024143218994, + "learning_rate": 8.254937163375225e-07, + "loss": 9.6474, + "step": 9200 + }, + { + "epoch": 0.828096947935368, + "grad_norm": 6.8713202476501465, + "learning_rate": 8.27737881508079e-07, + "loss": 9.6971, + "step": 9225 + }, + { + "epoch": 0.8303411131059246, + "grad_norm": 6.858035564422607, + "learning_rate": 8.299820466786356e-07, + "loss": 9.5647, + "step": 9250 + }, + { + "epoch": 0.8325852782764811, + "grad_norm": 6.5416259765625, + "learning_rate": 8.322262118491922e-07, + "loss": 9.6577, + "step": 9275 + }, + { + "epoch": 0.8348294434470377, + "grad_norm": 6.5720014572143555, + "learning_rate": 8.344703770197487e-07, + "loss": 9.6451, + "step": 9300 + }, + { + "epoch": 0.8370736086175943, + "grad_norm": 6.990091323852539, + "learning_rate": 8.367145421903053e-07, + "loss": 9.5883, + "step": 9325 + }, + { + "epoch": 0.8393177737881508, + "grad_norm": 6.798624038696289, + "learning_rate": 8.389587073608618e-07, + "loss": 9.64, + "step": 9350 + }, + { + "epoch": 0.8415619389587073, + "grad_norm": 6.7217698097229, + "learning_rate": 8.412028725314183e-07, + "loss": 9.571, + "step": 9375 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 6.7467122077941895, + "learning_rate": 8.434470377019748e-07, + "loss": 9.6305, + "step": 9400 + }, + { + "epoch": 0.8460502692998204, + "grad_norm": 6.783092021942139, + "learning_rate": 8.456912028725315e-07, + "loss": 9.6658, + "step": 9425 + }, + { + "epoch": 0.848294434470377, + "grad_norm": 6.904580593109131, + "learning_rate": 8.479353680430881e-07, + "loss": 9.577, + "step": 9450 + }, + { + "epoch": 0.8505385996409336, + "grad_norm": 6.618059158325195, + "learning_rate": 8.501795332136446e-07, + "loss": 9.6226, + "step": 9475 + }, + { + "epoch": 0.8527827648114902, + "grad_norm": 6.675761699676514, + "learning_rate": 8.524236983842012e-07, + "loss": 9.6094, + "step": 9500 + }, + { + "epoch": 0.8550269299820467, + "grad_norm": 6.517864227294922, + "learning_rate": 8.546678635547577e-07, + "loss": 9.62, + "step": 9525 + }, + { + "epoch": 0.8572710951526032, + "grad_norm": 6.633366107940674, + "learning_rate": 8.569120287253142e-07, + "loss": 9.6113, + "step": 9550 + }, + { + "epoch": 0.8595152603231598, + "grad_norm": 6.443725109100342, + "learning_rate": 8.591561938958707e-07, + "loss": 9.6039, + "step": 9575 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 6.683950901031494, + "learning_rate": 8.614003590664273e-07, + "loss": 9.5757, + "step": 9600 + }, + { + "epoch": 0.8640035906642729, + "grad_norm": 6.572634696960449, + "learning_rate": 8.63644524236984e-07, + "loss": 9.6166, + "step": 9625 + }, + { + "epoch": 0.8662477558348295, + "grad_norm": 6.4774250984191895, + "learning_rate": 8.658886894075405e-07, + "loss": 9.6098, + "step": 9650 + }, + { + "epoch": 0.868491921005386, + "grad_norm": 6.587173938751221, + "learning_rate": 8.681328545780971e-07, + "loss": 9.6081, + "step": 9675 + }, + { + "epoch": 0.8707360861759426, + "grad_norm": 6.576179504394531, + "learning_rate": 8.703770197486536e-07, + "loss": 9.5843, + "step": 9700 + }, + { + "epoch": 0.8729802513464991, + "grad_norm": 6.83200216293335, + "learning_rate": 8.726211849192101e-07, + "loss": 9.5871, + "step": 9725 + }, + { + "epoch": 0.8752244165170556, + "grad_norm": 6.724090099334717, + "learning_rate": 8.748653500897666e-07, + "loss": 9.5872, + "step": 9750 + }, + { + "epoch": 0.8774685816876122, + "grad_norm": 6.857002258300781, + "learning_rate": 8.771095152603232e-07, + "loss": 9.5075, + "step": 9775 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 6.725832939147949, + "learning_rate": 8.793536804308798e-07, + "loss": 9.5402, + "step": 9800 + }, + { + "epoch": 0.8819569120287253, + "grad_norm": 6.8039231300354, + "learning_rate": 8.815978456014364e-07, + "loss": 9.6141, + "step": 9825 + }, + { + "epoch": 0.8842010771992819, + "grad_norm": 7.061015605926514, + "learning_rate": 8.83842010771993e-07, + "loss": 9.646, + "step": 9850 + }, + { + "epoch": 0.8864452423698385, + "grad_norm": 6.679086208343506, + "learning_rate": 8.860861759425495e-07, + "loss": 9.5858, + "step": 9875 + }, + { + "epoch": 0.8886894075403949, + "grad_norm": 6.614816665649414, + "learning_rate": 8.88330341113106e-07, + "loss": 9.5673, + "step": 9900 + }, + { + "epoch": 0.8909335727109515, + "grad_norm": 6.665872097015381, + "learning_rate": 8.905745062836625e-07, + "loss": 9.5974, + "step": 9925 + }, + { + "epoch": 0.8931777378815081, + "grad_norm": 6.642302513122559, + "learning_rate": 8.92818671454219e-07, + "loss": 9.5565, + "step": 9950 + }, + { + "epoch": 0.8954219030520646, + "grad_norm": 6.644009590148926, + "learning_rate": 8.950628366247757e-07, + "loss": 9.5608, + "step": 9975 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 6.566183567047119, + "learning_rate": 8.973070017953322e-07, + "loss": 9.6164, + "step": 10000 + }, + { + "epoch": 0.8999102333931778, + "grad_norm": 6.818757057189941, + "learning_rate": 8.995511669658887e-07, + "loss": 9.6037, + "step": 10025 + }, + { + "epoch": 0.9021543985637342, + "grad_norm": 6.4886579513549805, + "learning_rate": 9.017953321364454e-07, + "loss": 9.6225, + "step": 10050 + }, + { + "epoch": 0.9043985637342908, + "grad_norm": 6.608550071716309, + "learning_rate": 9.040394973070019e-07, + "loss": 9.5652, + "step": 10075 + }, + { + "epoch": 0.9066427289048474, + "grad_norm": 6.644575595855713, + "learning_rate": 9.062836624775584e-07, + "loss": 9.5447, + "step": 10100 + }, + { + "epoch": 0.9088868940754039, + "grad_norm": 6.653059005737305, + "learning_rate": 9.085278276481149e-07, + "loss": 9.5552, + "step": 10125 + }, + { + "epoch": 0.9111310592459605, + "grad_norm": 6.638279438018799, + "learning_rate": 9.107719928186716e-07, + "loss": 9.5782, + "step": 10150 + }, + { + "epoch": 0.9133752244165171, + "grad_norm": 6.537282466888428, + "learning_rate": 9.130161579892281e-07, + "loss": 9.5621, + "step": 10175 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 6.457062721252441, + "learning_rate": 9.152603231597846e-07, + "loss": 9.5814, + "step": 10200 + }, + { + "epoch": 0.9178635547576302, + "grad_norm": 6.600307464599609, + "learning_rate": 9.175044883303412e-07, + "loss": 9.5897, + "step": 10225 + }, + { + "epoch": 0.9201077199281867, + "grad_norm": 6.65377950668335, + "learning_rate": 9.197486535008977e-07, + "loss": 9.4858, + "step": 10250 + }, + { + "epoch": 0.9223518850987432, + "grad_norm": 6.505385875701904, + "learning_rate": 9.219928186714543e-07, + "loss": 9.5597, + "step": 10275 + }, + { + "epoch": 0.9245960502692998, + "grad_norm": 6.669316291809082, + "learning_rate": 9.242369838420108e-07, + "loss": 9.565, + "step": 10300 + }, + { + "epoch": 0.9268402154398564, + "grad_norm": 6.537199020385742, + "learning_rate": 9.264811490125674e-07, + "loss": 9.5954, + "step": 10325 + }, + { + "epoch": 0.9290843806104129, + "grad_norm": 6.7069478034973145, + "learning_rate": 9.28725314183124e-07, + "loss": 9.5126, + "step": 10350 + }, + { + "epoch": 0.9313285457809695, + "grad_norm": 6.647909164428711, + "learning_rate": 9.309694793536805e-07, + "loss": 9.5604, + "step": 10375 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 6.536131381988525, + "learning_rate": 9.332136445242371e-07, + "loss": 9.6215, + "step": 10400 + }, + { + "epoch": 0.9358168761220825, + "grad_norm": 6.791843891143799, + "learning_rate": 9.354578096947936e-07, + "loss": 9.5316, + "step": 10425 + }, + { + "epoch": 0.9380610412926391, + "grad_norm": 6.546873569488525, + "learning_rate": 9.377019748653501e-07, + "loss": 9.576, + "step": 10450 + }, + { + "epoch": 0.9403052064631957, + "grad_norm": 6.5235443115234375, + "learning_rate": 9.399461400359066e-07, + "loss": 9.564, + "step": 10475 + }, + { + "epoch": 0.9425493716337523, + "grad_norm": 6.7889227867126465, + "learning_rate": 9.421903052064632e-07, + "loss": 9.5808, + "step": 10500 + }, + { + "epoch": 0.9447935368043088, + "grad_norm": 7.0180158615112305, + "learning_rate": 9.444344703770199e-07, + "loss": 9.5614, + "step": 10525 + }, + { + "epoch": 0.9470377019748654, + "grad_norm": 6.567283630371094, + "learning_rate": 9.466786355475764e-07, + "loss": 9.479, + "step": 10550 + }, + { + "epoch": 0.949281867145422, + "grad_norm": 6.511399269104004, + "learning_rate": 9.489228007181329e-07, + "loss": 9.4943, + "step": 10575 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 6.671716213226318, + "learning_rate": 9.511669658886895e-07, + "loss": 9.5117, + "step": 10600 + }, + { + "epoch": 0.953770197486535, + "grad_norm": 6.869779109954834, + "learning_rate": 9.53411131059246e-07, + "loss": 9.52, + "step": 10625 + }, + { + "epoch": 0.9560143626570916, + "grad_norm": 6.943051338195801, + "learning_rate": 9.556552962298026e-07, + "loss": 9.4921, + "step": 10650 + }, + { + "epoch": 0.9582585278276481, + "grad_norm": 6.556875228881836, + "learning_rate": 9.57899461400359e-07, + "loss": 9.4758, + "step": 10675 + }, + { + "epoch": 0.9605026929982047, + "grad_norm": 6.704715251922607, + "learning_rate": 9.601436265709157e-07, + "loss": 9.5437, + "step": 10700 + }, + { + "epoch": 0.9627468581687613, + "grad_norm": 6.527096748352051, + "learning_rate": 9.623877917414723e-07, + "loss": 9.5658, + "step": 10725 + }, + { + "epoch": 0.9649910233393177, + "grad_norm": 6.99722146987915, + "learning_rate": 9.646319569120289e-07, + "loss": 9.5959, + "step": 10750 + }, + { + "epoch": 0.9672351885098743, + "grad_norm": 6.613394260406494, + "learning_rate": 9.668761220825853e-07, + "loss": 9.4603, + "step": 10775 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 6.865056037902832, + "learning_rate": 9.69120287253142e-07, + "loss": 9.4917, + "step": 10800 + }, + { + "epoch": 0.9717235188509874, + "grad_norm": 6.746171951293945, + "learning_rate": 9.713644524236985e-07, + "loss": 9.4867, + "step": 10825 + }, + { + "epoch": 0.973967684021544, + "grad_norm": 6.493651866912842, + "learning_rate": 9.73608617594255e-07, + "loss": 9.5313, + "step": 10850 + }, + { + "epoch": 0.9762118491921006, + "grad_norm": 6.676388263702393, + "learning_rate": 9.758527827648115e-07, + "loss": 9.5361, + "step": 10875 + }, + { + "epoch": 0.9784560143626571, + "grad_norm": 6.758974552154541, + "learning_rate": 9.780969479353682e-07, + "loss": 9.55, + "step": 10900 + }, + { + "epoch": 0.9807001795332136, + "grad_norm": 6.738333702087402, + "learning_rate": 9.803411131059248e-07, + "loss": 9.5448, + "step": 10925 + }, + { + "epoch": 0.9829443447037702, + "grad_norm": 6.664250373840332, + "learning_rate": 9.825852782764812e-07, + "loss": 9.4364, + "step": 10950 + }, + { + "epoch": 0.9851885098743267, + "grad_norm": 6.6077880859375, + "learning_rate": 9.848294434470378e-07, + "loss": 9.5792, + "step": 10975 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 6.73434591293335, + "learning_rate": 9.870736086175944e-07, + "loss": 9.5065, + "step": 11000 + }, + { + "epoch": 0.9896768402154399, + "grad_norm": 7.210285663604736, + "learning_rate": 9.893177737881508e-07, + "loss": 9.5199, + "step": 11025 + }, + { + "epoch": 0.9919210053859964, + "grad_norm": 6.501571178436279, + "learning_rate": 9.915619389587074e-07, + "loss": 9.4552, + "step": 11050 + }, + { + "epoch": 0.994165170556553, + "grad_norm": 6.541090488433838, + "learning_rate": 9.93806104129264e-07, + "loss": 9.4833, + "step": 11075 + }, + { + "epoch": 0.9964093357271095, + "grad_norm": 6.919015407562256, + "learning_rate": 9.960502692998207e-07, + "loss": 9.548, + "step": 11100 + }, + { + "epoch": 0.998653500897666, + "grad_norm": 6.714727401733398, + "learning_rate": 9.98294434470377e-07, + "loss": 9.4631, + "step": 11125 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.0036278234968399746, + "eval_f1_macro": 3.688275739749409e-07, + "eval_f1_micro": 0.0036278234968399746, + "eval_f1_weighted": 2.899267973667565e-05, + "eval_loss": 9.575965881347656, + "eval_precision_macro": 1.851536383257086e-07, + "eval_precision_micro": 0.0036278234968399746, + "eval_precision_weighted": 1.4554497865233542e-05, + "eval_recall_macro": 4.6151006091932804e-05, + "eval_recall_micro": 0.0036278234968399746, + "eval_recall_weighted": 0.0036278234968399746, + "eval_runtime": 169.1745, + "eval_samples_per_second": 309.58, + "eval_steps_per_second": 9.676, + "step": 11140 + }, + { + "epoch": 1.0008976660682227, + "grad_norm": 6.855932235717773, + "learning_rate": 1.0005385996409337e-06, + "loss": 9.5159, + "step": 11150 + }, + { + "epoch": 1.0031418312387792, + "grad_norm": 6.5915327072143555, + "learning_rate": 1.00278276481149e-06, + "loss": 9.421, + "step": 11175 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 6.733715534210205, + "learning_rate": 1.0050269299820467e-06, + "loss": 9.5062, + "step": 11200 + }, + { + "epoch": 1.0076301615798924, + "grad_norm": 6.543283939361572, + "learning_rate": 1.0072710951526033e-06, + "loss": 9.4137, + "step": 11225 + }, + { + "epoch": 1.0098743267504489, + "grad_norm": 6.6234846115112305, + "learning_rate": 1.00951526032316e-06, + "loss": 9.4265, + "step": 11250 + }, + { + "epoch": 1.0121184919210053, + "grad_norm": 6.619158744812012, + "learning_rate": 1.0116696588868941e-06, + "loss": 9.5018, + "step": 11275 + }, + { + "epoch": 1.014362657091562, + "grad_norm": 6.625874996185303, + "learning_rate": 1.0139138240574508e-06, + "loss": 9.4057, + "step": 11300 + }, + { + "epoch": 1.0166068222621185, + "grad_norm": 6.660024166107178, + "learning_rate": 1.0161579892280072e-06, + "loss": 9.4391, + "step": 11325 + }, + { + "epoch": 1.018850987432675, + "grad_norm": 6.64752721786499, + "learning_rate": 1.0184021543985638e-06, + "loss": 9.4303, + "step": 11350 + }, + { + "epoch": 1.0210951526032317, + "grad_norm": 6.644859790802002, + "learning_rate": 1.0206463195691204e-06, + "loss": 9.4021, + "step": 11375 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 6.73024845123291, + "learning_rate": 1.0228904847396768e-06, + "loss": 9.4876, + "step": 11400 + }, + { + "epoch": 1.0255834829443446, + "grad_norm": 6.684884071350098, + "learning_rate": 1.0251346499102336e-06, + "loss": 9.4082, + "step": 11425 + }, + { + "epoch": 1.0278276481149013, + "grad_norm": 6.579803466796875, + "learning_rate": 1.02737881508079e-06, + "loss": 9.3798, + "step": 11450 + }, + { + "epoch": 1.0300718132854578, + "grad_norm": 6.818235874176025, + "learning_rate": 1.0296229802513466e-06, + "loss": 9.3946, + "step": 11475 + }, + { + "epoch": 1.0323159784560143, + "grad_norm": 6.609393119812012, + "learning_rate": 1.031867145421903e-06, + "loss": 9.4249, + "step": 11500 + }, + { + "epoch": 1.034560143626571, + "grad_norm": 6.752560138702393, + "learning_rate": 1.0341113105924597e-06, + "loss": 9.3993, + "step": 11525 + }, + { + "epoch": 1.0368043087971275, + "grad_norm": 6.647876739501953, + "learning_rate": 1.0363554757630163e-06, + "loss": 9.3816, + "step": 11550 + }, + { + "epoch": 1.039048473967684, + "grad_norm": 6.626875400543213, + "learning_rate": 1.0385996409335727e-06, + "loss": 9.3939, + "step": 11575 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 6.733286380767822, + "learning_rate": 1.0408438061041295e-06, + "loss": 9.3502, + "step": 11600 + }, + { + "epoch": 1.0435368043087971, + "grad_norm": 7.111309051513672, + "learning_rate": 1.043087971274686e-06, + "loss": 9.3699, + "step": 11625 + }, + { + "epoch": 1.0457809694793536, + "grad_norm": 6.565243721008301, + "learning_rate": 1.0453321364452425e-06, + "loss": 9.3854, + "step": 11650 + }, + { + "epoch": 1.0480251346499103, + "grad_norm": 6.684092044830322, + "learning_rate": 1.047576301615799e-06, + "loss": 9.4151, + "step": 11675 + }, + { + "epoch": 1.0502692998204668, + "grad_norm": 6.6230669021606445, + "learning_rate": 1.0498204667863555e-06, + "loss": 9.3976, + "step": 11700 + }, + { + "epoch": 1.0525134649910233, + "grad_norm": 6.699564456939697, + "learning_rate": 1.0520646319569122e-06, + "loss": 9.4085, + "step": 11725 + }, + { + "epoch": 1.05475763016158, + "grad_norm": 6.764492034912109, + "learning_rate": 1.0543087971274686e-06, + "loss": 9.3584, + "step": 11750 + }, + { + "epoch": 1.0570017953321365, + "grad_norm": 6.79047966003418, + "learning_rate": 1.0565529622980254e-06, + "loss": 9.3618, + "step": 11775 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 6.831668853759766, + "learning_rate": 1.0587971274685818e-06, + "loss": 9.3996, + "step": 11800 + }, + { + "epoch": 1.0614901256732496, + "grad_norm": 6.625792026519775, + "learning_rate": 1.0610412926391384e-06, + "loss": 9.3954, + "step": 11825 + }, + { + "epoch": 1.063734290843806, + "grad_norm": 6.516231536865234, + "learning_rate": 1.0632854578096948e-06, + "loss": 9.396, + "step": 11850 + }, + { + "epoch": 1.0659784560143626, + "grad_norm": 6.703080177307129, + "learning_rate": 1.0655296229802514e-06, + "loss": 9.3191, + "step": 11875 + }, + { + "epoch": 1.0682226211849193, + "grad_norm": 6.582396507263184, + "learning_rate": 1.067773788150808e-06, + "loss": 9.3405, + "step": 11900 + }, + { + "epoch": 1.0704667863554758, + "grad_norm": 6.474344730377197, + "learning_rate": 1.0700179533213644e-06, + "loss": 9.3236, + "step": 11925 + }, + { + "epoch": 1.0727109515260322, + "grad_norm": 6.491992473602295, + "learning_rate": 1.072262118491921e-06, + "loss": 9.3177, + "step": 11950 + }, + { + "epoch": 1.074955116696589, + "grad_norm": 6.729642391204834, + "learning_rate": 1.0745062836624777e-06, + "loss": 9.3234, + "step": 11975 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 6.6687116622924805, + "learning_rate": 1.0767504488330343e-06, + "loss": 9.3649, + "step": 12000 + }, + { + "epoch": 1.079443447037702, + "grad_norm": 6.5983991622924805, + "learning_rate": 1.0789946140035907e-06, + "loss": 9.4177, + "step": 12025 + }, + { + "epoch": 1.0816876122082586, + "grad_norm": 6.712021827697754, + "learning_rate": 1.0812387791741473e-06, + "loss": 9.4009, + "step": 12050 + }, + { + "epoch": 1.083931777378815, + "grad_norm": 6.695026397705078, + "learning_rate": 1.083482944344704e-06, + "loss": 9.3464, + "step": 12075 + }, + { + "epoch": 1.0861759425493716, + "grad_norm": 6.67159366607666, + "learning_rate": 1.0857271095152603e-06, + "loss": 9.3276, + "step": 12100 + }, + { + "epoch": 1.0884201077199283, + "grad_norm": 6.658183574676514, + "learning_rate": 1.087971274685817e-06, + "loss": 9.3897, + "step": 12125 + }, + { + "epoch": 1.0906642728904847, + "grad_norm": 6.725042343139648, + "learning_rate": 1.0902154398563736e-06, + "loss": 9.3238, + "step": 12150 + }, + { + "epoch": 1.0929084380610412, + "grad_norm": 6.6801581382751465, + "learning_rate": 1.0924596050269302e-06, + "loss": 9.35, + "step": 12175 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 6.431133270263672, + "learning_rate": 1.0947037701974866e-06, + "loss": 9.4199, + "step": 12200 + }, + { + "epoch": 1.0973967684021544, + "grad_norm": 6.7489118576049805, + "learning_rate": 1.0969479353680432e-06, + "loss": 9.3812, + "step": 12225 + }, + { + "epoch": 1.0996409335727109, + "grad_norm": 6.833267688751221, + "learning_rate": 1.0991921005385998e-06, + "loss": 9.3569, + "step": 12250 + }, + { + "epoch": 1.1018850987432676, + "grad_norm": 6.725024223327637, + "learning_rate": 1.1014362657091562e-06, + "loss": 9.3644, + "step": 12275 + }, + { + "epoch": 1.104129263913824, + "grad_norm": 6.703476428985596, + "learning_rate": 1.1036804308797128e-06, + "loss": 9.3913, + "step": 12300 + }, + { + "epoch": 1.1063734290843805, + "grad_norm": 6.6171040534973145, + "learning_rate": 1.1059245960502694e-06, + "loss": 9.3362, + "step": 12325 + }, + { + "epoch": 1.1086175942549372, + "grad_norm": 6.581846714019775, + "learning_rate": 1.108168761220826e-06, + "loss": 9.3379, + "step": 12350 + }, + { + "epoch": 1.1108617594254937, + "grad_norm": 6.598025321960449, + "learning_rate": 1.1104129263913825e-06, + "loss": 9.3778, + "step": 12375 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 6.618262767791748, + "learning_rate": 1.112657091561939e-06, + "loss": 9.3254, + "step": 12400 + }, + { + "epoch": 1.1153500897666069, + "grad_norm": 6.71485710144043, + "learning_rate": 1.1149012567324957e-06, + "loss": 9.3503, + "step": 12425 + }, + { + "epoch": 1.1175942549371634, + "grad_norm": 6.575069904327393, + "learning_rate": 1.117145421903052e-06, + "loss": 9.3147, + "step": 12450 + }, + { + "epoch": 1.1198384201077198, + "grad_norm": 6.95695161819458, + "learning_rate": 1.1193895870736087e-06, + "loss": 9.3117, + "step": 12475 + }, + { + "epoch": 1.1220825852782765, + "grad_norm": 6.843395709991455, + "learning_rate": 1.1216337522441653e-06, + "loss": 9.3383, + "step": 12500 + }, + { + "epoch": 1.124326750448833, + "grad_norm": 6.527547836303711, + "learning_rate": 1.123877917414722e-06, + "loss": 9.3176, + "step": 12525 + }, + { + "epoch": 1.1265709156193895, + "grad_norm": 6.659595489501953, + "learning_rate": 1.1261220825852783e-06, + "loss": 9.3842, + "step": 12550 + }, + { + "epoch": 1.1288150807899462, + "grad_norm": 6.998452186584473, + "learning_rate": 1.128366247755835e-06, + "loss": 9.3711, + "step": 12575 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 6.734482288360596, + "learning_rate": 1.1306104129263914e-06, + "loss": 9.1948, + "step": 12600 + }, + { + "epoch": 1.1333034111310591, + "grad_norm": 6.723465442657471, + "learning_rate": 1.132854578096948e-06, + "loss": 9.3652, + "step": 12625 + }, + { + "epoch": 1.1355475763016158, + "grad_norm": 6.954098224639893, + "learning_rate": 1.1350987432675046e-06, + "loss": 9.3091, + "step": 12650 + }, + { + "epoch": 1.1377917414721723, + "grad_norm": 6.6495466232299805, + "learning_rate": 1.1373429084380612e-06, + "loss": 9.3893, + "step": 12675 + }, + { + "epoch": 1.140035906642729, + "grad_norm": 6.562039375305176, + "learning_rate": 1.1395870736086178e-06, + "loss": 9.2924, + "step": 12700 + }, + { + "epoch": 1.1422800718132855, + "grad_norm": 6.832138538360596, + "learning_rate": 1.1418312387791742e-06, + "loss": 9.2676, + "step": 12725 + }, + { + "epoch": 1.144524236983842, + "grad_norm": 6.564228057861328, + "learning_rate": 1.1440754039497308e-06, + "loss": 9.3358, + "step": 12750 + }, + { + "epoch": 1.1467684021543985, + "grad_norm": 6.829017639160156, + "learning_rate": 1.1463195691202872e-06, + "loss": 9.3425, + "step": 12775 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 6.67156982421875, + "learning_rate": 1.1485637342908439e-06, + "loss": 9.3658, + "step": 12800 + }, + { + "epoch": 1.1512567324955116, + "grad_norm": 6.588474750518799, + "learning_rate": 1.1508078994614005e-06, + "loss": 9.2879, + "step": 12825 + }, + { + "epoch": 1.1535008976660683, + "grad_norm": 6.455601215362549, + "learning_rate": 1.153052064631957e-06, + "loss": 9.2183, + "step": 12850 + }, + { + "epoch": 1.1557450628366248, + "grad_norm": 6.843434810638428, + "learning_rate": 1.1552962298025137e-06, + "loss": 9.3897, + "step": 12875 + }, + { + "epoch": 1.1579892280071813, + "grad_norm": 6.5802531242370605, + "learning_rate": 1.15754039497307e-06, + "loss": 9.2954, + "step": 12900 + }, + { + "epoch": 1.1602333931777378, + "grad_norm": 6.572861671447754, + "learning_rate": 1.1597845601436267e-06, + "loss": 9.3526, + "step": 12925 + }, + { + "epoch": 1.1624775583482945, + "grad_norm": 6.596826553344727, + "learning_rate": 1.1620287253141831e-06, + "loss": 9.2918, + "step": 12950 + }, + { + "epoch": 1.164721723518851, + "grad_norm": 6.667306423187256, + "learning_rate": 1.1642728904847397e-06, + "loss": 9.3119, + "step": 12975 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 6.645683288574219, + "learning_rate": 1.1665170556552963e-06, + "loss": 9.3093, + "step": 13000 + }, + { + "epoch": 1.1692100538599641, + "grad_norm": 6.531824111938477, + "learning_rate": 1.168761220825853e-06, + "loss": 9.3109, + "step": 13025 + }, + { + "epoch": 1.1714542190305206, + "grad_norm": 6.587417125701904, + "learning_rate": 1.1710053859964096e-06, + "loss": 9.2032, + "step": 13050 + }, + { + "epoch": 1.173698384201077, + "grad_norm": 6.769504547119141, + "learning_rate": 1.173249551166966e-06, + "loss": 9.2856, + "step": 13075 + }, + { + "epoch": 1.1759425493716338, + "grad_norm": 6.784618377685547, + "learning_rate": 1.1754937163375226e-06, + "loss": 9.3543, + "step": 13100 + }, + { + "epoch": 1.1781867145421903, + "grad_norm": 6.659923553466797, + "learning_rate": 1.177737881508079e-06, + "loss": 9.2714, + "step": 13125 + }, + { + "epoch": 1.180430879712747, + "grad_norm": 6.788387775421143, + "learning_rate": 1.1799820466786356e-06, + "loss": 9.2867, + "step": 13150 + }, + { + "epoch": 1.1826750448833034, + "grad_norm": 6.671829700469971, + "learning_rate": 1.1822262118491922e-06, + "loss": 9.2664, + "step": 13175 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 6.643823623657227, + "learning_rate": 1.1844703770197488e-06, + "loss": 9.2491, + "step": 13200 + }, + { + "epoch": 1.1871633752244164, + "grad_norm": 6.64363956451416, + "learning_rate": 1.1867145421903053e-06, + "loss": 9.2207, + "step": 13225 + }, + { + "epoch": 1.189407540394973, + "grad_norm": 6.821055889129639, + "learning_rate": 1.1889587073608619e-06, + "loss": 9.3006, + "step": 13250 + }, + { + "epoch": 1.1916517055655296, + "grad_norm": 6.697867393493652, + "learning_rate": 1.1912028725314185e-06, + "loss": 9.2723, + "step": 13275 + }, + { + "epoch": 1.1938958707360863, + "grad_norm": 6.696470260620117, + "learning_rate": 1.1934470377019749e-06, + "loss": 9.2979, + "step": 13300 + }, + { + "epoch": 1.1961400359066428, + "grad_norm": 6.649637699127197, + "learning_rate": 1.1956912028725315e-06, + "loss": 9.2166, + "step": 13325 + }, + { + "epoch": 1.1983842010771992, + "grad_norm": 6.663254261016846, + "learning_rate": 1.1979353680430881e-06, + "loss": 9.2734, + "step": 13350 + }, + { + "epoch": 1.200628366247756, + "grad_norm": 6.590174198150635, + "learning_rate": 1.2001795332136445e-06, + "loss": 9.2916, + "step": 13375 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 6.673224449157715, + "learning_rate": 1.2024236983842011e-06, + "loss": 9.258, + "step": 13400 + }, + { + "epoch": 1.2051166965888689, + "grad_norm": 6.704507350921631, + "learning_rate": 1.2046678635547577e-06, + "loss": 9.2233, + "step": 13425 + }, + { + "epoch": 1.2073608617594256, + "grad_norm": 6.713016510009766, + "learning_rate": 1.2069120287253144e-06, + "loss": 9.2774, + "step": 13450 + }, + { + "epoch": 1.209605026929982, + "grad_norm": 6.677687644958496, + "learning_rate": 1.2091561938958708e-06, + "loss": 9.2234, + "step": 13475 + }, + { + "epoch": 1.2118491921005385, + "grad_norm": 6.758347511291504, + "learning_rate": 1.2114003590664274e-06, + "loss": 9.1914, + "step": 13500 + }, + { + "epoch": 1.2140933572710952, + "grad_norm": 6.66787576675415, + "learning_rate": 1.213644524236984e-06, + "loss": 9.2324, + "step": 13525 + }, + { + "epoch": 1.2163375224416517, + "grad_norm": 6.864614963531494, + "learning_rate": 1.2158886894075404e-06, + "loss": 9.2852, + "step": 13550 + }, + { + "epoch": 1.2185816876122082, + "grad_norm": 6.670363903045654, + "learning_rate": 1.218132854578097e-06, + "loss": 9.2505, + "step": 13575 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 6.69810152053833, + "learning_rate": 1.2203770197486536e-06, + "loss": 9.2128, + "step": 13600 + }, + { + "epoch": 1.2230700179533214, + "grad_norm": 6.790250301361084, + "learning_rate": 1.2226211849192102e-06, + "loss": 9.3151, + "step": 13625 + }, + { + "epoch": 1.2253141831238779, + "grad_norm": 6.583069324493408, + "learning_rate": 1.2248653500897666e-06, + "loss": 9.3069, + "step": 13650 + }, + { + "epoch": 1.2275583482944346, + "grad_norm": 6.747753143310547, + "learning_rate": 1.2271095152603233e-06, + "loss": 9.3383, + "step": 13675 + }, + { + "epoch": 1.229802513464991, + "grad_norm": 6.7486419677734375, + "learning_rate": 1.2293536804308797e-06, + "loss": 9.2918, + "step": 13700 + }, + { + "epoch": 1.2320466786355475, + "grad_norm": 6.991728782653809, + "learning_rate": 1.2315978456014363e-06, + "loss": 9.2755, + "step": 13725 + }, + { + "epoch": 1.2342908438061042, + "grad_norm": 6.601487636566162, + "learning_rate": 1.233842010771993e-06, + "loss": 9.1997, + "step": 13750 + }, + { + "epoch": 1.2365350089766607, + "grad_norm": 6.819201946258545, + "learning_rate": 1.2360861759425495e-06, + "loss": 9.2522, + "step": 13775 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 6.712340354919434, + "learning_rate": 1.2383303411131061e-06, + "loss": 9.2146, + "step": 13800 + }, + { + "epoch": 1.2410233393177739, + "grad_norm": 6.709385871887207, + "learning_rate": 1.2405745062836625e-06, + "loss": 9.2521, + "step": 13825 + }, + { + "epoch": 1.2432675044883303, + "grad_norm": 6.615209579467773, + "learning_rate": 1.242728904847397e-06, + "loss": 9.2543, + "step": 13850 + }, + { + "epoch": 1.2455116696588868, + "grad_norm": 6.688724040985107, + "learning_rate": 1.2449730700179534e-06, + "loss": 9.2342, + "step": 13875 + }, + { + "epoch": 1.2477558348294435, + "grad_norm": 6.732059955596924, + "learning_rate": 1.24721723518851e-06, + "loss": 9.3339, + "step": 13900 + }, + { + "epoch": 1.25, + "grad_norm": 6.691347599029541, + "learning_rate": 1.2494614003590664e-06, + "loss": 9.265, + "step": 13925 + }, + { + "epoch": 1.2522441651705565, + "grad_norm": 6.805498123168945, + "learning_rate": 1.2517055655296232e-06, + "loss": 9.2536, + "step": 13950 + }, + { + "epoch": 1.2544883303411132, + "grad_norm": 6.618569850921631, + "learning_rate": 1.2539497307001796e-06, + "loss": 9.2416, + "step": 13975 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 6.7224297523498535, + "learning_rate": 1.2561938958707362e-06, + "loss": 9.3188, + "step": 14000 + }, + { + "epoch": 1.2589766606822261, + "grad_norm": 6.951136112213135, + "learning_rate": 1.2584380610412926e-06, + "loss": 9.1709, + "step": 14025 + }, + { + "epoch": 1.2612208258527828, + "grad_norm": 6.6700825691223145, + "learning_rate": 1.2606822262118493e-06, + "loss": 9.2742, + "step": 14050 + }, + { + "epoch": 1.2634649910233393, + "grad_norm": 6.780479431152344, + "learning_rate": 1.2629263913824059e-06, + "loss": 9.2209, + "step": 14075 + }, + { + "epoch": 1.2657091561938958, + "grad_norm": 7.106590747833252, + "learning_rate": 1.2651705565529623e-06, + "loss": 9.204, + "step": 14100 + }, + { + "epoch": 1.2679533213644525, + "grad_norm": 6.673321723937988, + "learning_rate": 1.2674147217235189e-06, + "loss": 9.2455, + "step": 14125 + }, + { + "epoch": 1.270197486535009, + "grad_norm": 6.786211967468262, + "learning_rate": 1.2696588868940755e-06, + "loss": 9.2486, + "step": 14150 + }, + { + "epoch": 1.2724416517055657, + "grad_norm": 6.831704139709473, + "learning_rate": 1.271903052064632e-06, + "loss": 9.2941, + "step": 14175 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 6.557703018188477, + "learning_rate": 1.2741472172351887e-06, + "loss": 9.2006, + "step": 14200 + }, + { + "epoch": 1.2769299820466786, + "grad_norm": 6.674654483795166, + "learning_rate": 1.2763913824057453e-06, + "loss": 9.3435, + "step": 14225 + }, + { + "epoch": 1.279174147217235, + "grad_norm": 7.0924482345581055, + "learning_rate": 1.2786355475763017e-06, + "loss": 9.2277, + "step": 14250 + }, + { + "epoch": 1.2814183123877918, + "grad_norm": 6.713856220245361, + "learning_rate": 1.2808797127468584e-06, + "loss": 9.2047, + "step": 14275 + }, + { + "epoch": 1.2836624775583483, + "grad_norm": 6.923978328704834, + "learning_rate": 1.2830341113105926e-06, + "loss": 9.2775, + "step": 14300 + }, + { + "epoch": 1.285906642728905, + "grad_norm": 6.630103588104248, + "learning_rate": 1.2852782764811492e-06, + "loss": 9.2428, + "step": 14325 + }, + { + "epoch": 1.2881508078994615, + "grad_norm": 6.848647594451904, + "learning_rate": 1.2875224416517056e-06, + "loss": 9.3011, + "step": 14350 + }, + { + "epoch": 1.290394973070018, + "grad_norm": 6.684213638305664, + "learning_rate": 1.2897666068222622e-06, + "loss": 9.257, + "step": 14375 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 6.7741851806640625, + "learning_rate": 1.2920107719928188e-06, + "loss": 9.2049, + "step": 14400 + }, + { + "epoch": 1.2948833034111311, + "grad_norm": 6.737053871154785, + "learning_rate": 1.2942549371633752e-06, + "loss": 9.2251, + "step": 14425 + }, + { + "epoch": 1.2971274685816876, + "grad_norm": 6.776785850524902, + "learning_rate": 1.2964991023339319e-06, + "loss": 9.2256, + "step": 14450 + }, + { + "epoch": 1.2993716337522443, + "grad_norm": 6.832870006561279, + "learning_rate": 1.2987432675044885e-06, + "loss": 9.2519, + "step": 14475 + }, + { + "epoch": 1.3016157989228008, + "grad_norm": 6.8159308433532715, + "learning_rate": 1.3009874326750449e-06, + "loss": 9.2137, + "step": 14500 + }, + { + "epoch": 1.3038599640933572, + "grad_norm": 6.7588605880737305, + "learning_rate": 1.3032315978456015e-06, + "loss": 9.2684, + "step": 14525 + }, + { + "epoch": 1.3061041292639137, + "grad_norm": 6.784746170043945, + "learning_rate": 1.3054757630161579e-06, + "loss": 9.1565, + "step": 14550 + }, + { + "epoch": 1.3083482944344704, + "grad_norm": 6.786811828613281, + "learning_rate": 1.3077199281867147e-06, + "loss": 9.2759, + "step": 14575 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 6.794554233551025, + "learning_rate": 1.3099640933572713e-06, + "loss": 9.2511, + "step": 14600 + }, + { + "epoch": 1.3128366247755836, + "grad_norm": 6.677416801452637, + "learning_rate": 1.312208258527828e-06, + "loss": 9.2635, + "step": 14625 + }, + { + "epoch": 1.31508078994614, + "grad_norm": 6.827773094177246, + "learning_rate": 1.3144524236983844e-06, + "loss": 9.2677, + "step": 14650 + }, + { + "epoch": 1.3173249551166966, + "grad_norm": 6.725936412811279, + "learning_rate": 1.316696588868941e-06, + "loss": 9.2424, + "step": 14675 + }, + { + "epoch": 1.319569120287253, + "grad_norm": 6.7054643630981445, + "learning_rate": 1.3189407540394974e-06, + "loss": 9.188, + "step": 14700 + }, + { + "epoch": 1.3218132854578097, + "grad_norm": 6.975854873657227, + "learning_rate": 1.321184919210054e-06, + "loss": 9.2743, + "step": 14725 + }, + { + "epoch": 1.3240574506283662, + "grad_norm": 6.813758850097656, + "learning_rate": 1.3234290843806106e-06, + "loss": 9.281, + "step": 14750 + }, + { + "epoch": 1.326301615798923, + "grad_norm": 6.7720441818237305, + "learning_rate": 1.325673249551167e-06, + "loss": 9.1975, + "step": 14775 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 6.928959369659424, + "learning_rate": 1.3279174147217236e-06, + "loss": 9.2106, + "step": 14800 + }, + { + "epoch": 1.3307899461400359, + "grad_norm": 6.692811489105225, + "learning_rate": 1.33016157989228e-06, + "loss": 9.1649, + "step": 14825 + }, + { + "epoch": 1.3330341113105924, + "grad_norm": 7.046484470367432, + "learning_rate": 1.3324057450628366e-06, + "loss": 9.0425, + "step": 14850 + }, + { + "epoch": 1.335278276481149, + "grad_norm": 6.737009048461914, + "learning_rate": 1.3346499102333933e-06, + "loss": 9.251, + "step": 14875 + }, + { + "epoch": 1.3375224416517055, + "grad_norm": 7.286160945892334, + "learning_rate": 1.3368940754039497e-06, + "loss": 9.2639, + "step": 14900 + }, + { + "epoch": 1.3397666068222622, + "grad_norm": 6.654364585876465, + "learning_rate": 1.3391382405745065e-06, + "loss": 9.31, + "step": 14925 + }, + { + "epoch": 1.3420107719928187, + "grad_norm": 6.658660411834717, + "learning_rate": 1.341382405745063e-06, + "loss": 9.1945, + "step": 14950 + }, + { + "epoch": 1.3442549371633752, + "grad_norm": 6.696970462799072, + "learning_rate": 1.3436265709156195e-06, + "loss": 9.207, + "step": 14975 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 7.181731224060059, + "learning_rate": 1.3458707360861761e-06, + "loss": 9.207, + "step": 15000 + }, + { + "epoch": 1.3487432675044884, + "grad_norm": 6.768177032470703, + "learning_rate": 1.3481149012567327e-06, + "loss": 9.2022, + "step": 15025 + }, + { + "epoch": 1.3509874326750448, + "grad_norm": 7.174690246582031, + "learning_rate": 1.3503590664272891e-06, + "loss": 9.158, + "step": 15050 + }, + { + "epoch": 1.3532315978456015, + "grad_norm": 6.9935126304626465, + "learning_rate": 1.3526032315978458e-06, + "loss": 9.1351, + "step": 15075 + }, + { + "epoch": 1.355475763016158, + "grad_norm": 6.937646865844727, + "learning_rate": 1.3548473967684024e-06, + "loss": 9.1847, + "step": 15100 + }, + { + "epoch": 1.3577199281867145, + "grad_norm": 6.700376987457275, + "learning_rate": 1.3570915619389588e-06, + "loss": 9.2746, + "step": 15125 + }, + { + "epoch": 1.359964093357271, + "grad_norm": 7.399362564086914, + "learning_rate": 1.3593357271095154e-06, + "loss": 9.2682, + "step": 15150 + }, + { + "epoch": 1.3622082585278277, + "grad_norm": 6.860337734222412, + "learning_rate": 1.3615798922800718e-06, + "loss": 9.1992, + "step": 15175 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 6.66388463973999, + "learning_rate": 1.3638240574506284e-06, + "loss": 9.1776, + "step": 15200 + }, + { + "epoch": 1.3666965888689409, + "grad_norm": 6.673814296722412, + "learning_rate": 1.366068222621185e-06, + "loss": 9.1906, + "step": 15225 + }, + { + "epoch": 1.3689407540394973, + "grad_norm": 6.695856094360352, + "learning_rate": 1.3683123877917414e-06, + "loss": 9.1469, + "step": 15250 + }, + { + "epoch": 1.3711849192100538, + "grad_norm": 6.884016513824463, + "learning_rate": 1.370556552962298e-06, + "loss": 9.1008, + "step": 15275 + }, + { + "epoch": 1.3734290843806103, + "grad_norm": 6.944136142730713, + "learning_rate": 1.3728007181328549e-06, + "loss": 9.1787, + "step": 15300 + }, + { + "epoch": 1.375673249551167, + "grad_norm": 6.792248725891113, + "learning_rate": 1.3750448833034113e-06, + "loss": 9.183, + "step": 15325 + }, + { + "epoch": 1.3779174147217235, + "grad_norm": 6.7610249519348145, + "learning_rate": 1.3772890484739679e-06, + "loss": 9.1232, + "step": 15350 + }, + { + "epoch": 1.3801615798922802, + "grad_norm": 6.767807960510254, + "learning_rate": 1.3795332136445245e-06, + "loss": 9.2733, + "step": 15375 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 6.750069618225098, + "learning_rate": 1.381777378815081e-06, + "loss": 9.2063, + "step": 15400 + }, + { + "epoch": 1.3846499102333931, + "grad_norm": 6.793094635009766, + "learning_rate": 1.3840215439856375e-06, + "loss": 9.1231, + "step": 15425 + }, + { + "epoch": 1.3868940754039496, + "grad_norm": 6.7230305671691895, + "learning_rate": 1.386265709156194e-06, + "loss": 9.2548, + "step": 15450 + }, + { + "epoch": 1.3891382405745063, + "grad_norm": 6.745090961456299, + "learning_rate": 1.3885098743267505e-06, + "loss": 9.1361, + "step": 15475 + }, + { + "epoch": 1.3913824057450628, + "grad_norm": 6.730828762054443, + "learning_rate": 1.3907540394973071e-06, + "loss": 9.1422, + "step": 15500 + }, + { + "epoch": 1.3936265709156195, + "grad_norm": 6.795992851257324, + "learning_rate": 1.3929982046678636e-06, + "loss": 9.2281, + "step": 15525 + }, + { + "epoch": 1.395870736086176, + "grad_norm": 7.0249176025390625, + "learning_rate": 1.3952423698384202e-06, + "loss": 9.1943, + "step": 15550 + }, + { + "epoch": 1.3981149012567324, + "grad_norm": 7.3849568367004395, + "learning_rate": 1.3974865350089768e-06, + "loss": 9.0389, + "step": 15575 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 6.960760593414307, + "learning_rate": 1.3997307001795332e-06, + "loss": 9.2601, + "step": 15600 + }, + { + "epoch": 1.4026032315978456, + "grad_norm": 6.915163040161133, + "learning_rate": 1.4019748653500898e-06, + "loss": 9.1862, + "step": 15625 + }, + { + "epoch": 1.404847396768402, + "grad_norm": 6.6947221755981445, + "learning_rate": 1.4042190305206466e-06, + "loss": 9.1065, + "step": 15650 + }, + { + "epoch": 1.4070915619389588, + "grad_norm": 6.749163627624512, + "learning_rate": 1.406463195691203e-06, + "loss": 9.1513, + "step": 15675 + }, + { + "epoch": 1.4093357271095153, + "grad_norm": 6.7271833419799805, + "learning_rate": 1.4087073608617596e-06, + "loss": 9.1579, + "step": 15700 + }, + { + "epoch": 1.4115798922800717, + "grad_norm": 7.038432598114014, + "learning_rate": 1.4109515260323163e-06, + "loss": 9.2418, + "step": 15725 + }, + { + "epoch": 1.4138240574506284, + "grad_norm": 6.927584648132324, + "learning_rate": 1.4131956912028727e-06, + "loss": 9.2484, + "step": 15750 + }, + { + "epoch": 1.416068222621185, + "grad_norm": 6.702296257019043, + "learning_rate": 1.4154398563734293e-06, + "loss": 9.1257, + "step": 15775 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 7.036154270172119, + "learning_rate": 1.4176840215439857e-06, + "loss": 9.2171, + "step": 15800 + }, + { + "epoch": 1.420556552962298, + "grad_norm": 6.7869062423706055, + "learning_rate": 1.4199281867145423e-06, + "loss": 9.1142, + "step": 15825 + }, + { + "epoch": 1.4228007181328546, + "grad_norm": 6.695822238922119, + "learning_rate": 1.422172351885099e-06, + "loss": 9.0627, + "step": 15850 + }, + { + "epoch": 1.425044883303411, + "grad_norm": 7.038686275482178, + "learning_rate": 1.4244165170556553e-06, + "loss": 9.1507, + "step": 15875 + }, + { + "epoch": 1.4272890484739678, + "grad_norm": 6.965195655822754, + "learning_rate": 1.426660682226212e-06, + "loss": 9.2031, + "step": 15900 + }, + { + "epoch": 1.4295332136445242, + "grad_norm": 6.840915679931641, + "learning_rate": 1.4289048473967683e-06, + "loss": 9.2074, + "step": 15925 + }, + { + "epoch": 1.4317773788150807, + "grad_norm": 6.70405912399292, + "learning_rate": 1.431149012567325e-06, + "loss": 9.1239, + "step": 15950 + }, + { + "epoch": 1.4340215439856374, + "grad_norm": 6.957409381866455, + "learning_rate": 1.4333931777378816e-06, + "loss": 9.1638, + "step": 15975 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 7.085066795349121, + "learning_rate": 1.4356373429084384e-06, + "loss": 9.1769, + "step": 16000 + }, + { + "epoch": 1.4385098743267504, + "grad_norm": 6.870354652404785, + "learning_rate": 1.4378815080789948e-06, + "loss": 9.2726, + "step": 16025 + }, + { + "epoch": 1.440754039497307, + "grad_norm": 7.224937438964844, + "learning_rate": 1.4401256732495514e-06, + "loss": 9.2056, + "step": 16050 + }, + { + "epoch": 1.4429982046678635, + "grad_norm": 6.879683017730713, + "learning_rate": 1.4423698384201078e-06, + "loss": 9.0653, + "step": 16075 + }, + { + "epoch": 1.44524236983842, + "grad_norm": 7.403330326080322, + "learning_rate": 1.4446140035906644e-06, + "loss": 9.1226, + "step": 16100 + }, + { + "epoch": 1.4474865350089767, + "grad_norm": 6.758444309234619, + "learning_rate": 1.446858168761221e-06, + "loss": 9.1764, + "step": 16125 + }, + { + "epoch": 1.4497307001795332, + "grad_norm": 6.8561224937438965, + "learning_rate": 1.4491023339317774e-06, + "loss": 9.0817, + "step": 16150 + }, + { + "epoch": 1.4519748653500897, + "grad_norm": 6.745090484619141, + "learning_rate": 1.451346499102334e-06, + "loss": 9.0744, + "step": 16175 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 7.045586585998535, + "learning_rate": 1.4535906642728907e-06, + "loss": 9.2281, + "step": 16200 + }, + { + "epoch": 1.4564631956912029, + "grad_norm": 6.925437927246094, + "learning_rate": 1.455834829443447e-06, + "loss": 9.2313, + "step": 16225 + }, + { + "epoch": 1.4587073608617596, + "grad_norm": 6.943007469177246, + "learning_rate": 1.4580789946140037e-06, + "loss": 9.141, + "step": 16250 + }, + { + "epoch": 1.460951526032316, + "grad_norm": 7.1228928565979, + "learning_rate": 1.46032315978456e-06, + "loss": 9.2586, + "step": 16275 + }, + { + "epoch": 1.4631956912028725, + "grad_norm": 6.866269111633301, + "learning_rate": 1.4625673249551167e-06, + "loss": 9.2149, + "step": 16300 + }, + { + "epoch": 1.465439856373429, + "grad_norm": 6.759486198425293, + "learning_rate": 1.4648114901256733e-06, + "loss": 9.2308, + "step": 16325 + }, + { + "epoch": 1.4676840215439857, + "grad_norm": 6.844733715057373, + "learning_rate": 1.4670556552962297e-06, + "loss": 9.1001, + "step": 16350 + }, + { + "epoch": 1.4699281867145422, + "grad_norm": 6.991299629211426, + "learning_rate": 1.4692998204667866e-06, + "loss": 9.2698, + "step": 16375 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 7.0193305015563965, + "learning_rate": 1.4715439856373432e-06, + "loss": 9.1432, + "step": 16400 + }, + { + "epoch": 1.4744165170556554, + "grad_norm": 6.861202239990234, + "learning_rate": 1.4737881508078996e-06, + "loss": 9.1232, + "step": 16425 + }, + { + "epoch": 1.4766606822262118, + "grad_norm": 6.838179588317871, + "learning_rate": 1.4760323159784562e-06, + "loss": 9.1691, + "step": 16450 + }, + { + "epoch": 1.4789048473967683, + "grad_norm": 7.073694705963135, + "learning_rate": 1.4782764811490128e-06, + "loss": 9.1259, + "step": 16475 + }, + { + "epoch": 1.481149012567325, + "grad_norm": 7.145512580871582, + "learning_rate": 1.4805206463195692e-06, + "loss": 9.1298, + "step": 16500 + }, + { + "epoch": 1.4833931777378815, + "grad_norm": 6.9032883644104, + "learning_rate": 1.4827648114901258e-06, + "loss": 9.1707, + "step": 16525 + }, + { + "epoch": 1.4856373429084382, + "grad_norm": 6.798129558563232, + "learning_rate": 1.4850089766606822e-06, + "loss": 9.0581, + "step": 16550 + }, + { + "epoch": 1.4878815080789947, + "grad_norm": 6.956537246704102, + "learning_rate": 1.4872531418312388e-06, + "loss": 9.1561, + "step": 16575 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 6.775937080383301, + "learning_rate": 1.4894973070017955e-06, + "loss": 9.2438, + "step": 16600 + }, + { + "epoch": 1.4923698384201076, + "grad_norm": 7.881014347076416, + "learning_rate": 1.4917414721723519e-06, + "loss": 9.1854, + "step": 16625 + }, + { + "epoch": 1.4946140035906643, + "grad_norm": 6.877419471740723, + "learning_rate": 1.4939856373429085e-06, + "loss": 9.1591, + "step": 16650 + }, + { + "epoch": 1.4968581687612208, + "grad_norm": 6.831208229064941, + "learning_rate": 1.496229802513465e-06, + "loss": 9.0855, + "step": 16675 + }, + { + "epoch": 1.4991023339317775, + "grad_norm": 7.217222213745117, + "learning_rate": 1.4984739676840215e-06, + "loss": 9.155, + "step": 16700 + }, + { + "epoch": 1.501346499102334, + "grad_norm": 6.781536102294922, + "learning_rate": 1.5007181328545783e-06, + "loss": 9.117, + "step": 16725 + }, + { + "epoch": 1.5035906642728905, + "grad_norm": 7.1302103996276855, + "learning_rate": 1.502962298025135e-06, + "loss": 9.2171, + "step": 16750 + }, + { + "epoch": 1.505834829443447, + "grad_norm": 7.111241340637207, + "learning_rate": 1.5052064631956913e-06, + "loss": 9.0963, + "step": 16775 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 7.4777140617370605, + "learning_rate": 1.507450628366248e-06, + "loss": 9.1631, + "step": 16800 + }, + { + "epoch": 1.51032315978456, + "grad_norm": 7.035587310791016, + "learning_rate": 1.5096947935368046e-06, + "loss": 9.1863, + "step": 16825 + }, + { + "epoch": 1.5125673249551168, + "grad_norm": 7.688345909118652, + "learning_rate": 1.511938958707361e-06, + "loss": 9.1453, + "step": 16850 + }, + { + "epoch": 1.5148114901256733, + "grad_norm": 7.140257835388184, + "learning_rate": 1.5141831238779176e-06, + "loss": 9.0673, + "step": 16875 + }, + { + "epoch": 1.5170556552962298, + "grad_norm": 7.144179344177246, + "learning_rate": 1.516427289048474e-06, + "loss": 9.0738, + "step": 16900 + }, + { + "epoch": 1.5192998204667862, + "grad_norm": 7.797633171081543, + "learning_rate": 1.5186714542190306e-06, + "loss": 9.2341, + "step": 16925 + }, + { + "epoch": 1.521543985637343, + "grad_norm": 7.116386890411377, + "learning_rate": 1.5209156193895872e-06, + "loss": 9.0829, + "step": 16950 + }, + { + "epoch": 1.5237881508078994, + "grad_norm": 6.921915531158447, + "learning_rate": 1.5231597845601436e-06, + "loss": 9.1386, + "step": 16975 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 7.2082977294921875, + "learning_rate": 1.525314183123878e-06, + "loss": 9.1321, + "step": 17000 + }, + { + "epoch": 1.5282764811490126, + "grad_norm": 7.269525051116943, + "learning_rate": 1.5275583482944345e-06, + "loss": 9.1381, + "step": 17025 + }, + { + "epoch": 1.530520646319569, + "grad_norm": 7.292977333068848, + "learning_rate": 1.529802513464991e-06, + "loss": 9.0976, + "step": 17050 + }, + { + "epoch": 1.5327648114901256, + "grad_norm": 8.262571334838867, + "learning_rate": 1.5320466786355475e-06, + "loss": 9.086, + "step": 17075 + }, + { + "epoch": 1.5350089766606823, + "grad_norm": 7.157303333282471, + "learning_rate": 1.5342908438061043e-06, + "loss": 9.1012, + "step": 17100 + }, + { + "epoch": 1.5372531418312387, + "grad_norm": 8.166177749633789, + "learning_rate": 1.536535008976661e-06, + "loss": 9.1696, + "step": 17125 + }, + { + "epoch": 1.5394973070017954, + "grad_norm": 6.989890098571777, + "learning_rate": 1.5387791741472175e-06, + "loss": 9.1843, + "step": 17150 + }, + { + "epoch": 1.541741472172352, + "grad_norm": 7.38519287109375, + "learning_rate": 1.541023339317774e-06, + "loss": 9.136, + "step": 17175 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 7.2663798332214355, + "learning_rate": 1.5432675044883306e-06, + "loss": 9.0079, + "step": 17200 + }, + { + "epoch": 1.5462298025134649, + "grad_norm": 7.167383193969727, + "learning_rate": 1.545511669658887e-06, + "loss": 9.0868, + "step": 17225 + }, + { + "epoch": 1.5484739676840216, + "grad_norm": 7.3597846031188965, + "learning_rate": 1.5477558348294436e-06, + "loss": 9.0755, + "step": 17250 + }, + { + "epoch": 1.550718132854578, + "grad_norm": 7.1178202629089355, + "learning_rate": 1.5500000000000002e-06, + "loss": 9.1726, + "step": 17275 + }, + { + "epoch": 1.5529622980251347, + "grad_norm": 8.176761627197266, + "learning_rate": 1.5522441651705566e-06, + "loss": 9.1722, + "step": 17300 + }, + { + "epoch": 1.5552064631956912, + "grad_norm": 7.130929946899414, + "learning_rate": 1.5544883303411132e-06, + "loss": 9.1042, + "step": 17325 + }, + { + "epoch": 1.5574506283662477, + "grad_norm": 8.494580268859863, + "learning_rate": 1.5567324955116696e-06, + "loss": 9.095, + "step": 17350 + }, + { + "epoch": 1.5596947935368042, + "grad_norm": 7.1536455154418945, + "learning_rate": 1.5589766606822262e-06, + "loss": 9.1798, + "step": 17375 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 7.011881351470947, + "learning_rate": 1.5612208258527828e-06, + "loss": 9.1363, + "step": 17400 + }, + { + "epoch": 1.5641831238779176, + "grad_norm": 7.863648414611816, + "learning_rate": 1.5634649910233392e-06, + "loss": 9.1411, + "step": 17425 + }, + { + "epoch": 1.566427289048474, + "grad_norm": 7.2745747566223145, + "learning_rate": 1.565709156193896e-06, + "loss": 9.1469, + "step": 17450 + }, + { + "epoch": 1.5686714542190305, + "grad_norm": 8.812067985534668, + "learning_rate": 1.5679533213644527e-06, + "loss": 9.2349, + "step": 17475 + }, + { + "epoch": 1.570915619389587, + "grad_norm": 7.326807022094727, + "learning_rate": 1.570197486535009e-06, + "loss": 9.0204, + "step": 17500 + }, + { + "epoch": 1.5731597845601435, + "grad_norm": 8.075957298278809, + "learning_rate": 1.5724416517055657e-06, + "loss": 9.0742, + "step": 17525 + }, + { + "epoch": 1.5754039497307002, + "grad_norm": 7.171205997467041, + "learning_rate": 1.5746858168761223e-06, + "loss": 9.0406, + "step": 17550 + }, + { + "epoch": 1.577648114901257, + "grad_norm": 7.672084808349609, + "learning_rate": 1.5769299820466787e-06, + "loss": 9.2087, + "step": 17575 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 6.952700614929199, + "learning_rate": 1.5791741472172353e-06, + "loss": 9.1774, + "step": 17600 + }, + { + "epoch": 1.5821364452423698, + "grad_norm": 7.422615051269531, + "learning_rate": 1.581418312387792e-06, + "loss": 9.1305, + "step": 17625 + }, + { + "epoch": 1.5843806104129263, + "grad_norm": 7.115475177764893, + "learning_rate": 1.5836624775583484e-06, + "loss": 8.9769, + "step": 17650 + }, + { + "epoch": 1.5866247755834828, + "grad_norm": 7.72716760635376, + "learning_rate": 1.585906642728905e-06, + "loss": 9.0958, + "step": 17675 + }, + { + "epoch": 1.5888689407540395, + "grad_norm": 6.884526252746582, + "learning_rate": 1.5881508078994614e-06, + "loss": 9.0868, + "step": 17700 + }, + { + "epoch": 1.5911131059245962, + "grad_norm": 6.856712341308594, + "learning_rate": 1.590394973070018e-06, + "loss": 9.0567, + "step": 17725 + }, + { + "epoch": 1.5933572710951527, + "grad_norm": 7.402328968048096, + "learning_rate": 1.5926391382405746e-06, + "loss": 9.1582, + "step": 17750 + }, + { + "epoch": 1.5956014362657092, + "grad_norm": 7.359009742736816, + "learning_rate": 1.594883303411131e-06, + "loss": 9.1636, + "step": 17775 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 7.108728885650635, + "learning_rate": 1.5971274685816878e-06, + "loss": 9.1183, + "step": 17800 + }, + { + "epoch": 1.6000897666068221, + "grad_norm": 6.927455425262451, + "learning_rate": 1.5993716337522445e-06, + "loss": 9.0502, + "step": 17825 + }, + { + "epoch": 1.6023339317773788, + "grad_norm": 6.953423023223877, + "learning_rate": 1.6016157989228009e-06, + "loss": 9.1397, + "step": 17850 + }, + { + "epoch": 1.6045780969479355, + "grad_norm": 6.904997825622559, + "learning_rate": 1.6038599640933575e-06, + "loss": 9.1033, + "step": 17875 + }, + { + "epoch": 1.606822262118492, + "grad_norm": 7.867033958435059, + "learning_rate": 1.606104129263914e-06, + "loss": 9.1194, + "step": 17900 + }, + { + "epoch": 1.6090664272890485, + "grad_norm": 7.304041862487793, + "learning_rate": 1.6083482944344705e-06, + "loss": 9.15, + "step": 17925 + }, + { + "epoch": 1.611310592459605, + "grad_norm": 6.849384784698486, + "learning_rate": 1.6105924596050271e-06, + "loss": 9.1029, + "step": 17950 + }, + { + "epoch": 1.6135547576301614, + "grad_norm": 7.167654991149902, + "learning_rate": 1.6128366247755835e-06, + "loss": 9.2295, + "step": 17975 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 7.664891719818115, + "learning_rate": 1.6150807899461401e-06, + "loss": 9.0903, + "step": 18000 + }, + { + "epoch": 1.6180430879712748, + "grad_norm": 9.109430313110352, + "learning_rate": 1.6173249551166967e-06, + "loss": 9.0286, + "step": 18025 + }, + { + "epoch": 1.6202872531418313, + "grad_norm": 7.463655948638916, + "learning_rate": 1.6195691202872531e-06, + "loss": 9.1194, + "step": 18050 + }, + { + "epoch": 1.6225314183123878, + "grad_norm": 6.884872913360596, + "learning_rate": 1.6218132854578098e-06, + "loss": 9.0124, + "step": 18075 + }, + { + "epoch": 1.6247755834829443, + "grad_norm": 7.6332173347473145, + "learning_rate": 1.6240574506283664e-06, + "loss": 9.0224, + "step": 18100 + }, + { + "epoch": 1.6270197486535007, + "grad_norm": 7.3337626457214355, + "learning_rate": 1.6263016157989228e-06, + "loss": 9.0495, + "step": 18125 + }, + { + "epoch": 1.6292639138240574, + "grad_norm": 6.854399681091309, + "learning_rate": 1.6285457809694794e-06, + "loss": 8.9964, + "step": 18150 + }, + { + "epoch": 1.6315080789946141, + "grad_norm": 6.828502178192139, + "learning_rate": 1.6307899461400362e-06, + "loss": 9.1474, + "step": 18175 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 7.385233402252197, + "learning_rate": 1.6330341113105926e-06, + "loss": 9.1314, + "step": 18200 + }, + { + "epoch": 1.635996409335727, + "grad_norm": 7.449166297912598, + "learning_rate": 1.6352782764811492e-06, + "loss": 9.0901, + "step": 18225 + }, + { + "epoch": 1.6382405745062836, + "grad_norm": 7.079118728637695, + "learning_rate": 1.6375224416517059e-06, + "loss": 9.1155, + "step": 18250 + }, + { + "epoch": 1.64048473967684, + "grad_norm": 6.931335926055908, + "learning_rate": 1.6397666068222623e-06, + "loss": 9.1125, + "step": 18275 + }, + { + "epoch": 1.6427289048473968, + "grad_norm": 8.835485458374023, + "learning_rate": 1.6420107719928189e-06, + "loss": 9.1213, + "step": 18300 + }, + { + "epoch": 1.6449730700179535, + "grad_norm": 7.15498161315918, + "learning_rate": 1.6442549371633753e-06, + "loss": 9.2129, + "step": 18325 + }, + { + "epoch": 1.64721723518851, + "grad_norm": 6.941155433654785, + "learning_rate": 1.6464991023339319e-06, + "loss": 8.9941, + "step": 18350 + }, + { + "epoch": 1.6494614003590664, + "grad_norm": 7.118162155151367, + "learning_rate": 1.6487432675044885e-06, + "loss": 9.0882, + "step": 18375 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 6.910582542419434, + "learning_rate": 1.650987432675045e-06, + "loss": 9.1522, + "step": 18400 + }, + { + "epoch": 1.6539497307001796, + "grad_norm": 7.144911289215088, + "learning_rate": 1.6532315978456015e-06, + "loss": 9.0967, + "step": 18425 + }, + { + "epoch": 1.656193895870736, + "grad_norm": 7.461808681488037, + "learning_rate": 1.655475763016158e-06, + "loss": 9.006, + "step": 18450 + }, + { + "epoch": 1.6584380610412928, + "grad_norm": 8.240639686584473, + "learning_rate": 1.6577199281867145e-06, + "loss": 9.1865, + "step": 18475 + }, + { + "epoch": 1.6606822262118492, + "grad_norm": 7.095661640167236, + "learning_rate": 1.6599640933572712e-06, + "loss": 9.1158, + "step": 18500 + }, + { + "epoch": 1.6629263913824057, + "grad_norm": 7.586414813995361, + "learning_rate": 1.662208258527828e-06, + "loss": 9.1357, + "step": 18525 + }, + { + "epoch": 1.6651705565529622, + "grad_norm": 7.122671127319336, + "learning_rate": 1.6644524236983844e-06, + "loss": 9.1771, + "step": 18550 + }, + { + "epoch": 1.667414721723519, + "grad_norm": 7.491666793823242, + "learning_rate": 1.666696588868941e-06, + "loss": 9.1491, + "step": 18575 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 7.726658821105957, + "learning_rate": 1.6689407540394974e-06, + "loss": 9.1609, + "step": 18600 + }, + { + "epoch": 1.671903052064632, + "grad_norm": 7.043160915374756, + "learning_rate": 1.671184919210054e-06, + "loss": 9.1614, + "step": 18625 + }, + { + "epoch": 1.6741472172351886, + "grad_norm": 8.435595512390137, + "learning_rate": 1.6734290843806106e-06, + "loss": 9.1209, + "step": 18650 + }, + { + "epoch": 1.676391382405745, + "grad_norm": 7.032945156097412, + "learning_rate": 1.675673249551167e-06, + "loss": 8.9823, + "step": 18675 + }, + { + "epoch": 1.6786355475763015, + "grad_norm": 7.0244646072387695, + "learning_rate": 1.6779174147217237e-06, + "loss": 9.0226, + "step": 18700 + }, + { + "epoch": 1.6808797127468582, + "grad_norm": 7.046318531036377, + "learning_rate": 1.6801615798922803e-06, + "loss": 9.059, + "step": 18725 + }, + { + "epoch": 1.6831238779174147, + "grad_norm": 7.426521301269531, + "learning_rate": 1.6824057450628367e-06, + "loss": 9.0805, + "step": 18750 + }, + { + "epoch": 1.6853680430879714, + "grad_norm": 6.910311222076416, + "learning_rate": 1.6846499102333933e-06, + "loss": 9.0448, + "step": 18775 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 6.876137733459473, + "learning_rate": 1.6868940754039497e-06, + "loss": 9.023, + "step": 18800 + }, + { + "epoch": 1.6898563734290843, + "grad_norm": 6.854559898376465, + "learning_rate": 1.6891382405745063e-06, + "loss": 9.1081, + "step": 18825 + }, + { + "epoch": 1.6921005385996408, + "grad_norm": 7.152273178100586, + "learning_rate": 1.691382405745063e-06, + "loss": 9.0247, + "step": 18850 + }, + { + "epoch": 1.6943447037701975, + "grad_norm": 7.081953525543213, + "learning_rate": 1.6936265709156198e-06, + "loss": 9.0062, + "step": 18875 + }, + { + "epoch": 1.696588868940754, + "grad_norm": 7.447522163391113, + "learning_rate": 1.6958707360861762e-06, + "loss": 9.1038, + "step": 18900 + }, + { + "epoch": 1.6988330341113107, + "grad_norm": 7.250172138214111, + "learning_rate": 1.6981149012567328e-06, + "loss": 9.0375, + "step": 18925 + }, + { + "epoch": 1.7010771992818672, + "grad_norm": 6.8722662925720215, + "learning_rate": 1.7003590664272892e-06, + "loss": 9.0984, + "step": 18950 + }, + { + "epoch": 1.7033213644524237, + "grad_norm": 6.918059349060059, + "learning_rate": 1.7026032315978458e-06, + "loss": 9.0345, + "step": 18975 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 6.98055362701416, + "learning_rate": 1.7048473967684024e-06, + "loss": 9.0441, + "step": 19000 + }, + { + "epoch": 1.7078096947935368, + "grad_norm": 7.597086429595947, + "learning_rate": 1.7070915619389588e-06, + "loss": 9.0768, + "step": 19025 + }, + { + "epoch": 1.7100538599640933, + "grad_norm": 6.971393585205078, + "learning_rate": 1.7093357271095154e-06, + "loss": 9.0524, + "step": 19050 + }, + { + "epoch": 1.71229802513465, + "grad_norm": 6.766976356506348, + "learning_rate": 1.711579892280072e-06, + "loss": 9.0771, + "step": 19075 + }, + { + "epoch": 1.7145421903052065, + "grad_norm": 7.781848907470703, + "learning_rate": 1.7137342908438063e-06, + "loss": 9.1021, + "step": 19100 + }, + { + "epoch": 1.716786355475763, + "grad_norm": 7.976457118988037, + "learning_rate": 1.7159784560143627e-06, + "loss": 9.0502, + "step": 19125 + }, + { + "epoch": 1.7190305206463194, + "grad_norm": 7.481279373168945, + "learning_rate": 1.7182226211849193e-06, + "loss": 9.0945, + "step": 19150 + }, + { + "epoch": 1.7212746858168761, + "grad_norm": 6.99531364440918, + "learning_rate": 1.7204667863554759e-06, + "loss": 9.0613, + "step": 19175 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 6.85428524017334, + "learning_rate": 1.7227109515260323e-06, + "loss": 9.0114, + "step": 19200 + }, + { + "epoch": 1.7257630161579893, + "grad_norm": 7.071933269500732, + "learning_rate": 1.724955116696589e-06, + "loss": 8.954, + "step": 19225 + }, + { + "epoch": 1.7280071813285458, + "grad_norm": 7.556234836578369, + "learning_rate": 1.7271992818671457e-06, + "loss": 9.0757, + "step": 19250 + }, + { + "epoch": 1.7302513464991023, + "grad_norm": 6.788029670715332, + "learning_rate": 1.7294434470377021e-06, + "loss": 9.0704, + "step": 19275 + }, + { + "epoch": 1.7324955116696588, + "grad_norm": 10.391080856323242, + "learning_rate": 1.7316876122082588e-06, + "loss": 9.1639, + "step": 19300 + }, + { + "epoch": 1.7347396768402155, + "grad_norm": 7.341396331787109, + "learning_rate": 1.7339317773788154e-06, + "loss": 9.0053, + "step": 19325 + }, + { + "epoch": 1.736983842010772, + "grad_norm": 6.906222343444824, + "learning_rate": 1.7361759425493718e-06, + "loss": 9.0022, + "step": 19350 + }, + { + "epoch": 1.7392280071813286, + "grad_norm": 8.779803276062012, + "learning_rate": 1.7384201077199284e-06, + "loss": 8.9923, + "step": 19375 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 8.150513648986816, + "learning_rate": 1.7406642728904848e-06, + "loss": 9.0517, + "step": 19400 + }, + { + "epoch": 1.7437163375224416, + "grad_norm": 7.372367858886719, + "learning_rate": 1.7429084380610414e-06, + "loss": 9.1359, + "step": 19425 + }, + { + "epoch": 1.745960502692998, + "grad_norm": 7.951119422912598, + "learning_rate": 1.745152603231598e-06, + "loss": 8.9793, + "step": 19450 + }, + { + "epoch": 1.7482046678635548, + "grad_norm": 7.0099382400512695, + "learning_rate": 1.7473967684021544e-06, + "loss": 9.054, + "step": 19475 + }, + { + "epoch": 1.7504488330341115, + "grad_norm": 8.093155860900879, + "learning_rate": 1.749640933572711e-06, + "loss": 9.0424, + "step": 19500 + }, + { + "epoch": 1.752692998204668, + "grad_norm": 8.381257057189941, + "learning_rate": 1.7518850987432677e-06, + "loss": 9.0173, + "step": 19525 + }, + { + "epoch": 1.7549371633752244, + "grad_norm": 7.351244926452637, + "learning_rate": 1.754129263913824e-06, + "loss": 9.0885, + "step": 19550 + }, + { + "epoch": 1.757181328545781, + "grad_norm": 7.210597038269043, + "learning_rate": 1.7563734290843807e-06, + "loss": 9.1536, + "step": 19575 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 7.884551525115967, + "learning_rate": 1.7586175942549375e-06, + "loss": 9.1455, + "step": 19600 + }, + { + "epoch": 1.761669658886894, + "grad_norm": 6.980470180511475, + "learning_rate": 1.760861759425494e-06, + "loss": 9.2116, + "step": 19625 + }, + { + "epoch": 1.7639138240574508, + "grad_norm": 7.674100875854492, + "learning_rate": 1.7631059245960505e-06, + "loss": 8.9907, + "step": 19650 + }, + { + "epoch": 1.7661579892280073, + "grad_norm": 6.879805088043213, + "learning_rate": 1.7653500897666071e-06, + "loss": 9.036, + "step": 19675 + }, + { + "epoch": 1.7684021543985637, + "grad_norm": 7.215080261230469, + "learning_rate": 1.7675942549371635e-06, + "loss": 9.1057, + "step": 19700 + }, + { + "epoch": 1.7706463195691202, + "grad_norm": 9.486248970031738, + "learning_rate": 1.7698384201077202e-06, + "loss": 9.057, + "step": 19725 + }, + { + "epoch": 1.7728904847396767, + "grad_norm": 7.2087602615356445, + "learning_rate": 1.7720825852782766e-06, + "loss": 9.0203, + "step": 19750 + }, + { + "epoch": 1.7751346499102334, + "grad_norm": 7.54832649230957, + "learning_rate": 1.7743267504488332e-06, + "loss": 9.0005, + "step": 19775 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 7.186557292938232, + "learning_rate": 1.7765709156193898e-06, + "loss": 9.026, + "step": 19800 + }, + { + "epoch": 1.7796229802513466, + "grad_norm": 6.9838385581970215, + "learning_rate": 1.7788150807899462e-06, + "loss": 8.9645, + "step": 19825 + }, + { + "epoch": 1.781867145421903, + "grad_norm": 7.390361309051514, + "learning_rate": 1.7810592459605028e-06, + "loss": 8.9586, + "step": 19850 + }, + { + "epoch": 1.7841113105924595, + "grad_norm": 6.915621280670166, + "learning_rate": 1.7833034111310592e-06, + "loss": 9.071, + "step": 19875 + }, + { + "epoch": 1.786355475763016, + "grad_norm": 7.404318332672119, + "learning_rate": 1.7855475763016158e-06, + "loss": 9.0758, + "step": 19900 + }, + { + "epoch": 1.7885996409335727, + "grad_norm": 7.457082748413086, + "learning_rate": 1.7877917414721724e-06, + "loss": 9.0744, + "step": 19925 + }, + { + "epoch": 1.7908438061041294, + "grad_norm": 6.92787504196167, + "learning_rate": 1.7900359066427288e-06, + "loss": 8.9625, + "step": 19950 + }, + { + "epoch": 1.7930879712746859, + "grad_norm": 7.45485782623291, + "learning_rate": 1.7922800718132857e-06, + "loss": 9.1141, + "step": 19975 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 7.1208977699279785, + "learning_rate": 1.7945242369838423e-06, + "loss": 9.0402, + "step": 20000 + }, + { + "epoch": 1.7975763016157988, + "grad_norm": 6.975611209869385, + "learning_rate": 1.7967684021543987e-06, + "loss": 8.9991, + "step": 20025 + }, + { + "epoch": 1.7998204667863553, + "grad_norm": 7.2470622062683105, + "learning_rate": 1.7990125673249553e-06, + "loss": 8.9047, + "step": 20050 + }, + { + "epoch": 1.802064631956912, + "grad_norm": 7.47197961807251, + "learning_rate": 1.801256732495512e-06, + "loss": 9.1506, + "step": 20075 + }, + { + "epoch": 1.8043087971274687, + "grad_norm": 7.089389324188232, + "learning_rate": 1.8035008976660683e-06, + "loss": 8.9132, + "step": 20100 + }, + { + "epoch": 1.8065529622980252, + "grad_norm": 9.267487525939941, + "learning_rate": 1.805745062836625e-06, + "loss": 9.0816, + "step": 20125 + }, + { + "epoch": 1.8087971274685817, + "grad_norm": 7.735820293426514, + "learning_rate": 1.8079892280071816e-06, + "loss": 9.1713, + "step": 20150 + }, + { + "epoch": 1.8110412926391382, + "grad_norm": 7.322841644287109, + "learning_rate": 1.810233393177738e-06, + "loss": 9.083, + "step": 20175 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 7.130434036254883, + "learning_rate": 1.8124775583482946e-06, + "loss": 9.0807, + "step": 20200 + }, + { + "epoch": 1.8155296229802513, + "grad_norm": 8.170587539672852, + "learning_rate": 1.814721723518851e-06, + "loss": 9.0983, + "step": 20225 + }, + { + "epoch": 1.817773788150808, + "grad_norm": 7.228168964385986, + "learning_rate": 1.8169658886894076e-06, + "loss": 8.9539, + "step": 20250 + }, + { + "epoch": 1.8200179533213645, + "grad_norm": 7.1788105964660645, + "learning_rate": 1.8192100538599642e-06, + "loss": 8.9251, + "step": 20275 + }, + { + "epoch": 1.822262118491921, + "grad_norm": 7.122203826904297, + "learning_rate": 1.8214542190305206e-06, + "loss": 8.9978, + "step": 20300 + }, + { + "epoch": 1.8245062836624775, + "grad_norm": 7.1117072105407715, + "learning_rate": 1.8236983842010774e-06, + "loss": 9.0843, + "step": 20325 + }, + { + "epoch": 1.826750448833034, + "grad_norm": 9.220500946044922, + "learning_rate": 1.825942549371634e-06, + "loss": 8.9578, + "step": 20350 + }, + { + "epoch": 1.8289946140035906, + "grad_norm": 7.019988059997559, + "learning_rate": 1.8281867145421905e-06, + "loss": 9.0386, + "step": 20375 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 6.955723285675049, + "learning_rate": 1.830430879712747e-06, + "loss": 9.0862, + "step": 20400 + }, + { + "epoch": 1.8334829443447038, + "grad_norm": 7.384273529052734, + "learning_rate": 1.8326750448833037e-06, + "loss": 9.1144, + "step": 20425 + }, + { + "epoch": 1.8357271095152603, + "grad_norm": 7.46206521987915, + "learning_rate": 1.83491921005386e-06, + "loss": 8.9541, + "step": 20450 + }, + { + "epoch": 1.8379712746858168, + "grad_norm": 7.3150858879089355, + "learning_rate": 1.8371633752244167e-06, + "loss": 8.9992, + "step": 20475 + }, + { + "epoch": 1.8402154398563735, + "grad_norm": 7.019959926605225, + "learning_rate": 1.8394075403949733e-06, + "loss": 8.9519, + "step": 20500 + }, + { + "epoch": 1.84245960502693, + "grad_norm": 8.136028289794922, + "learning_rate": 1.8416517055655297e-06, + "loss": 8.9674, + "step": 20525 + }, + { + "epoch": 1.8447037701974867, + "grad_norm": 8.02564525604248, + "learning_rate": 1.8438958707360863e-06, + "loss": 9.0895, + "step": 20550 + }, + { + "epoch": 1.8469479353680431, + "grad_norm": 6.911016464233398, + "learning_rate": 1.8461400359066427e-06, + "loss": 8.8656, + "step": 20575 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 7.440893650054932, + "learning_rate": 1.8483842010771994e-06, + "loss": 9.0054, + "step": 20600 + }, + { + "epoch": 1.851436265709156, + "grad_norm": 7.4717936515808105, + "learning_rate": 1.850628366247756e-06, + "loss": 9.0112, + "step": 20625 + }, + { + "epoch": 1.8536804308797128, + "grad_norm": 7.268368244171143, + "learning_rate": 1.8528725314183124e-06, + "loss": 9.0188, + "step": 20650 + }, + { + "epoch": 1.8559245960502693, + "grad_norm": 7.021681308746338, + "learning_rate": 1.8551166965888692e-06, + "loss": 8.9357, + "step": 20675 + }, + { + "epoch": 1.858168761220826, + "grad_norm": 6.938802719116211, + "learning_rate": 1.8573608617594258e-06, + "loss": 8.9487, + "step": 20700 + }, + { + "epoch": 1.8604129263913824, + "grad_norm": 7.612166404724121, + "learning_rate": 1.8596050269299822e-06, + "loss": 9.0017, + "step": 20725 + }, + { + "epoch": 1.862657091561939, + "grad_norm": 8.019668579101562, + "learning_rate": 1.8618491921005388e-06, + "loss": 8.9667, + "step": 20750 + }, + { + "epoch": 1.8649012567324954, + "grad_norm": 8.401179313659668, + "learning_rate": 1.8640933572710954e-06, + "loss": 9.0597, + "step": 20775 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 7.179293155670166, + "learning_rate": 1.8663375224416519e-06, + "loss": 8.9776, + "step": 20800 + }, + { + "epoch": 1.8693895870736086, + "grad_norm": 7.444736003875732, + "learning_rate": 1.8685816876122085e-06, + "loss": 9.0271, + "step": 20825 + }, + { + "epoch": 1.8716337522441653, + "grad_norm": 8.325627326965332, + "learning_rate": 1.8708258527827649e-06, + "loss": 9.0333, + "step": 20850 + }, + { + "epoch": 1.8738779174147218, + "grad_norm": 7.502315044403076, + "learning_rate": 1.8730700179533215e-06, + "loss": 9.0506, + "step": 20875 + }, + { + "epoch": 1.8761220825852782, + "grad_norm": 6.927270412445068, + "learning_rate": 1.875314183123878e-06, + "loss": 8.9878, + "step": 20900 + }, + { + "epoch": 1.8783662477558347, + "grad_norm": 7.186232089996338, + "learning_rate": 1.8775583482944345e-06, + "loss": 9.1363, + "step": 20925 + }, + { + "epoch": 1.8806104129263914, + "grad_norm": 7.969644546508789, + "learning_rate": 1.8798025134649911e-06, + "loss": 8.953, + "step": 20950 + }, + { + "epoch": 1.882854578096948, + "grad_norm": 8.549609184265137, + "learning_rate": 1.8820466786355477e-06, + "loss": 8.9943, + "step": 20975 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 13.235190391540527, + "learning_rate": 1.8842908438061041e-06, + "loss": 8.9756, + "step": 21000 + }, + { + "epoch": 1.887342908438061, + "grad_norm": 8.559106826782227, + "learning_rate": 1.8865350089766608e-06, + "loss": 9.05, + "step": 21025 + }, + { + "epoch": 1.8895870736086176, + "grad_norm": 7.463354110717773, + "learning_rate": 1.8887791741472176e-06, + "loss": 9.1184, + "step": 21050 + }, + { + "epoch": 1.891831238779174, + "grad_norm": 8.517889976501465, + "learning_rate": 1.891023339317774e-06, + "loss": 8.9419, + "step": 21075 + }, + { + "epoch": 1.8940754039497307, + "grad_norm": 7.019028663635254, + "learning_rate": 1.8932675044883306e-06, + "loss": 8.9718, + "step": 21100 + }, + { + "epoch": 1.8963195691202872, + "grad_norm": 7.159873962402344, + "learning_rate": 1.8955116696588872e-06, + "loss": 8.9786, + "step": 21125 + }, + { + "epoch": 1.898563734290844, + "grad_norm": 7.690567493438721, + "learning_rate": 1.8977558348294436e-06, + "loss": 9.0619, + "step": 21150 + }, + { + "epoch": 1.9008078994614004, + "grad_norm": 8.199015617370605, + "learning_rate": 1.9000000000000002e-06, + "loss": 8.9768, + "step": 21175 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 9.621110916137695, + "learning_rate": 1.9022441651705566e-06, + "loss": 8.9531, + "step": 21200 + }, + { + "epoch": 1.9052962298025133, + "grad_norm": 11.752854347229004, + "learning_rate": 1.9044883303411133e-06, + "loss": 8.8727, + "step": 21225 + }, + { + "epoch": 1.90754039497307, + "grad_norm": 7.573029518127441, + "learning_rate": 1.9066427289048475e-06, + "loss": 9.0321, + "step": 21250 + }, + { + "epoch": 1.9097845601436265, + "grad_norm": 7.374513149261475, + "learning_rate": 1.908886894075404e-06, + "loss": 9.1287, + "step": 21275 + }, + { + "epoch": 1.9120287253141832, + "grad_norm": 8.096179962158203, + "learning_rate": 1.9111310592459605e-06, + "loss": 9.0808, + "step": 21300 + }, + { + "epoch": 1.9142728904847397, + "grad_norm": 7.705260276794434, + "learning_rate": 1.913375224416517e-06, + "loss": 9.0604, + "step": 21325 + }, + { + "epoch": 1.9165170556552962, + "grad_norm": 7.652899742126465, + "learning_rate": 1.9156193895870737e-06, + "loss": 9.0787, + "step": 21350 + }, + { + "epoch": 1.9187612208258527, + "grad_norm": 8.3776216506958, + "learning_rate": 1.9178635547576303e-06, + "loss": 9.0479, + "step": 21375 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 7.085367679595947, + "learning_rate": 1.920107719928187e-06, + "loss": 8.845, + "step": 21400 + }, + { + "epoch": 1.923249551166966, + "grad_norm": 7.316383361816406, + "learning_rate": 1.9223518850987436e-06, + "loss": 8.9046, + "step": 21425 + }, + { + "epoch": 1.9254937163375225, + "grad_norm": 7.530087471008301, + "learning_rate": 1.9245960502693e-06, + "loss": 9.039, + "step": 21450 + }, + { + "epoch": 1.927737881508079, + "grad_norm": 7.262918949127197, + "learning_rate": 1.926840215439857e-06, + "loss": 8.9296, + "step": 21475 + }, + { + "epoch": 1.9299820466786355, + "grad_norm": 7.530022144317627, + "learning_rate": 1.929084380610413e-06, + "loss": 8.8256, + "step": 21500 + }, + { + "epoch": 1.932226211849192, + "grad_norm": 7.510108947753906, + "learning_rate": 1.9313285457809696e-06, + "loss": 9.0433, + "step": 21525 + }, + { + "epoch": 1.9344703770197487, + "grad_norm": 9.776082038879395, + "learning_rate": 1.9335727109515262e-06, + "loss": 8.9738, + "step": 21550 + }, + { + "epoch": 1.9367145421903054, + "grad_norm": 7.685164928436279, + "learning_rate": 1.935816876122083e-06, + "loss": 8.8697, + "step": 21575 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 8.856735229492188, + "learning_rate": 1.9380610412926394e-06, + "loss": 9.0602, + "step": 21600 + }, + { + "epoch": 1.9412028725314183, + "grad_norm": 7.369348526000977, + "learning_rate": 1.9403052064631956e-06, + "loss": 8.9946, + "step": 21625 + }, + { + "epoch": 1.9434470377019748, + "grad_norm": 7.337221622467041, + "learning_rate": 1.9425493716337523e-06, + "loss": 9.0091, + "step": 21650 + }, + { + "epoch": 1.9456912028725313, + "grad_norm": 7.637331962585449, + "learning_rate": 1.944793536804309e-06, + "loss": 8.9953, + "step": 21675 + }, + { + "epoch": 1.947935368043088, + "grad_norm": 7.094986915588379, + "learning_rate": 1.9470377019748655e-06, + "loss": 9.0224, + "step": 21700 + }, + { + "epoch": 1.9501795332136447, + "grad_norm": 6.886270523071289, + "learning_rate": 1.949281867145422e-06, + "loss": 8.9176, + "step": 21725 + }, + { + "epoch": 1.9524236983842012, + "grad_norm": 7.362891674041748, + "learning_rate": 1.9515260323159783e-06, + "loss": 8.9999, + "step": 21750 + }, + { + "epoch": 1.9546678635547576, + "grad_norm": 7.412165641784668, + "learning_rate": 1.9537701974865353e-06, + "loss": 8.914, + "step": 21775 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 9.59965705871582, + "learning_rate": 1.956014362657092e-06, + "loss": 8.9246, + "step": 21800 + }, + { + "epoch": 1.9591561938958706, + "grad_norm": 6.962210655212402, + "learning_rate": 1.9582585278276486e-06, + "loss": 9.0026, + "step": 21825 + }, + { + "epoch": 1.9614003590664273, + "grad_norm": 8.056952476501465, + "learning_rate": 1.9605026929982048e-06, + "loss": 8.9568, + "step": 21850 + }, + { + "epoch": 1.963644524236984, + "grad_norm": 8.13674259185791, + "learning_rate": 1.9627468581687614e-06, + "loss": 8.9279, + "step": 21875 + }, + { + "epoch": 1.9658886894075405, + "grad_norm": 8.591360092163086, + "learning_rate": 1.964991023339318e-06, + "loss": 8.89, + "step": 21900 + }, + { + "epoch": 1.968132854578097, + "grad_norm": 7.277829647064209, + "learning_rate": 1.9672351885098746e-06, + "loss": 9.0732, + "step": 21925 + }, + { + "epoch": 1.9703770197486534, + "grad_norm": 7.004604816436768, + "learning_rate": 1.9694793536804312e-06, + "loss": 8.9017, + "step": 21950 + }, + { + "epoch": 1.97262118491921, + "grad_norm": 7.330372333526611, + "learning_rate": 1.9717235188509874e-06, + "loss": 9.0722, + "step": 21975 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 10.022676467895508, + "learning_rate": 1.973967684021544e-06, + "loss": 8.9168, + "step": 22000 + }, + { + "epoch": 1.9771095152603233, + "grad_norm": 7.762650489807129, + "learning_rate": 1.9762118491921006e-06, + "loss": 8.9556, + "step": 22025 + }, + { + "epoch": 1.9793536804308798, + "grad_norm": 7.78145694732666, + "learning_rate": 1.9784560143626573e-06, + "loss": 8.9558, + "step": 22050 + }, + { + "epoch": 1.9815978456014363, + "grad_norm": 7.390235424041748, + "learning_rate": 1.980700179533214e-06, + "loss": 9.0061, + "step": 22075 + }, + { + "epoch": 1.9838420107719927, + "grad_norm": 7.044656753540039, + "learning_rate": 1.98294434470377e-06, + "loss": 8.9241, + "step": 22100 + }, + { + "epoch": 1.9860861759425492, + "grad_norm": 7.232170581817627, + "learning_rate": 1.985188509874327e-06, + "loss": 9.0535, + "step": 22125 + }, + { + "epoch": 1.988330341113106, + "grad_norm": 7.15648078918457, + "learning_rate": 1.9874326750448837e-06, + "loss": 8.9757, + "step": 22150 + }, + { + "epoch": 1.9905745062836626, + "grad_norm": 7.836061477661133, + "learning_rate": 1.98967684021544e-06, + "loss": 8.9116, + "step": 22175 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 7.8744707107543945, + "learning_rate": 1.9919210053859965e-06, + "loss": 8.8796, + "step": 22200 + }, + { + "epoch": 1.9950628366247756, + "grad_norm": 7.078216552734375, + "learning_rate": 1.994165170556553e-06, + "loss": 9.0138, + "step": 22225 + }, + { + "epoch": 1.997307001795332, + "grad_norm": 9.103804588317871, + "learning_rate": 1.9964093357271097e-06, + "loss": 9.1001, + "step": 22250 + }, + { + "epoch": 1.9995511669658885, + "grad_norm": 7.067415237426758, + "learning_rate": 1.9986535008976664e-06, + "loss": 9.0602, + "step": 22275 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.0096805605941993, + "eval_f1_macro": 1.1922980960845238e-05, + "eval_f1_micro": 0.0096805605941993, + "eval_f1_weighted": 0.00034795413171588563, + "eval_loss": 9.184748649597168, + "eval_precision_macro": 1.0199374619906893e-05, + "eval_precision_micro": 0.0096805605941993, + "eval_precision_weighted": 0.0002129889332437326, + "eval_recall_macro": 0.0001341977041583746, + "eval_recall_micro": 0.0096805605941993, + "eval_recall_weighted": 0.0096805605941993, + "eval_runtime": 129.6326, + "eval_samples_per_second": 404.011, + "eval_steps_per_second": 12.628, + "step": 22280 + }, + { + "epoch": 2.0017953321364454, + "grad_norm": 7.29226016998291, + "learning_rate": 2.000897666068223e-06, + "loss": 8.8659, + "step": 22300 + }, + { + "epoch": 2.004039497307002, + "grad_norm": 6.886551856994629, + "learning_rate": 2.003141831238779e-06, + "loss": 8.7196, + "step": 22325 + }, + { + "epoch": 2.0062836624775584, + "grad_norm": 6.867776870727539, + "learning_rate": 2.0053859964093358e-06, + "loss": 8.8533, + "step": 22350 + }, + { + "epoch": 2.008527827648115, + "grad_norm": 7.165907859802246, + "learning_rate": 2.0076301615798924e-06, + "loss": 8.9433, + "step": 22375 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 8.750335693359375, + "learning_rate": 2.009874326750449e-06, + "loss": 8.8281, + "step": 22400 + }, + { + "epoch": 2.013016157989228, + "grad_norm": 6.866158485412598, + "learning_rate": 2.0121184919210056e-06, + "loss": 8.8402, + "step": 22425 + }, + { + "epoch": 2.0152603231597848, + "grad_norm": 7.568830490112305, + "learning_rate": 2.014362657091562e-06, + "loss": 8.8799, + "step": 22450 + }, + { + "epoch": 2.0175044883303412, + "grad_norm": 7.008769989013672, + "learning_rate": 2.016606822262119e-06, + "loss": 8.8512, + "step": 22475 + }, + { + "epoch": 2.0197486535008977, + "grad_norm": 6.936007976531982, + "learning_rate": 2.0188509874326755e-06, + "loss": 8.951, + "step": 22500 + }, + { + "epoch": 2.021992818671454, + "grad_norm": 7.676693439483643, + "learning_rate": 2.0210951526032317e-06, + "loss": 8.8804, + "step": 22525 + }, + { + "epoch": 2.0242369838420107, + "grad_norm": 7.325889587402344, + "learning_rate": 2.0233393177737883e-06, + "loss": 8.8046, + "step": 22550 + }, + { + "epoch": 2.026481149012567, + "grad_norm": 8.209090232849121, + "learning_rate": 2.025583482944345e-06, + "loss": 8.8623, + "step": 22575 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 7.93186092376709, + "learning_rate": 2.0278276481149015e-06, + "loss": 8.954, + "step": 22600 + }, + { + "epoch": 2.0309694793536806, + "grad_norm": 7.302985668182373, + "learning_rate": 2.030071813285458e-06, + "loss": 8.6809, + "step": 22625 + }, + { + "epoch": 2.033213644524237, + "grad_norm": 7.331601142883301, + "learning_rate": 2.0323159784560143e-06, + "loss": 8.9309, + "step": 22650 + }, + { + "epoch": 2.0354578096947935, + "grad_norm": 8.061921119689941, + "learning_rate": 2.034560143626571e-06, + "loss": 8.788, + "step": 22675 + }, + { + "epoch": 2.03770197486535, + "grad_norm": 8.044675827026367, + "learning_rate": 2.0368043087971276e-06, + "loss": 8.7932, + "step": 22700 + }, + { + "epoch": 2.0399461400359065, + "grad_norm": 7.544517517089844, + "learning_rate": 2.039048473967684e-06, + "loss": 8.7378, + "step": 22725 + }, + { + "epoch": 2.0421903052064634, + "grad_norm": 7.6586503982543945, + "learning_rate": 2.0412926391382408e-06, + "loss": 8.9424, + "step": 22750 + }, + { + "epoch": 2.04443447037702, + "grad_norm": 7.603306770324707, + "learning_rate": 2.0435368043087974e-06, + "loss": 8.7396, + "step": 22775 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 7.09197473526001, + "learning_rate": 2.0457809694793536e-06, + "loss": 8.7842, + "step": 22800 + }, + { + "epoch": 2.048922800718133, + "grad_norm": 7.707315444946289, + "learning_rate": 2.04802513464991e-06, + "loss": 8.7565, + "step": 22825 + }, + { + "epoch": 2.0511669658886893, + "grad_norm": 7.207848072052002, + "learning_rate": 2.0502692998204672e-06, + "loss": 8.7031, + "step": 22850 + }, + { + "epoch": 2.0534111310592458, + "grad_norm": 7.5227952003479, + "learning_rate": 2.0525134649910234e-06, + "loss": 8.8683, + "step": 22875 + }, + { + "epoch": 2.0556552962298027, + "grad_norm": 7.464354038238525, + "learning_rate": 2.05475763016158e-06, + "loss": 8.8908, + "step": 22900 + }, + { + "epoch": 2.057899461400359, + "grad_norm": 7.204601764678955, + "learning_rate": 2.0570017953321367e-06, + "loss": 8.8558, + "step": 22925 + }, + { + "epoch": 2.0601436265709157, + "grad_norm": 7.234342098236084, + "learning_rate": 2.0592459605026933e-06, + "loss": 8.9191, + "step": 22950 + }, + { + "epoch": 2.062387791741472, + "grad_norm": 7.399554252624512, + "learning_rate": 2.06149012567325e-06, + "loss": 8.867, + "step": 22975 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 7.271561622619629, + "learning_rate": 2.063734290843806e-06, + "loss": 8.8202, + "step": 23000 + }, + { + "epoch": 2.066876122082585, + "grad_norm": 7.215842247009277, + "learning_rate": 2.0659784560143627e-06, + "loss": 8.7973, + "step": 23025 + }, + { + "epoch": 2.069120287253142, + "grad_norm": 7.80983829498291, + "learning_rate": 2.0682226211849193e-06, + "loss": 8.9157, + "step": 23050 + }, + { + "epoch": 2.0713644524236985, + "grad_norm": 7.20431661605835, + "learning_rate": 2.070466786355476e-06, + "loss": 8.8725, + "step": 23075 + }, + { + "epoch": 2.073608617594255, + "grad_norm": 7.396937847137451, + "learning_rate": 2.0727109515260325e-06, + "loss": 8.9506, + "step": 23100 + }, + { + "epoch": 2.0758527827648114, + "grad_norm": 7.493486404418945, + "learning_rate": 2.0749551166965887e-06, + "loss": 8.9116, + "step": 23125 + }, + { + "epoch": 2.078096947935368, + "grad_norm": 7.700948715209961, + "learning_rate": 2.0771992818671454e-06, + "loss": 8.8398, + "step": 23150 + }, + { + "epoch": 2.0803411131059244, + "grad_norm": 8.914631843566895, + "learning_rate": 2.079443447037702e-06, + "loss": 8.8081, + "step": 23175 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 7.802060604095459, + "learning_rate": 2.081687612208259e-06, + "loss": 8.918, + "step": 23200 + }, + { + "epoch": 2.084829443447038, + "grad_norm": 7.239523410797119, + "learning_rate": 2.083931777378815e-06, + "loss": 8.9321, + "step": 23225 + }, + { + "epoch": 2.0870736086175943, + "grad_norm": 7.07887601852417, + "learning_rate": 2.086175942549372e-06, + "loss": 8.8539, + "step": 23250 + }, + { + "epoch": 2.0893177737881508, + "grad_norm": 7.132719039916992, + "learning_rate": 2.0884201077199284e-06, + "loss": 8.7587, + "step": 23275 + }, + { + "epoch": 2.0915619389587072, + "grad_norm": 7.083888053894043, + "learning_rate": 2.090664272890485e-06, + "loss": 8.7778, + "step": 23300 + }, + { + "epoch": 2.0938061041292637, + "grad_norm": 7.229637622833252, + "learning_rate": 2.0929084380610417e-06, + "loss": 8.7765, + "step": 23325 + }, + { + "epoch": 2.0960502692998206, + "grad_norm": 7.854532718658447, + "learning_rate": 2.095152603231598e-06, + "loss": 8.8521, + "step": 23350 + }, + { + "epoch": 2.098294434470377, + "grad_norm": 7.168841361999512, + "learning_rate": 2.0973967684021545e-06, + "loss": 8.695, + "step": 23375 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 7.180956840515137, + "learning_rate": 2.099640933572711e-06, + "loss": 8.8926, + "step": 23400 + }, + { + "epoch": 2.10278276481149, + "grad_norm": 8.000730514526367, + "learning_rate": 2.1018850987432677e-06, + "loss": 8.8571, + "step": 23425 + }, + { + "epoch": 2.1050269299820465, + "grad_norm": 7.267016887664795, + "learning_rate": 2.1041292639138243e-06, + "loss": 8.818, + "step": 23450 + }, + { + "epoch": 2.107271095152603, + "grad_norm": 7.822959899902344, + "learning_rate": 2.1063734290843805e-06, + "loss": 8.812, + "step": 23475 + }, + { + "epoch": 2.10951526032316, + "grad_norm": 7.1946306228637695, + "learning_rate": 2.108617594254937e-06, + "loss": 8.8035, + "step": 23500 + }, + { + "epoch": 2.1117594254937164, + "grad_norm": 7.709937572479248, + "learning_rate": 2.1108617594254937e-06, + "loss": 8.7891, + "step": 23525 + }, + { + "epoch": 2.114003590664273, + "grad_norm": 7.955183506011963, + "learning_rate": 2.1131059245960508e-06, + "loss": 8.8248, + "step": 23550 + }, + { + "epoch": 2.1162477558348294, + "grad_norm": 7.925631046295166, + "learning_rate": 2.115350089766607e-06, + "loss": 8.7932, + "step": 23575 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 7.785841941833496, + "learning_rate": 2.1175942549371636e-06, + "loss": 8.7956, + "step": 23600 + }, + { + "epoch": 2.1207360861759423, + "grad_norm": 7.678538799285889, + "learning_rate": 2.11983842010772e-06, + "loss": 8.8361, + "step": 23625 + }, + { + "epoch": 2.1229802513464993, + "grad_norm": 7.594573974609375, + "learning_rate": 2.122082585278277e-06, + "loss": 8.6861, + "step": 23650 + }, + { + "epoch": 2.1252244165170557, + "grad_norm": 8.875910758972168, + "learning_rate": 2.1243267504488334e-06, + "loss": 8.875, + "step": 23675 + }, + { + "epoch": 2.127468581687612, + "grad_norm": 7.733530521392822, + "learning_rate": 2.1265709156193896e-06, + "loss": 8.84, + "step": 23700 + }, + { + "epoch": 2.1297127468581687, + "grad_norm": 8.726116180419922, + "learning_rate": 2.1288150807899462e-06, + "loss": 9.018, + "step": 23725 + }, + { + "epoch": 2.131956912028725, + "grad_norm": 7.615342617034912, + "learning_rate": 2.131059245960503e-06, + "loss": 8.8977, + "step": 23750 + }, + { + "epoch": 2.1342010771992816, + "grad_norm": 7.242298603057861, + "learning_rate": 2.1333034111310595e-06, + "loss": 8.7259, + "step": 23775 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 7.277146816253662, + "learning_rate": 2.135547576301616e-06, + "loss": 8.6672, + "step": 23800 + }, + { + "epoch": 2.138689407540395, + "grad_norm": 7.293113708496094, + "learning_rate": 2.1377917414721723e-06, + "loss": 8.861, + "step": 23825 + }, + { + "epoch": 2.1409335727109515, + "grad_norm": 7.128895282745361, + "learning_rate": 2.140035906642729e-06, + "loss": 8.7703, + "step": 23850 + }, + { + "epoch": 2.143177737881508, + "grad_norm": 7.226113796234131, + "learning_rate": 2.1422800718132855e-06, + "loss": 8.7641, + "step": 23875 + }, + { + "epoch": 2.1454219030520645, + "grad_norm": 6.98193359375, + "learning_rate": 2.144524236983842e-06, + "loss": 9.0389, + "step": 23900 + }, + { + "epoch": 2.1476660682226214, + "grad_norm": 7.894299030303955, + "learning_rate": 2.1467684021543987e-06, + "loss": 8.8446, + "step": 23925 + }, + { + "epoch": 2.149910233393178, + "grad_norm": 7.614567756652832, + "learning_rate": 2.1490125673249553e-06, + "loss": 8.8979, + "step": 23950 + }, + { + "epoch": 2.1521543985637344, + "grad_norm": 7.5310187339782715, + "learning_rate": 2.151256732495512e-06, + "loss": 8.861, + "step": 23975 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 7.5816240310668945, + "learning_rate": 2.1535008976660686e-06, + "loss": 8.6867, + "step": 24000 + }, + { + "epoch": 2.1566427289048473, + "grad_norm": 7.327290058135986, + "learning_rate": 2.155745062836625e-06, + "loss": 8.8323, + "step": 24025 + }, + { + "epoch": 2.158886894075404, + "grad_norm": 7.004958629608154, + "learning_rate": 2.1579892280071814e-06, + "loss": 8.8868, + "step": 24050 + }, + { + "epoch": 2.1611310592459603, + "grad_norm": 9.341012954711914, + "learning_rate": 2.160233393177738e-06, + "loss": 8.9602, + "step": 24075 + }, + { + "epoch": 2.163375224416517, + "grad_norm": 6.971961975097656, + "learning_rate": 2.1624775583482946e-06, + "loss": 8.7872, + "step": 24100 + }, + { + "epoch": 2.1656193895870737, + "grad_norm": 7.400240421295166, + "learning_rate": 2.1647217235188512e-06, + "loss": 8.7801, + "step": 24125 + }, + { + "epoch": 2.16786355475763, + "grad_norm": 7.1770548820495605, + "learning_rate": 2.166965888689408e-06, + "loss": 8.8113, + "step": 24150 + }, + { + "epoch": 2.1701077199281866, + "grad_norm": 8.157402038574219, + "learning_rate": 2.169210053859964e-06, + "loss": 8.6716, + "step": 24175 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 6.922234535217285, + "learning_rate": 2.1714542190305206e-06, + "loss": 8.91, + "step": 24200 + }, + { + "epoch": 2.1745960502693, + "grad_norm": 7.267353057861328, + "learning_rate": 2.1736983842010773e-06, + "loss": 8.9162, + "step": 24225 + }, + { + "epoch": 2.1768402154398565, + "grad_norm": 7.9742608070373535, + "learning_rate": 2.175942549371634e-06, + "loss": 8.8758, + "step": 24250 + }, + { + "epoch": 2.179084380610413, + "grad_norm": 8.382390022277832, + "learning_rate": 2.1781867145421905e-06, + "loss": 8.8123, + "step": 24275 + }, + { + "epoch": 2.1813285457809695, + "grad_norm": 7.471012592315674, + "learning_rate": 2.180430879712747e-06, + "loss": 8.7635, + "step": 24300 + }, + { + "epoch": 2.183572710951526, + "grad_norm": 7.24432373046875, + "learning_rate": 2.1826750448833037e-06, + "loss": 8.8401, + "step": 24325 + }, + { + "epoch": 2.1858168761220824, + "grad_norm": 7.422149181365967, + "learning_rate": 2.1849192100538603e-06, + "loss": 8.8114, + "step": 24350 + }, + { + "epoch": 2.1880610412926393, + "grad_norm": 7.190101146697998, + "learning_rate": 2.187163375224417e-06, + "loss": 8.8197, + "step": 24375 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 7.27094030380249, + "learning_rate": 2.189407540394973e-06, + "loss": 8.8944, + "step": 24400 + }, + { + "epoch": 2.1925493716337523, + "grad_norm": 7.311267852783203, + "learning_rate": 2.1916517055655298e-06, + "loss": 8.7699, + "step": 24425 + }, + { + "epoch": 2.1947935368043088, + "grad_norm": 7.5937418937683105, + "learning_rate": 2.1938958707360864e-06, + "loss": 9.0016, + "step": 24450 + }, + { + "epoch": 2.1970377019748653, + "grad_norm": 7.6533966064453125, + "learning_rate": 2.196140035906643e-06, + "loss": 8.6193, + "step": 24475 + }, + { + "epoch": 2.1992818671454217, + "grad_norm": 7.5346293449401855, + "learning_rate": 2.1983842010771996e-06, + "loss": 8.8344, + "step": 24500 + }, + { + "epoch": 2.2015260323159787, + "grad_norm": 7.9907355308532715, + "learning_rate": 2.200628366247756e-06, + "loss": 8.6726, + "step": 24525 + }, + { + "epoch": 2.203770197486535, + "grad_norm": 7.448516845703125, + "learning_rate": 2.2028725314183124e-06, + "loss": 8.7464, + "step": 24550 + }, + { + "epoch": 2.2060143626570916, + "grad_norm": 7.70904541015625, + "learning_rate": 2.205116696588869e-06, + "loss": 8.8304, + "step": 24575 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 7.239698886871338, + "learning_rate": 2.2073608617594256e-06, + "loss": 8.7993, + "step": 24600 + }, + { + "epoch": 2.2105026929982046, + "grad_norm": 7.88021183013916, + "learning_rate": 2.2096050269299823e-06, + "loss": 8.8355, + "step": 24625 + }, + { + "epoch": 2.212746858168761, + "grad_norm": 7.877081871032715, + "learning_rate": 2.211849192100539e-06, + "loss": 8.8869, + "step": 24650 + }, + { + "epoch": 2.214991023339318, + "grad_norm": 7.454009056091309, + "learning_rate": 2.2140933572710955e-06, + "loss": 8.9655, + "step": 24675 + }, + { + "epoch": 2.2172351885098744, + "grad_norm": 7.194870471954346, + "learning_rate": 2.216337522441652e-06, + "loss": 8.7537, + "step": 24700 + }, + { + "epoch": 2.219479353680431, + "grad_norm": 7.033070087432861, + "learning_rate": 2.2185816876122083e-06, + "loss": 8.824, + "step": 24725 + }, + { + "epoch": 2.2217235188509874, + "grad_norm": 7.292501926422119, + "learning_rate": 2.220825852782765e-06, + "loss": 8.8007, + "step": 24750 + }, + { + "epoch": 2.223967684021544, + "grad_norm": 8.56209659576416, + "learning_rate": 2.2230700179533215e-06, + "loss": 8.7942, + "step": 24775 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 7.3621416091918945, + "learning_rate": 2.225314183123878e-06, + "loss": 8.8131, + "step": 24800 + }, + { + "epoch": 2.2284560143626573, + "grad_norm": 8.457122802734375, + "learning_rate": 2.2275583482944348e-06, + "loss": 8.798, + "step": 24825 + }, + { + "epoch": 2.2307001795332138, + "grad_norm": 7.17905855178833, + "learning_rate": 2.2298025134649914e-06, + "loss": 8.935, + "step": 24850 + }, + { + "epoch": 2.2329443447037702, + "grad_norm": 7.406349182128906, + "learning_rate": 2.2320466786355476e-06, + "loss": 8.855, + "step": 24875 + }, + { + "epoch": 2.2351885098743267, + "grad_norm": 12.396289825439453, + "learning_rate": 2.234290843806104e-06, + "loss": 8.8576, + "step": 24900 + }, + { + "epoch": 2.237432675044883, + "grad_norm": 8.904563903808594, + "learning_rate": 2.236535008976661e-06, + "loss": 8.9442, + "step": 24925 + }, + { + "epoch": 2.2396768402154397, + "grad_norm": 7.610739707946777, + "learning_rate": 2.2387791741472174e-06, + "loss": 8.8273, + "step": 24950 + }, + { + "epoch": 2.2419210053859966, + "grad_norm": 7.567747592926025, + "learning_rate": 2.241023339317774e-06, + "loss": 8.8686, + "step": 24975 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 7.35021448135376, + "learning_rate": 2.2432675044883306e-06, + "loss": 8.8307, + "step": 25000 + }, + { + "epoch": 2.2464093357271095, + "grad_norm": 7.4047465324401855, + "learning_rate": 2.2455116696588873e-06, + "loss": 8.8661, + "step": 25025 + }, + { + "epoch": 2.248653500897666, + "grad_norm": 7.400894641876221, + "learning_rate": 2.247755834829444e-06, + "loss": 8.7244, + "step": 25050 + }, + { + "epoch": 2.2508976660682225, + "grad_norm": 7.48524284362793, + "learning_rate": 2.25e-06, + "loss": 8.9498, + "step": 25075 + }, + { + "epoch": 2.253141831238779, + "grad_norm": 8.501863479614258, + "learning_rate": 2.2522441651705567e-06, + "loss": 8.7296, + "step": 25100 + }, + { + "epoch": 2.255385996409336, + "grad_norm": 7.253287315368652, + "learning_rate": 2.2544883303411133e-06, + "loss": 8.8225, + "step": 25125 + }, + { + "epoch": 2.2576301615798924, + "grad_norm": 7.26355504989624, + "learning_rate": 2.25673249551167e-06, + "loss": 8.7717, + "step": 25150 + }, + { + "epoch": 2.259874326750449, + "grad_norm": 7.9984211921691895, + "learning_rate": 2.2589766606822265e-06, + "loss": 8.7341, + "step": 25175 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 8.28126049041748, + "learning_rate": 2.2612208258527827e-06, + "loss": 8.7761, + "step": 25200 + }, + { + "epoch": 2.264362657091562, + "grad_norm": 7.688032627105713, + "learning_rate": 2.2634649910233393e-06, + "loss": 8.8193, + "step": 25225 + }, + { + "epoch": 2.2666068222621183, + "grad_norm": 7.558252811431885, + "learning_rate": 2.265709156193896e-06, + "loss": 8.5769, + "step": 25250 + }, + { + "epoch": 2.268850987432675, + "grad_norm": 7.616209506988525, + "learning_rate": 2.2679533213644526e-06, + "loss": 8.6565, + "step": 25275 + }, + { + "epoch": 2.2710951526032317, + "grad_norm": 7.196427345275879, + "learning_rate": 2.270197486535009e-06, + "loss": 8.7877, + "step": 25300 + }, + { + "epoch": 2.273339317773788, + "grad_norm": 7.885478973388672, + "learning_rate": 2.2724416517055658e-06, + "loss": 8.7729, + "step": 25325 + }, + { + "epoch": 2.2755834829443446, + "grad_norm": 7.20514440536499, + "learning_rate": 2.2746858168761224e-06, + "loss": 8.7239, + "step": 25350 + }, + { + "epoch": 2.277827648114901, + "grad_norm": 7.694345951080322, + "learning_rate": 2.276929982046679e-06, + "loss": 8.6629, + "step": 25375 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 7.348526477813721, + "learning_rate": 2.2791741472172356e-06, + "loss": 8.7812, + "step": 25400 + }, + { + "epoch": 2.2823159784560145, + "grad_norm": 8.017631530761719, + "learning_rate": 2.281418312387792e-06, + "loss": 8.7645, + "step": 25425 + }, + { + "epoch": 2.284560143626571, + "grad_norm": 7.618819236755371, + "learning_rate": 2.2836624775583484e-06, + "loss": 8.7893, + "step": 25450 + }, + { + "epoch": 2.2868043087971275, + "grad_norm": 8.782280921936035, + "learning_rate": 2.285906642728905e-06, + "loss": 8.82, + "step": 25475 + }, + { + "epoch": 2.289048473967684, + "grad_norm": 7.871521472930908, + "learning_rate": 2.2881508078994617e-06, + "loss": 8.9025, + "step": 25500 + }, + { + "epoch": 2.2912926391382404, + "grad_norm": 7.4189772605896, + "learning_rate": 2.2903052064631957e-06, + "loss": 8.7594, + "step": 25525 + }, + { + "epoch": 2.293536804308797, + "grad_norm": 7.727238655090332, + "learning_rate": 2.2925493716337523e-06, + "loss": 8.8142, + "step": 25550 + }, + { + "epoch": 2.295780969479354, + "grad_norm": 8.022933006286621, + "learning_rate": 2.294793536804309e-06, + "loss": 8.7116, + "step": 25575 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 9.168560028076172, + "learning_rate": 2.2970377019748655e-06, + "loss": 8.7845, + "step": 25600 + }, + { + "epoch": 2.300269299820467, + "grad_norm": 8.401240348815918, + "learning_rate": 2.299281867145422e-06, + "loss": 8.7938, + "step": 25625 + }, + { + "epoch": 2.3025134649910233, + "grad_norm": 7.115867614746094, + "learning_rate": 2.3015260323159783e-06, + "loss": 8.7368, + "step": 25650 + }, + { + "epoch": 2.3047576301615798, + "grad_norm": 7.929617404937744, + "learning_rate": 2.303770197486535e-06, + "loss": 8.7669, + "step": 25675 + }, + { + "epoch": 2.3070017953321367, + "grad_norm": 8.211640357971191, + "learning_rate": 2.306014362657092e-06, + "loss": 8.7814, + "step": 25700 + }, + { + "epoch": 2.309245960502693, + "grad_norm": 7.102311134338379, + "learning_rate": 2.3082585278276486e-06, + "loss": 8.7941, + "step": 25725 + }, + { + "epoch": 2.3114901256732496, + "grad_norm": 8.46108627319336, + "learning_rate": 2.310502692998205e-06, + "loss": 8.8023, + "step": 25750 + }, + { + "epoch": 2.313734290843806, + "grad_norm": 7.594084739685059, + "learning_rate": 2.3127468581687614e-06, + "loss": 8.7973, + "step": 25775 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 7.955173492431641, + "learning_rate": 2.314991023339318e-06, + "loss": 8.8974, + "step": 25800 + }, + { + "epoch": 2.318222621184919, + "grad_norm": 8.09561824798584, + "learning_rate": 2.3172351885098746e-06, + "loss": 8.685, + "step": 25825 + }, + { + "epoch": 2.3204667863554755, + "grad_norm": 6.9278178215026855, + "learning_rate": 2.3194793536804313e-06, + "loss": 8.7322, + "step": 25850 + }, + { + "epoch": 2.3227109515260325, + "grad_norm": 7.593615531921387, + "learning_rate": 2.3217235188509874e-06, + "loss": 8.9316, + "step": 25875 + }, + { + "epoch": 2.324955116696589, + "grad_norm": 7.166286945343018, + "learning_rate": 2.323967684021544e-06, + "loss": 8.7705, + "step": 25900 + }, + { + "epoch": 2.3271992818671454, + "grad_norm": 8.285543441772461, + "learning_rate": 2.3262118491921007e-06, + "loss": 8.8277, + "step": 25925 + }, + { + "epoch": 2.329443447037702, + "grad_norm": 7.943617343902588, + "learning_rate": 2.3284560143626573e-06, + "loss": 8.8701, + "step": 25950 + }, + { + "epoch": 2.3316876122082584, + "grad_norm": 8.49682331085205, + "learning_rate": 2.330700179533214e-06, + "loss": 8.7329, + "step": 25975 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 8.894546508789062, + "learning_rate": 2.33294434470377e-06, + "loss": 8.8172, + "step": 26000 + }, + { + "epoch": 2.3361759425493718, + "grad_norm": 8.180078506469727, + "learning_rate": 2.3351885098743267e-06, + "loss": 8.6854, + "step": 26025 + }, + { + "epoch": 2.3384201077199283, + "grad_norm": 7.521679401397705, + "learning_rate": 2.3374326750448833e-06, + "loss": 8.637, + "step": 26050 + }, + { + "epoch": 2.3406642728904847, + "grad_norm": 7.810539722442627, + "learning_rate": 2.3396768402154404e-06, + "loss": 8.7591, + "step": 26075 + }, + { + "epoch": 2.342908438061041, + "grad_norm": 8.644916534423828, + "learning_rate": 2.3419210053859966e-06, + "loss": 8.9011, + "step": 26100 + }, + { + "epoch": 2.3451526032315977, + "grad_norm": 7.750823020935059, + "learning_rate": 2.344165170556553e-06, + "loss": 8.7627, + "step": 26125 + }, + { + "epoch": 2.347396768402154, + "grad_norm": 7.633558750152588, + "learning_rate": 2.3464093357271098e-06, + "loss": 8.7114, + "step": 26150 + }, + { + "epoch": 2.349640933572711, + "grad_norm": 7.837903022766113, + "learning_rate": 2.3486535008976664e-06, + "loss": 8.7302, + "step": 26175 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 8.567168235778809, + "learning_rate": 2.350897666068223e-06, + "loss": 8.7633, + "step": 26200 + }, + { + "epoch": 2.354129263913824, + "grad_norm": 7.285449028015137, + "learning_rate": 2.353141831238779e-06, + "loss": 8.6705, + "step": 26225 + }, + { + "epoch": 2.3563734290843805, + "grad_norm": 9.980071067810059, + "learning_rate": 2.355385996409336e-06, + "loss": 8.7135, + "step": 26250 + }, + { + "epoch": 2.358617594254937, + "grad_norm": 9.108396530151367, + "learning_rate": 2.3576301615798924e-06, + "loss": 8.8951, + "step": 26275 + }, + { + "epoch": 2.360861759425494, + "grad_norm": 7.207018852233887, + "learning_rate": 2.359874326750449e-06, + "loss": 8.7512, + "step": 26300 + }, + { + "epoch": 2.3631059245960504, + "grad_norm": 7.324895858764648, + "learning_rate": 2.3621184919210057e-06, + "loss": 8.8925, + "step": 26325 + }, + { + "epoch": 2.365350089766607, + "grad_norm": 7.087808609008789, + "learning_rate": 2.364362657091562e-06, + "loss": 8.606, + "step": 26350 + }, + { + "epoch": 2.3675942549371634, + "grad_norm": 8.780818939208984, + "learning_rate": 2.3666068222621185e-06, + "loss": 8.7034, + "step": 26375 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 7.479258060455322, + "learning_rate": 2.368850987432675e-06, + "loss": 8.7742, + "step": 26400 + }, + { + "epoch": 2.3720825852782763, + "grad_norm": 8.018939971923828, + "learning_rate": 2.371095152603232e-06, + "loss": 8.7791, + "step": 26425 + }, + { + "epoch": 2.374326750448833, + "grad_norm": 7.280079364776611, + "learning_rate": 2.3733393177737883e-06, + "loss": 8.6998, + "step": 26450 + }, + { + "epoch": 2.3765709156193897, + "grad_norm": 8.136499404907227, + "learning_rate": 2.375583482944345e-06, + "loss": 8.6984, + "step": 26475 + }, + { + "epoch": 2.378815080789946, + "grad_norm": 8.004971504211426, + "learning_rate": 2.3778276481149016e-06, + "loss": 8.7939, + "step": 26500 + }, + { + "epoch": 2.3810592459605027, + "grad_norm": 7.429125785827637, + "learning_rate": 2.380071813285458e-06, + "loss": 8.6948, + "step": 26525 + }, + { + "epoch": 2.383303411131059, + "grad_norm": 7.33009147644043, + "learning_rate": 2.3823159784560148e-06, + "loss": 8.8223, + "step": 26550 + }, + { + "epoch": 2.3855475763016156, + "grad_norm": 7.463535785675049, + "learning_rate": 2.384560143626571e-06, + "loss": 8.7437, + "step": 26575 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 8.249847412109375, + "learning_rate": 2.3868043087971276e-06, + "loss": 8.7584, + "step": 26600 + }, + { + "epoch": 2.390035906642729, + "grad_norm": 7.530778884887695, + "learning_rate": 2.389048473967684e-06, + "loss": 8.7426, + "step": 26625 + }, + { + "epoch": 2.3922800718132855, + "grad_norm": 8.669591903686523, + "learning_rate": 2.391292639138241e-06, + "loss": 8.7287, + "step": 26650 + }, + { + "epoch": 2.394524236983842, + "grad_norm": 7.346036911010742, + "learning_rate": 2.3935368043087974e-06, + "loss": 8.8438, + "step": 26675 + }, + { + "epoch": 2.3967684021543985, + "grad_norm": 8.349026679992676, + "learning_rate": 2.3957809694793536e-06, + "loss": 8.6535, + "step": 26700 + }, + { + "epoch": 2.399012567324955, + "grad_norm": 7.508020877838135, + "learning_rate": 2.3980251346499102e-06, + "loss": 8.6569, + "step": 26725 + }, + { + "epoch": 2.401256732495512, + "grad_norm": 8.472759246826172, + "learning_rate": 2.400269299820467e-06, + "loss": 8.7648, + "step": 26750 + }, + { + "epoch": 2.4035008976660683, + "grad_norm": 7.45506477355957, + "learning_rate": 2.4025134649910235e-06, + "loss": 8.7551, + "step": 26775 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 9.62768268585205, + "learning_rate": 2.40475763016158e-06, + "loss": 8.7978, + "step": 26800 + }, + { + "epoch": 2.4079892280071813, + "grad_norm": 8.320019721984863, + "learning_rate": 2.4070017953321367e-06, + "loss": 8.7357, + "step": 26825 + }, + { + "epoch": 2.4102333931777378, + "grad_norm": 9.005629539489746, + "learning_rate": 2.4092459605026933e-06, + "loss": 8.7217, + "step": 26850 + }, + { + "epoch": 2.4124775583482942, + "grad_norm": 8.3873929977417, + "learning_rate": 2.41149012567325e-06, + "loss": 8.7439, + "step": 26875 + }, + { + "epoch": 2.414721723518851, + "grad_norm": 7.470830917358398, + "learning_rate": 2.4137342908438065e-06, + "loss": 8.7946, + "step": 26900 + }, + { + "epoch": 2.4169658886894076, + "grad_norm": 8.457619667053223, + "learning_rate": 2.4158886894075406e-06, + "loss": 8.806, + "step": 26925 + }, + { + "epoch": 2.419210053859964, + "grad_norm": 7.005653381347656, + "learning_rate": 2.418132854578097e-06, + "loss": 8.7611, + "step": 26950 + }, + { + "epoch": 2.4214542190305206, + "grad_norm": 7.697506904602051, + "learning_rate": 2.4203770197486538e-06, + "loss": 8.7287, + "step": 26975 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 8.318944931030273, + "learning_rate": 2.4226211849192104e-06, + "loss": 8.7416, + "step": 27000 + }, + { + "epoch": 2.4259425493716336, + "grad_norm": 7.3691534996032715, + "learning_rate": 2.4248653500897666e-06, + "loss": 8.7216, + "step": 27025 + }, + { + "epoch": 2.4281867145421905, + "grad_norm": 7.898078441619873, + "learning_rate": 2.427109515260323e-06, + "loss": 8.6635, + "step": 27050 + }, + { + "epoch": 2.430430879712747, + "grad_norm": 7.721663475036621, + "learning_rate": 2.42935368043088e-06, + "loss": 8.6098, + "step": 27075 + }, + { + "epoch": 2.4326750448833034, + "grad_norm": 8.64586353302002, + "learning_rate": 2.4315978456014364e-06, + "loss": 8.8915, + "step": 27100 + }, + { + "epoch": 2.43491921005386, + "grad_norm": 8.140568733215332, + "learning_rate": 2.433842010771993e-06, + "loss": 8.6583, + "step": 27125 + }, + { + "epoch": 2.4371633752244164, + "grad_norm": 8.600632667541504, + "learning_rate": 2.4360861759425497e-06, + "loss": 8.7622, + "step": 27150 + }, + { + "epoch": 2.4394075403949733, + "grad_norm": 7.156821250915527, + "learning_rate": 2.4383303411131063e-06, + "loss": 8.7931, + "step": 27175 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 9.11503791809082, + "learning_rate": 2.440574506283663e-06, + "loss": 8.6709, + "step": 27200 + }, + { + "epoch": 2.4438958707360863, + "grad_norm": 8.274863243103027, + "learning_rate": 2.4428186714542195e-06, + "loss": 8.5685, + "step": 27225 + }, + { + "epoch": 2.4461400359066428, + "grad_norm": 7.280328750610352, + "learning_rate": 2.4450628366247757e-06, + "loss": 8.696, + "step": 27250 + }, + { + "epoch": 2.4483842010771992, + "grad_norm": 8.908411026000977, + "learning_rate": 2.4473070017953323e-06, + "loss": 8.6888, + "step": 27275 + }, + { + "epoch": 2.4506283662477557, + "grad_norm": 7.128604888916016, + "learning_rate": 2.449551166965889e-06, + "loss": 8.8328, + "step": 27300 + }, + { + "epoch": 2.452872531418312, + "grad_norm": 7.357241153717041, + "learning_rate": 2.4517953321364456e-06, + "loss": 8.7651, + "step": 27325 + }, + { + "epoch": 2.455116696588869, + "grad_norm": 7.760249137878418, + "learning_rate": 2.454039497307002e-06, + "loss": 8.6849, + "step": 27350 + }, + { + "epoch": 2.4573608617594256, + "grad_norm": 7.426972389221191, + "learning_rate": 2.4562836624775584e-06, + "loss": 8.749, + "step": 27375 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 7.388211250305176, + "learning_rate": 2.458527827648115e-06, + "loss": 8.7413, + "step": 27400 + }, + { + "epoch": 2.4618491921005385, + "grad_norm": 7.254780292510986, + "learning_rate": 2.4607719928186716e-06, + "loss": 8.5119, + "step": 27425 + }, + { + "epoch": 2.464093357271095, + "grad_norm": 8.710436820983887, + "learning_rate": 2.463016157989228e-06, + "loss": 8.7262, + "step": 27450 + }, + { + "epoch": 2.466337522441652, + "grad_norm": 7.722415447235107, + "learning_rate": 2.465260323159785e-06, + "loss": 8.7216, + "step": 27475 + }, + { + "epoch": 2.4685816876122084, + "grad_norm": 7.141268730163574, + "learning_rate": 2.4675044883303414e-06, + "loss": 8.7211, + "step": 27500 + }, + { + "epoch": 2.470825852782765, + "grad_norm": 8.538675308227539, + "learning_rate": 2.469748653500898e-06, + "loss": 8.8226, + "step": 27525 + }, + { + "epoch": 2.4730700179533214, + "grad_norm": 7.637808799743652, + "learning_rate": 2.4719928186714547e-06, + "loss": 8.8038, + "step": 27550 + }, + { + "epoch": 2.475314183123878, + "grad_norm": 8.500548362731934, + "learning_rate": 2.474236983842011e-06, + "loss": 8.7195, + "step": 27575 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 7.445677757263184, + "learning_rate": 2.4764811490125675e-06, + "loss": 8.6764, + "step": 27600 + }, + { + "epoch": 2.479802513464991, + "grad_norm": 7.514459609985352, + "learning_rate": 2.478725314183124e-06, + "loss": 8.9272, + "step": 27625 + }, + { + "epoch": 2.4820466786355477, + "grad_norm": 7.744480133056641, + "learning_rate": 2.4809694793536807e-06, + "loss": 8.6633, + "step": 27650 + }, + { + "epoch": 2.484290843806104, + "grad_norm": 8.13674259185791, + "learning_rate": 2.4832136445242373e-06, + "loss": 8.7194, + "step": 27675 + }, + { + "epoch": 2.4865350089766607, + "grad_norm": 8.998900413513184, + "learning_rate": 2.485457809694794e-06, + "loss": 8.768, + "step": 27700 + }, + { + "epoch": 2.488779174147217, + "grad_norm": 8.522173881530762, + "learning_rate": 2.48770197486535e-06, + "loss": 8.5635, + "step": 27725 + }, + { + "epoch": 2.4910233393177736, + "grad_norm": 8.991546630859375, + "learning_rate": 2.4899461400359067e-06, + "loss": 8.7454, + "step": 27750 + }, + { + "epoch": 2.4932675044883306, + "grad_norm": 7.261856555938721, + "learning_rate": 2.4921903052064634e-06, + "loss": 8.7791, + "step": 27775 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 7.915175437927246, + "learning_rate": 2.49443447037702e-06, + "loss": 8.6962, + "step": 27800 + }, + { + "epoch": 2.4977558348294435, + "grad_norm": 9.081327438354492, + "learning_rate": 2.4966786355475766e-06, + "loss": 8.6789, + "step": 27825 + }, + { + "epoch": 2.5, + "grad_norm": 8.06051254272461, + "learning_rate": 2.4989228007181328e-06, + "loss": 8.7663, + "step": 27850 + }, + { + "epoch": 2.5022441651705565, + "grad_norm": 9.632922172546387, + "learning_rate": 2.5011669658886894e-06, + "loss": 8.6413, + "step": 27875 + }, + { + "epoch": 2.504488330341113, + "grad_norm": 8.354493141174316, + "learning_rate": 2.5034111310592464e-06, + "loss": 8.8378, + "step": 27900 + }, + { + "epoch": 2.5067324955116694, + "grad_norm": 9.47565746307373, + "learning_rate": 2.5056552962298026e-06, + "loss": 8.7008, + "step": 27925 + }, + { + "epoch": 2.5089766606822264, + "grad_norm": 8.866034507751465, + "learning_rate": 2.5078994614003592e-06, + "loss": 8.7112, + "step": 27950 + }, + { + "epoch": 2.511220825852783, + "grad_norm": 8.529930114746094, + "learning_rate": 2.5101436265709154e-06, + "loss": 8.6432, + "step": 27975 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 8.826647758483887, + "learning_rate": 2.5123877917414725e-06, + "loss": 8.6073, + "step": 28000 + }, + { + "epoch": 2.515709156193896, + "grad_norm": 7.719997882843018, + "learning_rate": 2.5146319569120287e-06, + "loss": 8.6901, + "step": 28025 + }, + { + "epoch": 2.5179533213644523, + "grad_norm": 8.184229850769043, + "learning_rate": 2.5168761220825853e-06, + "loss": 8.6896, + "step": 28050 + }, + { + "epoch": 2.520197486535009, + "grad_norm": 7.836431503295898, + "learning_rate": 2.5191202872531423e-06, + "loss": 8.7593, + "step": 28075 + }, + { + "epoch": 2.5224416517055657, + "grad_norm": 7.092922687530518, + "learning_rate": 2.5213644524236985e-06, + "loss": 8.6878, + "step": 28100 + }, + { + "epoch": 2.524685816876122, + "grad_norm": 7.48595666885376, + "learning_rate": 2.5236086175942555e-06, + "loss": 8.873, + "step": 28125 + }, + { + "epoch": 2.5269299820466786, + "grad_norm": 7.716548442840576, + "learning_rate": 2.5258527827648117e-06, + "loss": 8.678, + "step": 28150 + }, + { + "epoch": 2.529174147217235, + "grad_norm": 7.648696422576904, + "learning_rate": 2.5280969479353683e-06, + "loss": 8.7765, + "step": 28175 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 7.753777027130127, + "learning_rate": 2.5303411131059245e-06, + "loss": 8.7449, + "step": 28200 + }, + { + "epoch": 2.533662477558348, + "grad_norm": 7.347560882568359, + "learning_rate": 2.5325852782764816e-06, + "loss": 8.8258, + "step": 28225 + }, + { + "epoch": 2.535906642728905, + "grad_norm": 27.615747451782227, + "learning_rate": 2.5348294434470378e-06, + "loss": 8.8039, + "step": 28250 + }, + { + "epoch": 2.5381508078994615, + "grad_norm": 7.081418037414551, + "learning_rate": 2.5370736086175944e-06, + "loss": 8.6808, + "step": 28275 + }, + { + "epoch": 2.540394973070018, + "grad_norm": 9.154043197631836, + "learning_rate": 2.539317773788151e-06, + "loss": 8.52, + "step": 28300 + }, + { + "epoch": 2.5426391382405744, + "grad_norm": 8.552950859069824, + "learning_rate": 2.5415619389587076e-06, + "loss": 8.7724, + "step": 28325 + }, + { + "epoch": 2.5448833034111313, + "grad_norm": 7.476393699645996, + "learning_rate": 2.543806104129264e-06, + "loss": 8.6956, + "step": 28350 + }, + { + "epoch": 2.547127468581688, + "grad_norm": 7.98252534866333, + "learning_rate": 2.546050269299821e-06, + "loss": 8.6807, + "step": 28375 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 7.945890426635742, + "learning_rate": 2.5482944344703775e-06, + "loss": 8.7443, + "step": 28400 + }, + { + "epoch": 2.5516157989228008, + "grad_norm": 7.618307590484619, + "learning_rate": 2.5505385996409337e-06, + "loss": 8.6579, + "step": 28425 + }, + { + "epoch": 2.5538599640933572, + "grad_norm": 9.692346572875977, + "learning_rate": 2.5527827648114907e-06, + "loss": 8.6921, + "step": 28450 + }, + { + "epoch": 2.5561041292639137, + "grad_norm": 7.421929359436035, + "learning_rate": 2.555026929982047e-06, + "loss": 8.7355, + "step": 28475 + }, + { + "epoch": 2.55834829443447, + "grad_norm": 8.225549697875977, + "learning_rate": 2.5572710951526035e-06, + "loss": 8.5838, + "step": 28500 + }, + { + "epoch": 2.5605924596050267, + "grad_norm": 8.396663665771484, + "learning_rate": 2.5595152603231597e-06, + "loss": 8.6045, + "step": 28525 + }, + { + "epoch": 2.5628366247755836, + "grad_norm": 8.157426834106445, + "learning_rate": 2.5617594254937167e-06, + "loss": 8.7819, + "step": 28550 + }, + { + "epoch": 2.56508078994614, + "grad_norm": 7.967459678649902, + "learning_rate": 2.564003590664273e-06, + "loss": 8.7219, + "step": 28575 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 9.586108207702637, + "learning_rate": 2.56624775583483e-06, + "loss": 8.7125, + "step": 28600 + }, + { + "epoch": 2.569569120287253, + "grad_norm": 8.386338233947754, + "learning_rate": 2.568491921005386e-06, + "loss": 8.699, + "step": 28625 + }, + { + "epoch": 2.57181328545781, + "grad_norm": 7.507050037384033, + "learning_rate": 2.5707360861759428e-06, + "loss": 8.7859, + "step": 28650 + }, + { + "epoch": 2.5740574506283664, + "grad_norm": 7.522095680236816, + "learning_rate": 2.572980251346499e-06, + "loss": 8.6668, + "step": 28675 + }, + { + "epoch": 2.576301615798923, + "grad_norm": 7.599160671234131, + "learning_rate": 2.575224416517056e-06, + "loss": 8.7075, + "step": 28700 + }, + { + "epoch": 2.5785457809694794, + "grad_norm": 8.238758087158203, + "learning_rate": 2.577468581687612e-06, + "loss": 8.7445, + "step": 28725 + }, + { + "epoch": 2.580789946140036, + "grad_norm": 8.125377655029297, + "learning_rate": 2.579712746858169e-06, + "loss": 8.7493, + "step": 28750 + }, + { + "epoch": 2.5830341113105924, + "grad_norm": 11.882285118103027, + "learning_rate": 2.581956912028726e-06, + "loss": 8.6632, + "step": 28775 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 7.676426887512207, + "learning_rate": 2.584201077199282e-06, + "loss": 8.6234, + "step": 28800 + }, + { + "epoch": 2.5875224416517053, + "grad_norm": 8.387561798095703, + "learning_rate": 2.5864452423698386e-06, + "loss": 8.6569, + "step": 28825 + }, + { + "epoch": 2.5897666068222622, + "grad_norm": 8.178343772888184, + "learning_rate": 2.5886894075403953e-06, + "loss": 8.6942, + "step": 28850 + }, + { + "epoch": 2.5920107719928187, + "grad_norm": 7.991466522216797, + "learning_rate": 2.590933572710952e-06, + "loss": 8.6398, + "step": 28875 + }, + { + "epoch": 2.594254937163375, + "grad_norm": 8.077037811279297, + "learning_rate": 2.593177737881508e-06, + "loss": 8.7689, + "step": 28900 + }, + { + "epoch": 2.5964991023339317, + "grad_norm": 7.732774257659912, + "learning_rate": 2.595421903052065e-06, + "loss": 8.5936, + "step": 28925 + }, + { + "epoch": 2.5987432675044886, + "grad_norm": 7.425034999847412, + "learning_rate": 2.5976660682226213e-06, + "loss": 8.7483, + "step": 28950 + }, + { + "epoch": 2.600987432675045, + "grad_norm": 8.172046661376953, + "learning_rate": 2.599910233393178e-06, + "loss": 8.7509, + "step": 28975 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 8.61007308959961, + "learning_rate": 2.602154398563734e-06, + "loss": 8.7169, + "step": 29000 + }, + { + "epoch": 2.605475763016158, + "grad_norm": 10.6384916305542, + "learning_rate": 2.604398563734291e-06, + "loss": 8.4811, + "step": 29025 + }, + { + "epoch": 2.6077199281867145, + "grad_norm": 8.137349128723145, + "learning_rate": 2.6066427289048473e-06, + "loss": 8.6486, + "step": 29050 + }, + { + "epoch": 2.609964093357271, + "grad_norm": 8.939285278320312, + "learning_rate": 2.6088868940754044e-06, + "loss": 8.7186, + "step": 29075 + }, + { + "epoch": 2.6122082585278275, + "grad_norm": 7.770234107971191, + "learning_rate": 2.6111310592459606e-06, + "loss": 8.6302, + "step": 29100 + }, + { + "epoch": 2.614452423698384, + "grad_norm": 7.316786289215088, + "learning_rate": 2.613375224416517e-06, + "loss": 8.6091, + "step": 29125 + }, + { + "epoch": 2.616696588868941, + "grad_norm": 12.148127555847168, + "learning_rate": 2.6156193895870742e-06, + "loss": 8.8377, + "step": 29150 + }, + { + "epoch": 2.6189407540394973, + "grad_norm": 8.556102752685547, + "learning_rate": 2.6178635547576304e-06, + "loss": 8.7811, + "step": 29175 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 7.676416397094727, + "learning_rate": 2.620107719928187e-06, + "loss": 8.7366, + "step": 29200 + }, + { + "epoch": 2.6234290843806103, + "grad_norm": 8.648096084594727, + "learning_rate": 2.6223518850987432e-06, + "loss": 8.6169, + "step": 29225 + }, + { + "epoch": 2.625673249551167, + "grad_norm": 8.081331253051758, + "learning_rate": 2.6245960502693003e-06, + "loss": 8.5938, + "step": 29250 + }, + { + "epoch": 2.6279174147217237, + "grad_norm": 8.572725296020508, + "learning_rate": 2.6268402154398564e-06, + "loss": 8.6765, + "step": 29275 + }, + { + "epoch": 2.63016157989228, + "grad_norm": 7.819840908050537, + "learning_rate": 2.629084380610413e-06, + "loss": 8.6199, + "step": 29300 + }, + { + "epoch": 2.6324057450628366, + "grad_norm": 7.736175060272217, + "learning_rate": 2.631238779174147e-06, + "loss": 8.6553, + "step": 29325 + }, + { + "epoch": 2.634649910233393, + "grad_norm": 7.863946914672852, + "learning_rate": 2.633482944344704e-06, + "loss": 8.8194, + "step": 29350 + }, + { + "epoch": 2.6368940754039496, + "grad_norm": 7.36769437789917, + "learning_rate": 2.6357271095152603e-06, + "loss": 8.7629, + "step": 29375 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 7.860691070556641, + "learning_rate": 2.6379712746858173e-06, + "loss": 8.7081, + "step": 29400 + }, + { + "epoch": 2.641382405745063, + "grad_norm": 8.649950981140137, + "learning_rate": 2.6402154398563735e-06, + "loss": 8.7352, + "step": 29425 + }, + { + "epoch": 2.6436265709156195, + "grad_norm": 7.941035747528076, + "learning_rate": 2.64245960502693e-06, + "loss": 8.7031, + "step": 29450 + }, + { + "epoch": 2.645870736086176, + "grad_norm": 7.959293365478516, + "learning_rate": 2.6447037701974863e-06, + "loss": 8.7212, + "step": 29475 + }, + { + "epoch": 2.6481149012567324, + "grad_norm": 8.072748184204102, + "learning_rate": 2.6469479353680434e-06, + "loss": 8.4774, + "step": 29500 + }, + { + "epoch": 2.650359066427289, + "grad_norm": 8.879371643066406, + "learning_rate": 2.6491921005386e-06, + "loss": 8.5878, + "step": 29525 + }, + { + "epoch": 2.652603231597846, + "grad_norm": 8.610868453979492, + "learning_rate": 2.651436265709156e-06, + "loss": 8.5763, + "step": 29550 + }, + { + "epoch": 2.6548473967684023, + "grad_norm": 9.243968963623047, + "learning_rate": 2.6536804308797132e-06, + "loss": 8.6015, + "step": 29575 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 7.583789348602295, + "learning_rate": 2.6559245960502694e-06, + "loss": 8.7164, + "step": 29600 + }, + { + "epoch": 2.6593357271095153, + "grad_norm": 8.68832778930664, + "learning_rate": 2.658168761220826e-06, + "loss": 8.6381, + "step": 29625 + }, + { + "epoch": 2.6615798922800717, + "grad_norm": 9.179740905761719, + "learning_rate": 2.6604129263913826e-06, + "loss": 8.6787, + "step": 29650 + }, + { + "epoch": 2.6638240574506282, + "grad_norm": 8.402155876159668, + "learning_rate": 2.6626570915619393e-06, + "loss": 8.7052, + "step": 29675 + }, + { + "epoch": 2.6660682226211847, + "grad_norm": 8.042049407958984, + "learning_rate": 2.6649012567324955e-06, + "loss": 8.6915, + "step": 29700 + }, + { + "epoch": 2.6683123877917416, + "grad_norm": 9.408376693725586, + "learning_rate": 2.6671454219030525e-06, + "loss": 8.7536, + "step": 29725 + }, + { + "epoch": 2.670556552962298, + "grad_norm": 9.651875495910645, + "learning_rate": 2.6693895870736087e-06, + "loss": 8.7313, + "step": 29750 + }, + { + "epoch": 2.6728007181328546, + "grad_norm": 8.16729736328125, + "learning_rate": 2.6716337522441653e-06, + "loss": 8.713, + "step": 29775 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 7.944380283355713, + "learning_rate": 2.6738779174147215e-06, + "loss": 8.7183, + "step": 29800 + }, + { + "epoch": 2.6772890484739675, + "grad_norm": 9.081042289733887, + "learning_rate": 2.6761220825852785e-06, + "loss": 8.7412, + "step": 29825 + }, + { + "epoch": 2.6795332136445245, + "grad_norm": 8.5193452835083, + "learning_rate": 2.678366247755835e-06, + "loss": 8.6991, + "step": 29850 + }, + { + "epoch": 2.681777378815081, + "grad_norm": 7.194787502288818, + "learning_rate": 2.6806104129263918e-06, + "loss": 8.7298, + "step": 29875 + }, + { + "epoch": 2.6840215439856374, + "grad_norm": 8.94596004486084, + "learning_rate": 2.6828545780969484e-06, + "loss": 8.7207, + "step": 29900 + }, + { + "epoch": 2.686265709156194, + "grad_norm": 8.24487590789795, + "learning_rate": 2.6850987432675046e-06, + "loss": 8.7302, + "step": 29925 + }, + { + "epoch": 2.6885098743267504, + "grad_norm": 8.492156982421875, + "learning_rate": 2.6873429084380616e-06, + "loss": 8.8661, + "step": 29950 + }, + { + "epoch": 2.690754039497307, + "grad_norm": 7.721090793609619, + "learning_rate": 2.689587073608618e-06, + "loss": 8.7595, + "step": 29975 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 7.786222457885742, + "learning_rate": 2.6918312387791744e-06, + "loss": 8.6681, + "step": 30000 + }, + { + "epoch": 2.6952423698384202, + "grad_norm": 9.98211669921875, + "learning_rate": 2.6940754039497306e-06, + "loss": 8.6657, + "step": 30025 + }, + { + "epoch": 2.6974865350089767, + "grad_norm": 7.634110927581787, + "learning_rate": 2.6963195691202876e-06, + "loss": 8.5742, + "step": 30050 + }, + { + "epoch": 2.699730700179533, + "grad_norm": 8.64838695526123, + "learning_rate": 2.698563734290844e-06, + "loss": 8.5088, + "step": 30075 + }, + { + "epoch": 2.7019748653500897, + "grad_norm": 8.977266311645508, + "learning_rate": 2.7008078994614004e-06, + "loss": 8.7107, + "step": 30100 + }, + { + "epoch": 2.704219030520646, + "grad_norm": 8.371148109436035, + "learning_rate": 2.703052064631957e-06, + "loss": 8.6729, + "step": 30125 + }, + { + "epoch": 2.706463195691203, + "grad_norm": 8.685636520385742, + "learning_rate": 2.7052962298025137e-06, + "loss": 8.5476, + "step": 30150 + }, + { + "epoch": 2.7087073608617596, + "grad_norm": 7.718595027923584, + "learning_rate": 2.70754039497307e-06, + "loss": 8.6851, + "step": 30175 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 7.752781391143799, + "learning_rate": 2.709784560143627e-06, + "loss": 8.6136, + "step": 30200 + }, + { + "epoch": 2.7131956912028725, + "grad_norm": 9.167036056518555, + "learning_rate": 2.7120287253141835e-06, + "loss": 8.5481, + "step": 30225 + }, + { + "epoch": 2.715439856373429, + "grad_norm": 7.502840042114258, + "learning_rate": 2.7142728904847397e-06, + "loss": 8.6712, + "step": 30250 + }, + { + "epoch": 2.7176840215439855, + "grad_norm": 7.813225746154785, + "learning_rate": 2.7165170556552968e-06, + "loss": 8.6816, + "step": 30275 + }, + { + "epoch": 2.719928186714542, + "grad_norm": 8.870896339416504, + "learning_rate": 2.718761220825853e-06, + "loss": 8.7509, + "step": 30300 + }, + { + "epoch": 2.722172351885099, + "grad_norm": 8.533783912658691, + "learning_rate": 2.7210053859964096e-06, + "loss": 8.5163, + "step": 30325 + }, + { + "epoch": 2.7244165170556554, + "grad_norm": 7.305062294006348, + "learning_rate": 2.723249551166966e-06, + "loss": 8.7926, + "step": 30350 + }, + { + "epoch": 2.726660682226212, + "grad_norm": 7.588680267333984, + "learning_rate": 2.725493716337523e-06, + "loss": 8.7785, + "step": 30375 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 7.341064929962158, + "learning_rate": 2.727737881508079e-06, + "loss": 8.5815, + "step": 30400 + }, + { + "epoch": 2.7311490125673252, + "grad_norm": 9.373249053955078, + "learning_rate": 2.729982046678636e-06, + "loss": 8.6605, + "step": 30425 + }, + { + "epoch": 2.7333931777378817, + "grad_norm": 7.275130271911621, + "learning_rate": 2.7322262118491922e-06, + "loss": 8.7166, + "step": 30450 + }, + { + "epoch": 2.735637342908438, + "grad_norm": 8.94101619720459, + "learning_rate": 2.734470377019749e-06, + "loss": 8.6992, + "step": 30475 + }, + { + "epoch": 2.7378815080789947, + "grad_norm": 7.141937255859375, + "learning_rate": 2.736714542190305e-06, + "loss": 8.6336, + "step": 30500 + }, + { + "epoch": 2.740125673249551, + "grad_norm": 8.091525077819824, + "learning_rate": 2.738958707360862e-06, + "loss": 8.6129, + "step": 30525 + }, + { + "epoch": 2.7423698384201076, + "grad_norm": 8.85395622253418, + "learning_rate": 2.7412028725314187e-06, + "loss": 8.8234, + "step": 30550 + }, + { + "epoch": 2.744614003590664, + "grad_norm": 8.214533805847168, + "learning_rate": 2.743447037701975e-06, + "loss": 8.6889, + "step": 30575 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 11.49850845336914, + "learning_rate": 2.745691202872532e-06, + "loss": 8.6315, + "step": 30600 + }, + { + "epoch": 2.7491023339317775, + "grad_norm": 8.987617492675781, + "learning_rate": 2.747935368043088e-06, + "loss": 8.5921, + "step": 30625 + }, + { + "epoch": 2.751346499102334, + "grad_norm": 7.994260311126709, + "learning_rate": 2.750179533213645e-06, + "loss": 8.6589, + "step": 30650 + }, + { + "epoch": 2.7535906642728905, + "grad_norm": 9.67781925201416, + "learning_rate": 2.7524236983842013e-06, + "loss": 8.6851, + "step": 30675 + }, + { + "epoch": 2.755834829443447, + "grad_norm": 8.975408554077148, + "learning_rate": 2.754667863554758e-06, + "loss": 8.6611, + "step": 30700 + }, + { + "epoch": 2.758078994614004, + "grad_norm": 9.970959663391113, + "learning_rate": 2.756912028725314e-06, + "loss": 8.524, + "step": 30725 + }, + { + "epoch": 2.7603231597845603, + "grad_norm": 8.593131065368652, + "learning_rate": 2.759156193895871e-06, + "loss": 8.7454, + "step": 30750 + }, + { + "epoch": 2.762567324955117, + "grad_norm": 7.033371925354004, + "learning_rate": 2.7614003590664274e-06, + "loss": 8.6882, + "step": 30775 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 7.36411190032959, + "learning_rate": 2.763644524236984e-06, + "loss": 8.6535, + "step": 30800 + }, + { + "epoch": 2.7670556552962298, + "grad_norm": 10.95535659790039, + "learning_rate": 2.7658886894075406e-06, + "loss": 8.6971, + "step": 30825 + }, + { + "epoch": 2.7692998204667862, + "grad_norm": 7.885133266448975, + "learning_rate": 2.768132854578097e-06, + "loss": 8.6803, + "step": 30850 + }, + { + "epoch": 2.7715439856373427, + "grad_norm": 8.30626392364502, + "learning_rate": 2.7703770197486534e-06, + "loss": 8.7953, + "step": 30875 + }, + { + "epoch": 2.773788150807899, + "grad_norm": 8.037935256958008, + "learning_rate": 2.7726211849192104e-06, + "loss": 8.7517, + "step": 30900 + }, + { + "epoch": 2.776032315978456, + "grad_norm": 8.895528793334961, + "learning_rate": 2.774865350089767e-06, + "loss": 8.6232, + "step": 30925 + }, + { + "epoch": 2.7782764811490126, + "grad_norm": 8.01756477355957, + "learning_rate": 2.7771095152603232e-06, + "loss": 8.7031, + "step": 30950 + }, + { + "epoch": 2.780520646319569, + "grad_norm": 8.951826095581055, + "learning_rate": 2.7793536804308803e-06, + "loss": 8.7036, + "step": 30975 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 8.34976863861084, + "learning_rate": 2.7815978456014365e-06, + "loss": 8.5766, + "step": 31000 + }, + { + "epoch": 2.7850089766606825, + "grad_norm": 8.486808776855469, + "learning_rate": 2.783842010771993e-06, + "loss": 8.6439, + "step": 31025 + }, + { + "epoch": 2.787253141831239, + "grad_norm": 7.637495040893555, + "learning_rate": 2.7860861759425493e-06, + "loss": 8.7347, + "step": 31050 + }, + { + "epoch": 2.7894973070017954, + "grad_norm": 7.570237159729004, + "learning_rate": 2.7883303411131063e-06, + "loss": 8.6828, + "step": 31075 + }, + { + "epoch": 2.791741472172352, + "grad_norm": 7.560822010040283, + "learning_rate": 2.7905745062836625e-06, + "loss": 8.6941, + "step": 31100 + }, + { + "epoch": 2.7939856373429084, + "grad_norm": 7.767632007598877, + "learning_rate": 2.7928186714542196e-06, + "loss": 8.5953, + "step": 31125 + }, + { + "epoch": 2.796229802513465, + "grad_norm": 9.030695915222168, + "learning_rate": 2.7950628366247757e-06, + "loss": 8.6476, + "step": 31150 + }, + { + "epoch": 2.7984739676840213, + "grad_norm": 9.871515274047852, + "learning_rate": 2.7973070017953324e-06, + "loss": 8.8304, + "step": 31175 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 8.194798469543457, + "learning_rate": 2.7995511669658886e-06, + "loss": 8.7818, + "step": 31200 + }, + { + "epoch": 2.8029622980251347, + "grad_norm": 7.809013843536377, + "learning_rate": 2.8017953321364456e-06, + "loss": 8.6354, + "step": 31225 + }, + { + "epoch": 2.8052064631956912, + "grad_norm": 9.581247329711914, + "learning_rate": 2.8040394973070018e-06, + "loss": 8.6756, + "step": 31250 + }, + { + "epoch": 2.8074506283662477, + "grad_norm": 7.447369575500488, + "learning_rate": 2.8062836624775584e-06, + "loss": 8.7057, + "step": 31275 + }, + { + "epoch": 2.809694793536804, + "grad_norm": 8.269529342651367, + "learning_rate": 2.8085278276481154e-06, + "loss": 8.7135, + "step": 31300 + }, + { + "epoch": 2.811938958707361, + "grad_norm": 9.228303909301758, + "learning_rate": 2.8107719928186716e-06, + "loss": 8.7436, + "step": 31325 + }, + { + "epoch": 2.8141831238779176, + "grad_norm": 9.966009140014648, + "learning_rate": 2.8130161579892282e-06, + "loss": 8.7345, + "step": 31350 + }, + { + "epoch": 2.816427289048474, + "grad_norm": 9.222103118896484, + "learning_rate": 2.815260323159785e-06, + "loss": 8.6647, + "step": 31375 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 7.900032997131348, + "learning_rate": 2.8175044883303415e-06, + "loss": 8.6304, + "step": 31400 + }, + { + "epoch": 2.820915619389587, + "grad_norm": 9.372703552246094, + "learning_rate": 2.8197486535008977e-06, + "loss": 8.7316, + "step": 31425 + }, + { + "epoch": 2.8231597845601435, + "grad_norm": 7.537347316741943, + "learning_rate": 2.8219928186714547e-06, + "loss": 8.6336, + "step": 31450 + }, + { + "epoch": 2.8254039497307, + "grad_norm": 7.82981014251709, + "learning_rate": 2.824236983842011e-06, + "loss": 8.5097, + "step": 31475 + }, + { + "epoch": 2.827648114901257, + "grad_norm": 8.193095207214355, + "learning_rate": 2.8264811490125675e-06, + "loss": 8.6826, + "step": 31500 + }, + { + "epoch": 2.8298922800718134, + "grad_norm": 7.850029468536377, + "learning_rate": 2.8287253141831237e-06, + "loss": 8.8174, + "step": 31525 + }, + { + "epoch": 2.83213644524237, + "grad_norm": 9.90671157836914, + "learning_rate": 2.8309694793536807e-06, + "loss": 8.6295, + "step": 31550 + }, + { + "epoch": 2.8343806104129263, + "grad_norm": 8.134076118469238, + "learning_rate": 2.8331238779174148e-06, + "loss": 8.6132, + "step": 31575 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 9.088194847106934, + "learning_rate": 2.8353680430879714e-06, + "loss": 8.7699, + "step": 31600 + }, + { + "epoch": 2.8388689407540397, + "grad_norm": 8.434287071228027, + "learning_rate": 2.837612208258528e-06, + "loss": 8.7173, + "step": 31625 + }, + { + "epoch": 2.841113105924596, + "grad_norm": 7.0966267585754395, + "learning_rate": 2.8398563734290846e-06, + "loss": 8.6096, + "step": 31650 + }, + { + "epoch": 2.8433572710951527, + "grad_norm": 9.984721183776855, + "learning_rate": 2.842100538599641e-06, + "loss": 8.6086, + "step": 31675 + }, + { + "epoch": 2.845601436265709, + "grad_norm": 8.210875511169434, + "learning_rate": 2.844344703770198e-06, + "loss": 8.5808, + "step": 31700 + }, + { + "epoch": 2.8478456014362656, + "grad_norm": 9.669321060180664, + "learning_rate": 2.8465888689407544e-06, + "loss": 8.5504, + "step": 31725 + }, + { + "epoch": 2.850089766606822, + "grad_norm": 8.067395210266113, + "learning_rate": 2.8488330341113106e-06, + "loss": 8.7382, + "step": 31750 + }, + { + "epoch": 2.8523339317773786, + "grad_norm": 7.837053298950195, + "learning_rate": 2.8510771992818677e-06, + "loss": 8.7332, + "step": 31775 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 8.338728904724121, + "learning_rate": 2.853321364452424e-06, + "loss": 8.6203, + "step": 31800 + }, + { + "epoch": 2.856822262118492, + "grad_norm": 8.220911979675293, + "learning_rate": 2.8555655296229805e-06, + "loss": 8.5685, + "step": 31825 + }, + { + "epoch": 2.8590664272890485, + "grad_norm": 10.284891128540039, + "learning_rate": 2.8578096947935367e-06, + "loss": 8.5991, + "step": 31850 + }, + { + "epoch": 2.861310592459605, + "grad_norm": 13.880992889404297, + "learning_rate": 2.8600538599640937e-06, + "loss": 8.6956, + "step": 31875 + }, + { + "epoch": 2.8635547576301614, + "grad_norm": 7.42999792098999, + "learning_rate": 2.86229802513465e-06, + "loss": 8.6447, + "step": 31900 + }, + { + "epoch": 2.8657989228007184, + "grad_norm": 8.93120288848877, + "learning_rate": 2.864542190305207e-06, + "loss": 8.6042, + "step": 31925 + }, + { + "epoch": 2.868043087971275, + "grad_norm": 8.040596008300781, + "learning_rate": 2.866786355475763e-06, + "loss": 8.5775, + "step": 31950 + }, + { + "epoch": 2.8702872531418313, + "grad_norm": 8.394110679626465, + "learning_rate": 2.8690305206463197e-06, + "loss": 8.7284, + "step": 31975 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 8.029304504394531, + "learning_rate": 2.8712746858168768e-06, + "loss": 8.6238, + "step": 32000 + }, + { + "epoch": 2.8747755834829443, + "grad_norm": 9.0026273727417, + "learning_rate": 2.873518850987433e-06, + "loss": 8.7403, + "step": 32025 + }, + { + "epoch": 2.8770197486535007, + "grad_norm": 7.601195335388184, + "learning_rate": 2.8757630161579896e-06, + "loss": 8.6544, + "step": 32050 + }, + { + "epoch": 2.879263913824057, + "grad_norm": 9.080717086791992, + "learning_rate": 2.8780071813285458e-06, + "loss": 8.587, + "step": 32075 + }, + { + "epoch": 2.881508078994614, + "grad_norm": 8.269723892211914, + "learning_rate": 2.880251346499103e-06, + "loss": 8.6286, + "step": 32100 + }, + { + "epoch": 2.8837522441651706, + "grad_norm": 7.488866329193115, + "learning_rate": 2.882495511669659e-06, + "loss": 8.6246, + "step": 32125 + }, + { + "epoch": 2.885996409335727, + "grad_norm": 7.298510551452637, + "learning_rate": 2.8847396768402156e-06, + "loss": 8.5167, + "step": 32150 + }, + { + "epoch": 2.8882405745062836, + "grad_norm": 9.586936950683594, + "learning_rate": 2.8869838420107722e-06, + "loss": 8.6564, + "step": 32175 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 6.955339431762695, + "learning_rate": 2.889228007181329e-06, + "loss": 8.5331, + "step": 32200 + }, + { + "epoch": 2.892728904847397, + "grad_norm": 8.170775413513184, + "learning_rate": 2.891472172351885e-06, + "loss": 8.7129, + "step": 32225 + }, + { + "epoch": 2.8949730700179535, + "grad_norm": 10.114889144897461, + "learning_rate": 2.893716337522442e-06, + "loss": 8.6379, + "step": 32250 + }, + { + "epoch": 2.89721723518851, + "grad_norm": 8.216899871826172, + "learning_rate": 2.8959605026929983e-06, + "loss": 8.5812, + "step": 32275 + }, + { + "epoch": 2.8994614003590664, + "grad_norm": 7.976411819458008, + "learning_rate": 2.898204667863555e-06, + "loss": 8.6512, + "step": 32300 + }, + { + "epoch": 2.901705565529623, + "grad_norm": 8.93648624420166, + "learning_rate": 2.900448833034111e-06, + "loss": 8.6775, + "step": 32325 + }, + { + "epoch": 2.9039497307001794, + "grad_norm": 7.75168514251709, + "learning_rate": 2.902692998204668e-06, + "loss": 8.6064, + "step": 32350 + }, + { + "epoch": 2.906193895870736, + "grad_norm": 8.045312881469727, + "learning_rate": 2.9049371633752247e-06, + "loss": 8.5124, + "step": 32375 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 8.697863578796387, + "learning_rate": 2.9071813285457814e-06, + "loss": 8.7352, + "step": 32400 + }, + { + "epoch": 2.9106822262118492, + "grad_norm": 7.765064716339111, + "learning_rate": 2.909425493716338e-06, + "loss": 8.6302, + "step": 32425 + }, + { + "epoch": 2.9129263913824057, + "grad_norm": 7.798865795135498, + "learning_rate": 2.911669658886894e-06, + "loss": 8.8222, + "step": 32450 + }, + { + "epoch": 2.915170556552962, + "grad_norm": 7.709845066070557, + "learning_rate": 2.913913824057451e-06, + "loss": 8.5862, + "step": 32475 + }, + { + "epoch": 2.917414721723519, + "grad_norm": 8.490741729736328, + "learning_rate": 2.9161579892280074e-06, + "loss": 8.5869, + "step": 32500 + }, + { + "epoch": 2.9196588868940756, + "grad_norm": 9.060537338256836, + "learning_rate": 2.918402154398564e-06, + "loss": 8.6236, + "step": 32525 + }, + { + "epoch": 2.921903052064632, + "grad_norm": 9.590815544128418, + "learning_rate": 2.92064631956912e-06, + "loss": 8.5677, + "step": 32550 + }, + { + "epoch": 2.9241472172351886, + "grad_norm": 7.672825813293457, + "learning_rate": 2.9228904847396772e-06, + "loss": 8.6129, + "step": 32575 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 7.768482685089111, + "learning_rate": 2.9251346499102334e-06, + "loss": 8.6697, + "step": 32600 + }, + { + "epoch": 2.9286355475763015, + "grad_norm": 7.877586841583252, + "learning_rate": 2.92737881508079e-06, + "loss": 8.5626, + "step": 32625 + }, + { + "epoch": 2.930879712746858, + "grad_norm": 7.837269306182861, + "learning_rate": 2.9296229802513467e-06, + "loss": 8.5699, + "step": 32650 + }, + { + "epoch": 2.9331238779174145, + "grad_norm": 9.103056907653809, + "learning_rate": 2.9318671454219033e-06, + "loss": 8.594, + "step": 32675 + }, + { + "epoch": 2.9353680430879714, + "grad_norm": 8.542180061340332, + "learning_rate": 2.9341113105924595e-06, + "loss": 8.619, + "step": 32700 + }, + { + "epoch": 2.937612208258528, + "grad_norm": 8.070602416992188, + "learning_rate": 2.9363554757630165e-06, + "loss": 8.6787, + "step": 32725 + }, + { + "epoch": 2.9398563734290843, + "grad_norm": 8.832462310791016, + "learning_rate": 2.938599640933573e-06, + "loss": 8.6627, + "step": 32750 + }, + { + "epoch": 2.942100538599641, + "grad_norm": 9.071767807006836, + "learning_rate": 2.9408438061041293e-06, + "loss": 8.5974, + "step": 32775 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 7.2388505935668945, + "learning_rate": 2.9430879712746863e-06, + "loss": 8.6694, + "step": 32800 + }, + { + "epoch": 2.9465888689407542, + "grad_norm": 9.545003890991211, + "learning_rate": 2.9453321364452425e-06, + "loss": 8.4908, + "step": 32825 + }, + { + "epoch": 2.9488330341113107, + "grad_norm": 7.707342147827148, + "learning_rate": 2.947576301615799e-06, + "loss": 8.6584, + "step": 32850 + }, + { + "epoch": 2.951077199281867, + "grad_norm": 9.694428443908691, + "learning_rate": 2.9498204667863558e-06, + "loss": 8.6065, + "step": 32875 + }, + { + "epoch": 2.9533213644524237, + "grad_norm": 12.867597579956055, + "learning_rate": 2.9520646319569124e-06, + "loss": 8.6678, + "step": 32900 + }, + { + "epoch": 2.95556552962298, + "grad_norm": 7.866276741027832, + "learning_rate": 2.9543087971274686e-06, + "loss": 8.6631, + "step": 32925 + }, + { + "epoch": 2.9578096947935366, + "grad_norm": 9.296459197998047, + "learning_rate": 2.9565529622980256e-06, + "loss": 8.6172, + "step": 32950 + }, + { + "epoch": 2.960053859964093, + "grad_norm": 7.605988502502441, + "learning_rate": 2.958797127468582e-06, + "loss": 8.588, + "step": 32975 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 8.2241792678833, + "learning_rate": 2.9610412926391384e-06, + "loss": 8.6217, + "step": 33000 + }, + { + "epoch": 2.9645421903052065, + "grad_norm": 7.683385372161865, + "learning_rate": 2.9632854578096946e-06, + "loss": 8.6947, + "step": 33025 + }, + { + "epoch": 2.966786355475763, + "grad_norm": 10.885710716247559, + "learning_rate": 2.9655296229802517e-06, + "loss": 8.4994, + "step": 33050 + }, + { + "epoch": 2.9690305206463194, + "grad_norm": 9.710160255432129, + "learning_rate": 2.9677737881508083e-06, + "loss": 8.6395, + "step": 33075 + }, + { + "epoch": 2.9712746858168764, + "grad_norm": 7.919198513031006, + "learning_rate": 2.9700179533213645e-06, + "loss": 8.5702, + "step": 33100 + }, + { + "epoch": 2.973518850987433, + "grad_norm": 11.825087547302246, + "learning_rate": 2.9722621184919215e-06, + "loss": 8.6832, + "step": 33125 + }, + { + "epoch": 2.9757630161579893, + "grad_norm": 7.556580066680908, + "learning_rate": 2.9745062836624777e-06, + "loss": 8.6926, + "step": 33150 + }, + { + "epoch": 2.978007181328546, + "grad_norm": 10.037398338317871, + "learning_rate": 2.9767504488330347e-06, + "loss": 8.5706, + "step": 33175 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 10.152449607849121, + "learning_rate": 2.978994614003591e-06, + "loss": 8.6896, + "step": 33200 + }, + { + "epoch": 2.9824955116696588, + "grad_norm": 9.07713794708252, + "learning_rate": 2.9812387791741475e-06, + "loss": 8.4762, + "step": 33225 + }, + { + "epoch": 2.9847396768402152, + "grad_norm": 8.044647216796875, + "learning_rate": 2.9834829443447037e-06, + "loss": 8.6641, + "step": 33250 + }, + { + "epoch": 2.9869838420107717, + "grad_norm": 7.610648155212402, + "learning_rate": 2.9857271095152608e-06, + "loss": 8.7389, + "step": 33275 + }, + { + "epoch": 2.9892280071813286, + "grad_norm": 9.249563217163086, + "learning_rate": 2.987971274685817e-06, + "loss": 8.7335, + "step": 33300 + }, + { + "epoch": 2.991472172351885, + "grad_norm": 7.261866569519043, + "learning_rate": 2.9902154398563736e-06, + "loss": 8.691, + "step": 33325 + }, + { + "epoch": 2.9937163375224416, + "grad_norm": 8.85435676574707, + "learning_rate": 2.99245960502693e-06, + "loss": 8.6525, + "step": 33350 + }, + { + "epoch": 2.995960502692998, + "grad_norm": 8.012818336486816, + "learning_rate": 2.994703770197487e-06, + "loss": 8.5527, + "step": 33375 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 8.620171546936035, + "learning_rate": 2.996947935368043e-06, + "loss": 8.7035, + "step": 33400 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.018463712218127662, + "eval_f1_macro": 6.495986899315121e-05, + "eval_f1_micro": 0.018463712218127662, + "eval_f1_weighted": 0.0018309211819712726, + "eval_loss": 8.882479667663574, + "eval_precision_macro": 6.062457758905633e-05, + "eval_precision_micro": 0.018463712218127662, + "eval_precision_weighted": 0.0015495939987740161, + "eval_recall_macro": 0.0004220579811287228, + "eval_recall_micro": 0.018463712218127662, + "eval_recall_weighted": 0.018463712218127662, + "eval_runtime": 129.802, + "eval_samples_per_second": 403.484, + "eval_steps_per_second": 12.612, + "step": 33420 + }, + { + "epoch": 3.0004488330341115, + "grad_norm": 8.292719841003418, + "learning_rate": 2.9991921005386e-06, + "loss": 8.5686, + "step": 33425 + }, + { + "epoch": 3.002692998204668, + "grad_norm": 10.750715255737305, + "learning_rate": 3.0014362657091566e-06, + "loss": 8.4285, + "step": 33450 + }, + { + "epoch": 3.0049371633752244, + "grad_norm": 9.18671703338623, + "learning_rate": 3.003680430879713e-06, + "loss": 8.4595, + "step": 33475 + }, + { + "epoch": 3.007181328545781, + "grad_norm": 7.254101753234863, + "learning_rate": 3.00592459605027e-06, + "loss": 8.4478, + "step": 33500 + }, + { + "epoch": 3.0094254937163374, + "grad_norm": 8.782075881958008, + "learning_rate": 3.008168761220826e-06, + "loss": 8.6012, + "step": 33525 + }, + { + "epoch": 3.011669658886894, + "grad_norm": 9.433014869689941, + "learning_rate": 3.0104129263913827e-06, + "loss": 8.581, + "step": 33550 + }, + { + "epoch": 3.013913824057451, + "grad_norm": 8.075406074523926, + "learning_rate": 3.012657091561939e-06, + "loss": 8.4754, + "step": 33575 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 8.596275329589844, + "learning_rate": 3.014901256732496e-06, + "loss": 8.4402, + "step": 33600 + }, + { + "epoch": 3.0184021543985637, + "grad_norm": 8.799968719482422, + "learning_rate": 3.017145421903052e-06, + "loss": 8.5098, + "step": 33625 + }, + { + "epoch": 3.02064631956912, + "grad_norm": 10.829291343688965, + "learning_rate": 3.019389587073609e-06, + "loss": 8.5009, + "step": 33650 + }, + { + "epoch": 3.0228904847396767, + "grad_norm": 10.22868537902832, + "learning_rate": 3.0216337522441653e-06, + "loss": 8.5343, + "step": 33675 + }, + { + "epoch": 3.025134649910233, + "grad_norm": 7.996678352355957, + "learning_rate": 3.023877917414722e-06, + "loss": 8.4411, + "step": 33700 + }, + { + "epoch": 3.02737881508079, + "grad_norm": 7.3036580085754395, + "learning_rate": 3.026122082585278e-06, + "loss": 8.4887, + "step": 33725 + }, + { + "epoch": 3.0296229802513466, + "grad_norm": 8.352195739746094, + "learning_rate": 3.028366247755835e-06, + "loss": 8.6102, + "step": 33750 + }, + { + "epoch": 3.031867145421903, + "grad_norm": 8.49329948425293, + "learning_rate": 3.0306104129263914e-06, + "loss": 8.4943, + "step": 33775 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 10.24759292602539, + "learning_rate": 3.032854578096948e-06, + "loss": 8.4997, + "step": 33800 + }, + { + "epoch": 3.036355475763016, + "grad_norm": 8.328330039978027, + "learning_rate": 3.035098743267505e-06, + "loss": 8.3654, + "step": 33825 + }, + { + "epoch": 3.0385996409335725, + "grad_norm": 10.906458854675293, + "learning_rate": 3.0373429084380612e-06, + "loss": 8.534, + "step": 33850 + }, + { + "epoch": 3.0408438061041294, + "grad_norm": 8.962233543395996, + "learning_rate": 3.039587073608618e-06, + "loss": 8.4739, + "step": 33875 + }, + { + "epoch": 3.043087971274686, + "grad_norm": 7.4496541023254395, + "learning_rate": 3.0418312387791745e-06, + "loss": 8.4426, + "step": 33900 + }, + { + "epoch": 3.0453321364452424, + "grad_norm": 8.106133460998535, + "learning_rate": 3.044075403949731e-06, + "loss": 8.3494, + "step": 33925 + }, + { + "epoch": 3.047576301615799, + "grad_norm": 8.330204010009766, + "learning_rate": 3.0463195691202873e-06, + "loss": 8.6253, + "step": 33950 + }, + { + "epoch": 3.0498204667863553, + "grad_norm": 8.969709396362305, + "learning_rate": 3.0485637342908443e-06, + "loss": 8.3349, + "step": 33975 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 8.170571327209473, + "learning_rate": 3.0508078994614005e-06, + "loss": 8.4236, + "step": 34000 + }, + { + "epoch": 3.0543087971274687, + "grad_norm": 8.868038177490234, + "learning_rate": 3.053052064631957e-06, + "loss": 8.5044, + "step": 34025 + }, + { + "epoch": 3.056552962298025, + "grad_norm": 10.547266006469727, + "learning_rate": 3.0552962298025133e-06, + "loss": 8.6112, + "step": 34050 + }, + { + "epoch": 3.0587971274685817, + "grad_norm": 9.575149536132812, + "learning_rate": 3.0575403949730703e-06, + "loss": 8.4589, + "step": 34075 + }, + { + "epoch": 3.061041292639138, + "grad_norm": 9.52155590057373, + "learning_rate": 3.0597845601436265e-06, + "loss": 8.4949, + "step": 34100 + }, + { + "epoch": 3.0632854578096946, + "grad_norm": 7.623871803283691, + "learning_rate": 3.0620287253141836e-06, + "loss": 8.4908, + "step": 34125 + }, + { + "epoch": 3.0655296229802516, + "grad_norm": 7.481366157531738, + "learning_rate": 3.06427289048474e-06, + "loss": 8.5381, + "step": 34150 + }, + { + "epoch": 3.067773788150808, + "grad_norm": 7.608867168426514, + "learning_rate": 3.0665170556552964e-06, + "loss": 8.4945, + "step": 34175 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 8.1883544921875, + "learning_rate": 3.0687612208258534e-06, + "loss": 8.5113, + "step": 34200 + }, + { + "epoch": 3.072262118491921, + "grad_norm": 7.647704601287842, + "learning_rate": 3.0710053859964096e-06, + "loss": 8.5513, + "step": 34225 + }, + { + "epoch": 3.0745062836624775, + "grad_norm": 9.042765617370605, + "learning_rate": 3.0732495511669662e-06, + "loss": 8.5365, + "step": 34250 + }, + { + "epoch": 3.076750448833034, + "grad_norm": 7.775447845458984, + "learning_rate": 3.0754937163375224e-06, + "loss": 8.5854, + "step": 34275 + }, + { + "epoch": 3.078994614003591, + "grad_norm": 7.431183338165283, + "learning_rate": 3.0777378815080794e-06, + "loss": 8.5484, + "step": 34300 + }, + { + "epoch": 3.0812387791741473, + "grad_norm": 8.711434364318848, + "learning_rate": 3.0799820466786356e-06, + "loss": 8.6021, + "step": 34325 + }, + { + "epoch": 3.083482944344704, + "grad_norm": 8.551756858825684, + "learning_rate": 3.0822262118491923e-06, + "loss": 8.655, + "step": 34350 + }, + { + "epoch": 3.0857271095152603, + "grad_norm": 7.975530624389648, + "learning_rate": 3.084470377019749e-06, + "loss": 8.3824, + "step": 34375 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 8.555890083312988, + "learning_rate": 3.0867145421903055e-06, + "loss": 8.5013, + "step": 34400 + }, + { + "epoch": 3.0902154398563733, + "grad_norm": 8.289948463439941, + "learning_rate": 3.0889587073608617e-06, + "loss": 8.5546, + "step": 34425 + }, + { + "epoch": 3.09245960502693, + "grad_norm": 7.502779483795166, + "learning_rate": 3.0912028725314187e-06, + "loss": 8.3728, + "step": 34450 + }, + { + "epoch": 3.0947037701974867, + "grad_norm": 8.113394737243652, + "learning_rate": 3.093447037701975e-06, + "loss": 8.5315, + "step": 34475 + }, + { + "epoch": 3.096947935368043, + "grad_norm": 9.500271797180176, + "learning_rate": 3.0956912028725315e-06, + "loss": 8.2176, + "step": 34500 + }, + { + "epoch": 3.0991921005385996, + "grad_norm": 7.600194454193115, + "learning_rate": 3.0979353680430886e-06, + "loss": 8.3602, + "step": 34525 + }, + { + "epoch": 3.101436265709156, + "grad_norm": 7.476357936859131, + "learning_rate": 3.1001795332136447e-06, + "loss": 8.5001, + "step": 34550 + }, + { + "epoch": 3.1036804308797126, + "grad_norm": 7.860019683837891, + "learning_rate": 3.1024236983842014e-06, + "loss": 8.5632, + "step": 34575 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 7.64511775970459, + "learning_rate": 3.104667863554758e-06, + "loss": 8.3913, + "step": 34600 + }, + { + "epoch": 3.108168761220826, + "grad_norm": 7.842035293579102, + "learning_rate": 3.1069120287253146e-06, + "loss": 8.5437, + "step": 34625 + }, + { + "epoch": 3.1104129263913824, + "grad_norm": 8.0552396774292, + "learning_rate": 3.1091561938958708e-06, + "loss": 8.4996, + "step": 34650 + }, + { + "epoch": 3.112657091561939, + "grad_norm": 7.927032470703125, + "learning_rate": 3.111400359066428e-06, + "loss": 8.518, + "step": 34675 + }, + { + "epoch": 3.1149012567324954, + "grad_norm": 8.220379829406738, + "learning_rate": 3.113644524236984e-06, + "loss": 8.5604, + "step": 34700 + }, + { + "epoch": 3.117145421903052, + "grad_norm": 8.463505744934082, + "learning_rate": 3.1158886894075406e-06, + "loss": 8.5899, + "step": 34725 + }, + { + "epoch": 3.119389587073609, + "grad_norm": 8.825577735900879, + "learning_rate": 3.118132854578097e-06, + "loss": 8.4443, + "step": 34750 + }, + { + "epoch": 3.1216337522441653, + "grad_norm": 8.178020477294922, + "learning_rate": 3.120377019748654e-06, + "loss": 8.4939, + "step": 34775 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 7.776699066162109, + "learning_rate": 3.12262118491921e-06, + "loss": 8.3629, + "step": 34800 + }, + { + "epoch": 3.1261220825852782, + "grad_norm": 7.5941667556762695, + "learning_rate": 3.124865350089767e-06, + "loss": 8.5006, + "step": 34825 + }, + { + "epoch": 3.1283662477558347, + "grad_norm": 8.088007926940918, + "learning_rate": 3.1271095152603237e-06, + "loss": 8.3503, + "step": 34850 + }, + { + "epoch": 3.130610412926391, + "grad_norm": 8.164965629577637, + "learning_rate": 3.12935368043088e-06, + "loss": 8.597, + "step": 34875 + }, + { + "epoch": 3.132854578096948, + "grad_norm": 8.035935401916504, + "learning_rate": 3.131597845601437e-06, + "loss": 8.4433, + "step": 34900 + }, + { + "epoch": 3.1350987432675046, + "grad_norm": 8.142804145812988, + "learning_rate": 3.133842010771993e-06, + "loss": 8.6398, + "step": 34925 + }, + { + "epoch": 3.137342908438061, + "grad_norm": 7.328189373016357, + "learning_rate": 3.1360861759425497e-06, + "loss": 8.4566, + "step": 34950 + }, + { + "epoch": 3.1395870736086176, + "grad_norm": 7.533339023590088, + "learning_rate": 3.138330341113106e-06, + "loss": 8.3033, + "step": 34975 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 10.422499656677246, + "learning_rate": 3.140574506283663e-06, + "loss": 8.3653, + "step": 35000 + }, + { + "epoch": 3.1440754039497305, + "grad_norm": 9.597655296325684, + "learning_rate": 3.142818671454219e-06, + "loss": 8.4354, + "step": 35025 + }, + { + "epoch": 3.1463195691202874, + "grad_norm": 7.642961502075195, + "learning_rate": 3.1450628366247758e-06, + "loss": 8.5016, + "step": 35050 + }, + { + "epoch": 3.148563734290844, + "grad_norm": 7.727568626403809, + "learning_rate": 3.1473070017953324e-06, + "loss": 8.4582, + "step": 35075 + }, + { + "epoch": 3.1508078994614004, + "grad_norm": 9.734370231628418, + "learning_rate": 3.149551166965889e-06, + "loss": 8.5406, + "step": 35100 + }, + { + "epoch": 3.153052064631957, + "grad_norm": 11.248257637023926, + "learning_rate": 3.151795332136445e-06, + "loss": 8.4812, + "step": 35125 + }, + { + "epoch": 3.1552962298025133, + "grad_norm": 7.235381603240967, + "learning_rate": 3.1540394973070022e-06, + "loss": 8.4374, + "step": 35150 + }, + { + "epoch": 3.15754039497307, + "grad_norm": 7.635133266448975, + "learning_rate": 3.1562836624775584e-06, + "loss": 8.4974, + "step": 35175 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 7.585856914520264, + "learning_rate": 3.158527827648115e-06, + "loss": 8.4867, + "step": 35200 + }, + { + "epoch": 3.162028725314183, + "grad_norm": 7.043858528137207, + "learning_rate": 3.160771992818672e-06, + "loss": 8.4935, + "step": 35225 + }, + { + "epoch": 3.1642728904847397, + "grad_norm": 8.270597457885742, + "learning_rate": 3.1630161579892283e-06, + "loss": 8.5087, + "step": 35250 + }, + { + "epoch": 3.166517055655296, + "grad_norm": 8.732002258300781, + "learning_rate": 3.165260323159785e-06, + "loss": 8.3131, + "step": 35275 + }, + { + "epoch": 3.1687612208258527, + "grad_norm": 9.565396308898926, + "learning_rate": 3.1675044883303415e-06, + "loss": 8.3394, + "step": 35300 + }, + { + "epoch": 3.171005385996409, + "grad_norm": 8.197178840637207, + "learning_rate": 3.169748653500898e-06, + "loss": 8.6272, + "step": 35325 + }, + { + "epoch": 3.173249551166966, + "grad_norm": 8.136893272399902, + "learning_rate": 3.1719928186714543e-06, + "loss": 8.5074, + "step": 35350 + }, + { + "epoch": 3.1754937163375225, + "grad_norm": 7.5894551277160645, + "learning_rate": 3.1742369838420114e-06, + "loss": 8.3936, + "step": 35375 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 8.07244873046875, + "learning_rate": 3.1764811490125675e-06, + "loss": 8.3487, + "step": 35400 + }, + { + "epoch": 3.1799820466786355, + "grad_norm": 8.02269458770752, + "learning_rate": 3.178725314183124e-06, + "loss": 8.4897, + "step": 35425 + }, + { + "epoch": 3.182226211849192, + "grad_norm": 7.889553546905518, + "learning_rate": 3.1809694793536804e-06, + "loss": 8.4617, + "step": 35450 + }, + { + "epoch": 3.184470377019749, + "grad_norm": 8.069402694702148, + "learning_rate": 3.1832136445242374e-06, + "loss": 8.4432, + "step": 35475 + }, + { + "epoch": 3.1867145421903054, + "grad_norm": 8.514985084533691, + "learning_rate": 3.1854578096947936e-06, + "loss": 8.3813, + "step": 35500 + }, + { + "epoch": 3.188958707360862, + "grad_norm": 9.425914764404297, + "learning_rate": 3.18770197486535e-06, + "loss": 8.4648, + "step": 35525 + }, + { + "epoch": 3.1912028725314183, + "grad_norm": 10.518730163574219, + "learning_rate": 3.189946140035907e-06, + "loss": 8.4224, + "step": 35550 + }, + { + "epoch": 3.193447037701975, + "grad_norm": 8.378015518188477, + "learning_rate": 3.1921903052064634e-06, + "loss": 8.4661, + "step": 35575 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 8.31844425201416, + "learning_rate": 3.1944344703770205e-06, + "loss": 8.3442, + "step": 35600 + }, + { + "epoch": 3.1979353680430878, + "grad_norm": 8.19754695892334, + "learning_rate": 3.1966786355475767e-06, + "loss": 8.5689, + "step": 35625 + }, + { + "epoch": 3.2001795332136447, + "grad_norm": 7.8165178298950195, + "learning_rate": 3.198833034111311e-06, + "loss": 8.3984, + "step": 35650 + }, + { + "epoch": 3.202423698384201, + "grad_norm": 8.204354286193848, + "learning_rate": 3.2010771992818673e-06, + "loss": 8.4499, + "step": 35675 + }, + { + "epoch": 3.2046678635547576, + "grad_norm": 8.149467468261719, + "learning_rate": 3.2033213644524243e-06, + "loss": 8.4823, + "step": 35700 + }, + { + "epoch": 3.206912028725314, + "grad_norm": 7.872113227844238, + "learning_rate": 3.2055655296229805e-06, + "loss": 8.3313, + "step": 35725 + }, + { + "epoch": 3.2091561938958706, + "grad_norm": 9.119840621948242, + "learning_rate": 3.207809694793537e-06, + "loss": 8.3673, + "step": 35750 + }, + { + "epoch": 3.2114003590664275, + "grad_norm": 8.521638870239258, + "learning_rate": 3.2100538599640933e-06, + "loss": 8.3669, + "step": 35775 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 8.552473068237305, + "learning_rate": 3.2122980251346504e-06, + "loss": 8.5235, + "step": 35800 + }, + { + "epoch": 3.2158886894075405, + "grad_norm": 9.714869499206543, + "learning_rate": 3.2145421903052066e-06, + "loss": 8.5348, + "step": 35825 + }, + { + "epoch": 3.218132854578097, + "grad_norm": 9.11538314819336, + "learning_rate": 3.216786355475763e-06, + "loss": 8.4378, + "step": 35850 + }, + { + "epoch": 3.2203770197486534, + "grad_norm": 7.535921096801758, + "learning_rate": 3.2190305206463198e-06, + "loss": 8.4479, + "step": 35875 + }, + { + "epoch": 3.22262118491921, + "grad_norm": 8.115995407104492, + "learning_rate": 3.2212746858168764e-06, + "loss": 8.4867, + "step": 35900 + }, + { + "epoch": 3.2248653500897664, + "grad_norm": 7.66502046585083, + "learning_rate": 3.2235188509874326e-06, + "loss": 8.4714, + "step": 35925 + }, + { + "epoch": 3.2271095152603233, + "grad_norm": 12.233177185058594, + "learning_rate": 3.2257630161579896e-06, + "loss": 8.5738, + "step": 35950 + }, + { + "epoch": 3.22935368043088, + "grad_norm": 8.14846134185791, + "learning_rate": 3.2280071813285462e-06, + "loss": 8.3966, + "step": 35975 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 8.151975631713867, + "learning_rate": 3.2302513464991024e-06, + "loss": 8.329, + "step": 36000 + }, + { + "epoch": 3.2338420107719927, + "grad_norm": 7.7847514152526855, + "learning_rate": 3.2324955116696595e-06, + "loss": 8.2758, + "step": 36025 + }, + { + "epoch": 3.236086175942549, + "grad_norm": 8.117721557617188, + "learning_rate": 3.2347396768402157e-06, + "loss": 8.296, + "step": 36050 + }, + { + "epoch": 3.238330341113106, + "grad_norm": 7.951766014099121, + "learning_rate": 3.2369838420107723e-06, + "loss": 8.3816, + "step": 36075 + }, + { + "epoch": 3.2405745062836626, + "grad_norm": 7.242469787597656, + "learning_rate": 3.2392280071813285e-06, + "loss": 8.3759, + "step": 36100 + }, + { + "epoch": 3.242818671454219, + "grad_norm": 7.752915859222412, + "learning_rate": 3.2414721723518855e-06, + "loss": 8.4223, + "step": 36125 + }, + { + "epoch": 3.2450628366247756, + "grad_norm": 8.411601066589355, + "learning_rate": 3.2437163375224417e-06, + "loss": 8.4214, + "step": 36150 + }, + { + "epoch": 3.247307001795332, + "grad_norm": 11.10370922088623, + "learning_rate": 3.2459605026929987e-06, + "loss": 8.4497, + "step": 36175 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 7.315251350402832, + "learning_rate": 3.248204667863555e-06, + "loss": 8.5061, + "step": 36200 + }, + { + "epoch": 3.251795332136445, + "grad_norm": 7.242710113525391, + "learning_rate": 3.2504488330341115e-06, + "loss": 8.3792, + "step": 36225 + }, + { + "epoch": 3.254039497307002, + "grad_norm": 9.700970649719238, + "learning_rate": 3.2526929982046677e-06, + "loss": 8.4411, + "step": 36250 + }, + { + "epoch": 3.2562836624775584, + "grad_norm": 8.13416576385498, + "learning_rate": 3.2549371633752248e-06, + "loss": 8.292, + "step": 36275 + }, + { + "epoch": 3.258527827648115, + "grad_norm": 8.240909576416016, + "learning_rate": 3.2571813285457814e-06, + "loss": 8.3121, + "step": 36300 + }, + { + "epoch": 3.2607719928186714, + "grad_norm": 7.815669536590576, + "learning_rate": 3.2594254937163376e-06, + "loss": 8.3587, + "step": 36325 + }, + { + "epoch": 3.263016157989228, + "grad_norm": 8.55628776550293, + "learning_rate": 3.2616696588868946e-06, + "loss": 8.5587, + "step": 36350 + }, + { + "epoch": 3.2652603231597848, + "grad_norm": 10.08407974243164, + "learning_rate": 3.263913824057451e-06, + "loss": 8.5534, + "step": 36375 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 7.9110636711120605, + "learning_rate": 3.266157989228008e-06, + "loss": 8.3507, + "step": 36400 + }, + { + "epoch": 3.2697486535008977, + "grad_norm": 7.238288402557373, + "learning_rate": 3.268402154398564e-06, + "loss": 8.3944, + "step": 36425 + }, + { + "epoch": 3.271992818671454, + "grad_norm": 7.47892427444458, + "learning_rate": 3.2706463195691207e-06, + "loss": 8.4551, + "step": 36450 + }, + { + "epoch": 3.2742369838420107, + "grad_norm": 10.366768836975098, + "learning_rate": 3.272890484739677e-06, + "loss": 8.515, + "step": 36475 + }, + { + "epoch": 3.276481149012567, + "grad_norm": 9.37814998626709, + "learning_rate": 3.275134649910234e-06, + "loss": 8.4355, + "step": 36500 + }, + { + "epoch": 3.2787253141831236, + "grad_norm": 8.539063453674316, + "learning_rate": 3.27737881508079e-06, + "loss": 8.3952, + "step": 36525 + }, + { + "epoch": 3.2809694793536806, + "grad_norm": 8.930289268493652, + "learning_rate": 3.2796229802513467e-06, + "loss": 8.4275, + "step": 36550 + }, + { + "epoch": 3.283213644524237, + "grad_norm": 9.20190143585205, + "learning_rate": 3.2818671454219033e-06, + "loss": 8.3396, + "step": 36575 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 8.489038467407227, + "learning_rate": 3.28411131059246e-06, + "loss": 8.4126, + "step": 36600 + }, + { + "epoch": 3.28770197486535, + "grad_norm": 9.229331016540527, + "learning_rate": 3.286355475763016e-06, + "loss": 8.4651, + "step": 36625 + }, + { + "epoch": 3.2899461400359065, + "grad_norm": 8.418839454650879, + "learning_rate": 3.288599640933573e-06, + "loss": 8.5337, + "step": 36650 + }, + { + "epoch": 3.2921903052064634, + "grad_norm": 9.218766212463379, + "learning_rate": 3.2908438061041298e-06, + "loss": 8.324, + "step": 36675 + }, + { + "epoch": 3.29443447037702, + "grad_norm": 8.576160430908203, + "learning_rate": 3.293087971274686e-06, + "loss": 8.3034, + "step": 36700 + }, + { + "epoch": 3.2966786355475763, + "grad_norm": 7.77107572555542, + "learning_rate": 3.295332136445243e-06, + "loss": 8.414, + "step": 36725 + }, + { + "epoch": 3.298922800718133, + "grad_norm": 7.66124153137207, + "learning_rate": 3.297576301615799e-06, + "loss": 8.5343, + "step": 36750 + }, + { + "epoch": 3.3011669658886893, + "grad_norm": 7.683452606201172, + "learning_rate": 3.299820466786356e-06, + "loss": 8.4067, + "step": 36775 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 10.209945678710938, + "learning_rate": 3.302064631956912e-06, + "loss": 8.4302, + "step": 36800 + }, + { + "epoch": 3.3056552962298027, + "grad_norm": 8.97936725616455, + "learning_rate": 3.304308797127469e-06, + "loss": 8.4157, + "step": 36825 + }, + { + "epoch": 3.307899461400359, + "grad_norm": 10.166036605834961, + "learning_rate": 3.3065529622980252e-06, + "loss": 8.5126, + "step": 36850 + }, + { + "epoch": 3.3101436265709157, + "grad_norm": 9.039531707763672, + "learning_rate": 3.3087971274685823e-06, + "loss": 8.5609, + "step": 36875 + }, + { + "epoch": 3.312387791741472, + "grad_norm": 8.270082473754883, + "learning_rate": 3.3110412926391385e-06, + "loss": 8.4202, + "step": 36900 + }, + { + "epoch": 3.3146319569120286, + "grad_norm": 8.72864818572998, + "learning_rate": 3.313285457809695e-06, + "loss": 8.3667, + "step": 36925 + }, + { + "epoch": 3.316876122082585, + "grad_norm": 8.038674354553223, + "learning_rate": 3.3155296229802513e-06, + "loss": 8.3572, + "step": 36950 + }, + { + "epoch": 3.319120287253142, + "grad_norm": 7.671535491943359, + "learning_rate": 3.3177737881508083e-06, + "loss": 8.4122, + "step": 36975 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 7.990935325622559, + "learning_rate": 3.3200179533213645e-06, + "loss": 8.3449, + "step": 37000 + }, + { + "epoch": 3.323608617594255, + "grad_norm": 10.295793533325195, + "learning_rate": 3.322262118491921e-06, + "loss": 8.3424, + "step": 37025 + }, + { + "epoch": 3.3258527827648114, + "grad_norm": 8.74478816986084, + "learning_rate": 3.324416517055656e-06, + "loss": 8.5671, + "step": 37050 + }, + { + "epoch": 3.328096947935368, + "grad_norm": 10.626089096069336, + "learning_rate": 3.326660682226212e-06, + "loss": 8.4187, + "step": 37075 + }, + { + "epoch": 3.3303411131059244, + "grad_norm": 9.782660484313965, + "learning_rate": 3.3289048473967688e-06, + "loss": 8.253, + "step": 37100 + }, + { + "epoch": 3.3325852782764813, + "grad_norm": 8.335285186767578, + "learning_rate": 3.331149012567325e-06, + "loss": 8.3998, + "step": 37125 + }, + { + "epoch": 3.334829443447038, + "grad_norm": 8.266969680786133, + "learning_rate": 3.333393177737882e-06, + "loss": 8.464, + "step": 37150 + }, + { + "epoch": 3.3370736086175943, + "grad_norm": 8.957066535949707, + "learning_rate": 3.335637342908438e-06, + "loss": 8.3639, + "step": 37175 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 7.22792387008667, + "learning_rate": 3.337881508078995e-06, + "loss": 8.2225, + "step": 37200 + }, + { + "epoch": 3.3415619389587072, + "grad_norm": 9.82030200958252, + "learning_rate": 3.3401256732495514e-06, + "loss": 8.3309, + "step": 37225 + }, + { + "epoch": 3.343806104129264, + "grad_norm": 9.448009490966797, + "learning_rate": 3.342369838420108e-06, + "loss": 8.4286, + "step": 37250 + }, + { + "epoch": 3.3460502692998206, + "grad_norm": 7.922127723693848, + "learning_rate": 3.3446140035906642e-06, + "loss": 8.5702, + "step": 37275 + }, + { + "epoch": 3.348294434470377, + "grad_norm": 8.696995735168457, + "learning_rate": 3.3468581687612213e-06, + "loss": 8.4796, + "step": 37300 + }, + { + "epoch": 3.3505385996409336, + "grad_norm": 11.76653003692627, + "learning_rate": 3.3491023339317775e-06, + "loss": 8.4974, + "step": 37325 + }, + { + "epoch": 3.35278276481149, + "grad_norm": 7.903279781341553, + "learning_rate": 3.351346499102334e-06, + "loss": 8.406, + "step": 37350 + }, + { + "epoch": 3.3550269299820465, + "grad_norm": 7.927674770355225, + "learning_rate": 3.3535906642728903e-06, + "loss": 8.4985, + "step": 37375 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 7.448356628417969, + "learning_rate": 3.3558348294434473e-06, + "loss": 8.4775, + "step": 37400 + }, + { + "epoch": 3.35951526032316, + "grad_norm": 7.497574329376221, + "learning_rate": 3.358078994614004e-06, + "loss": 8.4748, + "step": 37425 + }, + { + "epoch": 3.3617594254937164, + "grad_norm": 7.709659576416016, + "learning_rate": 3.3603231597845605e-06, + "loss": 8.304, + "step": 37450 + }, + { + "epoch": 3.364003590664273, + "grad_norm": 8.069513320922852, + "learning_rate": 3.362567324955117e-06, + "loss": 8.4978, + "step": 37475 + }, + { + "epoch": 3.3662477558348294, + "grad_norm": 8.335525512695312, + "learning_rate": 3.3648114901256733e-06, + "loss": 8.4201, + "step": 37500 + }, + { + "epoch": 3.368491921005386, + "grad_norm": 8.664959907531738, + "learning_rate": 3.3670556552962304e-06, + "loss": 8.3631, + "step": 37525 + }, + { + "epoch": 3.370736086175943, + "grad_norm": 7.609121322631836, + "learning_rate": 3.3692998204667866e-06, + "loss": 8.3829, + "step": 37550 + }, + { + "epoch": 3.3729802513464993, + "grad_norm": 8.073299407958984, + "learning_rate": 3.371543985637343e-06, + "loss": 8.4229, + "step": 37575 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 9.45817756652832, + "learning_rate": 3.3737881508078994e-06, + "loss": 8.4575, + "step": 37600 + }, + { + "epoch": 3.377468581687612, + "grad_norm": 8.173770904541016, + "learning_rate": 3.3760323159784564e-06, + "loss": 8.2937, + "step": 37625 + }, + { + "epoch": 3.3797127468581687, + "grad_norm": 7.90732479095459, + "learning_rate": 3.3782764811490126e-06, + "loss": 8.379, + "step": 37650 + }, + { + "epoch": 3.381956912028725, + "grad_norm": 8.80076789855957, + "learning_rate": 3.3805206463195692e-06, + "loss": 8.4254, + "step": 37675 + }, + { + "epoch": 3.3842010771992816, + "grad_norm": 10.41962718963623, + "learning_rate": 3.382764811490126e-06, + "loss": 8.4798, + "step": 37700 + }, + { + "epoch": 3.3864452423698386, + "grad_norm": 8.043856620788574, + "learning_rate": 3.3850089766606825e-06, + "loss": 8.3809, + "step": 37725 + }, + { + "epoch": 3.388689407540395, + "grad_norm": 8.62320613861084, + "learning_rate": 3.3872531418312395e-06, + "loss": 8.6544, + "step": 37750 + }, + { + "epoch": 3.3909335727109515, + "grad_norm": 9.336959838867188, + "learning_rate": 3.3894973070017957e-06, + "loss": 8.3482, + "step": 37775 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 8.948089599609375, + "learning_rate": 3.3917414721723523e-06, + "loss": 8.3915, + "step": 37800 + }, + { + "epoch": 3.3954219030520645, + "grad_norm": 7.9565887451171875, + "learning_rate": 3.3939856373429085e-06, + "loss": 8.4193, + "step": 37825 + }, + { + "epoch": 3.3976660682226214, + "grad_norm": 8.176352500915527, + "learning_rate": 3.3962298025134655e-06, + "loss": 8.4367, + "step": 37850 + }, + { + "epoch": 3.399910233393178, + "grad_norm": 7.6172590255737305, + "learning_rate": 3.3984739676840217e-06, + "loss": 8.4176, + "step": 37875 + }, + { + "epoch": 3.4021543985637344, + "grad_norm": 8.203996658325195, + "learning_rate": 3.4007181328545783e-06, + "loss": 8.4151, + "step": 37900 + }, + { + "epoch": 3.404398563734291, + "grad_norm": 14.062883377075195, + "learning_rate": 3.402962298025135e-06, + "loss": 8.5583, + "step": 37925 + }, + { + "epoch": 3.4066427289048473, + "grad_norm": 8.612116813659668, + "learning_rate": 3.4052064631956916e-06, + "loss": 8.438, + "step": 37950 + }, + { + "epoch": 3.408886894075404, + "grad_norm": 7.211400032043457, + "learning_rate": 3.4074506283662478e-06, + "loss": 8.2943, + "step": 37975 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 9.707756042480469, + "learning_rate": 3.409694793536805e-06, + "loss": 8.3937, + "step": 38000 + }, + { + "epoch": 3.413375224416517, + "grad_norm": 8.058649063110352, + "learning_rate": 3.411938958707361e-06, + "loss": 8.1801, + "step": 38025 + }, + { + "epoch": 3.4156193895870737, + "grad_norm": 10.202410697937012, + "learning_rate": 3.4141831238779176e-06, + "loss": 8.4447, + "step": 38050 + }, + { + "epoch": 3.41786355475763, + "grad_norm": 9.956732749938965, + "learning_rate": 3.416427289048474e-06, + "loss": 8.2736, + "step": 38075 + }, + { + "epoch": 3.4201077199281866, + "grad_norm": 9.814412117004395, + "learning_rate": 3.418671454219031e-06, + "loss": 8.3767, + "step": 38100 + }, + { + "epoch": 3.422351885098743, + "grad_norm": 7.969823360443115, + "learning_rate": 3.4209156193895875e-06, + "loss": 8.3472, + "step": 38125 + }, + { + "epoch": 3.4245960502693, + "grad_norm": 8.322891235351562, + "learning_rate": 3.423159784560144e-06, + "loss": 8.3207, + "step": 38150 + }, + { + "epoch": 3.4268402154398565, + "grad_norm": 8.447792053222656, + "learning_rate": 3.4254039497307007e-06, + "loss": 8.2706, + "step": 38175 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 9.597269058227539, + "learning_rate": 3.427648114901257e-06, + "loss": 8.4176, + "step": 38200 + }, + { + "epoch": 3.4313285457809695, + "grad_norm": 8.275882720947266, + "learning_rate": 3.429892280071814e-06, + "loss": 8.3751, + "step": 38225 + }, + { + "epoch": 3.433572710951526, + "grad_norm": 8.543193817138672, + "learning_rate": 3.43213644524237e-06, + "loss": 8.3924, + "step": 38250 + }, + { + "epoch": 3.4358168761220824, + "grad_norm": 7.5396528244018555, + "learning_rate": 3.4343806104129267e-06, + "loss": 8.3582, + "step": 38275 + }, + { + "epoch": 3.438061041292639, + "grad_norm": 8.364615440368652, + "learning_rate": 3.436624775583483e-06, + "loss": 8.2275, + "step": 38300 + }, + { + "epoch": 3.440305206463196, + "grad_norm": 8.275908470153809, + "learning_rate": 3.43886894075404e-06, + "loss": 8.2396, + "step": 38325 + }, + { + "epoch": 3.4425493716337523, + "grad_norm": 8.147153854370117, + "learning_rate": 3.441113105924596e-06, + "loss": 8.3141, + "step": 38350 + }, + { + "epoch": 3.4447935368043088, + "grad_norm": 8.078544616699219, + "learning_rate": 3.4433572710951528e-06, + "loss": 8.4764, + "step": 38375 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 8.273439407348633, + "learning_rate": 3.4456014362657094e-06, + "loss": 8.453, + "step": 38400 + }, + { + "epoch": 3.4492818671454217, + "grad_norm": 7.937435626983643, + "learning_rate": 3.447845601436266e-06, + "loss": 8.3727, + "step": 38425 + }, + { + "epoch": 3.4515260323159787, + "grad_norm": 8.82056999206543, + "learning_rate": 3.450089766606823e-06, + "loss": 8.3704, + "step": 38450 + }, + { + "epoch": 3.453770197486535, + "grad_norm": 8.843513488769531, + "learning_rate": 3.4523339317773792e-06, + "loss": 8.4283, + "step": 38475 + }, + { + "epoch": 3.4560143626570916, + "grad_norm": 8.835134506225586, + "learning_rate": 3.454578096947936e-06, + "loss": 8.2301, + "step": 38500 + }, + { + "epoch": 3.458258527827648, + "grad_norm": 9.37270736694336, + "learning_rate": 3.456822262118492e-06, + "loss": 8.5165, + "step": 38525 + }, + { + "epoch": 3.4605026929982046, + "grad_norm": 8.626322746276855, + "learning_rate": 3.459066427289049e-06, + "loss": 8.4157, + "step": 38550 + }, + { + "epoch": 3.462746858168761, + "grad_norm": 8.391489028930664, + "learning_rate": 3.4613105924596053e-06, + "loss": 8.3315, + "step": 38575 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 7.129566669464111, + "learning_rate": 3.463554757630162e-06, + "loss": 8.4684, + "step": 38600 + }, + { + "epoch": 3.4672351885098744, + "grad_norm": 8.662847518920898, + "learning_rate": 3.4657989228007185e-06, + "loss": 8.4936, + "step": 38625 + }, + { + "epoch": 3.469479353680431, + "grad_norm": 7.6930084228515625, + "learning_rate": 3.468043087971275e-06, + "loss": 8.3807, + "step": 38650 + }, + { + "epoch": 3.4717235188509874, + "grad_norm": 7.133084774017334, + "learning_rate": 3.4702872531418313e-06, + "loss": 8.2246, + "step": 38675 + }, + { + "epoch": 3.473967684021544, + "grad_norm": 8.896027565002441, + "learning_rate": 3.4725314183123883e-06, + "loss": 8.3377, + "step": 38700 + }, + { + "epoch": 3.4762118491921004, + "grad_norm": 8.791261672973633, + "learning_rate": 3.4747755834829445e-06, + "loss": 8.2833, + "step": 38725 + }, + { + "epoch": 3.4784560143626573, + "grad_norm": 7.869853973388672, + "learning_rate": 3.477019748653501e-06, + "loss": 8.2976, + "step": 38750 + }, + { + "epoch": 3.4807001795332138, + "grad_norm": 7.689734935760498, + "learning_rate": 3.4792639138240573e-06, + "loss": 8.4935, + "step": 38775 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 9.368339538574219, + "learning_rate": 3.4815080789946144e-06, + "loss": 8.3081, + "step": 38800 + }, + { + "epoch": 3.4851885098743267, + "grad_norm": 8.7246675491333, + "learning_rate": 3.483752244165171e-06, + "loss": 8.2867, + "step": 38825 + }, + { + "epoch": 3.487432675044883, + "grad_norm": 8.037129402160645, + "learning_rate": 3.485996409335727e-06, + "loss": 8.3382, + "step": 38850 + }, + { + "epoch": 3.4896768402154397, + "grad_norm": 10.068944931030273, + "learning_rate": 3.4882405745062842e-06, + "loss": 8.5522, + "step": 38875 + }, + { + "epoch": 3.4919210053859966, + "grad_norm": 7.594559669494629, + "learning_rate": 3.4904847396768404e-06, + "loss": 8.4655, + "step": 38900 + }, + { + "epoch": 3.494165170556553, + "grad_norm": 8.878684997558594, + "learning_rate": 3.4927289048473974e-06, + "loss": 8.4719, + "step": 38925 + }, + { + "epoch": 3.4964093357271095, + "grad_norm": 8.916712760925293, + "learning_rate": 3.4949730700179536e-06, + "loss": 8.3341, + "step": 38950 + }, + { + "epoch": 3.498653500897666, + "grad_norm": 9.142593383789062, + "learning_rate": 3.4972172351885103e-06, + "loss": 8.4506, + "step": 38975 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 9.076680183410645, + "learning_rate": 3.4994614003590664e-06, + "loss": 8.426, + "step": 39000 + }, + { + "epoch": 3.5031418312387794, + "grad_norm": 10.125992774963379, + "learning_rate": 3.5017055655296235e-06, + "loss": 8.3842, + "step": 39025 + }, + { + "epoch": 3.505385996409336, + "grad_norm": 10.073408126831055, + "learning_rate": 3.5039497307001797e-06, + "loss": 8.2752, + "step": 39050 + }, + { + "epoch": 3.5076301615798924, + "grad_norm": 7.931765079498291, + "learning_rate": 3.5061938958707363e-06, + "loss": 8.2919, + "step": 39075 + }, + { + "epoch": 3.509874326750449, + "grad_norm": 7.437147617340088, + "learning_rate": 3.508438061041293e-06, + "loss": 8.3775, + "step": 39100 + }, + { + "epoch": 3.5121184919210053, + "grad_norm": 9.626590728759766, + "learning_rate": 3.5106822262118495e-06, + "loss": 8.338, + "step": 39125 + }, + { + "epoch": 3.514362657091562, + "grad_norm": 9.271175384521484, + "learning_rate": 3.5129263913824057e-06, + "loss": 8.2566, + "step": 39150 + }, + { + "epoch": 3.5166068222621183, + "grad_norm": 11.187642097473145, + "learning_rate": 3.5151705565529628e-06, + "loss": 8.1768, + "step": 39175 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 8.678838729858398, + "learning_rate": 3.5174147217235194e-06, + "loss": 8.3375, + "step": 39200 + }, + { + "epoch": 3.5210951526032317, + "grad_norm": 7.458976745605469, + "learning_rate": 3.5196588868940756e-06, + "loss": 8.3497, + "step": 39225 + }, + { + "epoch": 3.523339317773788, + "grad_norm": 10.970870971679688, + "learning_rate": 3.5219030520646326e-06, + "loss": 8.3674, + "step": 39250 + }, + { + "epoch": 3.5255834829443446, + "grad_norm": 8.09380054473877, + "learning_rate": 3.5241472172351888e-06, + "loss": 8.3664, + "step": 39275 + }, + { + "epoch": 3.527827648114901, + "grad_norm": 10.145340919494629, + "learning_rate": 3.5263913824057454e-06, + "loss": 8.3811, + "step": 39300 + }, + { + "epoch": 3.530071813285458, + "grad_norm": 7.768328666687012, + "learning_rate": 3.5286355475763016e-06, + "loss": 8.5301, + "step": 39325 + }, + { + "epoch": 3.5323159784560145, + "grad_norm": 9.358311653137207, + "learning_rate": 3.5308797127468586e-06, + "loss": 8.4983, + "step": 39350 + }, + { + "epoch": 3.534560143626571, + "grad_norm": 8.622987747192383, + "learning_rate": 3.533123877917415e-06, + "loss": 8.3527, + "step": 39375 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 7.9121880531311035, + "learning_rate": 3.5352782764811493e-06, + "loss": 8.2657, + "step": 39400 + }, + { + "epoch": 3.539048473967684, + "grad_norm": 8.424910545349121, + "learning_rate": 3.537522441651706e-06, + "loss": 8.2119, + "step": 39425 + }, + { + "epoch": 3.5412926391382404, + "grad_norm": 8.379015922546387, + "learning_rate": 3.5397666068222625e-06, + "loss": 8.4502, + "step": 39450 + }, + { + "epoch": 3.543536804308797, + "grad_norm": 7.583930492401123, + "learning_rate": 3.5420107719928187e-06, + "loss": 8.3889, + "step": 39475 + }, + { + "epoch": 3.545780969479354, + "grad_norm": 7.948736190795898, + "learning_rate": 3.5442549371633757e-06, + "loss": 8.3238, + "step": 39500 + }, + { + "epoch": 3.5480251346499103, + "grad_norm": 7.470489025115967, + "learning_rate": 3.546499102333932e-06, + "loss": 8.4849, + "step": 39525 + }, + { + "epoch": 3.550269299820467, + "grad_norm": 7.860374927520752, + "learning_rate": 3.5487432675044885e-06, + "loss": 8.2885, + "step": 39550 + }, + { + "epoch": 3.5525134649910233, + "grad_norm": 11.302489280700684, + "learning_rate": 3.5509874326750456e-06, + "loss": 8.4908, + "step": 39575 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 9.21044921875, + "learning_rate": 3.5532315978456018e-06, + "loss": 8.5043, + "step": 39600 + }, + { + "epoch": 3.5570017953321367, + "grad_norm": 8.118090629577637, + "learning_rate": 3.5554757630161584e-06, + "loss": 8.206, + "step": 39625 + }, + { + "epoch": 3.559245960502693, + "grad_norm": 10.023296356201172, + "learning_rate": 3.5577199281867146e-06, + "loss": 8.2613, + "step": 39650 + }, + { + "epoch": 3.5614901256732496, + "grad_norm": 7.961179256439209, + "learning_rate": 3.5599640933572716e-06, + "loss": 8.2394, + "step": 39675 + }, + { + "epoch": 3.563734290843806, + "grad_norm": 8.081048965454102, + "learning_rate": 3.562208258527828e-06, + "loss": 8.3966, + "step": 39700 + }, + { + "epoch": 3.5659784560143626, + "grad_norm": 8.523356437683105, + "learning_rate": 3.564452423698385e-06, + "loss": 8.4043, + "step": 39725 + }, + { + "epoch": 3.568222621184919, + "grad_norm": 8.040804862976074, + "learning_rate": 3.566696588868941e-06, + "loss": 8.2411, + "step": 39750 + }, + { + "epoch": 3.5704667863554755, + "grad_norm": 8.990389823913574, + "learning_rate": 3.5689407540394976e-06, + "loss": 8.2419, + "step": 39775 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 10.103216171264648, + "learning_rate": 3.571184919210054e-06, + "loss": 8.4738, + "step": 39800 + }, + { + "epoch": 3.574955116696589, + "grad_norm": 11.343913078308105, + "learning_rate": 3.573429084380611e-06, + "loss": 8.3062, + "step": 39825 + }, + { + "epoch": 3.5771992818671454, + "grad_norm": 9.066844940185547, + "learning_rate": 3.575673249551167e-06, + "loss": 8.2836, + "step": 39850 + }, + { + "epoch": 3.579443447037702, + "grad_norm": 8.558988571166992, + "learning_rate": 3.5779174147217237e-06, + "loss": 8.447, + "step": 39875 + }, + { + "epoch": 3.5816876122082584, + "grad_norm": 8.114757537841797, + "learning_rate": 3.5801615798922807e-06, + "loss": 8.4537, + "step": 39900 + }, + { + "epoch": 3.5839317773788153, + "grad_norm": 9.615211486816406, + "learning_rate": 3.582405745062837e-06, + "loss": 8.3362, + "step": 39925 + }, + { + "epoch": 3.5861759425493718, + "grad_norm": 8.440463066101074, + "learning_rate": 3.5846499102333935e-06, + "loss": 8.4116, + "step": 39950 + }, + { + "epoch": 3.5884201077199283, + "grad_norm": 8.265457153320312, + "learning_rate": 3.58689407540395e-06, + "loss": 8.1181, + "step": 39975 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 7.263768196105957, + "learning_rate": 3.5891382405745068e-06, + "loss": 8.324, + "step": 40000 + }, + { + "epoch": 3.592908438061041, + "grad_norm": 9.831427574157715, + "learning_rate": 3.591382405745063e-06, + "loss": 8.4456, + "step": 40025 + }, + { + "epoch": 3.5951526032315977, + "grad_norm": 11.402565956115723, + "learning_rate": 3.59362657091562e-06, + "loss": 8.2855, + "step": 40050 + }, + { + "epoch": 3.597396768402154, + "grad_norm": 8.705607414245605, + "learning_rate": 3.595870736086176e-06, + "loss": 8.3264, + "step": 40075 + }, + { + "epoch": 3.599640933572711, + "grad_norm": 8.383180618286133, + "learning_rate": 3.598114901256733e-06, + "loss": 8.429, + "step": 40100 + }, + { + "epoch": 3.6018850987432676, + "grad_norm": 9.497991561889648, + "learning_rate": 3.600359066427289e-06, + "loss": 8.2379, + "step": 40125 + }, + { + "epoch": 3.604129263913824, + "grad_norm": 8.290682792663574, + "learning_rate": 3.602603231597846e-06, + "loss": 8.3201, + "step": 40150 + }, + { + "epoch": 3.6063734290843805, + "grad_norm": 11.618953704833984, + "learning_rate": 3.604847396768402e-06, + "loss": 8.3421, + "step": 40175 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 7.35775899887085, + "learning_rate": 3.6070915619389592e-06, + "loss": 8.5392, + "step": 40200 + }, + { + "epoch": 3.610861759425494, + "grad_norm": 9.28359317779541, + "learning_rate": 3.6093357271095154e-06, + "loss": 8.3286, + "step": 40225 + }, + { + "epoch": 3.6131059245960504, + "grad_norm": 8.100897789001465, + "learning_rate": 3.611579892280072e-06, + "loss": 8.2477, + "step": 40250 + }, + { + "epoch": 3.615350089766607, + "grad_norm": 7.670331954956055, + "learning_rate": 3.613824057450629e-06, + "loss": 8.3222, + "step": 40275 + }, + { + "epoch": 3.6175942549371634, + "grad_norm": 7.4340291023254395, + "learning_rate": 3.6160682226211853e-06, + "loss": 8.1987, + "step": 40300 + }, + { + "epoch": 3.61983842010772, + "grad_norm": 7.888827800750732, + "learning_rate": 3.618312387791742e-06, + "loss": 8.4575, + "step": 40325 + }, + { + "epoch": 3.6220825852782763, + "grad_norm": 9.16313362121582, + "learning_rate": 3.620556552962298e-06, + "loss": 8.3848, + "step": 40350 + }, + { + "epoch": 3.624326750448833, + "grad_norm": 8.803550720214844, + "learning_rate": 3.622800718132855e-06, + "loss": 8.3429, + "step": 40375 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 7.600166320800781, + "learning_rate": 3.6250448833034113e-06, + "loss": 8.2939, + "step": 40400 + }, + { + "epoch": 3.628815080789946, + "grad_norm": 7.702960014343262, + "learning_rate": 3.627289048473968e-06, + "loss": 8.3015, + "step": 40425 + }, + { + "epoch": 3.6310592459605027, + "grad_norm": 7.661130428314209, + "learning_rate": 3.6295332136445246e-06, + "loss": 8.4575, + "step": 40450 + }, + { + "epoch": 3.633303411131059, + "grad_norm": 7.767815589904785, + "learning_rate": 3.631777378815081e-06, + "loss": 8.2538, + "step": 40475 + }, + { + "epoch": 3.635547576301616, + "grad_norm": 8.500043869018555, + "learning_rate": 3.6340215439856374e-06, + "loss": 8.3254, + "step": 40500 + }, + { + "epoch": 3.6377917414721725, + "grad_norm": 8.212271690368652, + "learning_rate": 3.6362657091561944e-06, + "loss": 8.3106, + "step": 40525 + }, + { + "epoch": 3.640035906642729, + "grad_norm": 8.240195274353027, + "learning_rate": 3.6385098743267506e-06, + "loss": 8.2548, + "step": 40550 + }, + { + "epoch": 3.6422800718132855, + "grad_norm": 10.759029388427734, + "learning_rate": 3.640754039497307e-06, + "loss": 8.3498, + "step": 40575 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 8.688158988952637, + "learning_rate": 3.6429982046678634e-06, + "loss": 8.2466, + "step": 40600 + }, + { + "epoch": 3.6467684021543985, + "grad_norm": 8.226327896118164, + "learning_rate": 3.6452423698384204e-06, + "loss": 8.309, + "step": 40625 + }, + { + "epoch": 3.649012567324955, + "grad_norm": 10.450510025024414, + "learning_rate": 3.647486535008977e-06, + "loss": 8.1518, + "step": 40650 + }, + { + "epoch": 3.6512567324955114, + "grad_norm": 8.103306770324707, + "learning_rate": 3.6497307001795337e-06, + "loss": 8.3519, + "step": 40675 + }, + { + "epoch": 3.6535008976660683, + "grad_norm": 8.451921463012695, + "learning_rate": 3.6519748653500903e-06, + "loss": 8.4337, + "step": 40700 + }, + { + "epoch": 3.655745062836625, + "grad_norm": 8.693678855895996, + "learning_rate": 3.6542190305206465e-06, + "loss": 8.3626, + "step": 40725 + }, + { + "epoch": 3.6579892280071813, + "grad_norm": 9.273821830749512, + "learning_rate": 3.6564631956912035e-06, + "loss": 8.2591, + "step": 40750 + }, + { + "epoch": 3.6602333931777378, + "grad_norm": 7.892919540405273, + "learning_rate": 3.6587073608617597e-06, + "loss": 8.4588, + "step": 40775 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 11.68966007232666, + "learning_rate": 3.6609515260323163e-06, + "loss": 8.3692, + "step": 40800 + }, + { + "epoch": 3.664721723518851, + "grad_norm": 8.05774211883545, + "learning_rate": 3.6631956912028725e-06, + "loss": 8.4262, + "step": 40825 + }, + { + "epoch": 3.6669658886894076, + "grad_norm": 9.17837905883789, + "learning_rate": 3.6654398563734295e-06, + "loss": 8.3254, + "step": 40850 + }, + { + "epoch": 3.669210053859964, + "grad_norm": 9.078459739685059, + "learning_rate": 3.6676840215439857e-06, + "loss": 8.2493, + "step": 40875 + }, + { + "epoch": 3.6714542190305206, + "grad_norm": 10.644388198852539, + "learning_rate": 3.6699281867145424e-06, + "loss": 8.4128, + "step": 40900 + }, + { + "epoch": 3.673698384201077, + "grad_norm": 9.39470386505127, + "learning_rate": 3.672172351885099e-06, + "loss": 8.2797, + "step": 40925 + }, + { + "epoch": 3.6759425493716336, + "grad_norm": 9.0573091506958, + "learning_rate": 3.6744165170556556e-06, + "loss": 8.2512, + "step": 40950 + }, + { + "epoch": 3.67818671454219, + "grad_norm": 8.01559829711914, + "learning_rate": 3.6766606822262126e-06, + "loss": 8.2267, + "step": 40975 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 7.755706787109375, + "learning_rate": 3.678904847396769e-06, + "loss": 8.4229, + "step": 41000 + }, + { + "epoch": 3.6826750448833034, + "grad_norm": 9.516419410705566, + "learning_rate": 3.6811490125673254e-06, + "loss": 8.2804, + "step": 41025 + }, + { + "epoch": 3.68491921005386, + "grad_norm": 7.936137676239014, + "learning_rate": 3.6833931777378816e-06, + "loss": 8.2776, + "step": 41050 + }, + { + "epoch": 3.6871633752244164, + "grad_norm": 7.868906021118164, + "learning_rate": 3.6856373429084387e-06, + "loss": 8.1772, + "step": 41075 + }, + { + "epoch": 3.6894075403949733, + "grad_norm": 7.604074954986572, + "learning_rate": 3.687881508078995e-06, + "loss": 8.3965, + "step": 41100 + }, + { + "epoch": 3.69165170556553, + "grad_norm": 10.996535301208496, + "learning_rate": 3.6901256732495515e-06, + "loss": 8.5092, + "step": 41125 + }, + { + "epoch": 3.6938958707360863, + "grad_norm": 9.287117004394531, + "learning_rate": 3.692369838420108e-06, + "loss": 8.3517, + "step": 41150 + }, + { + "epoch": 3.6961400359066428, + "grad_norm": 8.979403495788574, + "learning_rate": 3.6946140035906647e-06, + "loss": 8.3489, + "step": 41175 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 8.088530540466309, + "learning_rate": 3.696858168761221e-06, + "loss": 8.336, + "step": 41200 + }, + { + "epoch": 3.7006283662477557, + "grad_norm": 9.082919120788574, + "learning_rate": 3.699102333931778e-06, + "loss": 8.4385, + "step": 41225 + }, + { + "epoch": 3.702872531418312, + "grad_norm": 8.980606079101562, + "learning_rate": 3.701346499102334e-06, + "loss": 8.3228, + "step": 41250 + }, + { + "epoch": 3.7051166965888687, + "grad_norm": 8.932125091552734, + "learning_rate": 3.7035906642728907e-06, + "loss": 8.4496, + "step": 41275 + }, + { + "epoch": 3.7073608617594256, + "grad_norm": 7.995636463165283, + "learning_rate": 3.705834829443447e-06, + "loss": 8.4033, + "step": 41300 + }, + { + "epoch": 3.709605026929982, + "grad_norm": 8.3015718460083, + "learning_rate": 3.708078994614004e-06, + "loss": 8.233, + "step": 41325 + }, + { + "epoch": 3.7118491921005385, + "grad_norm": 9.889580726623535, + "learning_rate": 3.7103231597845606e-06, + "loss": 8.2445, + "step": 41350 + }, + { + "epoch": 3.714093357271095, + "grad_norm": 9.983036994934082, + "learning_rate": 3.7125673249551168e-06, + "loss": 8.326, + "step": 41375 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 8.230652809143066, + "learning_rate": 3.714811490125674e-06, + "loss": 8.198, + "step": 41400 + }, + { + "epoch": 3.7185816876122084, + "grad_norm": 8.414843559265137, + "learning_rate": 3.71705565529623e-06, + "loss": 8.4113, + "step": 41425 + }, + { + "epoch": 3.720825852782765, + "grad_norm": 9.34349536895752, + "learning_rate": 3.7192100538599644e-06, + "loss": 8.2484, + "step": 41450 + }, + { + "epoch": 3.7230700179533214, + "grad_norm": 8.631826400756836, + "learning_rate": 3.721454219030521e-06, + "loss": 8.2508, + "step": 41475 + }, + { + "epoch": 3.725314183123878, + "grad_norm": 7.413135528564453, + "learning_rate": 3.7236983842010777e-06, + "loss": 8.2593, + "step": 41500 + }, + { + "epoch": 3.7275583482944343, + "grad_norm": 8.66684341430664, + "learning_rate": 3.725942549371634e-06, + "loss": 8.3996, + "step": 41525 + }, + { + "epoch": 3.729802513464991, + "grad_norm": 9.377883911132812, + "learning_rate": 3.728186714542191e-06, + "loss": 8.2055, + "step": 41550 + }, + { + "epoch": 3.7320466786355477, + "grad_norm": 8.098823547363281, + "learning_rate": 3.730430879712747e-06, + "loss": 8.2578, + "step": 41575 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 7.389435291290283, + "learning_rate": 3.7326750448833037e-06, + "loss": 8.3937, + "step": 41600 + }, + { + "epoch": 3.7365350089766607, + "grad_norm": 12.930161476135254, + "learning_rate": 3.73491921005386e-06, + "loss": 8.3227, + "step": 41625 + }, + { + "epoch": 3.738779174147217, + "grad_norm": 10.382100105285645, + "learning_rate": 3.737163375224417e-06, + "loss": 8.1333, + "step": 41650 + }, + { + "epoch": 3.7410233393177736, + "grad_norm": 8.136540412902832, + "learning_rate": 3.739407540394973e-06, + "loss": 8.3401, + "step": 41675 + }, + { + "epoch": 3.7432675044883306, + "grad_norm": 7.8093037605285645, + "learning_rate": 3.7416517055655297e-06, + "loss": 8.3896, + "step": 41700 + }, + { + "epoch": 3.745511669658887, + "grad_norm": 9.72205924987793, + "learning_rate": 3.7438958707360868e-06, + "loss": 8.4024, + "step": 41725 + }, + { + "epoch": 3.7477558348294435, + "grad_norm": 9.21755599975586, + "learning_rate": 3.746140035906643e-06, + "loss": 8.2202, + "step": 41750 + }, + { + "epoch": 3.75, + "grad_norm": 8.624565124511719, + "learning_rate": 3.7483842010772e-06, + "loss": 8.325, + "step": 41775 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 8.489578247070312, + "learning_rate": 3.750628366247756e-06, + "loss": 8.4487, + "step": 41800 + }, + { + "epoch": 3.754488330341113, + "grad_norm": 8.510927200317383, + "learning_rate": 3.752872531418313e-06, + "loss": 8.3763, + "step": 41825 + }, + { + "epoch": 3.7567324955116694, + "grad_norm": 9.157064437866211, + "learning_rate": 3.755116696588869e-06, + "loss": 8.1804, + "step": 41850 + }, + { + "epoch": 3.7589766606822264, + "grad_norm": 7.9568705558776855, + "learning_rate": 3.757360861759426e-06, + "loss": 8.3108, + "step": 41875 + }, + { + "epoch": 3.761220825852783, + "grad_norm": 7.945227146148682, + "learning_rate": 3.7596050269299822e-06, + "loss": 8.1683, + "step": 41900 + }, + { + "epoch": 3.7634649910233393, + "grad_norm": 9.791983604431152, + "learning_rate": 3.761849192100539e-06, + "loss": 8.2882, + "step": 41925 + }, + { + "epoch": 3.765709156193896, + "grad_norm": 8.377614974975586, + "learning_rate": 3.7640933572710955e-06, + "loss": 8.2522, + "step": 41950 + }, + { + "epoch": 3.7679533213644523, + "grad_norm": 8.209891319274902, + "learning_rate": 3.766337522441652e-06, + "loss": 8.2394, + "step": 41975 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 8.042098045349121, + "learning_rate": 3.7685816876122083e-06, + "loss": 8.3602, + "step": 42000 + }, + { + "epoch": 3.7724416517055657, + "grad_norm": 8.398853302001953, + "learning_rate": 3.7708258527827653e-06, + "loss": 8.4373, + "step": 42025 + }, + { + "epoch": 3.774685816876122, + "grad_norm": 7.7743239402771, + "learning_rate": 3.7730700179533215e-06, + "loss": 8.3038, + "step": 42050 + }, + { + "epoch": 3.7769299820466786, + "grad_norm": 7.780357837677002, + "learning_rate": 3.775314183123878e-06, + "loss": 8.2861, + "step": 42075 + }, + { + "epoch": 3.779174147217235, + "grad_norm": 12.501626014709473, + "learning_rate": 3.777558348294435e-06, + "loss": 8.31, + "step": 42100 + }, + { + "epoch": 3.7814183123877916, + "grad_norm": 9.456903457641602, + "learning_rate": 3.7798025134649914e-06, + "loss": 8.3087, + "step": 42125 + }, + { + "epoch": 3.783662477558348, + "grad_norm": 9.534124374389648, + "learning_rate": 3.782046678635548e-06, + "loss": 8.3802, + "step": 42150 + }, + { + "epoch": 3.785906642728905, + "grad_norm": 7.823215484619141, + "learning_rate": 3.784290843806104e-06, + "loss": 8.2951, + "step": 42175 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 14.413287162780762, + "learning_rate": 3.786535008976661e-06, + "loss": 8.4091, + "step": 42200 + }, + { + "epoch": 3.790394973070018, + "grad_norm": 10.951684951782227, + "learning_rate": 3.7887791741472174e-06, + "loss": 8.3899, + "step": 42225 + }, + { + "epoch": 3.7926391382405744, + "grad_norm": 9.607998847961426, + "learning_rate": 3.7910233393177744e-06, + "loss": 8.2123, + "step": 42250 + }, + { + "epoch": 3.7948833034111313, + "grad_norm": 9.62059497833252, + "learning_rate": 3.7932675044883306e-06, + "loss": 8.2479, + "step": 42275 + }, + { + "epoch": 3.797127468581688, + "grad_norm": 8.516510009765625, + "learning_rate": 3.7955116696588872e-06, + "loss": 8.4056, + "step": 42300 + }, + { + "epoch": 3.7993716337522443, + "grad_norm": 8.543976783752441, + "learning_rate": 3.7977558348294434e-06, + "loss": 8.2546, + "step": 42325 + }, + { + "epoch": 3.8016157989228008, + "grad_norm": 8.27404499053955, + "learning_rate": 3.8000000000000005e-06, + "loss": 8.3844, + "step": 42350 + }, + { + "epoch": 3.8038599640933572, + "grad_norm": 8.638191223144531, + "learning_rate": 3.8022441651705567e-06, + "loss": 8.2951, + "step": 42375 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 8.855729103088379, + "learning_rate": 3.8044883303411133e-06, + "loss": 8.2777, + "step": 42400 + }, + { + "epoch": 3.80834829443447, + "grad_norm": 9.143380165100098, + "learning_rate": 3.8067324955116703e-06, + "loss": 8.2771, + "step": 42425 + }, + { + "epoch": 3.8105924596050267, + "grad_norm": 9.00294303894043, + "learning_rate": 3.8089766606822265e-06, + "loss": 8.3551, + "step": 42450 + }, + { + "epoch": 3.8128366247755836, + "grad_norm": 8.608232498168945, + "learning_rate": 3.811220825852783e-06, + "loss": 8.2806, + "step": 42475 + }, + { + "epoch": 3.81508078994614, + "grad_norm": 9.770371437072754, + "learning_rate": 3.8134649910233397e-06, + "loss": 8.1132, + "step": 42500 + }, + { + "epoch": 3.8173249551166966, + "grad_norm": 9.981130599975586, + "learning_rate": 3.815709156193896e-06, + "loss": 8.2193, + "step": 42525 + }, + { + "epoch": 3.819569120287253, + "grad_norm": 8.655659675598145, + "learning_rate": 3.817953321364453e-06, + "loss": 8.0342, + "step": 42550 + }, + { + "epoch": 3.82181328545781, + "grad_norm": 9.751129150390625, + "learning_rate": 3.8201974865350096e-06, + "loss": 8.3176, + "step": 42575 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 10.087747573852539, + "learning_rate": 3.822441651705565e-06, + "loss": 8.1655, + "step": 42600 + }, + { + "epoch": 3.826301615798923, + "grad_norm": 8.403882026672363, + "learning_rate": 3.824685816876123e-06, + "loss": 8.4748, + "step": 42625 + }, + { + "epoch": 3.8285457809694794, + "grad_norm": 8.020122528076172, + "learning_rate": 3.8269299820466786e-06, + "loss": 8.4702, + "step": 42650 + }, + { + "epoch": 3.830789946140036, + "grad_norm": 7.550715923309326, + "learning_rate": 3.829174147217235e-06, + "loss": 8.2511, + "step": 42675 + }, + { + "epoch": 3.8330341113105924, + "grad_norm": 8.016020774841309, + "learning_rate": 3.831418312387792e-06, + "loss": 8.2295, + "step": 42700 + }, + { + "epoch": 3.835278276481149, + "grad_norm": 9.088480949401855, + "learning_rate": 3.833662477558348e-06, + "loss": 8.379, + "step": 42725 + }, + { + "epoch": 3.8375224416517053, + "grad_norm": 9.801566123962402, + "learning_rate": 3.835906642728905e-06, + "loss": 8.2105, + "step": 42750 + }, + { + "epoch": 3.8397666068222622, + "grad_norm": 9.2017240524292, + "learning_rate": 3.838150807899462e-06, + "loss": 8.3351, + "step": 42775 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 9.83630084991455, + "learning_rate": 3.840394973070018e-06, + "loss": 8.2609, + "step": 42800 + }, + { + "epoch": 3.844254937163375, + "grad_norm": 9.469968795776367, + "learning_rate": 3.842639138240575e-06, + "loss": 8.2198, + "step": 42825 + }, + { + "epoch": 3.8464991023339317, + "grad_norm": 8.29370403289795, + "learning_rate": 3.8448833034111315e-06, + "loss": 8.4149, + "step": 42850 + }, + { + "epoch": 3.8487432675044886, + "grad_norm": 8.2186279296875, + "learning_rate": 3.847127468581688e-06, + "loss": 8.2193, + "step": 42875 + }, + { + "epoch": 3.850987432675045, + "grad_norm": 7.934107780456543, + "learning_rate": 3.849371633752245e-06, + "loss": 8.392, + "step": 42900 + }, + { + "epoch": 3.8532315978456015, + "grad_norm": 11.870473861694336, + "learning_rate": 3.8516157989228005e-06, + "loss": 8.405, + "step": 42925 + }, + { + "epoch": 3.855475763016158, + "grad_norm": 9.461506843566895, + "learning_rate": 3.853859964093358e-06, + "loss": 8.2856, + "step": 42950 + }, + { + "epoch": 3.8577199281867145, + "grad_norm": 9.932682037353516, + "learning_rate": 3.856104129263914e-06, + "loss": 8.2825, + "step": 42975 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 9.45221996307373, + "learning_rate": 3.858348294434471e-06, + "loss": 8.2453, + "step": 43000 + }, + { + "epoch": 3.8622082585278275, + "grad_norm": 8.130779266357422, + "learning_rate": 3.860592459605027e-06, + "loss": 8.2091, + "step": 43025 + }, + { + "epoch": 3.864452423698384, + "grad_norm": 9.362173080444336, + "learning_rate": 3.8628366247755836e-06, + "loss": 8.2232, + "step": 43050 + }, + { + "epoch": 3.866696588868941, + "grad_norm": 9.031453132629395, + "learning_rate": 3.86508078994614e-06, + "loss": 8.3107, + "step": 43075 + }, + { + "epoch": 3.8689407540394973, + "grad_norm": 8.614235877990723, + "learning_rate": 3.867324955116697e-06, + "loss": 8.2445, + "step": 43100 + }, + { + "epoch": 3.871184919210054, + "grad_norm": 9.952296257019043, + "learning_rate": 3.869569120287253e-06, + "loss": 8.236, + "step": 43125 + }, + { + "epoch": 3.8734290843806103, + "grad_norm": 9.180778503417969, + "learning_rate": 3.87181328545781e-06, + "loss": 8.1633, + "step": 43150 + }, + { + "epoch": 3.875673249551167, + "grad_norm": 8.67719841003418, + "learning_rate": 3.874057450628367e-06, + "loss": 8.394, + "step": 43175 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 8.350943565368652, + "learning_rate": 3.876301615798923e-06, + "loss": 8.1501, + "step": 43200 + }, + { + "epoch": 3.88016157989228, + "grad_norm": 9.305023193359375, + "learning_rate": 3.87854578096948e-06, + "loss": 8.2212, + "step": 43225 + }, + { + "epoch": 3.8824057450628366, + "grad_norm": 9.525959014892578, + "learning_rate": 3.8807899461400365e-06, + "loss": 8.2643, + "step": 43250 + }, + { + "epoch": 3.884649910233393, + "grad_norm": 8.021195411682129, + "learning_rate": 3.883034111310593e-06, + "loss": 8.3715, + "step": 43275 + }, + { + "epoch": 3.8868940754039496, + "grad_norm": 7.951926231384277, + "learning_rate": 3.885278276481149e-06, + "loss": 8.1772, + "step": 43300 + }, + { + "epoch": 3.889138240574506, + "grad_norm": 8.531570434570312, + "learning_rate": 3.887522441651706e-06, + "loss": 8.0553, + "step": 43325 + }, + { + "epoch": 3.891382405745063, + "grad_norm": 8.690767288208008, + "learning_rate": 3.889766606822262e-06, + "loss": 8.3126, + "step": 43350 + }, + { + "epoch": 3.8936265709156195, + "grad_norm": 7.7401275634765625, + "learning_rate": 3.892010771992819e-06, + "loss": 8.125, + "step": 43375 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 8.41762924194336, + "learning_rate": 3.894254937163375e-06, + "loss": 8.2249, + "step": 43400 + }, + { + "epoch": 3.8981149012567324, + "grad_norm": 9.897038459777832, + "learning_rate": 3.896499102333932e-06, + "loss": 8.2931, + "step": 43425 + }, + { + "epoch": 3.900359066427289, + "grad_norm": 8.979853630065918, + "learning_rate": 3.8987432675044886e-06, + "loss": 8.216, + "step": 43450 + }, + { + "epoch": 3.902603231597846, + "grad_norm": 9.334939002990723, + "learning_rate": 3.900987432675045e-06, + "loss": 8.2271, + "step": 43475 + }, + { + "epoch": 3.9048473967684023, + "grad_norm": 8.0659818649292, + "learning_rate": 3.903231597845602e-06, + "loss": 8.2379, + "step": 43500 + }, + { + "epoch": 3.907091561938959, + "grad_norm": 8.367196083068848, + "learning_rate": 3.905475763016158e-06, + "loss": 8.2075, + "step": 43525 + }, + { + "epoch": 3.9093357271095153, + "grad_norm": 8.432387351989746, + "learning_rate": 3.907719928186715e-06, + "loss": 8.3094, + "step": 43550 + }, + { + "epoch": 3.9115798922800717, + "grad_norm": 7.76008415222168, + "learning_rate": 3.909964093357272e-06, + "loss": 8.3808, + "step": 43575 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 9.557611465454102, + "learning_rate": 3.912208258527828e-06, + "loss": 8.3258, + "step": 43600 + }, + { + "epoch": 3.9160682226211847, + "grad_norm": 11.03093433380127, + "learning_rate": 3.914452423698384e-06, + "loss": 8.2946, + "step": 43625 + }, + { + "epoch": 3.9183123877917416, + "grad_norm": 11.794731140136719, + "learning_rate": 3.9166965888689415e-06, + "loss": 8.1212, + "step": 43650 + }, + { + "epoch": 3.920556552962298, + "grad_norm": 7.832529067993164, + "learning_rate": 3.918940754039497e-06, + "loss": 8.2728, + "step": 43675 + }, + { + "epoch": 3.9228007181328546, + "grad_norm": 9.66477108001709, + "learning_rate": 3.921184919210054e-06, + "loss": 8.2414, + "step": 43700 + }, + { + "epoch": 3.925044883303411, + "grad_norm": 12.403948783874512, + "learning_rate": 3.9234290843806105e-06, + "loss": 8.2584, + "step": 43725 + }, + { + "epoch": 3.9272890484739675, + "grad_norm": 7.900536060333252, + "learning_rate": 3.925673249551167e-06, + "loss": 8.3383, + "step": 43750 + }, + { + "epoch": 3.9295332136445245, + "grad_norm": 8.702131271362305, + "learning_rate": 3.927917414721724e-06, + "loss": 8.2107, + "step": 43775 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 11.369967460632324, + "learning_rate": 3.93016157989228e-06, + "loss": 8.0641, + "step": 43800 + }, + { + "epoch": 3.9340215439856374, + "grad_norm": 8.39797592163086, + "learning_rate": 3.932405745062837e-06, + "loss": 8.0224, + "step": 43825 + }, + { + "epoch": 3.936265709156194, + "grad_norm": 8.747532844543457, + "learning_rate": 3.9346499102333936e-06, + "loss": 8.3029, + "step": 43850 + }, + { + "epoch": 3.9385098743267504, + "grad_norm": 9.551450729370117, + "learning_rate": 3.93689407540395e-06, + "loss": 8.2999, + "step": 43875 + }, + { + "epoch": 3.940754039497307, + "grad_norm": 8.531974792480469, + "learning_rate": 3.939138240574507e-06, + "loss": 8.3029, + "step": 43900 + }, + { + "epoch": 3.9429982046678633, + "grad_norm": 8.987381935119629, + "learning_rate": 3.941382405745063e-06, + "loss": 8.2327, + "step": 43925 + }, + { + "epoch": 3.9452423698384202, + "grad_norm": 8.836443901062012, + "learning_rate": 3.94362657091562e-06, + "loss": 8.2755, + "step": 43950 + }, + { + "epoch": 3.9474865350089767, + "grad_norm": 10.0486421585083, + "learning_rate": 3.945870736086177e-06, + "loss": 8.3628, + "step": 43975 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 9.03572940826416, + "learning_rate": 3.948025134649911e-06, + "loss": 8.2788, + "step": 44000 + }, + { + "epoch": 3.9519748653500897, + "grad_norm": 10.578063011169434, + "learning_rate": 3.950269299820467e-06, + "loss": 8.1578, + "step": 44025 + }, + { + "epoch": 3.954219030520646, + "grad_norm": 7.520603179931641, + "learning_rate": 3.952513464991024e-06, + "loss": 8.3085, + "step": 44050 + }, + { + "epoch": 3.956463195691203, + "grad_norm": 9.331971168518066, + "learning_rate": 3.9547576301615805e-06, + "loss": 8.2214, + "step": 44075 + }, + { + "epoch": 3.9587073608617596, + "grad_norm": 9.498579978942871, + "learning_rate": 3.957001795332136e-06, + "loss": 8.1985, + "step": 44100 + }, + { + "epoch": 3.960951526032316, + "grad_norm": 10.716114044189453, + "learning_rate": 3.959245960502694e-06, + "loss": 8.1206, + "step": 44125 + }, + { + "epoch": 3.9631956912028725, + "grad_norm": 8.65717601776123, + "learning_rate": 3.9614901256732495e-06, + "loss": 8.2506, + "step": 44150 + }, + { + "epoch": 3.965439856373429, + "grad_norm": 7.821218967437744, + "learning_rate": 3.963734290843806e-06, + "loss": 8.2797, + "step": 44175 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 11.484996795654297, + "learning_rate": 3.965978456014363e-06, + "loss": 8.2465, + "step": 44200 + }, + { + "epoch": 3.969928186714542, + "grad_norm": 12.3637056350708, + "learning_rate": 3.968222621184919e-06, + "loss": 8.1147, + "step": 44225 + }, + { + "epoch": 3.972172351885099, + "grad_norm": 9.280914306640625, + "learning_rate": 3.970466786355476e-06, + "loss": 8.1095, + "step": 44250 + }, + { + "epoch": 3.9744165170556554, + "grad_norm": 8.594542503356934, + "learning_rate": 3.9727109515260326e-06, + "loss": 8.1224, + "step": 44275 + }, + { + "epoch": 3.976660682226212, + "grad_norm": 8.143704414367676, + "learning_rate": 3.974955116696589e-06, + "loss": 8.349, + "step": 44300 + }, + { + "epoch": 3.9789048473967683, + "grad_norm": 9.197824478149414, + "learning_rate": 3.977199281867146e-06, + "loss": 8.217, + "step": 44325 + }, + { + "epoch": 3.9811490125673252, + "grad_norm": 9.907508850097656, + "learning_rate": 3.979443447037702e-06, + "loss": 8.3232, + "step": 44350 + }, + { + "epoch": 3.9833931777378817, + "grad_norm": 7.831602573394775, + "learning_rate": 3.981687612208259e-06, + "loss": 8.1862, + "step": 44375 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 9.781538963317871, + "learning_rate": 3.983931777378816e-06, + "loss": 8.3364, + "step": 44400 + }, + { + "epoch": 3.9878815080789947, + "grad_norm": 8.211779594421387, + "learning_rate": 3.986175942549371e-06, + "loss": 8.2245, + "step": 44425 + }, + { + "epoch": 3.990125673249551, + "grad_norm": 8.28576946258545, + "learning_rate": 3.988420107719929e-06, + "loss": 8.0935, + "step": 44450 + }, + { + "epoch": 3.9923698384201076, + "grad_norm": 9.723760604858398, + "learning_rate": 3.990664272890485e-06, + "loss": 8.2031, + "step": 44475 + }, + { + "epoch": 3.994614003590664, + "grad_norm": 8.631284713745117, + "learning_rate": 3.992908438061041e-06, + "loss": 8.32, + "step": 44500 + }, + { + "epoch": 3.9968581687612206, + "grad_norm": 11.13822078704834, + "learning_rate": 3.995152603231598e-06, + "loss": 8.3534, + "step": 44525 + }, + { + "epoch": 3.9991023339317775, + "grad_norm": 8.318763732910156, + "learning_rate": 3.9973967684021545e-06, + "loss": 8.2052, + "step": 44550 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.03003455979225937, + "eval_f1_macro": 0.0001761674872119062, + "eval_f1_micro": 0.03003455979225937, + "eval_f1_weighted": 0.0051314235338266664, + "eval_loss": 8.536274909973145, + "eval_precision_macro": 0.00013117423221155152, + "eval_precision_micro": 0.03003455979225937, + "eval_precision_weighted": 0.003437965617655943, + "eval_recall_macro": 0.0008720015473292466, + "eval_recall_micro": 0.03003455979225937, + "eval_recall_weighted": 0.03003455979225937, + "eval_runtime": 132.3085, + "eval_samples_per_second": 395.84, + "eval_steps_per_second": 12.373, + "step": 44560 + }, + { + "epoch": 4.0013464991023335, + "grad_norm": 8.784418106079102, + "learning_rate": 3.999640933572712e-06, + "loss": 8.1789, + "step": 44575 + }, + { + "epoch": 4.003590664272891, + "grad_norm": 9.5264892578125, + "learning_rate": 4.001885098743268e-06, + "loss": 8.1409, + "step": 44600 + }, + { + "epoch": 4.005834829443447, + "grad_norm": 9.171372413635254, + "learning_rate": 4.004129263913824e-06, + "loss": 8.0667, + "step": 44625 + }, + { + "epoch": 4.008078994614004, + "grad_norm": 7.8141560554504395, + "learning_rate": 4.006373429084381e-06, + "loss": 8.0465, + "step": 44650 + }, + { + "epoch": 4.01032315978456, + "grad_norm": 7.2488179206848145, + "learning_rate": 4.0086175942549376e-06, + "loss": 8.2195, + "step": 44675 + }, + { + "epoch": 4.012567324955117, + "grad_norm": 10.660920143127441, + "learning_rate": 4.010861759425494e-06, + "loss": 8.137, + "step": 44700 + }, + { + "epoch": 4.014811490125673, + "grad_norm": 8.105512619018555, + "learning_rate": 4.013105924596051e-06, + "loss": 7.9276, + "step": 44725 + }, + { + "epoch": 4.01705565529623, + "grad_norm": 7.903000354766846, + "learning_rate": 4.015350089766607e-06, + "loss": 8.0534, + "step": 44750 + }, + { + "epoch": 4.019299820466786, + "grad_norm": 10.23942756652832, + "learning_rate": 4.017594254937164e-06, + "loss": 8.0853, + "step": 44775 + }, + { + "epoch": 4.021543985637343, + "grad_norm": 9.815914154052734, + "learning_rate": 4.01983842010772e-06, + "loss": 8.1471, + "step": 44800 + }, + { + "epoch": 4.023788150807899, + "grad_norm": 8.272306442260742, + "learning_rate": 4.022082585278277e-06, + "loss": 8.3152, + "step": 44825 + }, + { + "epoch": 4.026032315978456, + "grad_norm": 12.039443016052246, + "learning_rate": 4.024326750448833e-06, + "loss": 8.0939, + "step": 44850 + }, + { + "epoch": 4.028276481149012, + "grad_norm": 8.230140686035156, + "learning_rate": 4.02657091561939e-06, + "loss": 8.19, + "step": 44875 + }, + { + "epoch": 4.0305206463195695, + "grad_norm": 9.657529830932617, + "learning_rate": 4.028815080789946e-06, + "loss": 8.0444, + "step": 44900 + }, + { + "epoch": 4.032764811490126, + "grad_norm": 10.48708438873291, + "learning_rate": 4.031059245960503e-06, + "loss": 8.1514, + "step": 44925 + }, + { + "epoch": 4.0350089766606825, + "grad_norm": 8.202802658081055, + "learning_rate": 4.0333034111310595e-06, + "loss": 8.0464, + "step": 44950 + }, + { + "epoch": 4.037253141831239, + "grad_norm": 9.004851341247559, + "learning_rate": 4.035547576301616e-06, + "loss": 8.1634, + "step": 44975 + }, + { + "epoch": 4.039497307001795, + "grad_norm": 9.984739303588867, + "learning_rate": 4.037791741472173e-06, + "loss": 8.0701, + "step": 45000 + }, + { + "epoch": 4.041741472172352, + "grad_norm": 8.944620132446289, + "learning_rate": 4.040035906642729e-06, + "loss": 8.1532, + "step": 45025 + }, + { + "epoch": 4.043985637342908, + "grad_norm": 8.686067581176758, + "learning_rate": 4.042280071813286e-06, + "loss": 8.1306, + "step": 45050 + }, + { + "epoch": 4.046229802513465, + "grad_norm": 9.351990699768066, + "learning_rate": 4.0445242369838426e-06, + "loss": 8.1477, + "step": 45075 + }, + { + "epoch": 4.048473967684021, + "grad_norm": 8.635560035705566, + "learning_rate": 4.046768402154399e-06, + "loss": 8.1233, + "step": 45100 + }, + { + "epoch": 4.050718132854578, + "grad_norm": 9.196134567260742, + "learning_rate": 4.049012567324955e-06, + "loss": 8.1306, + "step": 45125 + }, + { + "epoch": 4.052962298025134, + "grad_norm": 8.416738510131836, + "learning_rate": 4.051256732495512e-06, + "loss": 7.9753, + "step": 45150 + }, + { + "epoch": 4.055206463195691, + "grad_norm": 8.684197425842285, + "learning_rate": 4.053500897666068e-06, + "loss": 8.1513, + "step": 45175 + }, + { + "epoch": 4.057450628366248, + "grad_norm": 8.100091934204102, + "learning_rate": 4.055745062836625e-06, + "loss": 8.0974, + "step": 45200 + }, + { + "epoch": 4.059694793536805, + "grad_norm": 8.44166374206543, + "learning_rate": 4.057989228007181e-06, + "loss": 8.1357, + "step": 45225 + }, + { + "epoch": 4.061938958707361, + "grad_norm": 8.326380729675293, + "learning_rate": 4.060233393177738e-06, + "loss": 8.1153, + "step": 45250 + }, + { + "epoch": 4.064183123877918, + "grad_norm": 9.202635765075684, + "learning_rate": 4.062477558348295e-06, + "loss": 8.0502, + "step": 45275 + }, + { + "epoch": 4.066427289048474, + "grad_norm": 9.037820816040039, + "learning_rate": 4.064721723518851e-06, + "loss": 7.9608, + "step": 45300 + }, + { + "epoch": 4.0686714542190305, + "grad_norm": 8.001424789428711, + "learning_rate": 4.066965888689408e-06, + "loss": 8.1224, + "step": 45325 + }, + { + "epoch": 4.070915619389587, + "grad_norm": 8.16597843170166, + "learning_rate": 4.0692100538599645e-06, + "loss": 8.0994, + "step": 45350 + }, + { + "epoch": 4.0731597845601435, + "grad_norm": 8.458403587341309, + "learning_rate": 4.071454219030521e-06, + "loss": 8.1865, + "step": 45375 + }, + { + "epoch": 4.0754039497307, + "grad_norm": 12.866374015808105, + "learning_rate": 4.073698384201078e-06, + "loss": 8.0368, + "step": 45400 + }, + { + "epoch": 4.0776481149012564, + "grad_norm": 8.366022109985352, + "learning_rate": 4.075942549371634e-06, + "loss": 7.8735, + "step": 45425 + }, + { + "epoch": 4.079892280071813, + "grad_norm": 9.208216667175293, + "learning_rate": 4.07818671454219e-06, + "loss": 8.1555, + "step": 45450 + }, + { + "epoch": 4.082136445242369, + "grad_norm": 9.917923927307129, + "learning_rate": 4.0804308797127475e-06, + "loss": 8.0653, + "step": 45475 + }, + { + "epoch": 4.084380610412927, + "grad_norm": 8.583492279052734, + "learning_rate": 4.082675044883303e-06, + "loss": 8.246, + "step": 45500 + }, + { + "epoch": 4.086624775583483, + "grad_norm": 9.626354217529297, + "learning_rate": 4.084919210053861e-06, + "loss": 8.0383, + "step": 45525 + }, + { + "epoch": 4.08886894075404, + "grad_norm": 9.990242004394531, + "learning_rate": 4.0871633752244165e-06, + "loss": 7.9255, + "step": 45550 + }, + { + "epoch": 4.091113105924596, + "grad_norm": 8.687834739685059, + "learning_rate": 4.089407540394973e-06, + "loss": 8.1112, + "step": 45575 + }, + { + "epoch": 4.093357271095153, + "grad_norm": 7.773278713226318, + "learning_rate": 4.09165170556553e-06, + "loss": 8.0942, + "step": 45600 + }, + { + "epoch": 4.095601436265709, + "grad_norm": 10.99388599395752, + "learning_rate": 4.093895870736086e-06, + "loss": 8.1249, + "step": 45625 + }, + { + "epoch": 4.097845601436266, + "grad_norm": 8.59395694732666, + "learning_rate": 4.096140035906643e-06, + "loss": 8.1996, + "step": 45650 + }, + { + "epoch": 4.100089766606822, + "grad_norm": 9.409692764282227, + "learning_rate": 4.0983842010772e-06, + "loss": 8.0168, + "step": 45675 + }, + { + "epoch": 4.102333931777379, + "grad_norm": 12.084576606750488, + "learning_rate": 4.100628366247756e-06, + "loss": 8.1216, + "step": 45700 + }, + { + "epoch": 4.104578096947935, + "grad_norm": 8.440125465393066, + "learning_rate": 4.102872531418313e-06, + "loss": 8.1341, + "step": 45725 + }, + { + "epoch": 4.1068222621184916, + "grad_norm": 11.279613494873047, + "learning_rate": 4.1051166965888695e-06, + "loss": 8.0998, + "step": 45750 + }, + { + "epoch": 4.109066427289049, + "grad_norm": 8.040456771850586, + "learning_rate": 4.107360861759426e-06, + "loss": 7.8948, + "step": 45775 + }, + { + "epoch": 4.111310592459605, + "grad_norm": 9.031113624572754, + "learning_rate": 4.109605026929983e-06, + "loss": 8.1201, + "step": 45800 + }, + { + "epoch": 4.113554757630162, + "grad_norm": 8.932721138000488, + "learning_rate": 4.1118491921005385e-06, + "loss": 8.1405, + "step": 45825 + }, + { + "epoch": 4.115798922800718, + "grad_norm": 11.453323364257812, + "learning_rate": 4.114093357271096e-06, + "loss": 8.2039, + "step": 45850 + }, + { + "epoch": 4.118043087971275, + "grad_norm": 8.722036361694336, + "learning_rate": 4.116337522441652e-06, + "loss": 7.927, + "step": 45875 + }, + { + "epoch": 4.120287253141831, + "grad_norm": 9.37661075592041, + "learning_rate": 4.118581687612208e-06, + "loss": 7.9846, + "step": 45900 + }, + { + "epoch": 4.122531418312388, + "grad_norm": 9.085951805114746, + "learning_rate": 4.120825852782765e-06, + "loss": 8.1419, + "step": 45925 + }, + { + "epoch": 4.124775583482944, + "grad_norm": 10.209589004516602, + "learning_rate": 4.1230700179533215e-06, + "loss": 8.1516, + "step": 45950 + }, + { + "epoch": 4.127019748653501, + "grad_norm": 10.745260238647461, + "learning_rate": 4.125314183123878e-06, + "loss": 8.0876, + "step": 45975 + }, + { + "epoch": 4.129263913824057, + "grad_norm": 9.08796215057373, + "learning_rate": 4.127558348294435e-06, + "loss": 8.0071, + "step": 46000 + }, + { + "epoch": 4.131508078994614, + "grad_norm": 10.327583312988281, + "learning_rate": 4.129802513464991e-06, + "loss": 8.1758, + "step": 46025 + }, + { + "epoch": 4.13375224416517, + "grad_norm": 8.914523124694824, + "learning_rate": 4.132046678635548e-06, + "loss": 8.0895, + "step": 46050 + }, + { + "epoch": 4.135996409335727, + "grad_norm": 8.343753814697266, + "learning_rate": 4.134290843806105e-06, + "loss": 8.1441, + "step": 46075 + }, + { + "epoch": 4.138240574506284, + "grad_norm": 8.222131729125977, + "learning_rate": 4.136535008976661e-06, + "loss": 8.1468, + "step": 46100 + }, + { + "epoch": 4.1404847396768405, + "grad_norm": 9.41507339477539, + "learning_rate": 4.138779174147218e-06, + "loss": 8.1331, + "step": 46125 + }, + { + "epoch": 4.142728904847397, + "grad_norm": 10.11640453338623, + "learning_rate": 4.141023339317774e-06, + "loss": 8.1913, + "step": 46150 + }, + { + "epoch": 4.1449730700179535, + "grad_norm": 8.190299987792969, + "learning_rate": 4.143267504488331e-06, + "loss": 7.933, + "step": 46175 + }, + { + "epoch": 4.14721723518851, + "grad_norm": 8.419903755187988, + "learning_rate": 4.145511669658887e-06, + "loss": 8.0834, + "step": 46200 + }, + { + "epoch": 4.149461400359066, + "grad_norm": 8.314464569091797, + "learning_rate": 4.1477558348294435e-06, + "loss": 8.0564, + "step": 46225 + }, + { + "epoch": 4.151705565529623, + "grad_norm": 7.522350311279297, + "learning_rate": 4.15e-06, + "loss": 8.0428, + "step": 46250 + }, + { + "epoch": 4.153949730700179, + "grad_norm": 10.02529239654541, + "learning_rate": 4.152244165170557e-06, + "loss": 8.0679, + "step": 46275 + }, + { + "epoch": 4.156193895870736, + "grad_norm": 18.154481887817383, + "learning_rate": 4.154488330341113e-06, + "loss": 8.1937, + "step": 46300 + }, + { + "epoch": 4.158438061041292, + "grad_norm": 8.102455139160156, + "learning_rate": 4.15673249551167e-06, + "loss": 8.0914, + "step": 46325 + }, + { + "epoch": 4.160682226211849, + "grad_norm": 9.936589241027832, + "learning_rate": 4.1589766606822265e-06, + "loss": 8.1751, + "step": 46350 + }, + { + "epoch": 4.162926391382406, + "grad_norm": 9.39206314086914, + "learning_rate": 4.161220825852783e-06, + "loss": 8.2797, + "step": 46375 + }, + { + "epoch": 4.165170556552963, + "grad_norm": 10.071934700012207, + "learning_rate": 4.16346499102334e-06, + "loss": 8.0286, + "step": 46400 + }, + { + "epoch": 4.167414721723519, + "grad_norm": 8.919889450073242, + "learning_rate": 4.165709156193896e-06, + "loss": 7.8973, + "step": 46425 + }, + { + "epoch": 4.169658886894076, + "grad_norm": 13.515708923339844, + "learning_rate": 4.167953321364453e-06, + "loss": 8.1225, + "step": 46450 + }, + { + "epoch": 4.171903052064632, + "grad_norm": 8.739055633544922, + "learning_rate": 4.17019748653501e-06, + "loss": 8.1468, + "step": 46475 + }, + { + "epoch": 4.174147217235189, + "grad_norm": 12.023072242736816, + "learning_rate": 4.172441651705566e-06, + "loss": 8.1804, + "step": 46500 + }, + { + "epoch": 4.176391382405745, + "grad_norm": 15.719243049621582, + "learning_rate": 4.174685816876122e-06, + "loss": 8.0761, + "step": 46525 + }, + { + "epoch": 4.1786355475763015, + "grad_norm": 10.254240036010742, + "learning_rate": 4.1769299820466795e-06, + "loss": 8.076, + "step": 46550 + }, + { + "epoch": 4.180879712746858, + "grad_norm": 9.03630542755127, + "learning_rate": 4.179174147217235e-06, + "loss": 7.9282, + "step": 46575 + }, + { + "epoch": 4.1831238779174145, + "grad_norm": 9.983465194702148, + "learning_rate": 4.181418312387792e-06, + "loss": 8.0488, + "step": 46600 + }, + { + "epoch": 4.185368043087971, + "grad_norm": 8.597188949584961, + "learning_rate": 4.1836624775583485e-06, + "loss": 8.1022, + "step": 46625 + }, + { + "epoch": 4.187612208258527, + "grad_norm": 7.557032108306885, + "learning_rate": 4.185906642728905e-06, + "loss": 7.9033, + "step": 46650 + }, + { + "epoch": 4.189856373429085, + "grad_norm": 11.996541976928711, + "learning_rate": 4.188150807899462e-06, + "loss": 8.0517, + "step": 46675 + }, + { + "epoch": 4.192100538599641, + "grad_norm": 10.468984603881836, + "learning_rate": 4.190394973070018e-06, + "loss": 8.0967, + "step": 46700 + }, + { + "epoch": 4.194344703770198, + "grad_norm": 8.843729972839355, + "learning_rate": 4.192639138240575e-06, + "loss": 8.0264, + "step": 46725 + }, + { + "epoch": 4.196588868940754, + "grad_norm": 10.766959190368652, + "learning_rate": 4.1948833034111315e-06, + "loss": 8.1703, + "step": 46750 + }, + { + "epoch": 4.198833034111311, + "grad_norm": 7.770892143249512, + "learning_rate": 4.197127468581688e-06, + "loss": 8.1228, + "step": 46775 + }, + { + "epoch": 4.201077199281867, + "grad_norm": 10.055929183959961, + "learning_rate": 4.199371633752245e-06, + "loss": 8.192, + "step": 46800 + }, + { + "epoch": 4.203321364452424, + "grad_norm": 8.94767951965332, + "learning_rate": 4.201615798922801e-06, + "loss": 7.9892, + "step": 46825 + }, + { + "epoch": 4.20556552962298, + "grad_norm": 10.090816497802734, + "learning_rate": 4.203859964093357e-06, + "loss": 8.0122, + "step": 46850 + }, + { + "epoch": 4.207809694793537, + "grad_norm": 12.820903778076172, + "learning_rate": 4.206104129263915e-06, + "loss": 7.9417, + "step": 46875 + }, + { + "epoch": 4.210053859964093, + "grad_norm": 8.45154094696045, + "learning_rate": 4.20834829443447e-06, + "loss": 8.0421, + "step": 46900 + }, + { + "epoch": 4.21229802513465, + "grad_norm": 10.874791145324707, + "learning_rate": 4.210592459605027e-06, + "loss": 7.9108, + "step": 46925 + }, + { + "epoch": 4.214542190305206, + "grad_norm": 9.312918663024902, + "learning_rate": 4.212836624775584e-06, + "loss": 8.0278, + "step": 46950 + }, + { + "epoch": 4.216786355475763, + "grad_norm": 8.215259552001953, + "learning_rate": 4.21508078994614e-06, + "loss": 8.1515, + "step": 46975 + }, + { + "epoch": 4.21903052064632, + "grad_norm": 9.33811092376709, + "learning_rate": 4.217324955116697e-06, + "loss": 8.1972, + "step": 47000 + }, + { + "epoch": 4.221274685816876, + "grad_norm": 10.72657585144043, + "learning_rate": 4.2195691202872535e-06, + "loss": 8.0595, + "step": 47025 + }, + { + "epoch": 4.223518850987433, + "grad_norm": 9.183622360229492, + "learning_rate": 4.22181328545781e-06, + "loss": 8.0234, + "step": 47050 + }, + { + "epoch": 4.225763016157989, + "grad_norm": 9.66406536102295, + "learning_rate": 4.224057450628367e-06, + "loss": 7.9421, + "step": 47075 + }, + { + "epoch": 4.228007181328546, + "grad_norm": 8.701910018920898, + "learning_rate": 4.226301615798923e-06, + "loss": 8.0223, + "step": 47100 + }, + { + "epoch": 4.230251346499102, + "grad_norm": 8.695223808288574, + "learning_rate": 4.22854578096948e-06, + "loss": 8.0601, + "step": 47125 + }, + { + "epoch": 4.232495511669659, + "grad_norm": 8.322623252868652, + "learning_rate": 4.2307899461400365e-06, + "loss": 8.1035, + "step": 47150 + }, + { + "epoch": 4.234739676840215, + "grad_norm": 7.9379191398620605, + "learning_rate": 4.233034111310593e-06, + "loss": 7.9565, + "step": 47175 + }, + { + "epoch": 4.236983842010772, + "grad_norm": 7.940052509307861, + "learning_rate": 4.23527827648115e-06, + "loss": 8.0966, + "step": 47200 + }, + { + "epoch": 4.239228007181328, + "grad_norm": 9.731016159057617, + "learning_rate": 4.2375224416517055e-06, + "loss": 8.0383, + "step": 47225 + }, + { + "epoch": 4.241472172351885, + "grad_norm": 10.37883472442627, + "learning_rate": 4.239766606822263e-06, + "loss": 8.3094, + "step": 47250 + }, + { + "epoch": 4.243716337522442, + "grad_norm": 8.429099082946777, + "learning_rate": 4.242010771992819e-06, + "loss": 8.0903, + "step": 47275 + }, + { + "epoch": 4.2459605026929985, + "grad_norm": 10.717753410339355, + "learning_rate": 4.244254937163375e-06, + "loss": 8.2059, + "step": 47300 + }, + { + "epoch": 4.248204667863555, + "grad_norm": 7.998884677886963, + "learning_rate": 4.246499102333932e-06, + "loss": 8.1167, + "step": 47325 + }, + { + "epoch": 4.2504488330341115, + "grad_norm": 10.844037055969238, + "learning_rate": 4.248743267504489e-06, + "loss": 7.967, + "step": 47350 + }, + { + "epoch": 4.252692998204668, + "grad_norm": 13.519461631774902, + "learning_rate": 4.250987432675045e-06, + "loss": 8.0841, + "step": 47375 + }, + { + "epoch": 4.254937163375224, + "grad_norm": 8.39925765991211, + "learning_rate": 4.253231597845602e-06, + "loss": 8.1295, + "step": 47400 + }, + { + "epoch": 4.257181328545781, + "grad_norm": 8.509821891784668, + "learning_rate": 4.2554757630161584e-06, + "loss": 8.0972, + "step": 47425 + }, + { + "epoch": 4.259425493716337, + "grad_norm": 9.478416442871094, + "learning_rate": 4.257719928186715e-06, + "loss": 8.1033, + "step": 47450 + }, + { + "epoch": 4.261669658886894, + "grad_norm": 9.039836883544922, + "learning_rate": 4.259964093357272e-06, + "loss": 8.0224, + "step": 47475 + }, + { + "epoch": 4.26391382405745, + "grad_norm": 8.69355583190918, + "learning_rate": 4.262208258527828e-06, + "loss": 8.0009, + "step": 47500 + }, + { + "epoch": 4.266157989228007, + "grad_norm": 9.460662841796875, + "learning_rate": 4.264452423698385e-06, + "loss": 7.9888, + "step": 47525 + }, + { + "epoch": 4.268402154398563, + "grad_norm": 9.148041725158691, + "learning_rate": 4.266696588868941e-06, + "loss": 8.0927, + "step": 47550 + }, + { + "epoch": 4.270646319569121, + "grad_norm": 8.72946834564209, + "learning_rate": 4.268940754039498e-06, + "loss": 8.0669, + "step": 47575 + }, + { + "epoch": 4.272890484739677, + "grad_norm": 8.246726036071777, + "learning_rate": 4.271095152603232e-06, + "loss": 8.1779, + "step": 47600 + }, + { + "epoch": 4.275134649910234, + "grad_norm": 9.315770149230957, + "learning_rate": 4.273339317773789e-06, + "loss": 8.122, + "step": 47625 + }, + { + "epoch": 4.27737881508079, + "grad_norm": 10.185583114624023, + "learning_rate": 4.2755834829443445e-06, + "loss": 7.9686, + "step": 47650 + }, + { + "epoch": 4.279622980251347, + "grad_norm": 9.626177787780762, + "learning_rate": 4.277827648114902e-06, + "loss": 7.9498, + "step": 47675 + }, + { + "epoch": 4.281867145421903, + "grad_norm": 8.164722442626953, + "learning_rate": 4.280071813285458e-06, + "loss": 8.2152, + "step": 47700 + }, + { + "epoch": 4.2841113105924595, + "grad_norm": 10.205259323120117, + "learning_rate": 4.282315978456014e-06, + "loss": 8.052, + "step": 47725 + }, + { + "epoch": 4.286355475763016, + "grad_norm": 9.348794937133789, + "learning_rate": 4.284560143626571e-06, + "loss": 8.1476, + "step": 47750 + }, + { + "epoch": 4.2885996409335725, + "grad_norm": 9.119477272033691, + "learning_rate": 4.286804308797128e-06, + "loss": 7.8755, + "step": 47775 + }, + { + "epoch": 4.290843806104129, + "grad_norm": 8.29890251159668, + "learning_rate": 4.289048473967684e-06, + "loss": 8.006, + "step": 47800 + }, + { + "epoch": 4.293087971274685, + "grad_norm": 12.28895092010498, + "learning_rate": 4.291292639138241e-06, + "loss": 8.1448, + "step": 47825 + }, + { + "epoch": 4.295332136445243, + "grad_norm": 8.218644142150879, + "learning_rate": 4.2935368043087975e-06, + "loss": 7.9366, + "step": 47850 + }, + { + "epoch": 4.297576301615799, + "grad_norm": 9.39483642578125, + "learning_rate": 4.295780969479354e-06, + "loss": 7.9044, + "step": 47875 + }, + { + "epoch": 4.299820466786356, + "grad_norm": 9.416637420654297, + "learning_rate": 4.298025134649911e-06, + "loss": 7.9714, + "step": 47900 + }, + { + "epoch": 4.302064631956912, + "grad_norm": 8.57021427154541, + "learning_rate": 4.300269299820467e-06, + "loss": 8.0873, + "step": 47925 + }, + { + "epoch": 4.304308797127469, + "grad_norm": 10.973342895507812, + "learning_rate": 4.302513464991024e-06, + "loss": 8.0939, + "step": 47950 + }, + { + "epoch": 4.306552962298025, + "grad_norm": 8.582932472229004, + "learning_rate": 4.30475763016158e-06, + "loss": 7.9946, + "step": 47975 + }, + { + "epoch": 4.308797127468582, + "grad_norm": 8.538934707641602, + "learning_rate": 4.307001795332137e-06, + "loss": 7.8542, + "step": 48000 + }, + { + "epoch": 4.311041292639138, + "grad_norm": 10.629801750183105, + "learning_rate": 4.309245960502693e-06, + "loss": 7.8119, + "step": 48025 + }, + { + "epoch": 4.313285457809695, + "grad_norm": 10.983323097229004, + "learning_rate": 4.31149012567325e-06, + "loss": 7.9503, + "step": 48050 + }, + { + "epoch": 4.315529622980251, + "grad_norm": 10.676750183105469, + "learning_rate": 4.313734290843806e-06, + "loss": 8.0544, + "step": 48075 + }, + { + "epoch": 4.317773788150808, + "grad_norm": 11.219866752624512, + "learning_rate": 4.315978456014363e-06, + "loss": 7.9595, + "step": 48100 + }, + { + "epoch": 4.320017953321364, + "grad_norm": 10.167444229125977, + "learning_rate": 4.318222621184919e-06, + "loss": 8.0356, + "step": 48125 + }, + { + "epoch": 4.3222621184919205, + "grad_norm": 9.177806854248047, + "learning_rate": 4.320466786355476e-06, + "loss": 7.9603, + "step": 48150 + }, + { + "epoch": 4.324506283662478, + "grad_norm": 8.792152404785156, + "learning_rate": 4.322710951526033e-06, + "loss": 8.1811, + "step": 48175 + }, + { + "epoch": 4.326750448833034, + "grad_norm": 9.369024276733398, + "learning_rate": 4.324955116696589e-06, + "loss": 7.8245, + "step": 48200 + }, + { + "epoch": 4.328994614003591, + "grad_norm": 8.799886703491211, + "learning_rate": 4.327199281867146e-06, + "loss": 8.0326, + "step": 48225 + }, + { + "epoch": 4.331238779174147, + "grad_norm": 8.818221092224121, + "learning_rate": 4.3294434470377024e-06, + "loss": 7.8299, + "step": 48250 + }, + { + "epoch": 4.333482944344704, + "grad_norm": 7.784689903259277, + "learning_rate": 4.331687612208259e-06, + "loss": 7.8415, + "step": 48275 + }, + { + "epoch": 4.33572710951526, + "grad_norm": 8.581801414489746, + "learning_rate": 4.333931777378816e-06, + "loss": 8.079, + "step": 48300 + }, + { + "epoch": 4.337971274685817, + "grad_norm": 8.420843124389648, + "learning_rate": 4.336175942549372e-06, + "loss": 7.8987, + "step": 48325 + }, + { + "epoch": 4.340215439856373, + "grad_norm": 9.560523986816406, + "learning_rate": 4.338420107719928e-06, + "loss": 8.0118, + "step": 48350 + }, + { + "epoch": 4.34245960502693, + "grad_norm": 9.757198333740234, + "learning_rate": 4.3406642728904855e-06, + "loss": 7.9114, + "step": 48375 + }, + { + "epoch": 4.344703770197486, + "grad_norm": 8.577818870544434, + "learning_rate": 4.342908438061041e-06, + "loss": 8.1132, + "step": 48400 + }, + { + "epoch": 4.346947935368043, + "grad_norm": 11.025150299072266, + "learning_rate": 4.345152603231598e-06, + "loss": 8.1222, + "step": 48425 + }, + { + "epoch": 4.3491921005386, + "grad_norm": 9.280269622802734, + "learning_rate": 4.3473967684021545e-06, + "loss": 7.9931, + "step": 48450 + }, + { + "epoch": 4.3514362657091565, + "grad_norm": 8.469796180725098, + "learning_rate": 4.349640933572711e-06, + "loss": 7.9924, + "step": 48475 + }, + { + "epoch": 4.353680430879713, + "grad_norm": 9.396141052246094, + "learning_rate": 4.351885098743268e-06, + "loss": 8.073, + "step": 48500 + }, + { + "epoch": 4.3559245960502695, + "grad_norm": 11.450509071350098, + "learning_rate": 4.354129263913824e-06, + "loss": 8.2017, + "step": 48525 + }, + { + "epoch": 4.358168761220826, + "grad_norm": 7.670708179473877, + "learning_rate": 4.356373429084381e-06, + "loss": 8.0796, + "step": 48550 + }, + { + "epoch": 4.3604129263913824, + "grad_norm": 13.787546157836914, + "learning_rate": 4.358617594254938e-06, + "loss": 7.945, + "step": 48575 + }, + { + "epoch": 4.362657091561939, + "grad_norm": 11.33871841430664, + "learning_rate": 4.360861759425494e-06, + "loss": 8.0991, + "step": 48600 + }, + { + "epoch": 4.364901256732495, + "grad_norm": 8.642830848693848, + "learning_rate": 4.363105924596051e-06, + "loss": 7.9735, + "step": 48625 + }, + { + "epoch": 4.367145421903052, + "grad_norm": 9.349923133850098, + "learning_rate": 4.3653500897666074e-06, + "loss": 8.0131, + "step": 48650 + }, + { + "epoch": 4.369389587073608, + "grad_norm": 12.168438911437988, + "learning_rate": 4.367594254937163e-06, + "loss": 8.142, + "step": 48675 + }, + { + "epoch": 4.371633752244165, + "grad_norm": 15.732131958007812, + "learning_rate": 4.369838420107721e-06, + "loss": 7.9398, + "step": 48700 + }, + { + "epoch": 4.373877917414721, + "grad_norm": 8.461878776550293, + "learning_rate": 4.3720825852782764e-06, + "loss": 8.0548, + "step": 48725 + }, + { + "epoch": 4.376122082585279, + "grad_norm": 9.225688934326172, + "learning_rate": 4.374326750448834e-06, + "loss": 8.1487, + "step": 48750 + }, + { + "epoch": 4.378366247755835, + "grad_norm": 9.771017074584961, + "learning_rate": 4.37657091561939e-06, + "loss": 8.0926, + "step": 48775 + }, + { + "epoch": 4.380610412926392, + "grad_norm": 9.919625282287598, + "learning_rate": 4.378815080789946e-06, + "loss": 7.981, + "step": 48800 + }, + { + "epoch": 4.382854578096948, + "grad_norm": 8.88196086883545, + "learning_rate": 4.381059245960503e-06, + "loss": 7.9319, + "step": 48825 + }, + { + "epoch": 4.385098743267505, + "grad_norm": 8.259479522705078, + "learning_rate": 4.3833034111310595e-06, + "loss": 8.0445, + "step": 48850 + }, + { + "epoch": 4.387342908438061, + "grad_norm": 11.220439910888672, + "learning_rate": 4.385547576301616e-06, + "loss": 7.9069, + "step": 48875 + }, + { + "epoch": 4.3895870736086176, + "grad_norm": 7.551411151885986, + "learning_rate": 4.387791741472173e-06, + "loss": 8.1446, + "step": 48900 + }, + { + "epoch": 4.391831238779174, + "grad_norm": 8.485369682312012, + "learning_rate": 4.390035906642729e-06, + "loss": 8.0482, + "step": 48925 + }, + { + "epoch": 4.3940754039497305, + "grad_norm": 10.339739799499512, + "learning_rate": 4.392280071813286e-06, + "loss": 8.0141, + "step": 48950 + }, + { + "epoch": 4.396319569120287, + "grad_norm": 7.703719615936279, + "learning_rate": 4.394524236983843e-06, + "loss": 7.8875, + "step": 48975 + }, + { + "epoch": 4.3985637342908435, + "grad_norm": 9.168750762939453, + "learning_rate": 4.396768402154399e-06, + "loss": 8.0133, + "step": 49000 + }, + { + "epoch": 4.4008078994614, + "grad_norm": 8.435202598571777, + "learning_rate": 4.399012567324956e-06, + "loss": 7.9055, + "step": 49025 + }, + { + "epoch": 4.403052064631957, + "grad_norm": 8.22078800201416, + "learning_rate": 4.401256732495512e-06, + "loss": 7.9756, + "step": 49050 + }, + { + "epoch": 4.405296229802514, + "grad_norm": 10.156187057495117, + "learning_rate": 4.403500897666069e-06, + "loss": 7.9767, + "step": 49075 + }, + { + "epoch": 4.40754039497307, + "grad_norm": 9.843085289001465, + "learning_rate": 4.405745062836625e-06, + "loss": 7.8715, + "step": 49100 + }, + { + "epoch": 4.409784560143627, + "grad_norm": 7.965404510498047, + "learning_rate": 4.4079892280071814e-06, + "loss": 7.859, + "step": 49125 + }, + { + "epoch": 4.412028725314183, + "grad_norm": 9.950892448425293, + "learning_rate": 4.410233393177738e-06, + "loss": 8.0324, + "step": 49150 + }, + { + "epoch": 4.41427289048474, + "grad_norm": 9.85426139831543, + "learning_rate": 4.412477558348295e-06, + "loss": 7.9697, + "step": 49175 + }, + { + "epoch": 4.416517055655296, + "grad_norm": 9.678536415100098, + "learning_rate": 4.414721723518851e-06, + "loss": 7.961, + "step": 49200 + }, + { + "epoch": 4.418761220825853, + "grad_norm": 9.318069458007812, + "learning_rate": 4.416965888689408e-06, + "loss": 8.0764, + "step": 49225 + }, + { + "epoch": 4.421005385996409, + "grad_norm": 7.682610034942627, + "learning_rate": 4.4192100538599645e-06, + "loss": 7.779, + "step": 49250 + }, + { + "epoch": 4.423249551166966, + "grad_norm": 10.471832275390625, + "learning_rate": 4.421454219030521e-06, + "loss": 7.8975, + "step": 49275 + }, + { + "epoch": 4.425493716337522, + "grad_norm": 8.507399559020996, + "learning_rate": 4.423698384201078e-06, + "loss": 8.1246, + "step": 49300 + }, + { + "epoch": 4.4277378815080795, + "grad_norm": 7.9411702156066895, + "learning_rate": 4.425942549371634e-06, + "loss": 7.9628, + "step": 49325 + }, + { + "epoch": 4.429982046678636, + "grad_norm": 10.191747665405273, + "learning_rate": 4.428186714542191e-06, + "loss": 7.8052, + "step": 49350 + }, + { + "epoch": 4.432226211849192, + "grad_norm": 8.15361499786377, + "learning_rate": 4.430430879712747e-06, + "loss": 7.8869, + "step": 49375 + }, + { + "epoch": 4.434470377019749, + "grad_norm": 12.23348617553711, + "learning_rate": 4.432675044883304e-06, + "loss": 7.9108, + "step": 49400 + }, + { + "epoch": 4.436714542190305, + "grad_norm": 12.191873550415039, + "learning_rate": 4.43491921005386e-06, + "loss": 8.0715, + "step": 49425 + }, + { + "epoch": 4.438958707360862, + "grad_norm": 8.562214851379395, + "learning_rate": 4.437163375224417e-06, + "loss": 8.0416, + "step": 49450 + }, + { + "epoch": 4.441202872531418, + "grad_norm": 8.909808158874512, + "learning_rate": 4.439407540394973e-06, + "loss": 8.0866, + "step": 49475 + }, + { + "epoch": 4.443447037701975, + "grad_norm": 10.921149253845215, + "learning_rate": 4.44165170556553e-06, + "loss": 8.0173, + "step": 49500 + }, + { + "epoch": 4.445691202872531, + "grad_norm": 9.585265159606934, + "learning_rate": 4.4438958707360864e-06, + "loss": 8.0405, + "step": 49525 + }, + { + "epoch": 4.447935368043088, + "grad_norm": 10.690500259399414, + "learning_rate": 4.446140035906643e-06, + "loss": 7.8173, + "step": 49550 + }, + { + "epoch": 4.450179533213644, + "grad_norm": 9.938990592956543, + "learning_rate": 4.4483842010772e-06, + "loss": 7.9485, + "step": 49575 + }, + { + "epoch": 4.452423698384201, + "grad_norm": 9.286126136779785, + "learning_rate": 4.450628366247756e-06, + "loss": 8.1594, + "step": 49600 + }, + { + "epoch": 4.454667863554757, + "grad_norm": 11.186763763427734, + "learning_rate": 4.452872531418313e-06, + "loss": 7.9416, + "step": 49625 + }, + { + "epoch": 4.456912028725315, + "grad_norm": 8.466522216796875, + "learning_rate": 4.4551166965888695e-06, + "loss": 7.9441, + "step": 49650 + }, + { + "epoch": 4.459156193895871, + "grad_norm": 10.537188529968262, + "learning_rate": 4.457360861759426e-06, + "loss": 7.9705, + "step": 49675 + }, + { + "epoch": 4.4614003590664275, + "grad_norm": 9.240513801574707, + "learning_rate": 4.459605026929983e-06, + "loss": 8.0457, + "step": 49700 + }, + { + "epoch": 4.463644524236984, + "grad_norm": 8.013209342956543, + "learning_rate": 4.461849192100539e-06, + "loss": 8.0712, + "step": 49725 + }, + { + "epoch": 4.4658886894075405, + "grad_norm": 12.425276756286621, + "learning_rate": 4.464093357271095e-06, + "loss": 7.9188, + "step": 49750 + }, + { + "epoch": 4.468132854578097, + "grad_norm": 8.928946495056152, + "learning_rate": 4.466337522441653e-06, + "loss": 7.9006, + "step": 49775 + }, + { + "epoch": 4.470377019748653, + "grad_norm": 8.657913208007812, + "learning_rate": 4.468581687612208e-06, + "loss": 8.0233, + "step": 49800 + }, + { + "epoch": 4.47262118491921, + "grad_norm": 8.588534355163574, + "learning_rate": 4.470825852782765e-06, + "loss": 8.0068, + "step": 49825 + }, + { + "epoch": 4.474865350089766, + "grad_norm": 9.419211387634277, + "learning_rate": 4.473070017953322e-06, + "loss": 8.1052, + "step": 49850 + }, + { + "epoch": 4.477109515260323, + "grad_norm": 8.585052490234375, + "learning_rate": 4.475314183123878e-06, + "loss": 7.9476, + "step": 49875 + }, + { + "epoch": 4.479353680430879, + "grad_norm": 8.49942398071289, + "learning_rate": 4.477558348294435e-06, + "loss": 8.0067, + "step": 49900 + }, + { + "epoch": 4.481597845601437, + "grad_norm": 10.793782234191895, + "learning_rate": 4.4798025134649914e-06, + "loss": 7.8297, + "step": 49925 + }, + { + "epoch": 4.483842010771993, + "grad_norm": 9.440011024475098, + "learning_rate": 4.482046678635548e-06, + "loss": 8.0348, + "step": 49950 + }, + { + "epoch": 4.48608617594255, + "grad_norm": 10.949712753295898, + "learning_rate": 4.484290843806105e-06, + "loss": 8.0974, + "step": 49975 + }, + { + "epoch": 4.488330341113106, + "grad_norm": 7.921511650085449, + "learning_rate": 4.486535008976661e-06, + "loss": 7.9868, + "step": 50000 + }, + { + "epoch": 4.490574506283663, + "grad_norm": 9.236543655395508, + "learning_rate": 4.488779174147218e-06, + "loss": 8.0113, + "step": 50025 + }, + { + "epoch": 4.492818671454219, + "grad_norm": 8.21011734008789, + "learning_rate": 4.4910233393177745e-06, + "loss": 8.1975, + "step": 50050 + }, + { + "epoch": 4.495062836624776, + "grad_norm": 9.360307693481445, + "learning_rate": 4.49326750448833e-06, + "loss": 7.9055, + "step": 50075 + }, + { + "epoch": 4.497307001795332, + "grad_norm": 9.915837287902832, + "learning_rate": 4.495511669658888e-06, + "loss": 8.0554, + "step": 50100 + }, + { + "epoch": 4.4995511669658885, + "grad_norm": 9.076584815979004, + "learning_rate": 4.497666068222622e-06, + "loss": 7.9466, + "step": 50125 + }, + { + "epoch": 4.501795332136445, + "grad_norm": 12.758278846740723, + "learning_rate": 4.499910233393178e-06, + "loss": 8.187, + "step": 50150 + }, + { + "epoch": 4.5040394973070015, + "grad_norm": 9.691737174987793, + "learning_rate": 4.502154398563734e-06, + "loss": 8.0145, + "step": 50175 + }, + { + "epoch": 4.506283662477558, + "grad_norm": 8.144372940063477, + "learning_rate": 4.504398563734292e-06, + "loss": 7.9941, + "step": 50200 + }, + { + "epoch": 4.508527827648114, + "grad_norm": 9.095715522766113, + "learning_rate": 4.506642728904847e-06, + "loss": 7.9845, + "step": 50225 + }, + { + "epoch": 4.510771992818672, + "grad_norm": 10.307632446289062, + "learning_rate": 4.508886894075404e-06, + "loss": 7.8837, + "step": 50250 + }, + { + "epoch": 4.513016157989228, + "grad_norm": 8.927815437316895, + "learning_rate": 4.511131059245961e-06, + "loss": 7.7589, + "step": 50275 + }, + { + "epoch": 4.515260323159785, + "grad_norm": 9.131474494934082, + "learning_rate": 4.513375224416517e-06, + "loss": 8.0399, + "step": 50300 + }, + { + "epoch": 4.517504488330341, + "grad_norm": 9.741454124450684, + "learning_rate": 4.515619389587075e-06, + "loss": 7.9206, + "step": 50325 + }, + { + "epoch": 4.519748653500898, + "grad_norm": 8.797220230102539, + "learning_rate": 4.5178635547576304e-06, + "loss": 8.151, + "step": 50350 + }, + { + "epoch": 4.521992818671454, + "grad_norm": 8.738369941711426, + "learning_rate": 4.520107719928187e-06, + "loss": 8.1301, + "step": 50375 + }, + { + "epoch": 4.524236983842011, + "grad_norm": 8.12989616394043, + "learning_rate": 4.522351885098744e-06, + "loss": 7.9859, + "step": 50400 + }, + { + "epoch": 4.526481149012567, + "grad_norm": 11.731709480285645, + "learning_rate": 4.5245960502693e-06, + "loss": 7.9297, + "step": 50425 + }, + { + "epoch": 4.528725314183124, + "grad_norm": 12.832759857177734, + "learning_rate": 4.526840215439857e-06, + "loss": 8.011, + "step": 50450 + }, + { + "epoch": 4.53096947935368, + "grad_norm": 8.662516593933105, + "learning_rate": 4.5290843806104135e-06, + "loss": 8.0305, + "step": 50475 + }, + { + "epoch": 4.533213644524237, + "grad_norm": 9.886242866516113, + "learning_rate": 4.53132854578097e-06, + "loss": 7.9623, + "step": 50500 + }, + { + "epoch": 4.535457809694794, + "grad_norm": 9.214704513549805, + "learning_rate": 4.533572710951527e-06, + "loss": 8.0784, + "step": 50525 + }, + { + "epoch": 4.53770197486535, + "grad_norm": 8.383956909179688, + "learning_rate": 4.5358168761220825e-06, + "loss": 7.963, + "step": 50550 + }, + { + "epoch": 4.539946140035907, + "grad_norm": 8.873045921325684, + "learning_rate": 4.53806104129264e-06, + "loss": 7.9126, + "step": 50575 + }, + { + "epoch": 4.542190305206463, + "grad_norm": 8.957294464111328, + "learning_rate": 4.540305206463196e-06, + "loss": 7.9541, + "step": 50600 + }, + { + "epoch": 4.54443447037702, + "grad_norm": 13.793033599853516, + "learning_rate": 4.542549371633752e-06, + "loss": 8.019, + "step": 50625 + }, + { + "epoch": 4.546678635547576, + "grad_norm": 9.007516860961914, + "learning_rate": 4.544793536804309e-06, + "loss": 7.8487, + "step": 50650 + }, + { + "epoch": 4.548922800718133, + "grad_norm": 7.779204845428467, + "learning_rate": 4.547037701974866e-06, + "loss": 8.0906, + "step": 50675 + }, + { + "epoch": 4.551166965888689, + "grad_norm": 10.847132682800293, + "learning_rate": 4.549281867145422e-06, + "loss": 7.8066, + "step": 50700 + }, + { + "epoch": 4.553411131059246, + "grad_norm": 9.901704788208008, + "learning_rate": 4.551526032315979e-06, + "loss": 8.0273, + "step": 50725 + }, + { + "epoch": 4.555655296229802, + "grad_norm": 9.053877830505371, + "learning_rate": 4.5537701974865354e-06, + "loss": 7.9525, + "step": 50750 + }, + { + "epoch": 4.557899461400359, + "grad_norm": 9.53842544555664, + "learning_rate": 4.556014362657092e-06, + "loss": 7.7171, + "step": 50775 + }, + { + "epoch": 4.560143626570916, + "grad_norm": 8.154467582702637, + "learning_rate": 4.558258527827649e-06, + "loss": 7.9243, + "step": 50800 + }, + { + "epoch": 4.562387791741472, + "grad_norm": 9.992972373962402, + "learning_rate": 4.560502692998205e-06, + "loss": 7.8966, + "step": 50825 + }, + { + "epoch": 4.564631956912029, + "grad_norm": 8.915594100952148, + "learning_rate": 4.562746858168762e-06, + "loss": 8.045, + "step": 50850 + }, + { + "epoch": 4.5668761220825855, + "grad_norm": 11.290024757385254, + "learning_rate": 4.564991023339318e-06, + "loss": 8.0486, + "step": 50875 + }, + { + "epoch": 4.569120287253142, + "grad_norm": 13.739160537719727, + "learning_rate": 4.567235188509875e-06, + "loss": 7.9946, + "step": 50900 + }, + { + "epoch": 4.5713644524236985, + "grad_norm": 10.37360668182373, + "learning_rate": 4.569479353680431e-06, + "loss": 7.9408, + "step": 50925 + }, + { + "epoch": 4.573608617594255, + "grad_norm": 9.721395492553711, + "learning_rate": 4.5717235188509875e-06, + "loss": 7.8967, + "step": 50950 + }, + { + "epoch": 4.575852782764811, + "grad_norm": 8.925392150878906, + "learning_rate": 4.573967684021544e-06, + "loss": 8.174, + "step": 50975 + }, + { + "epoch": 4.578096947935368, + "grad_norm": 8.168729782104492, + "learning_rate": 4.576211849192101e-06, + "loss": 8.0099, + "step": 51000 + }, + { + "epoch": 4.580341113105924, + "grad_norm": 8.348367691040039, + "learning_rate": 4.578456014362657e-06, + "loss": 7.9683, + "step": 51025 + }, + { + "epoch": 4.582585278276481, + "grad_norm": 11.420297622680664, + "learning_rate": 4.580700179533214e-06, + "loss": 8.1199, + "step": 51050 + }, + { + "epoch": 4.584829443447037, + "grad_norm": 11.266980171203613, + "learning_rate": 4.5829443447037706e-06, + "loss": 7.8978, + "step": 51075 + }, + { + "epoch": 4.587073608617594, + "grad_norm": 10.320151329040527, + "learning_rate": 4.585188509874327e-06, + "loss": 7.9686, + "step": 51100 + }, + { + "epoch": 4.589317773788151, + "grad_norm": 9.092639923095703, + "learning_rate": 4.587432675044884e-06, + "loss": 7.7482, + "step": 51125 + }, + { + "epoch": 4.591561938958708, + "grad_norm": 10.384039878845215, + "learning_rate": 4.58967684021544e-06, + "loss": 7.8057, + "step": 51150 + }, + { + "epoch": 4.593806104129264, + "grad_norm": 8.522717475891113, + "learning_rate": 4.591921005385997e-06, + "loss": 7.7947, + "step": 51175 + }, + { + "epoch": 4.596050269299821, + "grad_norm": 9.823525428771973, + "learning_rate": 4.594165170556553e-06, + "loss": 7.9184, + "step": 51200 + }, + { + "epoch": 4.598294434470377, + "grad_norm": 9.524829864501953, + "learning_rate": 4.59640933572711e-06, + "loss": 7.7886, + "step": 51225 + }, + { + "epoch": 4.600538599640934, + "grad_norm": 10.669671058654785, + "learning_rate": 4.598653500897666e-06, + "loss": 8.0571, + "step": 51250 + }, + { + "epoch": 4.60278276481149, + "grad_norm": 8.820032119750977, + "learning_rate": 4.6008976660682235e-06, + "loss": 7.702, + "step": 51275 + }, + { + "epoch": 4.6050269299820465, + "grad_norm": 8.450116157531738, + "learning_rate": 4.603141831238779e-06, + "loss": 7.9695, + "step": 51300 + }, + { + "epoch": 4.607271095152603, + "grad_norm": 9.144883155822754, + "learning_rate": 4.605385996409336e-06, + "loss": 7.9767, + "step": 51325 + }, + { + "epoch": 4.6095152603231595, + "grad_norm": 11.400300979614258, + "learning_rate": 4.6076301615798925e-06, + "loss": 7.85, + "step": 51350 + }, + { + "epoch": 4.611759425493716, + "grad_norm": 9.228141784667969, + "learning_rate": 4.609874326750449e-06, + "loss": 7.8772, + "step": 51375 + }, + { + "epoch": 4.614003590664273, + "grad_norm": 10.039932250976562, + "learning_rate": 4.612118491921006e-06, + "loss": 7.8703, + "step": 51400 + }, + { + "epoch": 4.61624775583483, + "grad_norm": 9.278451919555664, + "learning_rate": 4.614362657091562e-06, + "loss": 7.9058, + "step": 51425 + }, + { + "epoch": 4.618491921005386, + "grad_norm": 9.01558780670166, + "learning_rate": 4.616606822262119e-06, + "loss": 7.9986, + "step": 51450 + }, + { + "epoch": 4.620736086175943, + "grad_norm": 9.964019775390625, + "learning_rate": 4.6188509874326756e-06, + "loss": 7.9276, + "step": 51475 + }, + { + "epoch": 4.622980251346499, + "grad_norm": 9.10364055633545, + "learning_rate": 4.621095152603232e-06, + "loss": 7.7995, + "step": 51500 + }, + { + "epoch": 4.625224416517056, + "grad_norm": 11.055986404418945, + "learning_rate": 4.623339317773789e-06, + "loss": 8.1941, + "step": 51525 + }, + { + "epoch": 4.627468581687612, + "grad_norm": 9.273573875427246, + "learning_rate": 4.625583482944345e-06, + "loss": 7.7299, + "step": 51550 + }, + { + "epoch": 4.629712746858169, + "grad_norm": 9.415227890014648, + "learning_rate": 4.627827648114901e-06, + "loss": 7.9542, + "step": 51575 + }, + { + "epoch": 4.631956912028725, + "grad_norm": 10.838135719299316, + "learning_rate": 4.630071813285459e-06, + "loss": 7.9656, + "step": 51600 + }, + { + "epoch": 4.634201077199282, + "grad_norm": 8.830073356628418, + "learning_rate": 4.632315978456014e-06, + "loss": 7.8298, + "step": 51625 + }, + { + "epoch": 4.636445242369838, + "grad_norm": 10.366512298583984, + "learning_rate": 4.634560143626571e-06, + "loss": 7.932, + "step": 51650 + }, + { + "epoch": 4.638689407540395, + "grad_norm": 9.147958755493164, + "learning_rate": 4.636804308797128e-06, + "loss": 7.935, + "step": 51675 + }, + { + "epoch": 4.640933572710951, + "grad_norm": 8.862719535827637, + "learning_rate": 4.639048473967684e-06, + "loss": 7.8499, + "step": 51700 + }, + { + "epoch": 4.6431777378815084, + "grad_norm": 9.760979652404785, + "learning_rate": 4.641292639138241e-06, + "loss": 7.8587, + "step": 51725 + }, + { + "epoch": 4.645421903052065, + "grad_norm": 9.181867599487305, + "learning_rate": 4.6435368043087975e-06, + "loss": 8.1077, + "step": 51750 + }, + { + "epoch": 4.647666068222621, + "grad_norm": 9.873266220092773, + "learning_rate": 4.645780969479354e-06, + "loss": 7.9492, + "step": 51775 + }, + { + "epoch": 4.649910233393178, + "grad_norm": 8.217052459716797, + "learning_rate": 4.648025134649911e-06, + "loss": 8.0233, + "step": 51800 + }, + { + "epoch": 4.652154398563734, + "grad_norm": 7.741780757904053, + "learning_rate": 4.650269299820467e-06, + "loss": 8.0962, + "step": 51825 + }, + { + "epoch": 4.654398563734291, + "grad_norm": 10.503560066223145, + "learning_rate": 4.652513464991024e-06, + "loss": 7.9638, + "step": 51850 + }, + { + "epoch": 4.656642728904847, + "grad_norm": 8.089452743530273, + "learning_rate": 4.6547576301615806e-06, + "loss": 7.8437, + "step": 51875 + }, + { + "epoch": 4.658886894075404, + "grad_norm": 8.655327796936035, + "learning_rate": 4.657001795332136e-06, + "loss": 8.0187, + "step": 51900 + }, + { + "epoch": 4.66113105924596, + "grad_norm": 8.703110694885254, + "learning_rate": 4.659245960502694e-06, + "loss": 7.9956, + "step": 51925 + }, + { + "epoch": 4.663375224416517, + "grad_norm": 9.050973892211914, + "learning_rate": 4.6614901256732496e-06, + "loss": 8.0093, + "step": 51950 + }, + { + "epoch": 4.665619389587073, + "grad_norm": 9.741887092590332, + "learning_rate": 4.663734290843806e-06, + "loss": 7.9618, + "step": 51975 + }, + { + "epoch": 4.667863554757631, + "grad_norm": 10.791664123535156, + "learning_rate": 4.665978456014363e-06, + "loss": 7.8169, + "step": 52000 + }, + { + "epoch": 4.670107719928187, + "grad_norm": 8.933563232421875, + "learning_rate": 4.668222621184919e-06, + "loss": 7.9412, + "step": 52025 + }, + { + "epoch": 4.6723518850987436, + "grad_norm": 11.014135360717773, + "learning_rate": 4.670466786355476e-06, + "loss": 7.7357, + "step": 52050 + }, + { + "epoch": 4.6745960502693, + "grad_norm": 8.910687446594238, + "learning_rate": 4.672710951526033e-06, + "loss": 7.9486, + "step": 52075 + }, + { + "epoch": 4.6768402154398565, + "grad_norm": 11.67155933380127, + "learning_rate": 4.674955116696589e-06, + "loss": 8.0582, + "step": 52100 + }, + { + "epoch": 4.679084380610413, + "grad_norm": 7.906127452850342, + "learning_rate": 4.677199281867146e-06, + "loss": 7.8577, + "step": 52125 + }, + { + "epoch": 4.6813285457809695, + "grad_norm": 9.77098560333252, + "learning_rate": 4.6794434470377025e-06, + "loss": 8.0917, + "step": 52150 + }, + { + "epoch": 4.683572710951526, + "grad_norm": 9.788518905639648, + "learning_rate": 4.681687612208259e-06, + "loss": 8.0474, + "step": 52175 + }, + { + "epoch": 4.685816876122082, + "grad_norm": 9.188558578491211, + "learning_rate": 4.683931777378816e-06, + "loss": 7.9413, + "step": 52200 + }, + { + "epoch": 4.688061041292639, + "grad_norm": 9.117868423461914, + "learning_rate": 4.686175942549372e-06, + "loss": 8.0779, + "step": 52225 + }, + { + "epoch": 4.690305206463195, + "grad_norm": 9.527130126953125, + "learning_rate": 4.688420107719929e-06, + "loss": 8.0108, + "step": 52250 + }, + { + "epoch": 4.692549371633753, + "grad_norm": 8.716657638549805, + "learning_rate": 4.690574506283663e-06, + "loss": 7.7732, + "step": 52275 + }, + { + "epoch": 4.694793536804308, + "grad_norm": 8.856244087219238, + "learning_rate": 4.6928186714542196e-06, + "loss": 7.8248, + "step": 52300 + }, + { + "epoch": 4.697037701974866, + "grad_norm": 8.651986122131348, + "learning_rate": 4.695062836624776e-06, + "loss": 8.0142, + "step": 52325 + }, + { + "epoch": 4.699281867145422, + "grad_norm": 8.978036880493164, + "learning_rate": 4.697307001795333e-06, + "loss": 7.8864, + "step": 52350 + }, + { + "epoch": 4.701526032315979, + "grad_norm": 8.90780258178711, + "learning_rate": 4.6995511669658886e-06, + "loss": 7.8364, + "step": 52375 + }, + { + "epoch": 4.703770197486535, + "grad_norm": 9.870092391967773, + "learning_rate": 4.701795332136446e-06, + "loss": 7.9255, + "step": 52400 + }, + { + "epoch": 4.706014362657092, + "grad_norm": 13.26844596862793, + "learning_rate": 4.704039497307002e-06, + "loss": 7.7966, + "step": 52425 + }, + { + "epoch": 4.708258527827648, + "grad_norm": 8.729020118713379, + "learning_rate": 4.706283662477558e-06, + "loss": 7.915, + "step": 52450 + }, + { + "epoch": 4.710502692998205, + "grad_norm": 8.912311553955078, + "learning_rate": 4.708527827648116e-06, + "loss": 8.0384, + "step": 52475 + }, + { + "epoch": 4.712746858168761, + "grad_norm": 9.165661811828613, + "learning_rate": 4.710771992818672e-06, + "loss": 7.9067, + "step": 52500 + }, + { + "epoch": 4.7149910233393175, + "grad_norm": 9.271905899047852, + "learning_rate": 4.713016157989228e-06, + "loss": 7.9419, + "step": 52525 + }, + { + "epoch": 4.717235188509874, + "grad_norm": 15.852624893188477, + "learning_rate": 4.715260323159785e-06, + "loss": 8.0579, + "step": 52550 + }, + { + "epoch": 4.7194793536804305, + "grad_norm": 8.539238929748535, + "learning_rate": 4.7175044883303415e-06, + "loss": 8.0499, + "step": 52575 + }, + { + "epoch": 4.721723518850988, + "grad_norm": 10.799327850341797, + "learning_rate": 4.719748653500898e-06, + "loss": 8.1419, + "step": 52600 + }, + { + "epoch": 4.723967684021544, + "grad_norm": 9.183158874511719, + "learning_rate": 4.721992818671455e-06, + "loss": 7.8418, + "step": 52625 + }, + { + "epoch": 4.726211849192101, + "grad_norm": 9.723808288574219, + "learning_rate": 4.724236983842011e-06, + "loss": 7.952, + "step": 52650 + }, + { + "epoch": 4.728456014362657, + "grad_norm": 10.153637886047363, + "learning_rate": 4.726481149012568e-06, + "loss": 8.0488, + "step": 52675 + }, + { + "epoch": 4.730700179533214, + "grad_norm": 9.180317878723145, + "learning_rate": 4.728725314183124e-06, + "loss": 7.8484, + "step": 52700 + }, + { + "epoch": 4.73294434470377, + "grad_norm": 10.147676467895508, + "learning_rate": 4.730969479353681e-06, + "loss": 7.8383, + "step": 52725 + }, + { + "epoch": 4.735188509874327, + "grad_norm": 8.569506645202637, + "learning_rate": 4.733213644524237e-06, + "loss": 7.9216, + "step": 52750 + }, + { + "epoch": 4.737432675044883, + "grad_norm": 9.199727058410645, + "learning_rate": 4.7354578096947936e-06, + "loss": 7.9685, + "step": 52775 + }, + { + "epoch": 4.73967684021544, + "grad_norm": 11.453333854675293, + "learning_rate": 4.73770197486535e-06, + "loss": 7.8369, + "step": 52800 + }, + { + "epoch": 4.741921005385996, + "grad_norm": 8.012238502502441, + "learning_rate": 4.739946140035907e-06, + "loss": 7.8635, + "step": 52825 + }, + { + "epoch": 4.744165170556553, + "grad_norm": 10.123213768005371, + "learning_rate": 4.742190305206464e-06, + "loss": 7.9301, + "step": 52850 + }, + { + "epoch": 4.74640933572711, + "grad_norm": 8.365550994873047, + "learning_rate": 4.74443447037702e-06, + "loss": 7.8501, + "step": 52875 + }, + { + "epoch": 4.748653500897666, + "grad_norm": 9.200785636901855, + "learning_rate": 4.746678635547577e-06, + "loss": 7.7557, + "step": 52900 + }, + { + "epoch": 4.750897666068223, + "grad_norm": 9.746176719665527, + "learning_rate": 4.748922800718133e-06, + "loss": 8.0288, + "step": 52925 + }, + { + "epoch": 4.753141831238779, + "grad_norm": 8.890609741210938, + "learning_rate": 4.75116696588869e-06, + "loss": 7.9324, + "step": 52950 + }, + { + "epoch": 4.755385996409336, + "grad_norm": 12.122444152832031, + "learning_rate": 4.7534111310592465e-06, + "loss": 8.0719, + "step": 52975 + }, + { + "epoch": 4.757630161579892, + "grad_norm": 9.728184700012207, + "learning_rate": 4.755655296229803e-06, + "loss": 7.8893, + "step": 53000 + }, + { + "epoch": 4.759874326750449, + "grad_norm": 8.334617614746094, + "learning_rate": 4.75789946140036e-06, + "loss": 7.6828, + "step": 53025 + }, + { + "epoch": 4.762118491921005, + "grad_norm": 9.79677677154541, + "learning_rate": 4.760143626570916e-06, + "loss": 7.7819, + "step": 53050 + }, + { + "epoch": 4.764362657091562, + "grad_norm": 11.010250091552734, + "learning_rate": 4.762387791741472e-06, + "loss": 8.0262, + "step": 53075 + }, + { + "epoch": 4.766606822262118, + "grad_norm": 10.02094554901123, + "learning_rate": 4.7646319569120296e-06, + "loss": 7.8554, + "step": 53100 + }, + { + "epoch": 4.768850987432675, + "grad_norm": 8.863846778869629, + "learning_rate": 4.766876122082585e-06, + "loss": 7.9165, + "step": 53125 + }, + { + "epoch": 4.771095152603231, + "grad_norm": 9.168973922729492, + "learning_rate": 4.769120287253142e-06, + "loss": 8.0174, + "step": 53150 + }, + { + "epoch": 4.773339317773788, + "grad_norm": 9.83978271484375, + "learning_rate": 4.7713644524236986e-06, + "loss": 7.6396, + "step": 53175 + }, + { + "epoch": 4.775583482944345, + "grad_norm": 10.471001625061035, + "learning_rate": 4.773608617594255e-06, + "loss": 8.1601, + "step": 53200 + }, + { + "epoch": 4.777827648114902, + "grad_norm": 10.683746337890625, + "learning_rate": 4.775852782764812e-06, + "loss": 7.8642, + "step": 53225 + }, + { + "epoch": 4.780071813285458, + "grad_norm": 11.129725456237793, + "learning_rate": 4.778096947935368e-06, + "loss": 7.8798, + "step": 53250 + }, + { + "epoch": 4.7823159784560145, + "grad_norm": 10.615584373474121, + "learning_rate": 4.780341113105925e-06, + "loss": 7.883, + "step": 53275 + }, + { + "epoch": 4.784560143626571, + "grad_norm": 9.796239852905273, + "learning_rate": 4.782585278276482e-06, + "loss": 7.8611, + "step": 53300 + }, + { + "epoch": 4.7868043087971275, + "grad_norm": 13.156811714172363, + "learning_rate": 4.784829443447038e-06, + "loss": 7.7242, + "step": 53325 + }, + { + "epoch": 4.789048473967684, + "grad_norm": 12.646811485290527, + "learning_rate": 4.787073608617595e-06, + "loss": 7.8573, + "step": 53350 + }, + { + "epoch": 4.79129263913824, + "grad_norm": 9.519824028015137, + "learning_rate": 4.7893177737881515e-06, + "loss": 7.8755, + "step": 53375 + }, + { + "epoch": 4.793536804308797, + "grad_norm": 9.089550971984863, + "learning_rate": 4.791561938958707e-06, + "loss": 7.8026, + "step": 53400 + }, + { + "epoch": 4.795780969479353, + "grad_norm": 12.402710914611816, + "learning_rate": 4.793806104129265e-06, + "loss": 7.9953, + "step": 53425 + }, + { + "epoch": 4.79802513464991, + "grad_norm": 10.849319458007812, + "learning_rate": 4.7960502692998205e-06, + "loss": 7.7833, + "step": 53450 + }, + { + "epoch": 4.800269299820467, + "grad_norm": 8.876319885253906, + "learning_rate": 4.798294434470377e-06, + "loss": 7.9325, + "step": 53475 + }, + { + "epoch": 4.802513464991024, + "grad_norm": 11.1802339553833, + "learning_rate": 4.800538599640934e-06, + "loss": 7.9416, + "step": 53500 + }, + { + "epoch": 4.80475763016158, + "grad_norm": 12.394604682922363, + "learning_rate": 4.80278276481149e-06, + "loss": 7.8802, + "step": 53525 + }, + { + "epoch": 4.807001795332137, + "grad_norm": 10.216887474060059, + "learning_rate": 4.805026929982047e-06, + "loss": 7.9219, + "step": 53550 + }, + { + "epoch": 4.809245960502693, + "grad_norm": 8.131916046142578, + "learning_rate": 4.8072710951526036e-06, + "loss": 7.7713, + "step": 53575 + }, + { + "epoch": 4.81149012567325, + "grad_norm": 9.26923656463623, + "learning_rate": 4.80951526032316e-06, + "loss": 7.8984, + "step": 53600 + }, + { + "epoch": 4.813734290843806, + "grad_norm": 8.39031982421875, + "learning_rate": 4.811759425493717e-06, + "loss": 7.8796, + "step": 53625 + }, + { + "epoch": 4.815978456014363, + "grad_norm": 10.703878402709961, + "learning_rate": 4.814003590664273e-06, + "loss": 7.6556, + "step": 53650 + }, + { + "epoch": 4.818222621184919, + "grad_norm": 9.758881568908691, + "learning_rate": 4.81624775583483e-06, + "loss": 7.8218, + "step": 53675 + }, + { + "epoch": 4.8204667863554755, + "grad_norm": 10.85703182220459, + "learning_rate": 4.818491921005387e-06, + "loss": 7.9654, + "step": 53700 + }, + { + "epoch": 4.822710951526032, + "grad_norm": 8.654091835021973, + "learning_rate": 4.820736086175942e-06, + "loss": 7.9301, + "step": 53725 + }, + { + "epoch": 4.8249551166965885, + "grad_norm": 9.391196250915527, + "learning_rate": 4.8229802513465e-06, + "loss": 7.9773, + "step": 53750 + }, + { + "epoch": 4.827199281867145, + "grad_norm": 8.518434524536133, + "learning_rate": 4.825224416517056e-06, + "loss": 7.9053, + "step": 53775 + }, + { + "epoch": 4.829443447037702, + "grad_norm": 9.637202262878418, + "learning_rate": 4.827468581687613e-06, + "loss": 7.9447, + "step": 53800 + }, + { + "epoch": 4.831687612208259, + "grad_norm": 9.585752487182617, + "learning_rate": 4.829712746858169e-06, + "loss": 7.7518, + "step": 53825 + }, + { + "epoch": 4.833931777378815, + "grad_norm": 9.766579627990723, + "learning_rate": 4.8319569120287255e-06, + "loss": 7.9272, + "step": 53850 + }, + { + "epoch": 4.836175942549372, + "grad_norm": 10.196799278259277, + "learning_rate": 4.834201077199282e-06, + "loss": 7.9986, + "step": 53875 + }, + { + "epoch": 4.838420107719928, + "grad_norm": 9.534643173217773, + "learning_rate": 4.836445242369839e-06, + "loss": 7.8311, + "step": 53900 + }, + { + "epoch": 4.840664272890485, + "grad_norm": 11.230202674865723, + "learning_rate": 4.838689407540395e-06, + "loss": 7.9317, + "step": 53925 + }, + { + "epoch": 4.842908438061041, + "grad_norm": 9.370316505432129, + "learning_rate": 4.840933572710952e-06, + "loss": 8.038, + "step": 53950 + }, + { + "epoch": 4.845152603231598, + "grad_norm": 13.543322563171387, + "learning_rate": 4.8431777378815085e-06, + "loss": 7.7745, + "step": 53975 + }, + { + "epoch": 4.847396768402154, + "grad_norm": 12.197200775146484, + "learning_rate": 4.845421903052065e-06, + "loss": 7.8709, + "step": 54000 + }, + { + "epoch": 4.849640933572711, + "grad_norm": 8.48709487915039, + "learning_rate": 4.847666068222622e-06, + "loss": 7.6298, + "step": 54025 + }, + { + "epoch": 4.851885098743267, + "grad_norm": 11.623228073120117, + "learning_rate": 4.849910233393178e-06, + "loss": 7.9393, + "step": 54050 + }, + { + "epoch": 4.8541292639138245, + "grad_norm": 8.960615158081055, + "learning_rate": 4.852154398563735e-06, + "loss": 7.9143, + "step": 54075 + }, + { + "epoch": 4.856373429084381, + "grad_norm": 9.903751373291016, + "learning_rate": 4.854398563734291e-06, + "loss": 7.943, + "step": 54100 + }, + { + "epoch": 4.858617594254937, + "grad_norm": 11.34912109375, + "learning_rate": 4.856642728904848e-06, + "loss": 7.8111, + "step": 54125 + }, + { + "epoch": 4.860861759425494, + "grad_norm": 9.299190521240234, + "learning_rate": 4.858886894075404e-06, + "loss": 7.9068, + "step": 54150 + }, + { + "epoch": 4.86310592459605, + "grad_norm": 8.464523315429688, + "learning_rate": 4.861131059245961e-06, + "loss": 7.8569, + "step": 54175 + }, + { + "epoch": 4.865350089766607, + "grad_norm": 8.764655113220215, + "learning_rate": 4.863375224416517e-06, + "loss": 7.8055, + "step": 54200 + }, + { + "epoch": 4.867594254937163, + "grad_norm": 8.747025489807129, + "learning_rate": 4.865619389587074e-06, + "loss": 7.9327, + "step": 54225 + }, + { + "epoch": 4.86983842010772, + "grad_norm": 9.0860013961792, + "learning_rate": 4.8678635547576305e-06, + "loss": 7.8988, + "step": 54250 + }, + { + "epoch": 4.872082585278276, + "grad_norm": 9.340476989746094, + "learning_rate": 4.870107719928187e-06, + "loss": 7.8812, + "step": 54275 + }, + { + "epoch": 4.874326750448833, + "grad_norm": 9.902183532714844, + "learning_rate": 4.872351885098744e-06, + "loss": 7.7209, + "step": 54300 + }, + { + "epoch": 4.876570915619389, + "grad_norm": 9.305747985839844, + "learning_rate": 4.8745960502693e-06, + "loss": 7.9802, + "step": 54325 + }, + { + "epoch": 4.878815080789947, + "grad_norm": 10.5062894821167, + "learning_rate": 4.876840215439857e-06, + "loss": 7.7772, + "step": 54350 + }, + { + "epoch": 4.881059245960502, + "grad_norm": 8.981291770935059, + "learning_rate": 4.8790843806104135e-06, + "loss": 7.8061, + "step": 54375 + }, + { + "epoch": 4.88330341113106, + "grad_norm": 8.456402778625488, + "learning_rate": 4.88132854578097e-06, + "loss": 7.8214, + "step": 54400 + }, + { + "epoch": 4.885547576301616, + "grad_norm": 9.292280197143555, + "learning_rate": 4.883572710951526e-06, + "loss": 8.0152, + "step": 54425 + }, + { + "epoch": 4.8877917414721725, + "grad_norm": 11.67159366607666, + "learning_rate": 4.885816876122083e-06, + "loss": 7.7694, + "step": 54450 + }, + { + "epoch": 4.890035906642729, + "grad_norm": 9.769392967224121, + "learning_rate": 4.888061041292639e-06, + "loss": 7.7894, + "step": 54475 + }, + { + "epoch": 4.8922800718132855, + "grad_norm": 9.555033683776855, + "learning_rate": 4.890305206463196e-06, + "loss": 7.8548, + "step": 54500 + }, + { + "epoch": 4.894524236983842, + "grad_norm": 8.76042652130127, + "learning_rate": 4.892549371633752e-06, + "loss": 7.7458, + "step": 54525 + }, + { + "epoch": 4.8967684021543985, + "grad_norm": 10.021039962768555, + "learning_rate": 4.894793536804309e-06, + "loss": 7.7972, + "step": 54550 + }, + { + "epoch": 4.899012567324955, + "grad_norm": 11.244729042053223, + "learning_rate": 4.896947935368043e-06, + "loss": 7.8891, + "step": 54575 + }, + { + "epoch": 4.901256732495511, + "grad_norm": 9.997754096984863, + "learning_rate": 4.8991921005386005e-06, + "loss": 7.7499, + "step": 54600 + }, + { + "epoch": 4.903500897666068, + "grad_norm": 9.093688011169434, + "learning_rate": 4.901436265709157e-06, + "loss": 8.029, + "step": 54625 + }, + { + "epoch": 4.905745062836624, + "grad_norm": 9.881813049316406, + "learning_rate": 4.903680430879713e-06, + "loss": 7.9554, + "step": 54650 + }, + { + "epoch": 4.907989228007182, + "grad_norm": 12.274054527282715, + "learning_rate": 4.90592459605027e-06, + "loss": 7.793, + "step": 54675 + }, + { + "epoch": 4.910233393177738, + "grad_norm": 8.970602035522461, + "learning_rate": 4.908168761220826e-06, + "loss": 7.9625, + "step": 54700 + }, + { + "epoch": 4.912477558348295, + "grad_norm": 10.566658973693848, + "learning_rate": 4.910412926391383e-06, + "loss": 7.8054, + "step": 54725 + }, + { + "epoch": 4.914721723518851, + "grad_norm": 8.84123706817627, + "learning_rate": 4.912657091561939e-06, + "loss": 7.7916, + "step": 54750 + }, + { + "epoch": 4.916965888689408, + "grad_norm": 9.206274032592773, + "learning_rate": 4.914901256732496e-06, + "loss": 7.7528, + "step": 54775 + }, + { + "epoch": 4.919210053859964, + "grad_norm": 12.93954086303711, + "learning_rate": 4.9171454219030526e-06, + "loss": 7.9501, + "step": 54800 + }, + { + "epoch": 4.921454219030521, + "grad_norm": 8.960988998413086, + "learning_rate": 4.919389587073609e-06, + "loss": 7.8339, + "step": 54825 + }, + { + "epoch": 4.923698384201077, + "grad_norm": 11.529765129089355, + "learning_rate": 4.921633752244166e-06, + "loss": 8.0206, + "step": 54850 + }, + { + "epoch": 4.925942549371634, + "grad_norm": 8.05270767211914, + "learning_rate": 4.923877917414722e-06, + "loss": 7.9674, + "step": 54875 + }, + { + "epoch": 4.92818671454219, + "grad_norm": 8.850829124450684, + "learning_rate": 4.926122082585278e-06, + "loss": 7.7234, + "step": 54900 + }, + { + "epoch": 4.9304308797127465, + "grad_norm": 10.924898147583008, + "learning_rate": 4.928366247755836e-06, + "loss": 7.7529, + "step": 54925 + }, + { + "epoch": 4.932675044883304, + "grad_norm": 9.465418815612793, + "learning_rate": 4.930610412926391e-06, + "loss": 8.0012, + "step": 54950 + }, + { + "epoch": 4.9349192100538595, + "grad_norm": 8.299141883850098, + "learning_rate": 4.932854578096948e-06, + "loss": 7.7861, + "step": 54975 + }, + { + "epoch": 4.937163375224417, + "grad_norm": 8.899646759033203, + "learning_rate": 4.9350987432675055e-06, + "loss": 7.9318, + "step": 55000 + }, + { + "epoch": 4.939407540394973, + "grad_norm": 8.817787170410156, + "learning_rate": 4.937342908438061e-06, + "loss": 7.8289, + "step": 55025 + }, + { + "epoch": 4.94165170556553, + "grad_norm": 9.337017059326172, + "learning_rate": 4.939587073608618e-06, + "loss": 8.0151, + "step": 55050 + }, + { + "epoch": 4.943895870736086, + "grad_norm": 9.246665000915527, + "learning_rate": 4.9418312387791745e-06, + "loss": 8.0749, + "step": 55075 + }, + { + "epoch": 4.946140035906643, + "grad_norm": 9.126211166381836, + "learning_rate": 4.944075403949731e-06, + "loss": 7.8616, + "step": 55100 + }, + { + "epoch": 4.948384201077199, + "grad_norm": 9.212133407592773, + "learning_rate": 4.946319569120288e-06, + "loss": 8.0395, + "step": 55125 + }, + { + "epoch": 4.950628366247756, + "grad_norm": 9.740900993347168, + "learning_rate": 4.948563734290844e-06, + "loss": 8.0574, + "step": 55150 + }, + { + "epoch": 4.952872531418312, + "grad_norm": 9.027520179748535, + "learning_rate": 4.950807899461401e-06, + "loss": 7.8623, + "step": 55175 + }, + { + "epoch": 4.955116696588869, + "grad_norm": 9.351509094238281, + "learning_rate": 4.9530520646319575e-06, + "loss": 7.764, + "step": 55200 + }, + { + "epoch": 4.957360861759425, + "grad_norm": 11.804048538208008, + "learning_rate": 4.955296229802513e-06, + "loss": 7.7156, + "step": 55225 + }, + { + "epoch": 4.959605026929982, + "grad_norm": 12.273924827575684, + "learning_rate": 4.957540394973071e-06, + "loss": 7.8676, + "step": 55250 + }, + { + "epoch": 4.961849192100539, + "grad_norm": 8.541516304016113, + "learning_rate": 4.9597845601436265e-06, + "loss": 7.8278, + "step": 55275 + }, + { + "epoch": 4.9640933572710955, + "grad_norm": 13.012560844421387, + "learning_rate": 4.962028725314183e-06, + "loss": 7.8918, + "step": 55300 + }, + { + "epoch": 4.966337522441652, + "grad_norm": 11.291000366210938, + "learning_rate": 4.96427289048474e-06, + "loss": 7.9892, + "step": 55325 + }, + { + "epoch": 4.968581687612208, + "grad_norm": 10.618658065795898, + "learning_rate": 4.966517055655296e-06, + "loss": 7.9726, + "step": 55350 + }, + { + "epoch": 4.970825852782765, + "grad_norm": 10.494564056396484, + "learning_rate": 4.968761220825854e-06, + "loss": 7.9323, + "step": 55375 + }, + { + "epoch": 4.973070017953321, + "grad_norm": 8.749927520751953, + "learning_rate": 4.97100538599641e-06, + "loss": 7.9286, + "step": 55400 + }, + { + "epoch": 4.975314183123878, + "grad_norm": 12.694400787353516, + "learning_rate": 4.973249551166966e-06, + "loss": 7.9415, + "step": 55425 + }, + { + "epoch": 4.977558348294434, + "grad_norm": 13.65678882598877, + "learning_rate": 4.975493716337523e-06, + "loss": 7.8182, + "step": 55450 + }, + { + "epoch": 4.979802513464991, + "grad_norm": 11.809450149536133, + "learning_rate": 4.9777378815080795e-06, + "loss": 7.8525, + "step": 55475 + }, + { + "epoch": 4.982046678635547, + "grad_norm": 10.512025833129883, + "learning_rate": 4.979982046678636e-06, + "loss": 7.896, + "step": 55500 + }, + { + "epoch": 4.984290843806104, + "grad_norm": 10.29493522644043, + "learning_rate": 4.982226211849193e-06, + "loss": 8.0272, + "step": 55525 + }, + { + "epoch": 4.986535008976661, + "grad_norm": 13.322554588317871, + "learning_rate": 4.984470377019749e-06, + "loss": 7.8647, + "step": 55550 + }, + { + "epoch": 4.988779174147218, + "grad_norm": 9.131346702575684, + "learning_rate": 4.986714542190306e-06, + "loss": 8.0939, + "step": 55575 + }, + { + "epoch": 4.991023339317774, + "grad_norm": 10.359338760375977, + "learning_rate": 4.988958707360862e-06, + "loss": 7.7935, + "step": 55600 + }, + { + "epoch": 4.993267504488331, + "grad_norm": 8.76497745513916, + "learning_rate": 4.991202872531419e-06, + "loss": 7.8627, + "step": 55625 + }, + { + "epoch": 4.995511669658887, + "grad_norm": 8.178946495056152, + "learning_rate": 4.993447037701975e-06, + "loss": 7.8751, + "step": 55650 + }, + { + "epoch": 4.9977558348294435, + "grad_norm": 12.888087272644043, + "learning_rate": 4.9956912028725315e-06, + "loss": 8.0588, + "step": 55675 + }, + { + "epoch": 5.0, + "grad_norm": 8.727584838867188, + "learning_rate": 4.997935368043089e-06, + "loss": 7.8341, + "step": 55700 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.04347660053844538, + "eval_f1_macro": 0.0004684588166013723, + "eval_f1_micro": 0.04347660053844538, + "eval_f1_weighted": 0.00975005258948209, + "eval_loss": 8.240588188171387, + "eval_precision_macro": 0.00044027290280604706, + "eval_precision_micro": 0.04347660053844538, + "eval_precision_weighted": 0.007641540136974549, + "eval_recall_macro": 0.0016732408437260186, + "eval_recall_micro": 0.04347660053844538, + "eval_recall_weighted": 0.04347660053844538, + "eval_runtime": 128.6182, + "eval_samples_per_second": 407.197, + "eval_steps_per_second": 12.728, + "step": 55700 + }, + { + "epoch": 5.0022441651705565, + "grad_norm": 9.022710800170898, + "learning_rate": 5.000179533213645e-06, + "loss": 7.577, + "step": 55725 + }, + { + "epoch": 5.004488330341113, + "grad_norm": 8.923656463623047, + "learning_rate": 5.002423698384201e-06, + "loss": 7.7575, + "step": 55750 + }, + { + "epoch": 5.006732495511669, + "grad_norm": 10.142394065856934, + "learning_rate": 5.004667863554759e-06, + "loss": 7.7899, + "step": 55775 + }, + { + "epoch": 5.008976660682226, + "grad_norm": 8.357888221740723, + "learning_rate": 5.006912028725315e-06, + "loss": 7.4239, + "step": 55800 + }, + { + "epoch": 5.011220825852782, + "grad_norm": 9.149526596069336, + "learning_rate": 5.009156193895871e-06, + "loss": 7.707, + "step": 55825 + }, + { + "epoch": 5.01346499102334, + "grad_norm": 10.102737426757812, + "learning_rate": 5.011400359066427e-06, + "loss": 7.6619, + "step": 55850 + }, + { + "epoch": 5.015709156193896, + "grad_norm": 8.398115158081055, + "learning_rate": 5.0136445242369845e-06, + "loss": 7.701, + "step": 55875 + }, + { + "epoch": 5.017953321364453, + "grad_norm": 9.830175399780273, + "learning_rate": 5.015888689407541e-06, + "loss": 7.6865, + "step": 55900 + }, + { + "epoch": 5.020197486535009, + "grad_norm": 9.576410293579102, + "learning_rate": 5.018043087971275e-06, + "loss": 7.5603, + "step": 55925 + }, + { + "epoch": 5.022441651705566, + "grad_norm": 7.601745128631592, + "learning_rate": 5.020287253141831e-06, + "loss": 7.6747, + "step": 55950 + }, + { + "epoch": 5.024685816876122, + "grad_norm": 14.96988582611084, + "learning_rate": 5.022531418312388e-06, + "loss": 7.8883, + "step": 55975 + }, + { + "epoch": 5.026929982046679, + "grad_norm": 9.275834083557129, + "learning_rate": 5.024775583482945e-06, + "loss": 7.6701, + "step": 56000 + }, + { + "epoch": 5.029174147217235, + "grad_norm": 9.679591178894043, + "learning_rate": 5.027019748653501e-06, + "loss": 7.8377, + "step": 56025 + }, + { + "epoch": 5.031418312387792, + "grad_norm": 11.566990852355957, + "learning_rate": 5.029263913824057e-06, + "loss": 7.6315, + "step": 56050 + }, + { + "epoch": 5.033662477558348, + "grad_norm": 11.533337593078613, + "learning_rate": 5.031508078994615e-06, + "loss": 7.8209, + "step": 56075 + }, + { + "epoch": 5.0359066427289045, + "grad_norm": 9.511127471923828, + "learning_rate": 5.0337522441651705e-06, + "loss": 7.8648, + "step": 56100 + }, + { + "epoch": 5.038150807899461, + "grad_norm": 14.986556053161621, + "learning_rate": 5.035996409335727e-06, + "loss": 7.7876, + "step": 56125 + }, + { + "epoch": 5.040394973070018, + "grad_norm": 10.251988410949707, + "learning_rate": 5.038240574506285e-06, + "loss": 7.7234, + "step": 56150 + }, + { + "epoch": 5.042639138240575, + "grad_norm": 9.13516902923584, + "learning_rate": 5.040484739676841e-06, + "loss": 7.6798, + "step": 56175 + }, + { + "epoch": 5.044883303411131, + "grad_norm": 10.009552001953125, + "learning_rate": 5.042728904847397e-06, + "loss": 7.7222, + "step": 56200 + }, + { + "epoch": 5.047127468581688, + "grad_norm": 11.983941078186035, + "learning_rate": 5.044973070017954e-06, + "loss": 7.7009, + "step": 56225 + }, + { + "epoch": 5.049371633752244, + "grad_norm": 9.615638732910156, + "learning_rate": 5.047217235188511e-06, + "loss": 7.7826, + "step": 56250 + }, + { + "epoch": 5.051615798922801, + "grad_norm": 9.805880546569824, + "learning_rate": 5.049461400359067e-06, + "loss": 7.6697, + "step": 56275 + }, + { + "epoch": 5.053859964093357, + "grad_norm": 8.31063175201416, + "learning_rate": 5.0517055655296235e-06, + "loss": 7.6847, + "step": 56300 + }, + { + "epoch": 5.056104129263914, + "grad_norm": 8.975356101989746, + "learning_rate": 5.053949730700179e-06, + "loss": 7.8639, + "step": 56325 + }, + { + "epoch": 5.05834829443447, + "grad_norm": 11.00772476196289, + "learning_rate": 5.056193895870737e-06, + "loss": 7.7543, + "step": 56350 + }, + { + "epoch": 5.060592459605027, + "grad_norm": 9.456061363220215, + "learning_rate": 5.058438061041293e-06, + "loss": 7.7498, + "step": 56375 + }, + { + "epoch": 5.062836624775583, + "grad_norm": 10.090569496154785, + "learning_rate": 5.060682226211849e-06, + "loss": 7.745, + "step": 56400 + }, + { + "epoch": 5.06508078994614, + "grad_norm": 9.769320487976074, + "learning_rate": 5.0629263913824065e-06, + "loss": 7.6661, + "step": 56425 + }, + { + "epoch": 5.067324955116697, + "grad_norm": 9.002483367919922, + "learning_rate": 5.065170556552963e-06, + "loss": 7.8375, + "step": 56450 + }, + { + "epoch": 5.0695691202872535, + "grad_norm": 11.088233947753906, + "learning_rate": 5.067414721723519e-06, + "loss": 7.5063, + "step": 56475 + }, + { + "epoch": 5.07181328545781, + "grad_norm": 8.020485877990723, + "learning_rate": 5.0696588868940755e-06, + "loss": 7.7274, + "step": 56500 + }, + { + "epoch": 5.074057450628366, + "grad_norm": 8.63790512084961, + "learning_rate": 5.071903052064633e-06, + "loss": 7.6064, + "step": 56525 + }, + { + "epoch": 5.076301615798923, + "grad_norm": 8.584205627441406, + "learning_rate": 5.074147217235189e-06, + "loss": 7.8946, + "step": 56550 + }, + { + "epoch": 5.078545780969479, + "grad_norm": 10.999496459960938, + "learning_rate": 5.076391382405745e-06, + "loss": 7.8127, + "step": 56575 + }, + { + "epoch": 5.080789946140036, + "grad_norm": 8.290962219238281, + "learning_rate": 5.078635547576302e-06, + "loss": 7.7339, + "step": 56600 + }, + { + "epoch": 5.083034111310592, + "grad_norm": 11.380302429199219, + "learning_rate": 5.080879712746859e-06, + "loss": 7.739, + "step": 56625 + }, + { + "epoch": 5.085278276481149, + "grad_norm": 9.541581153869629, + "learning_rate": 5.083123877917415e-06, + "loss": 7.6885, + "step": 56650 + }, + { + "epoch": 5.087522441651705, + "grad_norm": 9.195602416992188, + "learning_rate": 5.085368043087972e-06, + "loss": 7.6588, + "step": 56675 + }, + { + "epoch": 5.089766606822262, + "grad_norm": 8.242143630981445, + "learning_rate": 5.087612208258528e-06, + "loss": 7.7127, + "step": 56700 + }, + { + "epoch": 5.092010771992818, + "grad_norm": 13.373574256896973, + "learning_rate": 5.089856373429085e-06, + "loss": 7.7377, + "step": 56725 + }, + { + "epoch": 5.094254937163376, + "grad_norm": 9.503652572631836, + "learning_rate": 5.092100538599642e-06, + "loss": 7.6221, + "step": 56750 + }, + { + "epoch": 5.096499102333932, + "grad_norm": 10.022549629211426, + "learning_rate": 5.0943447037701975e-06, + "loss": 7.662, + "step": 56775 + }, + { + "epoch": 5.098743267504489, + "grad_norm": 10.28764533996582, + "learning_rate": 5.096588868940755e-06, + "loss": 7.6972, + "step": 56800 + }, + { + "epoch": 5.100987432675045, + "grad_norm": 10.039887428283691, + "learning_rate": 5.0988330341113115e-06, + "loss": 7.7785, + "step": 56825 + }, + { + "epoch": 5.1032315978456015, + "grad_norm": 8.062430381774902, + "learning_rate": 5.101077199281867e-06, + "loss": 7.8576, + "step": 56850 + }, + { + "epoch": 5.105475763016158, + "grad_norm": 9.828313827514648, + "learning_rate": 5.103321364452424e-06, + "loss": 7.7406, + "step": 56875 + }, + { + "epoch": 5.1077199281867145, + "grad_norm": 11.139410018920898, + "learning_rate": 5.105565529622981e-06, + "loss": 7.6629, + "step": 56900 + }, + { + "epoch": 5.109964093357271, + "grad_norm": 10.220213890075684, + "learning_rate": 5.107809694793537e-06, + "loss": 7.5732, + "step": 56925 + }, + { + "epoch": 5.1122082585278275, + "grad_norm": 9.2566499710083, + "learning_rate": 5.110053859964094e-06, + "loss": 7.6375, + "step": 56950 + }, + { + "epoch": 5.114452423698384, + "grad_norm": 16.83420753479004, + "learning_rate": 5.1122980251346495e-06, + "loss": 7.7481, + "step": 56975 + }, + { + "epoch": 5.11669658886894, + "grad_norm": 8.975869178771973, + "learning_rate": 5.114542190305207e-06, + "loss": 7.868, + "step": 57000 + }, + { + "epoch": 5.118940754039497, + "grad_norm": 9.8814115524292, + "learning_rate": 5.116786355475764e-06, + "loss": 7.6797, + "step": 57025 + }, + { + "epoch": 5.121184919210054, + "grad_norm": 24.101486206054688, + "learning_rate": 5.119030520646319e-06, + "loss": 7.7731, + "step": 57050 + }, + { + "epoch": 5.123429084380611, + "grad_norm": 8.89622974395752, + "learning_rate": 5.121274685816876e-06, + "loss": 7.529, + "step": 57075 + }, + { + "epoch": 5.125673249551167, + "grad_norm": 8.839393615722656, + "learning_rate": 5.1235188509874335e-06, + "loss": 7.7111, + "step": 57100 + }, + { + "epoch": 5.127917414721724, + "grad_norm": 10.290761947631836, + "learning_rate": 5.12576301615799e-06, + "loss": 7.5855, + "step": 57125 + }, + { + "epoch": 5.13016157989228, + "grad_norm": 7.803911209106445, + "learning_rate": 5.128007181328546e-06, + "loss": 7.5648, + "step": 57150 + }, + { + "epoch": 5.132405745062837, + "grad_norm": 9.84412956237793, + "learning_rate": 5.130251346499103e-06, + "loss": 7.5728, + "step": 57175 + }, + { + "epoch": 5.134649910233393, + "grad_norm": 10.691458702087402, + "learning_rate": 5.13249551166966e-06, + "loss": 7.7019, + "step": 57200 + }, + { + "epoch": 5.13689407540395, + "grad_norm": 9.29505443572998, + "learning_rate": 5.134739676840216e-06, + "loss": 7.5857, + "step": 57225 + }, + { + "epoch": 5.139138240574506, + "grad_norm": 11.044669151306152, + "learning_rate": 5.136983842010772e-06, + "loss": 7.8443, + "step": 57250 + }, + { + "epoch": 5.141382405745063, + "grad_norm": 9.358799934387207, + "learning_rate": 5.13922800718133e-06, + "loss": 7.5733, + "step": 57275 + }, + { + "epoch": 5.143626570915619, + "grad_norm": 9.831872940063477, + "learning_rate": 5.1414721723518855e-06, + "loss": 7.8376, + "step": 57300 + }, + { + "epoch": 5.1458707360861755, + "grad_norm": 7.7552666664123535, + "learning_rate": 5.143716337522442e-06, + "loss": 7.6744, + "step": 57325 + }, + { + "epoch": 5.148114901256733, + "grad_norm": 9.880117416381836, + "learning_rate": 5.145960502692998e-06, + "loss": 7.7918, + "step": 57350 + }, + { + "epoch": 5.150359066427289, + "grad_norm": 9.598079681396484, + "learning_rate": 5.148204667863555e-06, + "loss": 7.5888, + "step": 57375 + }, + { + "epoch": 5.152603231597846, + "grad_norm": 8.474371910095215, + "learning_rate": 5.150448833034112e-06, + "loss": 7.8445, + "step": 57400 + }, + { + "epoch": 5.154847396768402, + "grad_norm": 9.129942893981934, + "learning_rate": 5.152692998204668e-06, + "loss": 7.6957, + "step": 57425 + }, + { + "epoch": 5.157091561938959, + "grad_norm": 9.522250175476074, + "learning_rate": 5.154937163375224e-06, + "loss": 7.6079, + "step": 57450 + }, + { + "epoch": 5.159335727109515, + "grad_norm": 9.692319869995117, + "learning_rate": 5.157181328545782e-06, + "loss": 7.8154, + "step": 57475 + }, + { + "epoch": 5.161579892280072, + "grad_norm": 9.320409774780273, + "learning_rate": 5.159425493716338e-06, + "loss": 7.6031, + "step": 57500 + }, + { + "epoch": 5.163824057450628, + "grad_norm": 10.463351249694824, + "learning_rate": 5.161669658886894e-06, + "loss": 7.6804, + "step": 57525 + }, + { + "epoch": 5.166068222621185, + "grad_norm": 10.134159088134766, + "learning_rate": 5.163913824057452e-06, + "loss": 7.6716, + "step": 57550 + }, + { + "epoch": 5.168312387791741, + "grad_norm": 8.7643461227417, + "learning_rate": 5.1661579892280074e-06, + "loss": 7.747, + "step": 57575 + }, + { + "epoch": 5.170556552962298, + "grad_norm": 11.359028816223145, + "learning_rate": 5.168402154398564e-06, + "loss": 7.6184, + "step": 57600 + }, + { + "epoch": 5.172800718132855, + "grad_norm": 9.287097930908203, + "learning_rate": 5.170646319569121e-06, + "loss": 7.706, + "step": 57625 + }, + { + "epoch": 5.1750448833034115, + "grad_norm": 8.09850025177002, + "learning_rate": 5.172890484739677e-06, + "loss": 7.7348, + "step": 57650 + }, + { + "epoch": 5.177289048473968, + "grad_norm": 8.25939655303955, + "learning_rate": 5.175134649910234e-06, + "loss": 7.5762, + "step": 57675 + }, + { + "epoch": 5.1795332136445245, + "grad_norm": 9.773100852966309, + "learning_rate": 5.1773788150807905e-06, + "loss": 7.7399, + "step": 57700 + }, + { + "epoch": 5.181777378815081, + "grad_norm": 9.005331039428711, + "learning_rate": 5.179622980251346e-06, + "loss": 7.8738, + "step": 57725 + }, + { + "epoch": 5.184021543985637, + "grad_norm": 10.072188377380371, + "learning_rate": 5.181867145421904e-06, + "loss": 7.5913, + "step": 57750 + }, + { + "epoch": 5.186265709156194, + "grad_norm": 8.136216163635254, + "learning_rate": 5.18411131059246e-06, + "loss": 7.6953, + "step": 57775 + }, + { + "epoch": 5.18850987432675, + "grad_norm": 10.679668426513672, + "learning_rate": 5.186355475763016e-06, + "loss": 7.776, + "step": 57800 + }, + { + "epoch": 5.190754039497307, + "grad_norm": 9.61020278930664, + "learning_rate": 5.188599640933573e-06, + "loss": 7.7117, + "step": 57825 + }, + { + "epoch": 5.192998204667863, + "grad_norm": 11.755073547363281, + "learning_rate": 5.19084380610413e-06, + "loss": 7.6603, + "step": 57850 + }, + { + "epoch": 5.19524236983842, + "grad_norm": 9.827613830566406, + "learning_rate": 5.193087971274686e-06, + "loss": 7.5544, + "step": 57875 + }, + { + "epoch": 5.197486535008976, + "grad_norm": 10.74515151977539, + "learning_rate": 5.195332136445243e-06, + "loss": 7.6286, + "step": 57900 + }, + { + "epoch": 5.199730700179533, + "grad_norm": 9.235673904418945, + "learning_rate": 5.1975763016158e-06, + "loss": 7.8532, + "step": 57925 + }, + { + "epoch": 5.20197486535009, + "grad_norm": 13.592889785766602, + "learning_rate": 5.199820466786356e-06, + "loss": 7.6388, + "step": 57950 + }, + { + "epoch": 5.204219030520647, + "grad_norm": 9.683856010437012, + "learning_rate": 5.2020646319569124e-06, + "loss": 7.6881, + "step": 57975 + }, + { + "epoch": 5.206463195691203, + "grad_norm": 10.123971939086914, + "learning_rate": 5.204308797127468e-06, + "loss": 7.7723, + "step": 58000 + }, + { + "epoch": 5.20870736086176, + "grad_norm": 8.230531692504883, + "learning_rate": 5.206552962298026e-06, + "loss": 7.7511, + "step": 58025 + }, + { + "epoch": 5.210951526032316, + "grad_norm": 10.402027130126953, + "learning_rate": 5.208797127468582e-06, + "loss": 7.5193, + "step": 58050 + }, + { + "epoch": 5.2131956912028725, + "grad_norm": 9.268167495727539, + "learning_rate": 5.211041292639139e-06, + "loss": 7.7405, + "step": 58075 + }, + { + "epoch": 5.215439856373429, + "grad_norm": 9.613805770874023, + "learning_rate": 5.213285457809695e-06, + "loss": 7.7714, + "step": 58100 + }, + { + "epoch": 5.2176840215439855, + "grad_norm": 11.376741409301758, + "learning_rate": 5.215529622980252e-06, + "loss": 7.6006, + "step": 58125 + }, + { + "epoch": 5.219928186714542, + "grad_norm": 10.096146583557129, + "learning_rate": 5.217773788150809e-06, + "loss": 7.5376, + "step": 58150 + }, + { + "epoch": 5.222172351885098, + "grad_norm": 11.072246551513672, + "learning_rate": 5.2200179533213645e-06, + "loss": 7.6985, + "step": 58175 + }, + { + "epoch": 5.224416517055655, + "grad_norm": 10.481771469116211, + "learning_rate": 5.222262118491921e-06, + "loss": 7.5178, + "step": 58200 + }, + { + "epoch": 5.226660682226212, + "grad_norm": 10.740564346313477, + "learning_rate": 5.224506283662479e-06, + "loss": 7.7843, + "step": 58225 + }, + { + "epoch": 5.228904847396769, + "grad_norm": 8.441429138183594, + "learning_rate": 5.226750448833034e-06, + "loss": 7.5068, + "step": 58250 + }, + { + "epoch": 5.231149012567325, + "grad_norm": 8.412349700927734, + "learning_rate": 5.228994614003591e-06, + "loss": 7.4484, + "step": 58275 + }, + { + "epoch": 5.233393177737882, + "grad_norm": 9.798705101013184, + "learning_rate": 5.2312387791741484e-06, + "loss": 7.6395, + "step": 58300 + }, + { + "epoch": 5.235637342908438, + "grad_norm": 9.515256881713867, + "learning_rate": 5.233482944344704e-06, + "loss": 7.7424, + "step": 58325 + }, + { + "epoch": 5.237881508078995, + "grad_norm": 9.949021339416504, + "learning_rate": 5.235727109515261e-06, + "loss": 7.6005, + "step": 58350 + }, + { + "epoch": 5.240125673249551, + "grad_norm": 12.375062942504883, + "learning_rate": 5.237971274685817e-06, + "loss": 7.942, + "step": 58375 + }, + { + "epoch": 5.242369838420108, + "grad_norm": 10.158859252929688, + "learning_rate": 5.240215439856374e-06, + "loss": 7.6424, + "step": 58400 + }, + { + "epoch": 5.244614003590664, + "grad_norm": 12.547576904296875, + "learning_rate": 5.242459605026931e-06, + "loss": 7.717, + "step": 58425 + }, + { + "epoch": 5.246858168761221, + "grad_norm": 8.407984733581543, + "learning_rate": 5.2447037701974864e-06, + "loss": 7.593, + "step": 58450 + }, + { + "epoch": 5.249102333931777, + "grad_norm": 8.762823104858398, + "learning_rate": 5.246947935368043e-06, + "loss": 7.6539, + "step": 58475 + }, + { + "epoch": 5.2513464991023335, + "grad_norm": 9.826586723327637, + "learning_rate": 5.2491921005386005e-06, + "loss": 7.6322, + "step": 58500 + }, + { + "epoch": 5.253590664272891, + "grad_norm": 8.776369094848633, + "learning_rate": 5.251436265709156e-06, + "loss": 7.5878, + "step": 58525 + }, + { + "epoch": 5.255834829443447, + "grad_norm": 10.048395156860352, + "learning_rate": 5.253680430879713e-06, + "loss": 7.5976, + "step": 58550 + }, + { + "epoch": 5.258078994614004, + "grad_norm": 9.97916316986084, + "learning_rate": 5.25592459605027e-06, + "loss": 7.6656, + "step": 58575 + }, + { + "epoch": 5.26032315978456, + "grad_norm": 9.699275970458984, + "learning_rate": 5.258168761220826e-06, + "loss": 7.4595, + "step": 58600 + }, + { + "epoch": 5.262567324955117, + "grad_norm": 8.79430103302002, + "learning_rate": 5.260412926391383e-06, + "loss": 7.4495, + "step": 58625 + }, + { + "epoch": 5.264811490125673, + "grad_norm": 9.82303237915039, + "learning_rate": 5.262657091561939e-06, + "loss": 7.7024, + "step": 58650 + }, + { + "epoch": 5.26705565529623, + "grad_norm": 8.872953414916992, + "learning_rate": 5.264901256732497e-06, + "loss": 7.5757, + "step": 58675 + }, + { + "epoch": 5.269299820466786, + "grad_norm": 12.229708671569824, + "learning_rate": 5.267145421903053e-06, + "loss": 7.7878, + "step": 58700 + }, + { + "epoch": 5.271543985637343, + "grad_norm": 9.87607479095459, + "learning_rate": 5.269389587073609e-06, + "loss": 7.7013, + "step": 58725 + }, + { + "epoch": 5.273788150807899, + "grad_norm": 9.421488761901855, + "learning_rate": 5.271633752244165e-06, + "loss": 7.6129, + "step": 58750 + }, + { + "epoch": 5.276032315978456, + "grad_norm": 10.221114158630371, + "learning_rate": 5.2738779174147224e-06, + "loss": 7.5945, + "step": 58775 + }, + { + "epoch": 5.278276481149012, + "grad_norm": 9.237082481384277, + "learning_rate": 5.276122082585279e-06, + "loss": 7.6784, + "step": 58800 + }, + { + "epoch": 5.2805206463195695, + "grad_norm": 12.320544242858887, + "learning_rate": 5.278366247755835e-06, + "loss": 7.5934, + "step": 58825 + }, + { + "epoch": 5.282764811490126, + "grad_norm": 10.094799995422363, + "learning_rate": 5.2806104129263914e-06, + "loss": 7.6876, + "step": 58850 + }, + { + "epoch": 5.2850089766606825, + "grad_norm": 10.313736915588379, + "learning_rate": 5.282854578096949e-06, + "loss": 7.805, + "step": 58875 + }, + { + "epoch": 5.287253141831239, + "grad_norm": 9.906643867492676, + "learning_rate": 5.285098743267505e-06, + "loss": 7.7956, + "step": 58900 + }, + { + "epoch": 5.289497307001795, + "grad_norm": 8.833810806274414, + "learning_rate": 5.287342908438061e-06, + "loss": 7.6454, + "step": 58925 + }, + { + "epoch": 5.291741472172352, + "grad_norm": 8.938261985778809, + "learning_rate": 5.289587073608619e-06, + "loss": 7.6411, + "step": 58950 + }, + { + "epoch": 5.293985637342908, + "grad_norm": 11.750243186950684, + "learning_rate": 5.2918312387791745e-06, + "loss": 7.5671, + "step": 58975 + }, + { + "epoch": 5.296229802513465, + "grad_norm": 8.99089241027832, + "learning_rate": 5.294075403949731e-06, + "loss": 7.7107, + "step": 59000 + }, + { + "epoch": 5.298473967684021, + "grad_norm": 11.1624755859375, + "learning_rate": 5.296319569120288e-06, + "loss": 7.6358, + "step": 59025 + }, + { + "epoch": 5.300718132854578, + "grad_norm": 9.643279075622559, + "learning_rate": 5.298563734290844e-06, + "loss": 7.723, + "step": 59050 + }, + { + "epoch": 5.302962298025134, + "grad_norm": 10.38420581817627, + "learning_rate": 5.300807899461401e-06, + "loss": 7.7653, + "step": 59075 + }, + { + "epoch": 5.305206463195692, + "grad_norm": 10.529306411743164, + "learning_rate": 5.303052064631958e-06, + "loss": 7.5542, + "step": 59100 + }, + { + "epoch": 5.307450628366248, + "grad_norm": 8.818406105041504, + "learning_rate": 5.305296229802513e-06, + "loss": 7.6453, + "step": 59125 + }, + { + "epoch": 5.309694793536805, + "grad_norm": 11.434507369995117, + "learning_rate": 5.307540394973071e-06, + "loss": 7.5969, + "step": 59150 + }, + { + "epoch": 5.311938958707361, + "grad_norm": 10.435087203979492, + "learning_rate": 5.3097845601436274e-06, + "loss": 7.6728, + "step": 59175 + }, + { + "epoch": 5.314183123877918, + "grad_norm": 9.308300971984863, + "learning_rate": 5.312028725314183e-06, + "loss": 7.623, + "step": 59200 + }, + { + "epoch": 5.316427289048474, + "grad_norm": 11.066710472106934, + "learning_rate": 5.31427289048474e-06, + "loss": 7.7331, + "step": 59225 + }, + { + "epoch": 5.3186714542190305, + "grad_norm": 12.195531845092773, + "learning_rate": 5.316517055655297e-06, + "loss": 7.9208, + "step": 59250 + }, + { + "epoch": 5.320915619389587, + "grad_norm": 9.932723999023438, + "learning_rate": 5.318761220825853e-06, + "loss": 7.5801, + "step": 59275 + }, + { + "epoch": 5.3231597845601435, + "grad_norm": 10.159737586975098, + "learning_rate": 5.32100538599641e-06, + "loss": 7.6809, + "step": 59300 + }, + { + "epoch": 5.3254039497307, + "grad_norm": 9.194977760314941, + "learning_rate": 5.323249551166967e-06, + "loss": 7.3202, + "step": 59325 + }, + { + "epoch": 5.3276481149012564, + "grad_norm": 12.792230606079102, + "learning_rate": 5.325493716337523e-06, + "loss": 7.7377, + "step": 59350 + }, + { + "epoch": 5.329892280071813, + "grad_norm": 7.926971435546875, + "learning_rate": 5.3277378815080795e-06, + "loss": 7.7237, + "step": 59375 + }, + { + "epoch": 5.332136445242369, + "grad_norm": 10.208791732788086, + "learning_rate": 5.329982046678635e-06, + "loss": 7.6601, + "step": 59400 + }, + { + "epoch": 5.334380610412927, + "grad_norm": 10.913378715515137, + "learning_rate": 5.332226211849193e-06, + "loss": 7.6452, + "step": 59425 + }, + { + "epoch": 5.336624775583483, + "grad_norm": 9.096741676330566, + "learning_rate": 5.334470377019749e-06, + "loss": 7.8229, + "step": 59450 + }, + { + "epoch": 5.33886894075404, + "grad_norm": 10.8193359375, + "learning_rate": 5.336714542190305e-06, + "loss": 7.8246, + "step": 59475 + }, + { + "epoch": 5.341113105924596, + "grad_norm": 9.150163650512695, + "learning_rate": 5.338958707360862e-06, + "loss": 7.6975, + "step": 59500 + }, + { + "epoch": 5.343357271095153, + "grad_norm": 10.634905815124512, + "learning_rate": 5.341202872531419e-06, + "loss": 7.7657, + "step": 59525 + }, + { + "epoch": 5.345601436265709, + "grad_norm": 8.439240455627441, + "learning_rate": 5.343447037701975e-06, + "loss": 7.5184, + "step": 59550 + }, + { + "epoch": 5.347845601436266, + "grad_norm": 8.276130676269531, + "learning_rate": 5.3456912028725316e-06, + "loss": 7.7029, + "step": 59575 + }, + { + "epoch": 5.350089766606822, + "grad_norm": 17.992034912109375, + "learning_rate": 5.347935368043088e-06, + "loss": 7.6975, + "step": 59600 + }, + { + "epoch": 5.352333931777379, + "grad_norm": 9.619964599609375, + "learning_rate": 5.350179533213646e-06, + "loss": 7.6632, + "step": 59625 + }, + { + "epoch": 5.354578096947935, + "grad_norm": 8.428370475769043, + "learning_rate": 5.352423698384201e-06, + "loss": 7.8636, + "step": 59650 + }, + { + "epoch": 5.3568222621184916, + "grad_norm": 10.307182312011719, + "learning_rate": 5.354667863554758e-06, + "loss": 7.4795, + "step": 59675 + }, + { + "epoch": 5.359066427289049, + "grad_norm": 9.897976875305176, + "learning_rate": 5.3569120287253155e-06, + "loss": 7.5856, + "step": 59700 + }, + { + "epoch": 5.361310592459605, + "grad_norm": 8.580826759338379, + "learning_rate": 5.359156193895871e-06, + "loss": 7.7058, + "step": 59725 + }, + { + "epoch": 5.363554757630162, + "grad_norm": 13.4871187210083, + "learning_rate": 5.361400359066428e-06, + "loss": 7.5088, + "step": 59750 + }, + { + "epoch": 5.365798922800718, + "grad_norm": 15.912814140319824, + "learning_rate": 5.363644524236984e-06, + "loss": 7.7797, + "step": 59775 + }, + { + "epoch": 5.368043087971275, + "grad_norm": 8.466780662536621, + "learning_rate": 5.365888689407541e-06, + "loss": 7.6105, + "step": 59800 + }, + { + "epoch": 5.370287253141831, + "grad_norm": 9.63807201385498, + "learning_rate": 5.368132854578098e-06, + "loss": 7.5904, + "step": 59825 + }, + { + "epoch": 5.372531418312388, + "grad_norm": 10.070378303527832, + "learning_rate": 5.3703770197486535e-06, + "loss": 7.7915, + "step": 59850 + }, + { + "epoch": 5.374775583482944, + "grad_norm": 11.109130859375, + "learning_rate": 5.37262118491921e-06, + "loss": 7.6451, + "step": 59875 + }, + { + "epoch": 5.377019748653501, + "grad_norm": 8.922284126281738, + "learning_rate": 5.3748653500897676e-06, + "loss": 7.5841, + "step": 59900 + }, + { + "epoch": 5.379263913824057, + "grad_norm": 10.165995597839355, + "learning_rate": 5.377109515260323e-06, + "loss": 7.7253, + "step": 59925 + }, + { + "epoch": 5.381508078994614, + "grad_norm": 9.901357650756836, + "learning_rate": 5.37935368043088e-06, + "loss": 7.7678, + "step": 59950 + }, + { + "epoch": 5.38375224416517, + "grad_norm": 9.271557807922363, + "learning_rate": 5.3815978456014366e-06, + "loss": 7.4822, + "step": 59975 + }, + { + "epoch": 5.385996409335727, + "grad_norm": 8.715646743774414, + "learning_rate": 5.383842010771993e-06, + "loss": 7.6197, + "step": 60000 + }, + { + "epoch": 5.388240574506284, + "grad_norm": 8.811176300048828, + "learning_rate": 5.38608617594255e-06, + "loss": 7.6484, + "step": 60025 + }, + { + "epoch": 5.3904847396768405, + "grad_norm": 13.162262916564941, + "learning_rate": 5.388330341113106e-06, + "loss": 7.6696, + "step": 60050 + }, + { + "epoch": 5.392728904847397, + "grad_norm": 9.486488342285156, + "learning_rate": 5.390574506283663e-06, + "loss": 7.7346, + "step": 60075 + }, + { + "epoch": 5.3949730700179535, + "grad_norm": 8.623913764953613, + "learning_rate": 5.39281867145422e-06, + "loss": 7.6749, + "step": 60100 + }, + { + "epoch": 5.39721723518851, + "grad_norm": 9.587614059448242, + "learning_rate": 5.395062836624776e-06, + "loss": 7.7301, + "step": 60125 + }, + { + "epoch": 5.399461400359066, + "grad_norm": 8.628618240356445, + "learning_rate": 5.397307001795332e-06, + "loss": 7.6066, + "step": 60150 + }, + { + "epoch": 5.401705565529623, + "grad_norm": 10.734697341918945, + "learning_rate": 5.3995511669658895e-06, + "loss": 7.5326, + "step": 60175 + }, + { + "epoch": 5.403949730700179, + "grad_norm": 10.127850532531738, + "learning_rate": 5.401795332136446e-06, + "loss": 7.5484, + "step": 60200 + }, + { + "epoch": 5.406193895870736, + "grad_norm": 10.219551086425781, + "learning_rate": 5.404039497307002e-06, + "loss": 7.5855, + "step": 60225 + }, + { + "epoch": 5.408438061041292, + "grad_norm": 10.39041519165039, + "learning_rate": 5.4062836624775585e-06, + "loss": 7.6124, + "step": 60250 + }, + { + "epoch": 5.410682226211849, + "grad_norm": 9.436785697937012, + "learning_rate": 5.408527827648116e-06, + "loss": 7.5206, + "step": 60275 + }, + { + "epoch": 5.412926391382406, + "grad_norm": 9.419328689575195, + "learning_rate": 5.410771992818672e-06, + "loss": 7.5494, + "step": 60300 + }, + { + "epoch": 5.415170556552963, + "grad_norm": 10.805438041687012, + "learning_rate": 5.413016157989228e-06, + "loss": 7.6297, + "step": 60325 + }, + { + "epoch": 5.417414721723519, + "grad_norm": 12.10112476348877, + "learning_rate": 5.415260323159784e-06, + "loss": 7.5258, + "step": 60350 + }, + { + "epoch": 5.419658886894076, + "grad_norm": 10.108992576599121, + "learning_rate": 5.4175044883303416e-06, + "loss": 7.6378, + "step": 60375 + }, + { + "epoch": 5.421903052064632, + "grad_norm": 9.326981544494629, + "learning_rate": 5.419748653500898e-06, + "loss": 7.6525, + "step": 60400 + }, + { + "epoch": 5.424147217235189, + "grad_norm": 9.00843334197998, + "learning_rate": 5.421992818671454e-06, + "loss": 7.4254, + "step": 60425 + }, + { + "epoch": 5.426391382405745, + "grad_norm": 9.347887992858887, + "learning_rate": 5.424236983842011e-06, + "loss": 7.4907, + "step": 60450 + }, + { + "epoch": 5.4286355475763015, + "grad_norm": 9.182339668273926, + "learning_rate": 5.426481149012568e-06, + "loss": 7.6441, + "step": 60475 + }, + { + "epoch": 5.430879712746858, + "grad_norm": 10.35143756866455, + "learning_rate": 5.428725314183124e-06, + "loss": 7.5881, + "step": 60500 + }, + { + "epoch": 5.4331238779174145, + "grad_norm": 8.759939193725586, + "learning_rate": 5.43096947935368e-06, + "loss": 7.6967, + "step": 60525 + }, + { + "epoch": 5.435368043087971, + "grad_norm": 10.519254684448242, + "learning_rate": 5.433213644524238e-06, + "loss": 7.6483, + "step": 60550 + }, + { + "epoch": 5.437612208258527, + "grad_norm": 8.57989501953125, + "learning_rate": 5.4354578096947945e-06, + "loss": 7.6482, + "step": 60575 + }, + { + "epoch": 5.439856373429085, + "grad_norm": 10.73281192779541, + "learning_rate": 5.43770197486535e-06, + "loss": 7.5385, + "step": 60600 + }, + { + "epoch": 5.442100538599641, + "grad_norm": 11.734580039978027, + "learning_rate": 5.439946140035907e-06, + "loss": 7.4376, + "step": 60625 + }, + { + "epoch": 5.444344703770198, + "grad_norm": 9.081700325012207, + "learning_rate": 5.442190305206464e-06, + "loss": 7.7178, + "step": 60650 + }, + { + "epoch": 5.446588868940754, + "grad_norm": 14.637162208557129, + "learning_rate": 5.44443447037702e-06, + "loss": 7.6335, + "step": 60675 + }, + { + "epoch": 5.448833034111311, + "grad_norm": 10.116673469543457, + "learning_rate": 5.446678635547577e-06, + "loss": 7.7043, + "step": 60700 + }, + { + "epoch": 5.451077199281867, + "grad_norm": 9.191705703735352, + "learning_rate": 5.448922800718134e-06, + "loss": 7.5034, + "step": 60725 + }, + { + "epoch": 5.453321364452424, + "grad_norm": 9.389591217041016, + "learning_rate": 5.45116696588869e-06, + "loss": 7.4066, + "step": 60750 + }, + { + "epoch": 5.45556552962298, + "grad_norm": 11.55097770690918, + "learning_rate": 5.4534111310592466e-06, + "loss": 7.6092, + "step": 60775 + }, + { + "epoch": 5.457809694793537, + "grad_norm": 9.36042308807373, + "learning_rate": 5.455655296229802e-06, + "loss": 7.8564, + "step": 60800 + }, + { + "epoch": 5.460053859964093, + "grad_norm": 11.745915412902832, + "learning_rate": 5.45789946140036e-06, + "loss": 7.6728, + "step": 60825 + }, + { + "epoch": 5.46229802513465, + "grad_norm": 9.398786544799805, + "learning_rate": 5.460143626570916e-06, + "loss": 7.7312, + "step": 60850 + }, + { + "epoch": 5.464542190305206, + "grad_norm": 10.128938674926758, + "learning_rate": 5.46229802513465e-06, + "loss": 7.6494, + "step": 60875 + }, + { + "epoch": 5.466786355475763, + "grad_norm": 10.877889633178711, + "learning_rate": 5.464542190305206e-06, + "loss": 7.6939, + "step": 60900 + }, + { + "epoch": 5.46903052064632, + "grad_norm": 9.860407829284668, + "learning_rate": 5.466786355475764e-06, + "loss": 7.6246, + "step": 60925 + }, + { + "epoch": 5.471274685816876, + "grad_norm": 9.08110237121582, + "learning_rate": 5.46903052064632e-06, + "loss": 7.6749, + "step": 60950 + }, + { + "epoch": 5.473518850987433, + "grad_norm": 11.561442375183105, + "learning_rate": 5.471274685816876e-06, + "loss": 7.6652, + "step": 60975 + }, + { + "epoch": 5.475763016157989, + "grad_norm": 9.743111610412598, + "learning_rate": 5.473518850987433e-06, + "loss": 7.6671, + "step": 61000 + }, + { + "epoch": 5.478007181328546, + "grad_norm": 8.671440124511719, + "learning_rate": 5.47576301615799e-06, + "loss": 7.5496, + "step": 61025 + }, + { + "epoch": 5.480251346499102, + "grad_norm": 8.64813232421875, + "learning_rate": 5.478007181328546e-06, + "loss": 7.5377, + "step": 61050 + }, + { + "epoch": 5.482495511669659, + "grad_norm": 10.75428295135498, + "learning_rate": 5.4802513464991025e-06, + "loss": 7.5453, + "step": 61075 + }, + { + "epoch": 5.484739676840215, + "grad_norm": 9.73268985748291, + "learning_rate": 5.48249551166966e-06, + "loss": 7.5654, + "step": 61100 + }, + { + "epoch": 5.486983842010772, + "grad_norm": 10.277050971984863, + "learning_rate": 5.484739676840216e-06, + "loss": 7.6605, + "step": 61125 + }, + { + "epoch": 5.489228007181328, + "grad_norm": 9.557150840759277, + "learning_rate": 5.486983842010772e-06, + "loss": 7.7176, + "step": 61150 + }, + { + "epoch": 5.491472172351886, + "grad_norm": 9.049446105957031, + "learning_rate": 5.489228007181329e-06, + "loss": 7.8246, + "step": 61175 + }, + { + "epoch": 5.493716337522442, + "grad_norm": 9.43244743347168, + "learning_rate": 5.491472172351886e-06, + "loss": 7.5462, + "step": 61200 + }, + { + "epoch": 5.4959605026929985, + "grad_norm": 8.362750053405762, + "learning_rate": 5.493716337522442e-06, + "loss": 7.6126, + "step": 61225 + }, + { + "epoch": 5.498204667863555, + "grad_norm": 11.434321403503418, + "learning_rate": 5.495960502692999e-06, + "loss": 7.4152, + "step": 61250 + }, + { + "epoch": 5.5004488330341115, + "grad_norm": 9.457329750061035, + "learning_rate": 5.4982046678635546e-06, + "loss": 7.7677, + "step": 61275 + }, + { + "epoch": 5.502692998204668, + "grad_norm": 10.148665428161621, + "learning_rate": 5.500448833034112e-06, + "loss": 7.5609, + "step": 61300 + }, + { + "epoch": 5.504937163375224, + "grad_norm": 9.371383666992188, + "learning_rate": 5.502692998204669e-06, + "loss": 7.4294, + "step": 61325 + }, + { + "epoch": 5.507181328545781, + "grad_norm": 8.989571571350098, + "learning_rate": 5.504937163375224e-06, + "loss": 7.6701, + "step": 61350 + }, + { + "epoch": 5.509425493716337, + "grad_norm": 10.70133113861084, + "learning_rate": 5.507181328545781e-06, + "loss": 7.7015, + "step": 61375 + }, + { + "epoch": 5.511669658886894, + "grad_norm": 9.847673416137695, + "learning_rate": 5.5094254937163385e-06, + "loss": 7.7178, + "step": 61400 + }, + { + "epoch": 5.51391382405745, + "grad_norm": 10.353143692016602, + "learning_rate": 5.511669658886894e-06, + "loss": 7.7246, + "step": 61425 + }, + { + "epoch": 5.516157989228007, + "grad_norm": 10.035849571228027, + "learning_rate": 5.513913824057451e-06, + "loss": 7.5169, + "step": 61450 + }, + { + "epoch": 5.518402154398563, + "grad_norm": 9.639896392822266, + "learning_rate": 5.516157989228008e-06, + "loss": 7.5652, + "step": 61475 + }, + { + "epoch": 5.520646319569121, + "grad_norm": 9.687488555908203, + "learning_rate": 5.518402154398564e-06, + "loss": 7.6155, + "step": 61500 + }, + { + "epoch": 5.522890484739677, + "grad_norm": 13.092938423156738, + "learning_rate": 5.520646319569121e-06, + "loss": 7.4602, + "step": 61525 + }, + { + "epoch": 5.525134649910234, + "grad_norm": 8.903818130493164, + "learning_rate": 5.522890484739677e-06, + "loss": 7.714, + "step": 61550 + }, + { + "epoch": 5.52737881508079, + "grad_norm": 11.856274604797363, + "learning_rate": 5.525134649910234e-06, + "loss": 7.6453, + "step": 61575 + }, + { + "epoch": 5.529622980251347, + "grad_norm": 10.297956466674805, + "learning_rate": 5.5273788150807906e-06, + "loss": 7.5885, + "step": 61600 + }, + { + "epoch": 5.531867145421903, + "grad_norm": 11.152332305908203, + "learning_rate": 5.529622980251347e-06, + "loss": 7.6691, + "step": 61625 + }, + { + "epoch": 5.5341113105924595, + "grad_norm": 11.10828971862793, + "learning_rate": 5.531867145421903e-06, + "loss": 7.4783, + "step": 61650 + }, + { + "epoch": 5.536355475763016, + "grad_norm": 11.121953964233398, + "learning_rate": 5.53411131059246e-06, + "loss": 7.7801, + "step": 61675 + }, + { + "epoch": 5.5385996409335725, + "grad_norm": 8.768472671508789, + "learning_rate": 5.536355475763017e-06, + "loss": 7.5366, + "step": 61700 + }, + { + "epoch": 5.540843806104129, + "grad_norm": 8.315673828125, + "learning_rate": 5.538599640933573e-06, + "loss": 7.7075, + "step": 61725 + }, + { + "epoch": 5.543087971274685, + "grad_norm": 10.894671440124512, + "learning_rate": 5.540843806104129e-06, + "loss": 7.3773, + "step": 61750 + }, + { + "epoch": 5.545332136445243, + "grad_norm": 8.84228229522705, + "learning_rate": 5.543087971274687e-06, + "loss": 7.4669, + "step": 61775 + }, + { + "epoch": 5.547576301615799, + "grad_norm": 10.580299377441406, + "learning_rate": 5.545332136445243e-06, + "loss": 7.4163, + "step": 61800 + }, + { + "epoch": 5.549820466786356, + "grad_norm": 8.856431007385254, + "learning_rate": 5.547576301615799e-06, + "loss": 7.6916, + "step": 61825 + }, + { + "epoch": 5.552064631956912, + "grad_norm": 10.768349647521973, + "learning_rate": 5.549820466786357e-06, + "loss": 7.6498, + "step": 61850 + }, + { + "epoch": 5.554308797127469, + "grad_norm": 10.676264762878418, + "learning_rate": 5.5520646319569125e-06, + "loss": 7.7522, + "step": 61875 + }, + { + "epoch": 5.556552962298025, + "grad_norm": 11.727278709411621, + "learning_rate": 5.554308797127469e-06, + "loss": 7.6946, + "step": 61900 + }, + { + "epoch": 5.558797127468582, + "grad_norm": 9.437605857849121, + "learning_rate": 5.556552962298025e-06, + "loss": 7.4652, + "step": 61925 + }, + { + "epoch": 5.561041292639138, + "grad_norm": 10.650228500366211, + "learning_rate": 5.558797127468582e-06, + "loss": 7.5953, + "step": 61950 + }, + { + "epoch": 5.563285457809695, + "grad_norm": 8.877728462219238, + "learning_rate": 5.561041292639139e-06, + "loss": 7.3825, + "step": 61975 + }, + { + "epoch": 5.565529622980251, + "grad_norm": 10.232831954956055, + "learning_rate": 5.563285457809695e-06, + "loss": 7.904, + "step": 62000 + }, + { + "epoch": 5.567773788150808, + "grad_norm": 8.718149185180664, + "learning_rate": 5.565529622980251e-06, + "loss": 7.5178, + "step": 62025 + }, + { + "epoch": 5.570017953321364, + "grad_norm": 10.110018730163574, + "learning_rate": 5.567773788150809e-06, + "loss": 7.4122, + "step": 62050 + }, + { + "epoch": 5.5722621184919205, + "grad_norm": 9.356905937194824, + "learning_rate": 5.5700179533213646e-06, + "loss": 7.5518, + "step": 62075 + }, + { + "epoch": 5.574506283662478, + "grad_norm": 10.27255630493164, + "learning_rate": 5.572262118491921e-06, + "loss": 7.5069, + "step": 62100 + }, + { + "epoch": 5.576750448833034, + "grad_norm": 10.485356330871582, + "learning_rate": 5.574506283662478e-06, + "loss": 7.4349, + "step": 62125 + }, + { + "epoch": 5.578994614003591, + "grad_norm": 16.62680435180664, + "learning_rate": 5.576750448833035e-06, + "loss": 7.5785, + "step": 62150 + }, + { + "epoch": 5.581238779174147, + "grad_norm": 10.396809577941895, + "learning_rate": 5.578994614003591e-06, + "loss": 7.3841, + "step": 62175 + }, + { + "epoch": 5.583482944344704, + "grad_norm": 15.176380157470703, + "learning_rate": 5.581238779174148e-06, + "loss": 7.5239, + "step": 62200 + }, + { + "epoch": 5.58572710951526, + "grad_norm": 8.44926929473877, + "learning_rate": 5.583482944344705e-06, + "loss": 7.6, + "step": 62225 + }, + { + "epoch": 5.587971274685817, + "grad_norm": 10.143630027770996, + "learning_rate": 5.585727109515261e-06, + "loss": 7.4631, + "step": 62250 + }, + { + "epoch": 5.590215439856373, + "grad_norm": 13.119927406311035, + "learning_rate": 5.5879712746858175e-06, + "loss": 7.4834, + "step": 62275 + }, + { + "epoch": 5.59245960502693, + "grad_norm": 11.079878807067871, + "learning_rate": 5.590215439856373e-06, + "loss": 7.7392, + "step": 62300 + }, + { + "epoch": 5.594703770197486, + "grad_norm": 12.147110939025879, + "learning_rate": 5.592459605026931e-06, + "loss": 7.7973, + "step": 62325 + }, + { + "epoch": 5.596947935368043, + "grad_norm": 13.574075698852539, + "learning_rate": 5.594703770197487e-06, + "loss": 7.4532, + "step": 62350 + }, + { + "epoch": 5.5991921005386, + "grad_norm": 10.634505271911621, + "learning_rate": 5.596947935368043e-06, + "loss": 7.6106, + "step": 62375 + }, + { + "epoch": 5.6014362657091565, + "grad_norm": 10.906079292297363, + "learning_rate": 5.5991921005386e-06, + "loss": 7.4377, + "step": 62400 + }, + { + "epoch": 5.603680430879713, + "grad_norm": 9.55339527130127, + "learning_rate": 5.601436265709157e-06, + "loss": 7.5642, + "step": 62425 + }, + { + "epoch": 5.6059245960502695, + "grad_norm": 8.403786659240723, + "learning_rate": 5.603680430879713e-06, + "loss": 7.7749, + "step": 62450 + }, + { + "epoch": 5.608168761220826, + "grad_norm": 11.408065795898438, + "learning_rate": 5.6059245960502695e-06, + "loss": 7.5949, + "step": 62475 + }, + { + "epoch": 5.6104129263913824, + "grad_norm": 10.991607666015625, + "learning_rate": 5.608168761220826e-06, + "loss": 7.5336, + "step": 62500 + }, + { + "epoch": 5.612657091561939, + "grad_norm": 9.936521530151367, + "learning_rate": 5.610412926391383e-06, + "loss": 7.5984, + "step": 62525 + }, + { + "epoch": 5.614901256732495, + "grad_norm": 14.477123260498047, + "learning_rate": 5.612657091561939e-06, + "loss": 7.432, + "step": 62550 + }, + { + "epoch": 5.617145421903052, + "grad_norm": 10.865318298339844, + "learning_rate": 5.614901256732496e-06, + "loss": 7.3099, + "step": 62575 + }, + { + "epoch": 5.619389587073608, + "grad_norm": 10.008889198303223, + "learning_rate": 5.617145421903053e-06, + "loss": 7.6042, + "step": 62600 + }, + { + "epoch": 5.621633752244165, + "grad_norm": 9.965439796447754, + "learning_rate": 5.619389587073609e-06, + "loss": 7.6794, + "step": 62625 + }, + { + "epoch": 5.623877917414722, + "grad_norm": 9.02977466583252, + "learning_rate": 5.621633752244166e-06, + "loss": 7.371, + "step": 62650 + }, + { + "epoch": 5.626122082585278, + "grad_norm": 9.308999061584473, + "learning_rate": 5.623877917414722e-06, + "loss": 7.7069, + "step": 62675 + }, + { + "epoch": 5.628366247755835, + "grad_norm": 9.341262817382812, + "learning_rate": 5.626122082585279e-06, + "loss": 7.7485, + "step": 62700 + }, + { + "epoch": 5.630610412926392, + "grad_norm": 9.5994234085083, + "learning_rate": 5.628366247755836e-06, + "loss": 7.4822, + "step": 62725 + }, + { + "epoch": 5.632854578096948, + "grad_norm": 10.581972122192383, + "learning_rate": 5.6306104129263915e-06, + "loss": 7.5707, + "step": 62750 + }, + { + "epoch": 5.635098743267505, + "grad_norm": 10.693232536315918, + "learning_rate": 5.632854578096948e-06, + "loss": 7.4258, + "step": 62775 + }, + { + "epoch": 5.637342908438061, + "grad_norm": 9.104724884033203, + "learning_rate": 5.6350987432675055e-06, + "loss": 7.3407, + "step": 62800 + }, + { + "epoch": 5.6395870736086176, + "grad_norm": 9.606040954589844, + "learning_rate": 5.637342908438061e-06, + "loss": 7.6177, + "step": 62825 + }, + { + "epoch": 5.641831238779174, + "grad_norm": 9.42577075958252, + "learning_rate": 5.639587073608618e-06, + "loss": 7.669, + "step": 62850 + }, + { + "epoch": 5.6440754039497305, + "grad_norm": 9.783003807067871, + "learning_rate": 5.641831238779175e-06, + "loss": 7.5751, + "step": 62875 + }, + { + "epoch": 5.646319569120287, + "grad_norm": 10.320673942565918, + "learning_rate": 5.644075403949731e-06, + "loss": 7.8018, + "step": 62900 + }, + { + "epoch": 5.6485637342908435, + "grad_norm": 10.316611289978027, + "learning_rate": 5.646229802513465e-06, + "loss": 7.5383, + "step": 62925 + }, + { + "epoch": 5.6508078994614, + "grad_norm": 11.804607391357422, + "learning_rate": 5.648473967684022e-06, + "loss": 7.4132, + "step": 62950 + }, + { + "epoch": 5.653052064631957, + "grad_norm": 9.010329246520996, + "learning_rate": 5.650718132854579e-06, + "loss": 7.7858, + "step": 62975 + }, + { + "epoch": 5.655296229802514, + "grad_norm": 10.79466724395752, + "learning_rate": 5.652962298025135e-06, + "loss": 7.8073, + "step": 63000 + }, + { + "epoch": 5.65754039497307, + "grad_norm": 10.218594551086426, + "learning_rate": 5.655206463195692e-06, + "loss": 7.4631, + "step": 63025 + }, + { + "epoch": 5.659784560143627, + "grad_norm": 11.134344100952148, + "learning_rate": 5.657450628366247e-06, + "loss": 7.5588, + "step": 63050 + }, + { + "epoch": 5.662028725314183, + "grad_norm": 11.12415599822998, + "learning_rate": 5.659694793536805e-06, + "loss": 7.4453, + "step": 63075 + }, + { + "epoch": 5.66427289048474, + "grad_norm": 8.48401165008545, + "learning_rate": 5.6619389587073615e-06, + "loss": 7.5218, + "step": 63100 + }, + { + "epoch": 5.666517055655296, + "grad_norm": 9.023018836975098, + "learning_rate": 5.664183123877918e-06, + "loss": 7.5325, + "step": 63125 + }, + { + "epoch": 5.668761220825853, + "grad_norm": 10.308551788330078, + "learning_rate": 5.666427289048474e-06, + "loss": 7.5704, + "step": 63150 + }, + { + "epoch": 5.671005385996409, + "grad_norm": 9.894794464111328, + "learning_rate": 5.668671454219031e-06, + "loss": 7.5701, + "step": 63175 + }, + { + "epoch": 5.673249551166966, + "grad_norm": 8.99880313873291, + "learning_rate": 5.670915619389588e-06, + "loss": 7.5916, + "step": 63200 + }, + { + "epoch": 5.675493716337522, + "grad_norm": 10.57407283782959, + "learning_rate": 5.673159784560144e-06, + "loss": 7.732, + "step": 63225 + }, + { + "epoch": 5.6777378815080795, + "grad_norm": 14.131706237792969, + "learning_rate": 5.675403949730701e-06, + "loss": 7.5238, + "step": 63250 + }, + { + "epoch": 5.679982046678636, + "grad_norm": 10.8331298828125, + "learning_rate": 5.677648114901258e-06, + "loss": 7.459, + "step": 63275 + }, + { + "epoch": 5.682226211849192, + "grad_norm": 10.284524917602539, + "learning_rate": 5.6798922800718136e-06, + "loss": 7.4286, + "step": 63300 + }, + { + "epoch": 5.684470377019749, + "grad_norm": 11.1963529586792, + "learning_rate": 5.68213644524237e-06, + "loss": 7.6135, + "step": 63325 + }, + { + "epoch": 5.686714542190305, + "grad_norm": 8.899638175964355, + "learning_rate": 5.684380610412928e-06, + "loss": 7.6422, + "step": 63350 + }, + { + "epoch": 5.688958707360862, + "grad_norm": 9.969193458557129, + "learning_rate": 5.686624775583483e-06, + "loss": 7.698, + "step": 63375 + }, + { + "epoch": 5.691202872531418, + "grad_norm": 8.796121597290039, + "learning_rate": 5.68886894075404e-06, + "loss": 7.6115, + "step": 63400 + }, + { + "epoch": 5.693447037701975, + "grad_norm": 10.135534286499023, + "learning_rate": 5.691113105924596e-06, + "loss": 7.315, + "step": 63425 + }, + { + "epoch": 5.695691202872531, + "grad_norm": 9.145771980285645, + "learning_rate": 5.693357271095153e-06, + "loss": 7.6272, + "step": 63450 + }, + { + "epoch": 5.697935368043088, + "grad_norm": 10.29287338256836, + "learning_rate": 5.69560143626571e-06, + "loss": 7.5967, + "step": 63475 + }, + { + "epoch": 5.700179533213644, + "grad_norm": 8.026521682739258, + "learning_rate": 5.697845601436266e-06, + "loss": 7.7495, + "step": 63500 + }, + { + "epoch": 5.702423698384201, + "grad_norm": 8.809864044189453, + "learning_rate": 5.700089766606822e-06, + "loss": 7.4416, + "step": 63525 + }, + { + "epoch": 5.704667863554757, + "grad_norm": 11.82730770111084, + "learning_rate": 5.70233393177738e-06, + "loss": 7.7237, + "step": 63550 + }, + { + "epoch": 5.706912028725315, + "grad_norm": 11.005682945251465, + "learning_rate": 5.7045780969479355e-06, + "loss": 7.6433, + "step": 63575 + }, + { + "epoch": 5.709156193895871, + "grad_norm": 11.089757919311523, + "learning_rate": 5.706822262118492e-06, + "loss": 7.575, + "step": 63600 + }, + { + "epoch": 5.7114003590664275, + "grad_norm": 12.11961841583252, + "learning_rate": 5.7090664272890495e-06, + "loss": 7.4753, + "step": 63625 + }, + { + "epoch": 5.713644524236984, + "grad_norm": 9.387213706970215, + "learning_rate": 5.711310592459605e-06, + "loss": 7.6579, + "step": 63650 + }, + { + "epoch": 5.7158886894075405, + "grad_norm": 11.187020301818848, + "learning_rate": 5.713554757630162e-06, + "loss": 7.6331, + "step": 63675 + }, + { + "epoch": 5.718132854578097, + "grad_norm": 9.346372604370117, + "learning_rate": 5.7157989228007185e-06, + "loss": 7.583, + "step": 63700 + }, + { + "epoch": 5.720377019748653, + "grad_norm": 9.436589241027832, + "learning_rate": 5.718043087971276e-06, + "loss": 7.5827, + "step": 63725 + }, + { + "epoch": 5.72262118491921, + "grad_norm": 9.489044189453125, + "learning_rate": 5.720287253141832e-06, + "loss": 7.4569, + "step": 63750 + }, + { + "epoch": 5.724865350089766, + "grad_norm": 11.036372184753418, + "learning_rate": 5.722531418312388e-06, + "loss": 7.4198, + "step": 63775 + }, + { + "epoch": 5.727109515260323, + "grad_norm": 10.056819915771484, + "learning_rate": 5.724775583482944e-06, + "loss": 7.6382, + "step": 63800 + }, + { + "epoch": 5.729353680430879, + "grad_norm": 11.047760009765625, + "learning_rate": 5.727019748653502e-06, + "loss": 7.4768, + "step": 63825 + }, + { + "epoch": 5.731597845601437, + "grad_norm": 14.484389305114746, + "learning_rate": 5.729263913824058e-06, + "loss": 7.8111, + "step": 63850 + }, + { + "epoch": 5.733842010771993, + "grad_norm": 11.790982246398926, + "learning_rate": 5.731508078994614e-06, + "loss": 7.5296, + "step": 63875 + }, + { + "epoch": 5.73608617594255, + "grad_norm": 9.272040367126465, + "learning_rate": 5.733752244165171e-06, + "loss": 7.6602, + "step": 63900 + }, + { + "epoch": 5.738330341113106, + "grad_norm": 9.616334915161133, + "learning_rate": 5.735996409335728e-06, + "loss": 7.6301, + "step": 63925 + }, + { + "epoch": 5.740574506283663, + "grad_norm": 11.378646850585938, + "learning_rate": 5.738240574506284e-06, + "loss": 7.4166, + "step": 63950 + }, + { + "epoch": 5.742818671454219, + "grad_norm": 13.79890251159668, + "learning_rate": 5.7404847396768405e-06, + "loss": 7.6552, + "step": 63975 + }, + { + "epoch": 5.745062836624776, + "grad_norm": 10.704466819763184, + "learning_rate": 5.742728904847398e-06, + "loss": 7.6468, + "step": 64000 + }, + { + "epoch": 5.747307001795332, + "grad_norm": 10.52642822265625, + "learning_rate": 5.744973070017954e-06, + "loss": 7.6895, + "step": 64025 + }, + { + "epoch": 5.7495511669658885, + "grad_norm": 9.254570007324219, + "learning_rate": 5.74721723518851e-06, + "loss": 7.5426, + "step": 64050 + }, + { + "epoch": 5.751795332136445, + "grad_norm": 10.454205513000488, + "learning_rate": 5.749461400359067e-06, + "loss": 7.7333, + "step": 64075 + }, + { + "epoch": 5.7540394973070015, + "grad_norm": 12.89731502532959, + "learning_rate": 5.7517055655296235e-06, + "loss": 7.6318, + "step": 64100 + }, + { + "epoch": 5.756283662477558, + "grad_norm": 11.087653160095215, + "learning_rate": 5.75394973070018e-06, + "loss": 7.5314, + "step": 64125 + }, + { + "epoch": 5.758527827648114, + "grad_norm": 10.755064010620117, + "learning_rate": 5.756193895870737e-06, + "loss": 7.4172, + "step": 64150 + }, + { + "epoch": 5.760771992818672, + "grad_norm": 10.206089973449707, + "learning_rate": 5.7584380610412925e-06, + "loss": 7.6235, + "step": 64175 + }, + { + "epoch": 5.763016157989228, + "grad_norm": 10.215435981750488, + "learning_rate": 5.76068222621185e-06, + "loss": 7.337, + "step": 64200 + }, + { + "epoch": 5.765260323159785, + "grad_norm": 9.34958553314209, + "learning_rate": 5.762926391382407e-06, + "loss": 7.5102, + "step": 64225 + }, + { + "epoch": 5.767504488330341, + "grad_norm": 8.471648216247559, + "learning_rate": 5.765170556552962e-06, + "loss": 7.6297, + "step": 64250 + }, + { + "epoch": 5.769748653500898, + "grad_norm": 9.627309799194336, + "learning_rate": 5.767414721723519e-06, + "loss": 7.5908, + "step": 64275 + }, + { + "epoch": 5.771992818671454, + "grad_norm": 8.675994873046875, + "learning_rate": 5.7696588868940765e-06, + "loss": 7.6758, + "step": 64300 + }, + { + "epoch": 5.774236983842011, + "grad_norm": 12.941842079162598, + "learning_rate": 5.771903052064632e-06, + "loss": 7.7226, + "step": 64325 + }, + { + "epoch": 5.776481149012567, + "grad_norm": 9.834450721740723, + "learning_rate": 5.774147217235189e-06, + "loss": 7.5181, + "step": 64350 + }, + { + "epoch": 5.778725314183124, + "grad_norm": 10.956109046936035, + "learning_rate": 5.776391382405746e-06, + "loss": 7.4142, + "step": 64375 + }, + { + "epoch": 5.78096947935368, + "grad_norm": 10.355422019958496, + "learning_rate": 5.778635547576302e-06, + "loss": 7.6839, + "step": 64400 + }, + { + "epoch": 5.783213644524237, + "grad_norm": 11.257526397705078, + "learning_rate": 5.780879712746859e-06, + "loss": 7.5107, + "step": 64425 + }, + { + "epoch": 5.785457809694794, + "grad_norm": 8.223873138427734, + "learning_rate": 5.7831238779174145e-06, + "loss": 7.5271, + "step": 64450 + }, + { + "epoch": 5.78770197486535, + "grad_norm": 9.177905082702637, + "learning_rate": 5.785368043087972e-06, + "loss": 7.468, + "step": 64475 + }, + { + "epoch": 5.789946140035907, + "grad_norm": 8.732272148132324, + "learning_rate": 5.7876122082585285e-06, + "loss": 7.3978, + "step": 64500 + }, + { + "epoch": 5.792190305206463, + "grad_norm": 10.062797546386719, + "learning_rate": 5.789856373429084e-06, + "loss": 7.4537, + "step": 64525 + }, + { + "epoch": 5.79443447037702, + "grad_norm": 10.382023811340332, + "learning_rate": 5.792100538599641e-06, + "loss": 7.5423, + "step": 64550 + }, + { + "epoch": 5.796678635547576, + "grad_norm": 9.786298751831055, + "learning_rate": 5.794344703770198e-06, + "loss": 7.6649, + "step": 64575 + }, + { + "epoch": 5.798922800718133, + "grad_norm": 14.369637489318848, + "learning_rate": 5.796588868940754e-06, + "loss": 7.3893, + "step": 64600 + }, + { + "epoch": 5.801166965888689, + "grad_norm": 10.71498966217041, + "learning_rate": 5.798833034111311e-06, + "loss": 7.4089, + "step": 64625 + }, + { + "epoch": 5.803411131059246, + "grad_norm": 11.726700782775879, + "learning_rate": 5.801077199281867e-06, + "loss": 7.5605, + "step": 64650 + }, + { + "epoch": 5.805655296229802, + "grad_norm": 10.030598640441895, + "learning_rate": 5.803321364452425e-06, + "loss": 7.5481, + "step": 64675 + }, + { + "epoch": 5.807899461400359, + "grad_norm": 10.632725715637207, + "learning_rate": 5.805565529622981e-06, + "loss": 7.6011, + "step": 64700 + }, + { + "epoch": 5.810143626570916, + "grad_norm": 9.949968338012695, + "learning_rate": 5.807809694793537e-06, + "loss": 7.6549, + "step": 64725 + }, + { + "epoch": 5.812387791741472, + "grad_norm": 9.435643196105957, + "learning_rate": 5.810053859964095e-06, + "loss": 7.6104, + "step": 64750 + }, + { + "epoch": 5.814631956912029, + "grad_norm": 9.28760051727295, + "learning_rate": 5.8122980251346505e-06, + "loss": 7.5149, + "step": 64775 + }, + { + "epoch": 5.8168761220825855, + "grad_norm": 9.495110511779785, + "learning_rate": 5.814542190305207e-06, + "loss": 7.6083, + "step": 64800 + }, + { + "epoch": 5.819120287253142, + "grad_norm": 10.163768768310547, + "learning_rate": 5.816786355475763e-06, + "loss": 7.5791, + "step": 64825 + }, + { + "epoch": 5.8213644524236985, + "grad_norm": 8.158746719360352, + "learning_rate": 5.81903052064632e-06, + "loss": 7.4006, + "step": 64850 + }, + { + "epoch": 5.823608617594255, + "grad_norm": 9.872208595275879, + "learning_rate": 5.821274685816877e-06, + "loss": 7.3864, + "step": 64875 + }, + { + "epoch": 5.825852782764811, + "grad_norm": 11.187078475952148, + "learning_rate": 5.823518850987433e-06, + "loss": 7.3014, + "step": 64900 + }, + { + "epoch": 5.828096947935368, + "grad_norm": 10.251884460449219, + "learning_rate": 5.825763016157989e-06, + "loss": 7.6071, + "step": 64925 + }, + { + "epoch": 5.830341113105924, + "grad_norm": 8.795920372009277, + "learning_rate": 5.828007181328547e-06, + "loss": 7.7773, + "step": 64950 + }, + { + "epoch": 5.832585278276481, + "grad_norm": 9.970499038696289, + "learning_rate": 5.8302513464991025e-06, + "loss": 7.5082, + "step": 64975 + }, + { + "epoch": 5.834829443447037, + "grad_norm": 10.867568016052246, + "learning_rate": 5.832495511669659e-06, + "loss": 7.3559, + "step": 65000 + }, + { + "epoch": 5.837073608617594, + "grad_norm": 8.215165138244629, + "learning_rate": 5.834739676840217e-06, + "loss": 7.6617, + "step": 65025 + }, + { + "epoch": 5.839317773788151, + "grad_norm": 12.228425979614258, + "learning_rate": 5.836983842010772e-06, + "loss": 7.514, + "step": 65050 + }, + { + "epoch": 5.841561938958708, + "grad_norm": 10.92136287689209, + "learning_rate": 5.839228007181329e-06, + "loss": 7.6581, + "step": 65075 + }, + { + "epoch": 5.843806104129264, + "grad_norm": 8.78610897064209, + "learning_rate": 5.841472172351886e-06, + "loss": 7.522, + "step": 65100 + }, + { + "epoch": 5.846050269299821, + "grad_norm": 10.976997375488281, + "learning_rate": 5.843716337522442e-06, + "loss": 7.5303, + "step": 65125 + }, + { + "epoch": 5.848294434470377, + "grad_norm": 12.805636405944824, + "learning_rate": 5.845960502692999e-06, + "loss": 7.633, + "step": 65150 + }, + { + "epoch": 5.850538599640934, + "grad_norm": 11.418402671813965, + "learning_rate": 5.8482046678635554e-06, + "loss": 7.4752, + "step": 65175 + }, + { + "epoch": 5.85278276481149, + "grad_norm": 9.861089706420898, + "learning_rate": 5.850448833034111e-06, + "loss": 7.568, + "step": 65200 + }, + { + "epoch": 5.8550269299820465, + "grad_norm": 10.707598686218262, + "learning_rate": 5.852692998204669e-06, + "loss": 7.665, + "step": 65225 + }, + { + "epoch": 5.857271095152603, + "grad_norm": 12.122179985046387, + "learning_rate": 5.854937163375225e-06, + "loss": 7.4191, + "step": 65250 + }, + { + "epoch": 5.8595152603231595, + "grad_norm": 11.576057434082031, + "learning_rate": 5.857181328545781e-06, + "loss": 7.5187, + "step": 65275 + }, + { + "epoch": 5.861759425493716, + "grad_norm": 8.980744361877441, + "learning_rate": 5.859425493716338e-06, + "loss": 7.5521, + "step": 65300 + }, + { + "epoch": 5.864003590664273, + "grad_norm": 11.415038108825684, + "learning_rate": 5.861669658886895e-06, + "loss": 7.5953, + "step": 65325 + }, + { + "epoch": 5.86624775583483, + "grad_norm": 11.606910705566406, + "learning_rate": 5.863913824057451e-06, + "loss": 7.3631, + "step": 65350 + }, + { + "epoch": 5.868491921005386, + "grad_norm": 9.033615112304688, + "learning_rate": 5.8661579892280075e-06, + "loss": 7.5369, + "step": 65375 + }, + { + "epoch": 5.870736086175943, + "grad_norm": 11.254317283630371, + "learning_rate": 5.868402154398565e-06, + "loss": 7.49, + "step": 65400 + }, + { + "epoch": 5.872980251346499, + "grad_norm": 10.311396598815918, + "learning_rate": 5.870646319569121e-06, + "loss": 7.5802, + "step": 65425 + }, + { + "epoch": 5.875224416517056, + "grad_norm": 10.553428649902344, + "learning_rate": 5.872890484739677e-06, + "loss": 7.3877, + "step": 65450 + }, + { + "epoch": 5.877468581687612, + "grad_norm": 8.633349418640137, + "learning_rate": 5.875134649910233e-06, + "loss": 7.4542, + "step": 65475 + }, + { + "epoch": 5.879712746858169, + "grad_norm": 9.827868461608887, + "learning_rate": 5.877378815080791e-06, + "loss": 7.4093, + "step": 65500 + }, + { + "epoch": 5.881956912028725, + "grad_norm": 11.127350807189941, + "learning_rate": 5.879622980251347e-06, + "loss": 7.4564, + "step": 65525 + }, + { + "epoch": 5.884201077199282, + "grad_norm": 11.651484489440918, + "learning_rate": 5.881867145421903e-06, + "loss": 7.46, + "step": 65550 + }, + { + "epoch": 5.886445242369838, + "grad_norm": 9.325936317443848, + "learning_rate": 5.88411131059246e-06, + "loss": 7.6303, + "step": 65575 + }, + { + "epoch": 5.888689407540395, + "grad_norm": 9.559435844421387, + "learning_rate": 5.886355475763017e-06, + "loss": 7.5515, + "step": 65600 + }, + { + "epoch": 5.890933572710951, + "grad_norm": 10.572776794433594, + "learning_rate": 5.888599640933574e-06, + "loss": 7.5873, + "step": 65625 + }, + { + "epoch": 5.8931777378815084, + "grad_norm": 10.71639633178711, + "learning_rate": 5.8908438061041294e-06, + "loss": 7.5513, + "step": 65650 + }, + { + "epoch": 5.895421903052065, + "grad_norm": 10.015830993652344, + "learning_rate": 5.893087971274686e-06, + "loss": 7.6678, + "step": 65675 + }, + { + "epoch": 5.897666068222621, + "grad_norm": 10.823591232299805, + "learning_rate": 5.8953321364452435e-06, + "loss": 7.4649, + "step": 65700 + }, + { + "epoch": 5.899910233393178, + "grad_norm": 10.797861099243164, + "learning_rate": 5.897576301615799e-06, + "loss": 7.7595, + "step": 65725 + }, + { + "epoch": 5.902154398563734, + "grad_norm": 14.479728698730469, + "learning_rate": 5.899820466786356e-06, + "loss": 7.717, + "step": 65750 + }, + { + "epoch": 5.904398563734291, + "grad_norm": 13.045822143554688, + "learning_rate": 5.902064631956913e-06, + "loss": 7.4939, + "step": 65775 + }, + { + "epoch": 5.906642728904847, + "grad_norm": 14.756202697753906, + "learning_rate": 5.904308797127469e-06, + "loss": 7.2959, + "step": 65800 + }, + { + "epoch": 5.908886894075404, + "grad_norm": 9.919795989990234, + "learning_rate": 5.906552962298026e-06, + "loss": 7.4317, + "step": 65825 + }, + { + "epoch": 5.91113105924596, + "grad_norm": 9.798335075378418, + "learning_rate": 5.9087971274685815e-06, + "loss": 7.6026, + "step": 65850 + }, + { + "epoch": 5.913375224416517, + "grad_norm": 10.342686653137207, + "learning_rate": 5.911041292639139e-06, + "loss": 7.4834, + "step": 65875 + }, + { + "epoch": 5.915619389587073, + "grad_norm": 11.586835861206055, + "learning_rate": 5.913285457809696e-06, + "loss": 7.5517, + "step": 65900 + }, + { + "epoch": 5.917863554757631, + "grad_norm": 9.45134162902832, + "learning_rate": 5.915529622980251e-06, + "loss": 7.7488, + "step": 65925 + }, + { + "epoch": 5.920107719928187, + "grad_norm": 10.508381843566895, + "learning_rate": 5.917773788150808e-06, + "loss": 7.3294, + "step": 65950 + }, + { + "epoch": 5.9223518850987436, + "grad_norm": 10.113921165466309, + "learning_rate": 5.9200179533213654e-06, + "loss": 7.5857, + "step": 65975 + }, + { + "epoch": 5.9245960502693, + "grad_norm": 10.909886360168457, + "learning_rate": 5.922262118491921e-06, + "loss": 7.4077, + "step": 66000 + }, + { + "epoch": 5.9268402154398565, + "grad_norm": 8.481833457946777, + "learning_rate": 5.924506283662478e-06, + "loss": 7.3693, + "step": 66025 + }, + { + "epoch": 5.929084380610413, + "grad_norm": 12.600994110107422, + "learning_rate": 5.926660682226212e-06, + "loss": 7.4579, + "step": 66050 + }, + { + "epoch": 5.9313285457809695, + "grad_norm": 14.66698169708252, + "learning_rate": 5.928904847396769e-06, + "loss": 7.4185, + "step": 66075 + }, + { + "epoch": 5.933572710951526, + "grad_norm": 11.084731101989746, + "learning_rate": 5.931149012567325e-06, + "loss": 7.4114, + "step": 66100 + }, + { + "epoch": 5.935816876122082, + "grad_norm": 9.44386100769043, + "learning_rate": 5.933393177737882e-06, + "loss": 7.3417, + "step": 66125 + }, + { + "epoch": 5.938061041292639, + "grad_norm": 12.616454124450684, + "learning_rate": 5.935637342908439e-06, + "loss": 7.7211, + "step": 66150 + }, + { + "epoch": 5.940305206463195, + "grad_norm": 10.44694995880127, + "learning_rate": 5.937881508078995e-06, + "loss": 7.5691, + "step": 66175 + }, + { + "epoch": 5.942549371633753, + "grad_norm": 12.131168365478516, + "learning_rate": 5.9401256732495515e-06, + "loss": 7.4795, + "step": 66200 + }, + { + "epoch": 5.944793536804308, + "grad_norm": 11.54875373840332, + "learning_rate": 5.942369838420108e-06, + "loss": 7.3972, + "step": 66225 + }, + { + "epoch": 5.947037701974866, + "grad_norm": 9.879904747009277, + "learning_rate": 5.944614003590666e-06, + "loss": 7.3718, + "step": 66250 + }, + { + "epoch": 5.949281867145422, + "grad_norm": 11.425763130187988, + "learning_rate": 5.946858168761221e-06, + "loss": 7.7428, + "step": 66275 + }, + { + "epoch": 5.951526032315979, + "grad_norm": 11.30843448638916, + "learning_rate": 5.949102333931778e-06, + "loss": 7.559, + "step": 66300 + }, + { + "epoch": 5.953770197486535, + "grad_norm": 10.323836326599121, + "learning_rate": 5.951346499102334e-06, + "loss": 7.603, + "step": 66325 + }, + { + "epoch": 5.956014362657092, + "grad_norm": 13.19066047668457, + "learning_rate": 5.953590664272891e-06, + "loss": 7.4414, + "step": 66350 + }, + { + "epoch": 5.958258527827648, + "grad_norm": 8.165266990661621, + "learning_rate": 5.955834829443448e-06, + "loss": 7.5726, + "step": 66375 + }, + { + "epoch": 5.960502692998205, + "grad_norm": 9.932510375976562, + "learning_rate": 5.958078994614004e-06, + "loss": 7.4202, + "step": 66400 + }, + { + "epoch": 5.962746858168761, + "grad_norm": 11.175251007080078, + "learning_rate": 5.96032315978456e-06, + "loss": 7.4841, + "step": 66425 + }, + { + "epoch": 5.9649910233393175, + "grad_norm": 8.69661808013916, + "learning_rate": 5.962567324955118e-06, + "loss": 7.2963, + "step": 66450 + }, + { + "epoch": 5.967235188509874, + "grad_norm": 10.786421775817871, + "learning_rate": 5.9648114901256734e-06, + "loss": 7.4605, + "step": 66475 + }, + { + "epoch": 5.9694793536804305, + "grad_norm": 10.311339378356934, + "learning_rate": 5.96705565529623e-06, + "loss": 7.5645, + "step": 66500 + }, + { + "epoch": 5.971723518850988, + "grad_norm": 9.785832405090332, + "learning_rate": 5.9692998204667875e-06, + "loss": 7.585, + "step": 66525 + }, + { + "epoch": 5.973967684021544, + "grad_norm": 13.641069412231445, + "learning_rate": 5.971543985637343e-06, + "loss": 7.4951, + "step": 66550 + }, + { + "epoch": 5.976211849192101, + "grad_norm": 9.12836742401123, + "learning_rate": 5.9737881508079e-06, + "loss": 7.6409, + "step": 66575 + }, + { + "epoch": 5.978456014362657, + "grad_norm": 12.401712417602539, + "learning_rate": 5.9760323159784565e-06, + "loss": 7.541, + "step": 66600 + }, + { + "epoch": 5.980700179533214, + "grad_norm": 9.86592960357666, + "learning_rate": 5.978276481149013e-06, + "loss": 7.7129, + "step": 66625 + }, + { + "epoch": 5.98294434470377, + "grad_norm": 10.243877410888672, + "learning_rate": 5.98052064631957e-06, + "loss": 7.7138, + "step": 66650 + }, + { + "epoch": 5.985188509874327, + "grad_norm": 9.840398788452148, + "learning_rate": 5.982764811490126e-06, + "loss": 7.674, + "step": 66675 + }, + { + "epoch": 5.987432675044883, + "grad_norm": 11.097267150878906, + "learning_rate": 5.985008976660682e-06, + "loss": 7.5796, + "step": 66700 + }, + { + "epoch": 5.98967684021544, + "grad_norm": 10.197983741760254, + "learning_rate": 5.98725314183124e-06, + "loss": 7.5508, + "step": 66725 + }, + { + "epoch": 5.991921005385996, + "grad_norm": 10.150491714477539, + "learning_rate": 5.989497307001796e-06, + "loss": 7.5847, + "step": 66750 + }, + { + "epoch": 5.994165170556553, + "grad_norm": 11.201029777526855, + "learning_rate": 5.991741472172352e-06, + "loss": 7.5607, + "step": 66775 + }, + { + "epoch": 5.99640933572711, + "grad_norm": 10.061843872070312, + "learning_rate": 5.993985637342909e-06, + "loss": 7.3918, + "step": 66800 + }, + { + "epoch": 5.998653500897666, + "grad_norm": 9.363324165344238, + "learning_rate": 5.996229802513466e-06, + "loss": 7.6635, + "step": 66825 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.05201153265995838, + "eval_f1_macro": 0.0007895338392911678, + "eval_f1_micro": 0.05201153265995838, + "eval_f1_weighted": 0.014477036436801913, + "eval_loss": 7.966246128082275, + "eval_precision_macro": 0.0006891846437637676, + "eval_precision_micro": 0.05201153265995838, + "eval_precision_weighted": 0.010888190358541206, + "eval_recall_macro": 0.002391027332196912, + "eval_recall_micro": 0.05201153265995838, + "eval_recall_weighted": 0.05201153265995838, + "eval_runtime": 128.981, + "eval_samples_per_second": 406.052, + "eval_steps_per_second": 12.692, + "step": 66840 + }, + { + "epoch": 6.000897666068223, + "grad_norm": 9.829092979431152, + "learning_rate": 5.998473967684022e-06, + "loss": 7.3509, + "step": 66850 + }, + { + "epoch": 6.003141831238779, + "grad_norm": 10.256646156311035, + "learning_rate": 6.0007181328545784e-06, + "loss": 7.2644, + "step": 66875 + }, + { + "epoch": 6.005385996409336, + "grad_norm": 12.18589973449707, + "learning_rate": 6.002962298025136e-06, + "loss": 7.2829, + "step": 66900 + }, + { + "epoch": 6.007630161579892, + "grad_norm": 10.578293800354004, + "learning_rate": 6.005206463195692e-06, + "loss": 7.4121, + "step": 66925 + }, + { + "epoch": 6.009874326750449, + "grad_norm": 11.101888656616211, + "learning_rate": 6.007450628366248e-06, + "loss": 7.3169, + "step": 66950 + }, + { + "epoch": 6.012118491921005, + "grad_norm": 10.915926933288574, + "learning_rate": 6.009694793536804e-06, + "loss": 7.3647, + "step": 66975 + }, + { + "epoch": 6.014362657091562, + "grad_norm": 11.12590217590332, + "learning_rate": 6.0119389587073615e-06, + "loss": 7.2745, + "step": 67000 + }, + { + "epoch": 6.016606822262118, + "grad_norm": 10.98822021484375, + "learning_rate": 6.014183123877918e-06, + "loss": 7.2988, + "step": 67025 + }, + { + "epoch": 6.018850987432675, + "grad_norm": 9.773468017578125, + "learning_rate": 6.016427289048474e-06, + "loss": 7.1845, + "step": 67050 + }, + { + "epoch": 6.021095152603231, + "grad_norm": 9.58114242553711, + "learning_rate": 6.0186714542190305e-06, + "loss": 7.2913, + "step": 67075 + }, + { + "epoch": 6.023339317773788, + "grad_norm": 9.039715766906738, + "learning_rate": 6.020915619389588e-06, + "loss": 7.5356, + "step": 67100 + }, + { + "epoch": 6.025583482944345, + "grad_norm": 9.327583312988281, + "learning_rate": 6.023159784560144e-06, + "loss": 7.3144, + "step": 67125 + }, + { + "epoch": 6.027827648114902, + "grad_norm": 9.656584739685059, + "learning_rate": 6.0254039497307e-06, + "loss": 7.3043, + "step": 67150 + }, + { + "epoch": 6.030071813285458, + "grad_norm": 10.715460777282715, + "learning_rate": 6.027648114901257e-06, + "loss": 7.3966, + "step": 67175 + }, + { + "epoch": 6.0323159784560145, + "grad_norm": 10.063456535339355, + "learning_rate": 6.0298922800718144e-06, + "loss": 7.263, + "step": 67200 + }, + { + "epoch": 6.034560143626571, + "grad_norm": 9.484185218811035, + "learning_rate": 6.03213644524237e-06, + "loss": 7.3729, + "step": 67225 + }, + { + "epoch": 6.0368043087971275, + "grad_norm": 11.731531143188477, + "learning_rate": 6.034380610412927e-06, + "loss": 7.5401, + "step": 67250 + }, + { + "epoch": 6.039048473967684, + "grad_norm": 8.671502113342285, + "learning_rate": 6.036624775583484e-06, + "loss": 7.0371, + "step": 67275 + }, + { + "epoch": 6.04129263913824, + "grad_norm": 9.572068214416504, + "learning_rate": 6.03886894075404e-06, + "loss": 7.194, + "step": 67300 + }, + { + "epoch": 6.043536804308797, + "grad_norm": 8.487078666687012, + "learning_rate": 6.041113105924597e-06, + "loss": 7.4809, + "step": 67325 + }, + { + "epoch": 6.045780969479353, + "grad_norm": 9.452540397644043, + "learning_rate": 6.0433572710951524e-06, + "loss": 7.4055, + "step": 67350 + }, + { + "epoch": 6.04802513464991, + "grad_norm": 7.956407070159912, + "learning_rate": 6.04560143626571e-06, + "loss": 7.2264, + "step": 67375 + }, + { + "epoch": 6.050269299820466, + "grad_norm": 12.96489143371582, + "learning_rate": 6.0478456014362665e-06, + "loss": 7.5247, + "step": 67400 + }, + { + "epoch": 6.052513464991024, + "grad_norm": 9.143209457397461, + "learning_rate": 6.050089766606822e-06, + "loss": 7.4378, + "step": 67425 + }, + { + "epoch": 6.05475763016158, + "grad_norm": 8.351421356201172, + "learning_rate": 6.052333931777379e-06, + "loss": 7.3619, + "step": 67450 + }, + { + "epoch": 6.057001795332137, + "grad_norm": 11.090563774108887, + "learning_rate": 6.054578096947936e-06, + "loss": 7.3217, + "step": 67475 + }, + { + "epoch": 6.059245960502693, + "grad_norm": 8.939047813415527, + "learning_rate": 6.056822262118492e-06, + "loss": 7.3952, + "step": 67500 + }, + { + "epoch": 6.06149012567325, + "grad_norm": 10.07595443725586, + "learning_rate": 6.059066427289049e-06, + "loss": 7.3578, + "step": 67525 + }, + { + "epoch": 6.063734290843806, + "grad_norm": 10.587313652038574, + "learning_rate": 6.061310592459606e-06, + "loss": 7.2537, + "step": 67550 + }, + { + "epoch": 6.065978456014363, + "grad_norm": 11.880866050720215, + "learning_rate": 6.063554757630162e-06, + "loss": 7.3674, + "step": 67575 + }, + { + "epoch": 6.068222621184919, + "grad_norm": 9.966970443725586, + "learning_rate": 6.065798922800719e-06, + "loss": 7.5218, + "step": 67600 + }, + { + "epoch": 6.0704667863554755, + "grad_norm": 9.006058692932129, + "learning_rate": 6.068043087971275e-06, + "loss": 7.3545, + "step": 67625 + }, + { + "epoch": 6.072710951526032, + "grad_norm": 9.731264114379883, + "learning_rate": 6.070287253141832e-06, + "loss": 7.415, + "step": 67650 + }, + { + "epoch": 6.0749551166965885, + "grad_norm": 8.97692584991455, + "learning_rate": 6.0725314183123884e-06, + "loss": 7.3254, + "step": 67675 + }, + { + "epoch": 6.077199281867145, + "grad_norm": 9.070415496826172, + "learning_rate": 6.074775583482945e-06, + "loss": 7.3023, + "step": 67700 + }, + { + "epoch": 6.079443447037702, + "grad_norm": 8.869362831115723, + "learning_rate": 6.077019748653501e-06, + "loss": 7.227, + "step": 67725 + }, + { + "epoch": 6.081687612208259, + "grad_norm": 9.226993560791016, + "learning_rate": 6.079263913824058e-06, + "loss": 7.3001, + "step": 67750 + }, + { + "epoch": 6.083931777378815, + "grad_norm": 12.463308334350586, + "learning_rate": 6.081508078994615e-06, + "loss": 7.3037, + "step": 67775 + }, + { + "epoch": 6.086175942549372, + "grad_norm": 10.112553596496582, + "learning_rate": 6.083752244165171e-06, + "loss": 7.2911, + "step": 67800 + }, + { + "epoch": 6.088420107719928, + "grad_norm": 10.429906845092773, + "learning_rate": 6.085996409335727e-06, + "loss": 7.3014, + "step": 67825 + }, + { + "epoch": 6.090664272890485, + "grad_norm": 10.653236389160156, + "learning_rate": 6.088240574506285e-06, + "loss": 7.0718, + "step": 67850 + }, + { + "epoch": 6.092908438061041, + "grad_norm": 11.07476806640625, + "learning_rate": 6.0904847396768405e-06, + "loss": 7.5117, + "step": 67875 + }, + { + "epoch": 6.095152603231598, + "grad_norm": 12.698047637939453, + "learning_rate": 6.092728904847397e-06, + "loss": 7.4142, + "step": 67900 + }, + { + "epoch": 6.097396768402154, + "grad_norm": 9.6851167678833, + "learning_rate": 6.094973070017955e-06, + "loss": 7.4644, + "step": 67925 + }, + { + "epoch": 6.099640933572711, + "grad_norm": 8.105419158935547, + "learning_rate": 6.09721723518851e-06, + "loss": 7.2399, + "step": 67950 + }, + { + "epoch": 6.101885098743267, + "grad_norm": 9.600595474243164, + "learning_rate": 6.099461400359067e-06, + "loss": 7.3671, + "step": 67975 + }, + { + "epoch": 6.1041292639138245, + "grad_norm": 11.16218090057373, + "learning_rate": 6.101705565529623e-06, + "loss": 7.3277, + "step": 68000 + }, + { + "epoch": 6.106373429084381, + "grad_norm": 11.245944023132324, + "learning_rate": 6.10394973070018e-06, + "loss": 7.3353, + "step": 68025 + }, + { + "epoch": 6.108617594254937, + "grad_norm": 10.077742576599121, + "learning_rate": 6.106193895870737e-06, + "loss": 7.1722, + "step": 68050 + }, + { + "epoch": 6.110861759425494, + "grad_norm": 11.304350852966309, + "learning_rate": 6.108438061041293e-06, + "loss": 7.1605, + "step": 68075 + }, + { + "epoch": 6.11310592459605, + "grad_norm": 10.177559852600098, + "learning_rate": 6.110682226211849e-06, + "loss": 7.0824, + "step": 68100 + }, + { + "epoch": 6.115350089766607, + "grad_norm": 10.113236427307129, + "learning_rate": 6.112926391382407e-06, + "loss": 7.1484, + "step": 68125 + }, + { + "epoch": 6.117594254937163, + "grad_norm": 10.454411506652832, + "learning_rate": 6.115170556552963e-06, + "loss": 7.3248, + "step": 68150 + }, + { + "epoch": 6.11983842010772, + "grad_norm": 11.672314643859863, + "learning_rate": 6.117414721723519e-06, + "loss": 7.2723, + "step": 68175 + }, + { + "epoch": 6.122082585278276, + "grad_norm": 11.107830047607422, + "learning_rate": 6.119658886894076e-06, + "loss": 7.4208, + "step": 68200 + }, + { + "epoch": 6.124326750448833, + "grad_norm": 10.233985900878906, + "learning_rate": 6.121903052064633e-06, + "loss": 7.0474, + "step": 68225 + }, + { + "epoch": 6.126570915619389, + "grad_norm": 12.886422157287598, + "learning_rate": 6.124057450628367e-06, + "loss": 7.3788, + "step": 68250 + }, + { + "epoch": 6.128815080789946, + "grad_norm": 10.402198791503906, + "learning_rate": 6.126301615798923e-06, + "loss": 7.4985, + "step": 68275 + }, + { + "epoch": 6.131059245960503, + "grad_norm": 8.741645812988281, + "learning_rate": 6.12854578096948e-06, + "loss": 7.3163, + "step": 68300 + }, + { + "epoch": 6.13330341113106, + "grad_norm": 14.814192771911621, + "learning_rate": 6.130789946140037e-06, + "loss": 7.608, + "step": 68325 + }, + { + "epoch": 6.135547576301616, + "grad_norm": 10.850825309753418, + "learning_rate": 6.133034111310593e-06, + "loss": 7.3363, + "step": 68350 + }, + { + "epoch": 6.1377917414721725, + "grad_norm": 11.062671661376953, + "learning_rate": 6.135278276481149e-06, + "loss": 7.1899, + "step": 68375 + }, + { + "epoch": 6.140035906642729, + "grad_norm": 12.506834030151367, + "learning_rate": 6.137522441651707e-06, + "loss": 7.3434, + "step": 68400 + }, + { + "epoch": 6.1422800718132855, + "grad_norm": 11.132984161376953, + "learning_rate": 6.139766606822263e-06, + "loss": 7.3997, + "step": 68425 + }, + { + "epoch": 6.144524236983842, + "grad_norm": 10.63211727142334, + "learning_rate": 6.142010771992819e-06, + "loss": 7.4543, + "step": 68450 + }, + { + "epoch": 6.1467684021543985, + "grad_norm": 9.971512794494629, + "learning_rate": 6.144254937163375e-06, + "loss": 7.3976, + "step": 68475 + }, + { + "epoch": 6.149012567324955, + "grad_norm": 10.743717193603516, + "learning_rate": 6.1464991023339324e-06, + "loss": 7.4512, + "step": 68500 + }, + { + "epoch": 6.151256732495511, + "grad_norm": 10.002758979797363, + "learning_rate": 6.148743267504489e-06, + "loss": 7.2909, + "step": 68525 + }, + { + "epoch": 6.153500897666068, + "grad_norm": 12.286795616149902, + "learning_rate": 6.150987432675045e-06, + "loss": 7.513, + "step": 68550 + }, + { + "epoch": 6.155745062836624, + "grad_norm": 11.84028434753418, + "learning_rate": 6.1532315978456014e-06, + "loss": 7.3237, + "step": 68575 + }, + { + "epoch": 6.157989228007182, + "grad_norm": 10.17548656463623, + "learning_rate": 6.155475763016159e-06, + "loss": 7.2452, + "step": 68600 + }, + { + "epoch": 6.160233393177738, + "grad_norm": 11.526083946228027, + "learning_rate": 6.157719928186715e-06, + "loss": 7.3591, + "step": 68625 + }, + { + "epoch": 6.162477558348295, + "grad_norm": 8.932941436767578, + "learning_rate": 6.159964093357271e-06, + "loss": 7.3775, + "step": 68650 + }, + { + "epoch": 6.164721723518851, + "grad_norm": 10.043919563293457, + "learning_rate": 6.162208258527829e-06, + "loss": 7.4978, + "step": 68675 + }, + { + "epoch": 6.166965888689408, + "grad_norm": 8.588866233825684, + "learning_rate": 6.1644524236983845e-06, + "loss": 7.2455, + "step": 68700 + }, + { + "epoch": 6.169210053859964, + "grad_norm": 10.443575859069824, + "learning_rate": 6.166696588868941e-06, + "loss": 7.4364, + "step": 68725 + }, + { + "epoch": 6.171454219030521, + "grad_norm": 9.128968238830566, + "learning_rate": 6.168940754039498e-06, + "loss": 7.3003, + "step": 68750 + }, + { + "epoch": 6.173698384201077, + "grad_norm": 10.610875129699707, + "learning_rate": 6.171184919210055e-06, + "loss": 7.2764, + "step": 68775 + }, + { + "epoch": 6.175942549371634, + "grad_norm": 9.044340133666992, + "learning_rate": 6.173429084380611e-06, + "loss": 7.2025, + "step": 68800 + }, + { + "epoch": 6.17818671454219, + "grad_norm": 13.972766876220703, + "learning_rate": 6.175673249551168e-06, + "loss": 7.4288, + "step": 68825 + }, + { + "epoch": 6.1804308797127465, + "grad_norm": 10.939373970031738, + "learning_rate": 6.177917414721723e-06, + "loss": 7.2646, + "step": 68850 + }, + { + "epoch": 6.182675044883303, + "grad_norm": 11.050416946411133, + "learning_rate": 6.180161579892281e-06, + "loss": 7.1341, + "step": 68875 + }, + { + "epoch": 6.18491921005386, + "grad_norm": 10.001524925231934, + "learning_rate": 6.1824057450628374e-06, + "loss": 7.2358, + "step": 68900 + }, + { + "epoch": 6.187163375224417, + "grad_norm": 13.835500717163086, + "learning_rate": 6.184649910233393e-06, + "loss": 7.2616, + "step": 68925 + }, + { + "epoch": 6.189407540394973, + "grad_norm": 10.051883697509766, + "learning_rate": 6.18689407540395e-06, + "loss": 7.2508, + "step": 68950 + }, + { + "epoch": 6.19165170556553, + "grad_norm": 9.97620677947998, + "learning_rate": 6.189138240574507e-06, + "loss": 7.423, + "step": 68975 + }, + { + "epoch": 6.193895870736086, + "grad_norm": 10.972290992736816, + "learning_rate": 6.191382405745063e-06, + "loss": 7.2079, + "step": 69000 + }, + { + "epoch": 6.196140035906643, + "grad_norm": 11.513222694396973, + "learning_rate": 6.19362657091562e-06, + "loss": 7.3342, + "step": 69025 + }, + { + "epoch": 6.198384201077199, + "grad_norm": 11.739823341369629, + "learning_rate": 6.195870736086177e-06, + "loss": 7.2603, + "step": 69050 + }, + { + "epoch": 6.200628366247756, + "grad_norm": 11.787446022033691, + "learning_rate": 6.198114901256733e-06, + "loss": 7.5077, + "step": 69075 + }, + { + "epoch": 6.202872531418312, + "grad_norm": 9.610742568969727, + "learning_rate": 6.2003590664272895e-06, + "loss": 7.2314, + "step": 69100 + }, + { + "epoch": 6.205116696588869, + "grad_norm": 11.627431869506836, + "learning_rate": 6.202603231597846e-06, + "loss": 7.423, + "step": 69125 + }, + { + "epoch": 6.207360861759425, + "grad_norm": 12.430917739868164, + "learning_rate": 6.204847396768403e-06, + "loss": 7.2983, + "step": 69150 + }, + { + "epoch": 6.209605026929982, + "grad_norm": 10.92905044555664, + "learning_rate": 6.207091561938959e-06, + "loss": 7.236, + "step": 69175 + }, + { + "epoch": 6.211849192100539, + "grad_norm": 9.0597505569458, + "learning_rate": 6.209335727109516e-06, + "loss": 7.1183, + "step": 69200 + }, + { + "epoch": 6.2140933572710955, + "grad_norm": 9.86233901977539, + "learning_rate": 6.211579892280072e-06, + "loss": 7.4515, + "step": 69225 + }, + { + "epoch": 6.216337522441652, + "grad_norm": 10.63094425201416, + "learning_rate": 6.213824057450629e-06, + "loss": 7.2166, + "step": 69250 + }, + { + "epoch": 6.218581687612208, + "grad_norm": 9.763898849487305, + "learning_rate": 6.216068222621186e-06, + "loss": 7.2246, + "step": 69275 + }, + { + "epoch": 6.220825852782765, + "grad_norm": 9.57391357421875, + "learning_rate": 6.2183123877917416e-06, + "loss": 7.1206, + "step": 69300 + }, + { + "epoch": 6.223070017953321, + "grad_norm": 13.276796340942383, + "learning_rate": 6.220556552962298e-06, + "loss": 7.3116, + "step": 69325 + }, + { + "epoch": 6.225314183123878, + "grad_norm": 9.777149200439453, + "learning_rate": 6.222800718132856e-06, + "loss": 7.314, + "step": 69350 + }, + { + "epoch": 6.227558348294434, + "grad_norm": 9.593793869018555, + "learning_rate": 6.225044883303411e-06, + "loss": 7.2607, + "step": 69375 + }, + { + "epoch": 6.229802513464991, + "grad_norm": 12.719728469848633, + "learning_rate": 6.227289048473968e-06, + "loss": 7.3978, + "step": 69400 + }, + { + "epoch": 6.232046678635547, + "grad_norm": 12.820954322814941, + "learning_rate": 6.2295332136445255e-06, + "loss": 7.4037, + "step": 69425 + }, + { + "epoch": 6.234290843806104, + "grad_norm": 11.668316841125488, + "learning_rate": 6.231777378815081e-06, + "loss": 7.1843, + "step": 69450 + }, + { + "epoch": 6.236535008976661, + "grad_norm": 10.177596092224121, + "learning_rate": 6.234021543985638e-06, + "loss": 7.2942, + "step": 69475 + }, + { + "epoch": 6.238779174147218, + "grad_norm": 14.161917686462402, + "learning_rate": 6.236265709156194e-06, + "loss": 7.3738, + "step": 69500 + }, + { + "epoch": 6.241023339317774, + "grad_norm": 10.113015174865723, + "learning_rate": 6.238509874326751e-06, + "loss": 7.2052, + "step": 69525 + }, + { + "epoch": 6.243267504488331, + "grad_norm": 11.075775146484375, + "learning_rate": 6.240754039497308e-06, + "loss": 7.0967, + "step": 69550 + }, + { + "epoch": 6.245511669658887, + "grad_norm": 9.826265335083008, + "learning_rate": 6.2429982046678635e-06, + "loss": 7.4074, + "step": 69575 + }, + { + "epoch": 6.2477558348294435, + "grad_norm": 9.98036003112793, + "learning_rate": 6.24524236983842e-06, + "loss": 7.2033, + "step": 69600 + }, + { + "epoch": 6.25, + "grad_norm": 10.137947082519531, + "learning_rate": 6.2474865350089776e-06, + "loss": 7.3308, + "step": 69625 + }, + { + "epoch": 6.2522441651705565, + "grad_norm": 13.031664848327637, + "learning_rate": 6.249730700179534e-06, + "loss": 7.1747, + "step": 69650 + }, + { + "epoch": 6.254488330341113, + "grad_norm": 10.201323509216309, + "learning_rate": 6.25197486535009e-06, + "loss": 7.2476, + "step": 69675 + }, + { + "epoch": 6.256732495511669, + "grad_norm": 14.473282814025879, + "learning_rate": 6.254219030520647e-06, + "loss": 7.3846, + "step": 69700 + }, + { + "epoch": 6.258976660682226, + "grad_norm": 11.930091857910156, + "learning_rate": 6.256463195691204e-06, + "loss": 7.3287, + "step": 69725 + }, + { + "epoch": 6.261220825852782, + "grad_norm": 9.037915229797363, + "learning_rate": 6.25870736086176e-06, + "loss": 7.3404, + "step": 69750 + }, + { + "epoch": 6.263464991023339, + "grad_norm": 10.470358848571777, + "learning_rate": 6.260951526032316e-06, + "loss": 7.3038, + "step": 69775 + }, + { + "epoch": 6.265709156193896, + "grad_norm": 11.541390419006348, + "learning_rate": 6.263195691202874e-06, + "loss": 7.1941, + "step": 69800 + }, + { + "epoch": 6.267953321364453, + "grad_norm": 8.814519882202148, + "learning_rate": 6.26543985637343e-06, + "loss": 7.3793, + "step": 69825 + }, + { + "epoch": 6.270197486535009, + "grad_norm": 10.472984313964844, + "learning_rate": 6.267684021543986e-06, + "loss": 7.3618, + "step": 69850 + }, + { + "epoch": 6.272441651705566, + "grad_norm": 10.67203426361084, + "learning_rate": 6.269928186714542e-06, + "loss": 7.412, + "step": 69875 + }, + { + "epoch": 6.274685816876122, + "grad_norm": 12.595012664794922, + "learning_rate": 6.2721723518850995e-06, + "loss": 7.3274, + "step": 69900 + }, + { + "epoch": 6.276929982046679, + "grad_norm": 9.416701316833496, + "learning_rate": 6.274416517055656e-06, + "loss": 7.3688, + "step": 69925 + }, + { + "epoch": 6.279174147217235, + "grad_norm": 9.94314956665039, + "learning_rate": 6.276660682226212e-06, + "loss": 7.4872, + "step": 69950 + }, + { + "epoch": 6.281418312387792, + "grad_norm": 8.939887046813965, + "learning_rate": 6.2789048473967685e-06, + "loss": 7.3057, + "step": 69975 + }, + { + "epoch": 6.283662477558348, + "grad_norm": 9.09467887878418, + "learning_rate": 6.281149012567326e-06, + "loss": 7.2131, + "step": 70000 + }, + { + "epoch": 6.2859066427289045, + "grad_norm": 10.398765563964844, + "learning_rate": 6.283393177737882e-06, + "loss": 7.227, + "step": 70025 + }, + { + "epoch": 6.288150807899461, + "grad_norm": 13.385804176330566, + "learning_rate": 6.285637342908438e-06, + "loss": 7.1947, + "step": 70050 + }, + { + "epoch": 6.290394973070018, + "grad_norm": 9.315632820129395, + "learning_rate": 6.287881508078996e-06, + "loss": 7.3764, + "step": 70075 + }, + { + "epoch": 6.292639138240575, + "grad_norm": 9.019762992858887, + "learning_rate": 6.2901256732495516e-06, + "loss": 7.1945, + "step": 70100 + }, + { + "epoch": 6.294883303411131, + "grad_norm": 10.140368461608887, + "learning_rate": 6.292369838420108e-06, + "loss": 7.2191, + "step": 70125 + }, + { + "epoch": 6.297127468581688, + "grad_norm": 10.417601585388184, + "learning_rate": 6.294614003590665e-06, + "loss": 7.2922, + "step": 70150 + }, + { + "epoch": 6.299371633752244, + "grad_norm": 9.979846954345703, + "learning_rate": 6.296858168761221e-06, + "loss": 7.1986, + "step": 70175 + }, + { + "epoch": 6.301615798922801, + "grad_norm": 8.206429481506348, + "learning_rate": 6.299102333931778e-06, + "loss": 7.2438, + "step": 70200 + }, + { + "epoch": 6.303859964093357, + "grad_norm": 11.886724472045898, + "learning_rate": 6.301346499102335e-06, + "loss": 7.0744, + "step": 70225 + }, + { + "epoch": 6.306104129263914, + "grad_norm": 12.22385311126709, + "learning_rate": 6.30359066427289e-06, + "loss": 7.2972, + "step": 70250 + }, + { + "epoch": 6.30834829443447, + "grad_norm": 8.37578296661377, + "learning_rate": 6.305834829443448e-06, + "loss": 7.3343, + "step": 70275 + }, + { + "epoch": 6.310592459605027, + "grad_norm": 9.616117477416992, + "learning_rate": 6.3080789946140045e-06, + "loss": 7.2089, + "step": 70300 + }, + { + "epoch": 6.312836624775583, + "grad_norm": 9.9074068069458, + "learning_rate": 6.31032315978456e-06, + "loss": 7.358, + "step": 70325 + }, + { + "epoch": 6.31508078994614, + "grad_norm": 11.830098152160645, + "learning_rate": 6.312567324955117e-06, + "loss": 7.5625, + "step": 70350 + }, + { + "epoch": 6.317324955116697, + "grad_norm": 10.055526733398438, + "learning_rate": 6.314811490125674e-06, + "loss": 7.5561, + "step": 70375 + }, + { + "epoch": 6.3195691202872535, + "grad_norm": 10.60123062133789, + "learning_rate": 6.31705565529623e-06, + "loss": 7.3009, + "step": 70400 + }, + { + "epoch": 6.32181328545781, + "grad_norm": 13.269153594970703, + "learning_rate": 6.319299820466787e-06, + "loss": 7.5355, + "step": 70425 + }, + { + "epoch": 6.324057450628366, + "grad_norm": 9.057252883911133, + "learning_rate": 6.321543985637344e-06, + "loss": 7.2795, + "step": 70450 + }, + { + "epoch": 6.326301615798923, + "grad_norm": 10.569048881530762, + "learning_rate": 6.3237881508079e-06, + "loss": 7.2336, + "step": 70475 + }, + { + "epoch": 6.328545780969479, + "grad_norm": 11.534499168395996, + "learning_rate": 6.3260323159784566e-06, + "loss": 7.2419, + "step": 70500 + }, + { + "epoch": 6.330789946140036, + "grad_norm": 10.439385414123535, + "learning_rate": 6.328276481149012e-06, + "loss": 7.3172, + "step": 70525 + }, + { + "epoch": 6.333034111310592, + "grad_norm": 10.163830757141113, + "learning_rate": 6.33052064631957e-06, + "loss": 7.3854, + "step": 70550 + }, + { + "epoch": 6.335278276481149, + "grad_norm": 11.391217231750488, + "learning_rate": 6.332764811490126e-06, + "loss": 7.2606, + "step": 70575 + }, + { + "epoch": 6.337522441651705, + "grad_norm": 10.149394989013672, + "learning_rate": 6.335008976660683e-06, + "loss": 7.3284, + "step": 70600 + }, + { + "epoch": 6.339766606822262, + "grad_norm": 9.909098625183105, + "learning_rate": 6.337253141831239e-06, + "loss": 7.2445, + "step": 70625 + }, + { + "epoch": 6.342010771992818, + "grad_norm": 9.401453971862793, + "learning_rate": 6.339497307001796e-06, + "loss": 7.1699, + "step": 70650 + }, + { + "epoch": 6.344254937163376, + "grad_norm": 10.42232608795166, + "learning_rate": 6.341741472172353e-06, + "loss": 7.4268, + "step": 70675 + }, + { + "epoch": 6.346499102333932, + "grad_norm": 10.303500175476074, + "learning_rate": 6.343985637342909e-06, + "loss": 7.2776, + "step": 70700 + }, + { + "epoch": 6.348743267504489, + "grad_norm": 14.271076202392578, + "learning_rate": 6.346140035906643e-06, + "loss": 7.3309, + "step": 70725 + }, + { + "epoch": 6.350987432675045, + "grad_norm": 11.264866828918457, + "learning_rate": 6.3483842010772e-06, + "loss": 7.312, + "step": 70750 + }, + { + "epoch": 6.3532315978456015, + "grad_norm": 8.907125473022461, + "learning_rate": 6.350628366247757e-06, + "loss": 7.4133, + "step": 70775 + }, + { + "epoch": 6.355475763016158, + "grad_norm": 10.623636245727539, + "learning_rate": 6.3528725314183125e-06, + "loss": 7.5182, + "step": 70800 + }, + { + "epoch": 6.3577199281867145, + "grad_norm": 9.564785957336426, + "learning_rate": 6.35511669658887e-06, + "loss": 7.2429, + "step": 70825 + }, + { + "epoch": 6.359964093357271, + "grad_norm": 13.404255867004395, + "learning_rate": 6.3573608617594266e-06, + "loss": 7.2725, + "step": 70850 + }, + { + "epoch": 6.3622082585278275, + "grad_norm": 9.196087837219238, + "learning_rate": 6.359605026929982e-06, + "loss": 7.2936, + "step": 70875 + }, + { + "epoch": 6.364452423698384, + "grad_norm": 10.269205093383789, + "learning_rate": 6.361849192100539e-06, + "loss": 7.1459, + "step": 70900 + }, + { + "epoch": 6.36669658886894, + "grad_norm": 13.9072904586792, + "learning_rate": 6.364093357271096e-06, + "loss": 7.1367, + "step": 70925 + }, + { + "epoch": 6.368940754039498, + "grad_norm": 8.802763938903809, + "learning_rate": 6.366337522441652e-06, + "loss": 7.4228, + "step": 70950 + }, + { + "epoch": 6.371184919210054, + "grad_norm": 10.993894577026367, + "learning_rate": 6.368581687612209e-06, + "loss": 7.2961, + "step": 70975 + }, + { + "epoch": 6.373429084380611, + "grad_norm": 10.471091270446777, + "learning_rate": 6.3708258527827646e-06, + "loss": 7.328, + "step": 71000 + }, + { + "epoch": 6.375673249551167, + "grad_norm": 11.242839813232422, + "learning_rate": 6.373070017953322e-06, + "loss": 7.271, + "step": 71025 + }, + { + "epoch": 6.377917414721724, + "grad_norm": 11.286234855651855, + "learning_rate": 6.375314183123879e-06, + "loss": 7.3715, + "step": 71050 + }, + { + "epoch": 6.38016157989228, + "grad_norm": 12.442651748657227, + "learning_rate": 6.377558348294434e-06, + "loss": 7.2088, + "step": 71075 + }, + { + "epoch": 6.382405745062837, + "grad_norm": 12.350981712341309, + "learning_rate": 6.379802513464991e-06, + "loss": 7.1669, + "step": 71100 + }, + { + "epoch": 6.384649910233393, + "grad_norm": 17.44031524658203, + "learning_rate": 6.3820466786355485e-06, + "loss": 7.2554, + "step": 71125 + }, + { + "epoch": 6.38689407540395, + "grad_norm": 11.33200454711914, + "learning_rate": 6.384290843806104e-06, + "loss": 7.0965, + "step": 71150 + }, + { + "epoch": 6.389138240574506, + "grad_norm": 9.57513427734375, + "learning_rate": 6.386535008976661e-06, + "loss": 7.305, + "step": 71175 + }, + { + "epoch": 6.391382405745063, + "grad_norm": 9.972275733947754, + "learning_rate": 6.388779174147218e-06, + "loss": 7.3747, + "step": 71200 + }, + { + "epoch": 6.393626570915619, + "grad_norm": 10.386908531188965, + "learning_rate": 6.391023339317775e-06, + "loss": 7.1693, + "step": 71225 + }, + { + "epoch": 6.3958707360861755, + "grad_norm": 9.905481338500977, + "learning_rate": 6.393267504488331e-06, + "loss": 7.2029, + "step": 71250 + }, + { + "epoch": 6.398114901256733, + "grad_norm": 9.716893196105957, + "learning_rate": 6.395511669658887e-06, + "loss": 7.1483, + "step": 71275 + }, + { + "epoch": 6.400359066427289, + "grad_norm": 13.206552505493164, + "learning_rate": 6.397755834829445e-06, + "loss": 7.0422, + "step": 71300 + }, + { + "epoch": 6.402603231597846, + "grad_norm": 14.882104873657227, + "learning_rate": 6.4000000000000006e-06, + "loss": 7.6892, + "step": 71325 + }, + { + "epoch": 6.404847396768402, + "grad_norm": 11.507643699645996, + "learning_rate": 6.402244165170557e-06, + "loss": 7.289, + "step": 71350 + }, + { + "epoch": 6.407091561938959, + "grad_norm": 11.31379508972168, + "learning_rate": 6.404488330341113e-06, + "loss": 7.4535, + "step": 71375 + }, + { + "epoch": 6.409335727109515, + "grad_norm": 11.08597469329834, + "learning_rate": 6.40673249551167e-06, + "loss": 7.304, + "step": 71400 + }, + { + "epoch": 6.411579892280072, + "grad_norm": 10.791849136352539, + "learning_rate": 6.408976660682227e-06, + "loss": 7.2247, + "step": 71425 + }, + { + "epoch": 6.413824057450628, + "grad_norm": 10.791643142700195, + "learning_rate": 6.411220825852783e-06, + "loss": 7.2832, + "step": 71450 + }, + { + "epoch": 6.416068222621185, + "grad_norm": 9.843780517578125, + "learning_rate": 6.413464991023339e-06, + "loss": 7.2973, + "step": 71475 + }, + { + "epoch": 6.418312387791741, + "grad_norm": 15.745185852050781, + "learning_rate": 6.415709156193897e-06, + "loss": 7.1865, + "step": 71500 + }, + { + "epoch": 6.420556552962298, + "grad_norm": 11.532201766967773, + "learning_rate": 6.417953321364453e-06, + "loss": 7.4176, + "step": 71525 + }, + { + "epoch": 6.422800718132855, + "grad_norm": 11.349449157714844, + "learning_rate": 6.420197486535009e-06, + "loss": 7.2844, + "step": 71550 + }, + { + "epoch": 6.4250448833034115, + "grad_norm": 14.24781608581543, + "learning_rate": 6.422441651705567e-06, + "loss": 7.2029, + "step": 71575 + }, + { + "epoch": 6.427289048473968, + "grad_norm": 10.360746383666992, + "learning_rate": 6.4246858168761225e-06, + "loss": 7.3485, + "step": 71600 + }, + { + "epoch": 6.4295332136445245, + "grad_norm": 9.496199607849121, + "learning_rate": 6.426929982046679e-06, + "loss": 7.2631, + "step": 71625 + }, + { + "epoch": 6.431777378815081, + "grad_norm": 12.424043655395508, + "learning_rate": 6.429174147217236e-06, + "loss": 7.2906, + "step": 71650 + }, + { + "epoch": 6.434021543985637, + "grad_norm": 9.914804458618164, + "learning_rate": 6.431418312387792e-06, + "loss": 7.4377, + "step": 71675 + }, + { + "epoch": 6.436265709156194, + "grad_norm": 11.613337516784668, + "learning_rate": 6.433662477558349e-06, + "loss": 7.1785, + "step": 71700 + }, + { + "epoch": 6.43850987432675, + "grad_norm": 11.472223281860352, + "learning_rate": 6.4359066427289056e-06, + "loss": 7.4165, + "step": 71725 + }, + { + "epoch": 6.440754039497307, + "grad_norm": 11.97685432434082, + "learning_rate": 6.438150807899461e-06, + "loss": 7.2515, + "step": 71750 + }, + { + "epoch": 6.442998204667863, + "grad_norm": 11.56657886505127, + "learning_rate": 6.440394973070019e-06, + "loss": 7.3371, + "step": 71775 + }, + { + "epoch": 6.44524236983842, + "grad_norm": 13.24588394165039, + "learning_rate": 6.442639138240575e-06, + "loss": 6.9927, + "step": 71800 + }, + { + "epoch": 6.447486535008976, + "grad_norm": 8.968293190002441, + "learning_rate": 6.444883303411131e-06, + "loss": 7.1506, + "step": 71825 + }, + { + "epoch": 6.449730700179533, + "grad_norm": 10.981985092163086, + "learning_rate": 6.447127468581688e-06, + "loss": 7.2909, + "step": 71850 + }, + { + "epoch": 6.45197486535009, + "grad_norm": 12.403886795043945, + "learning_rate": 6.449371633752245e-06, + "loss": 7.362, + "step": 71875 + }, + { + "epoch": 6.454219030520647, + "grad_norm": 11.006831169128418, + "learning_rate": 6.451615798922801e-06, + "loss": 7.3425, + "step": 71900 + }, + { + "epoch": 6.456463195691203, + "grad_norm": 10.338288307189941, + "learning_rate": 6.453859964093358e-06, + "loss": 7.3741, + "step": 71925 + }, + { + "epoch": 6.45870736086176, + "grad_norm": 9.076534271240234, + "learning_rate": 6.456104129263915e-06, + "loss": 7.1697, + "step": 71950 + }, + { + "epoch": 6.460951526032316, + "grad_norm": 11.134077072143555, + "learning_rate": 6.458348294434471e-06, + "loss": 7.2777, + "step": 71975 + }, + { + "epoch": 6.4631956912028725, + "grad_norm": 9.654725074768066, + "learning_rate": 6.4605924596050275e-06, + "loss": 7.2816, + "step": 72000 + }, + { + "epoch": 6.465439856373429, + "grad_norm": 10.212964057922363, + "learning_rate": 6.462836624775583e-06, + "loss": 7.1996, + "step": 72025 + }, + { + "epoch": 6.4676840215439855, + "grad_norm": 11.717792510986328, + "learning_rate": 6.465080789946141e-06, + "loss": 7.1493, + "step": 72050 + }, + { + "epoch": 6.469928186714542, + "grad_norm": 11.515053749084473, + "learning_rate": 6.467324955116697e-06, + "loss": 7.1314, + "step": 72075 + }, + { + "epoch": 6.472172351885098, + "grad_norm": 12.614387512207031, + "learning_rate": 6.469569120287253e-06, + "loss": 7.3919, + "step": 72100 + }, + { + "epoch": 6.474416517055655, + "grad_norm": 12.071078300476074, + "learning_rate": 6.47181328545781e-06, + "loss": 7.1034, + "step": 72125 + }, + { + "epoch": 6.476660682226212, + "grad_norm": 13.950483322143555, + "learning_rate": 6.474057450628367e-06, + "loss": 7.2785, + "step": 72150 + }, + { + "epoch": 6.478904847396769, + "grad_norm": 10.626548767089844, + "learning_rate": 6.476301615798924e-06, + "loss": 7.1653, + "step": 72175 + }, + { + "epoch": 6.481149012567325, + "grad_norm": 10.907064437866211, + "learning_rate": 6.4785457809694795e-06, + "loss": 7.4954, + "step": 72200 + }, + { + "epoch": 6.483393177737882, + "grad_norm": 11.10688304901123, + "learning_rate": 6.480789946140037e-06, + "loss": 7.3842, + "step": 72225 + }, + { + "epoch": 6.485637342908438, + "grad_norm": 14.686075210571289, + "learning_rate": 6.483034111310594e-06, + "loss": 7.27, + "step": 72250 + }, + { + "epoch": 6.487881508078995, + "grad_norm": 16.494029998779297, + "learning_rate": 6.485278276481149e-06, + "loss": 7.1229, + "step": 72275 + }, + { + "epoch": 6.490125673249551, + "grad_norm": 10.618032455444336, + "learning_rate": 6.487522441651706e-06, + "loss": 7.3121, + "step": 72300 + }, + { + "epoch": 6.492369838420108, + "grad_norm": 13.463263511657715, + "learning_rate": 6.4897666068222635e-06, + "loss": 7.2907, + "step": 72325 + }, + { + "epoch": 6.494614003590664, + "grad_norm": 8.989644050598145, + "learning_rate": 6.492010771992819e-06, + "loss": 7.2323, + "step": 72350 + }, + { + "epoch": 6.496858168761221, + "grad_norm": 9.366023063659668, + "learning_rate": 6.494254937163376e-06, + "loss": 7.5859, + "step": 72375 + }, + { + "epoch": 6.499102333931777, + "grad_norm": 11.381954193115234, + "learning_rate": 6.496499102333932e-06, + "loss": 7.2949, + "step": 72400 + }, + { + "epoch": 6.501346499102334, + "grad_norm": 11.405661582946777, + "learning_rate": 6.498743267504489e-06, + "loss": 7.2111, + "step": 72425 + }, + { + "epoch": 6.50359066427289, + "grad_norm": 9.760736465454102, + "learning_rate": 6.500987432675046e-06, + "loss": 7.2608, + "step": 72450 + }, + { + "epoch": 6.505834829443447, + "grad_norm": 14.921845436096191, + "learning_rate": 6.5032315978456015e-06, + "loss": 7.4944, + "step": 72475 + }, + { + "epoch": 6.508078994614004, + "grad_norm": 12.619390487670898, + "learning_rate": 6.505475763016158e-06, + "loss": 7.2551, + "step": 72500 + }, + { + "epoch": 6.51032315978456, + "grad_norm": 11.49813175201416, + "learning_rate": 6.5077199281867155e-06, + "loss": 7.2372, + "step": 72525 + }, + { + "epoch": 6.512567324955117, + "grad_norm": 13.279525756835938, + "learning_rate": 6.509964093357271e-06, + "loss": 7.3871, + "step": 72550 + }, + { + "epoch": 6.514811490125673, + "grad_norm": 12.522918701171875, + "learning_rate": 6.512208258527828e-06, + "loss": 7.4306, + "step": 72575 + }, + { + "epoch": 6.51705565529623, + "grad_norm": 11.07654857635498, + "learning_rate": 6.514452423698385e-06, + "loss": 7.119, + "step": 72600 + }, + { + "epoch": 6.519299820466786, + "grad_norm": 12.053755760192871, + "learning_rate": 6.516696588868941e-06, + "loss": 7.2057, + "step": 72625 + }, + { + "epoch": 6.521543985637343, + "grad_norm": 13.094888687133789, + "learning_rate": 6.518940754039498e-06, + "loss": 7.3988, + "step": 72650 + }, + { + "epoch": 6.523788150807899, + "grad_norm": 9.129287719726562, + "learning_rate": 6.521184919210054e-06, + "loss": 7.2238, + "step": 72675 + }, + { + "epoch": 6.526032315978456, + "grad_norm": 11.65527057647705, + "learning_rate": 6.523429084380611e-06, + "loss": 7.2577, + "step": 72700 + }, + { + "epoch": 6.528276481149012, + "grad_norm": 10.806488990783691, + "learning_rate": 6.525673249551168e-06, + "loss": 7.2406, + "step": 72725 + }, + { + "epoch": 6.5305206463195695, + "grad_norm": 9.79490852355957, + "learning_rate": 6.527917414721724e-06, + "loss": 7.1863, + "step": 72750 + }, + { + "epoch": 6.532764811490126, + "grad_norm": 10.458491325378418, + "learning_rate": 6.53016157989228e-06, + "loss": 7.0246, + "step": 72775 + }, + { + "epoch": 6.5350089766606825, + "grad_norm": 11.089095115661621, + "learning_rate": 6.5324057450628375e-06, + "loss": 7.2713, + "step": 72800 + }, + { + "epoch": 6.537253141831239, + "grad_norm": 10.685979843139648, + "learning_rate": 6.534649910233394e-06, + "loss": 7.0999, + "step": 72825 + }, + { + "epoch": 6.539497307001795, + "grad_norm": 10.96831226348877, + "learning_rate": 6.53689407540395e-06, + "loss": 7.1494, + "step": 72850 + }, + { + "epoch": 6.541741472172352, + "grad_norm": 12.241168022155762, + "learning_rate": 6.5391382405745065e-06, + "loss": 7.2565, + "step": 72875 + }, + { + "epoch": 6.543985637342908, + "grad_norm": 11.273906707763672, + "learning_rate": 6.541382405745064e-06, + "loss": 7.1344, + "step": 72900 + }, + { + "epoch": 6.546229802513465, + "grad_norm": 9.24215030670166, + "learning_rate": 6.54362657091562e-06, + "loss": 7.1866, + "step": 72925 + }, + { + "epoch": 6.548473967684021, + "grad_norm": 12.932868957519531, + "learning_rate": 6.545780969479354e-06, + "loss": 7.5177, + "step": 72950 + }, + { + "epoch": 6.550718132854578, + "grad_norm": 11.153263092041016, + "learning_rate": 6.548025134649911e-06, + "loss": 7.3228, + "step": 72975 + }, + { + "epoch": 6.552962298025134, + "grad_norm": 13.925304412841797, + "learning_rate": 6.550269299820468e-06, + "loss": 7.3937, + "step": 73000 + }, + { + "epoch": 6.555206463195692, + "grad_norm": 13.274391174316406, + "learning_rate": 6.5525134649910235e-06, + "loss": 7.2851, + "step": 73025 + }, + { + "epoch": 6.557450628366247, + "grad_norm": 10.368252754211426, + "learning_rate": 6.55475763016158e-06, + "loss": 7.2557, + "step": 73050 + }, + { + "epoch": 6.559694793536805, + "grad_norm": 9.549860000610352, + "learning_rate": 6.557001795332138e-06, + "loss": 7.2773, + "step": 73075 + }, + { + "epoch": 6.561938958707361, + "grad_norm": 9.162616729736328, + "learning_rate": 6.559245960502693e-06, + "loss": 7.4563, + "step": 73100 + }, + { + "epoch": 6.564183123877918, + "grad_norm": 13.331947326660156, + "learning_rate": 6.56149012567325e-06, + "loss": 7.2942, + "step": 73125 + }, + { + "epoch": 6.566427289048474, + "grad_norm": 9.832636833190918, + "learning_rate": 6.563734290843807e-06, + "loss": 7.3719, + "step": 73150 + }, + { + "epoch": 6.5686714542190305, + "grad_norm": 12.36171817779541, + "learning_rate": 6.565978456014363e-06, + "loss": 7.3709, + "step": 73175 + }, + { + "epoch": 6.570915619389587, + "grad_norm": 13.82861614227295, + "learning_rate": 6.56822262118492e-06, + "loss": 7.0378, + "step": 73200 + }, + { + "epoch": 6.5731597845601435, + "grad_norm": 9.79139518737793, + "learning_rate": 6.5704667863554765e-06, + "loss": 7.2636, + "step": 73225 + }, + { + "epoch": 6.5754039497307, + "grad_norm": 9.398963928222656, + "learning_rate": 6.572710951526032e-06, + "loss": 7.3228, + "step": 73250 + }, + { + "epoch": 6.5776481149012564, + "grad_norm": 12.43901538848877, + "learning_rate": 6.57495511669659e-06, + "loss": 7.3508, + "step": 73275 + }, + { + "epoch": 6.579892280071813, + "grad_norm": 11.71102237701416, + "learning_rate": 6.577199281867146e-06, + "loss": 7.1744, + "step": 73300 + }, + { + "epoch": 6.582136445242369, + "grad_norm": 11.392264366149902, + "learning_rate": 6.579443447037702e-06, + "loss": 7.1149, + "step": 73325 + }, + { + "epoch": 6.584380610412927, + "grad_norm": 14.81397533416748, + "learning_rate": 6.5816876122082595e-06, + "loss": 7.1353, + "step": 73350 + }, + { + "epoch": 6.586624775583483, + "grad_norm": 10.645167350769043, + "learning_rate": 6.583931777378816e-06, + "loss": 7.3575, + "step": 73375 + }, + { + "epoch": 6.58886894075404, + "grad_norm": 10.415990829467773, + "learning_rate": 6.586175942549372e-06, + "loss": 7.295, + "step": 73400 + }, + { + "epoch": 6.591113105924596, + "grad_norm": 10.390096664428711, + "learning_rate": 6.5884201077199285e-06, + "loss": 7.1384, + "step": 73425 + }, + { + "epoch": 6.593357271095153, + "grad_norm": 9.475513458251953, + "learning_rate": 6.590664272890486e-06, + "loss": 7.3764, + "step": 73450 + }, + { + "epoch": 6.595601436265709, + "grad_norm": 10.15778923034668, + "learning_rate": 6.592908438061042e-06, + "loss": 7.1741, + "step": 73475 + }, + { + "epoch": 6.597845601436266, + "grad_norm": 10.713929176330566, + "learning_rate": 6.595152603231598e-06, + "loss": 7.3149, + "step": 73500 + }, + { + "epoch": 6.600089766606822, + "grad_norm": 11.420851707458496, + "learning_rate": 6.597396768402154e-06, + "loss": 7.3948, + "step": 73525 + }, + { + "epoch": 6.602333931777379, + "grad_norm": 15.836380958557129, + "learning_rate": 6.599640933572712e-06, + "loss": 7.3538, + "step": 73550 + }, + { + "epoch": 6.604578096947935, + "grad_norm": 13.205289840698242, + "learning_rate": 6.601885098743268e-06, + "loss": 7.0806, + "step": 73575 + }, + { + "epoch": 6.6068222621184916, + "grad_norm": 13.741181373596191, + "learning_rate": 6.604129263913824e-06, + "loss": 7.3982, + "step": 73600 + }, + { + "epoch": 6.609066427289049, + "grad_norm": 9.187962532043457, + "learning_rate": 6.606373429084381e-06, + "loss": 7.4331, + "step": 73625 + }, + { + "epoch": 6.611310592459605, + "grad_norm": 10.74998664855957, + "learning_rate": 6.608617594254938e-06, + "loss": 7.0999, + "step": 73650 + }, + { + "epoch": 6.613554757630162, + "grad_norm": 10.907756805419922, + "learning_rate": 6.610861759425494e-06, + "loss": 7.1725, + "step": 73675 + }, + { + "epoch": 6.615798922800718, + "grad_norm": 11.677279472351074, + "learning_rate": 6.6131059245960505e-06, + "loss": 7.3012, + "step": 73700 + }, + { + "epoch": 6.618043087971275, + "grad_norm": 10.894217491149902, + "learning_rate": 6.615350089766608e-06, + "loss": 7.2183, + "step": 73725 + }, + { + "epoch": 6.620287253141831, + "grad_norm": 11.791688919067383, + "learning_rate": 6.6175942549371645e-06, + "loss": 7.3126, + "step": 73750 + }, + { + "epoch": 6.622531418312388, + "grad_norm": 13.383397102355957, + "learning_rate": 6.61983842010772e-06, + "loss": 7.2639, + "step": 73775 + }, + { + "epoch": 6.624775583482944, + "grad_norm": 12.98088264465332, + "learning_rate": 6.622082585278277e-06, + "loss": 7.2392, + "step": 73800 + }, + { + "epoch": 6.627019748653501, + "grad_norm": 12.258731842041016, + "learning_rate": 6.624326750448834e-06, + "loss": 7.1529, + "step": 73825 + }, + { + "epoch": 6.629263913824057, + "grad_norm": 12.107407569885254, + "learning_rate": 6.62657091561939e-06, + "loss": 7.4322, + "step": 73850 + }, + { + "epoch": 6.631508078994614, + "grad_norm": 10.387789726257324, + "learning_rate": 6.628815080789947e-06, + "loss": 7.3042, + "step": 73875 + }, + { + "epoch": 6.63375224416517, + "grad_norm": 10.534290313720703, + "learning_rate": 6.6310592459605025e-06, + "loss": 7.1171, + "step": 73900 + }, + { + "epoch": 6.635996409335727, + "grad_norm": 11.081453323364258, + "learning_rate": 6.63330341113106e-06, + "loss": 7.4039, + "step": 73925 + }, + { + "epoch": 6.638240574506284, + "grad_norm": 9.601165771484375, + "learning_rate": 6.635547576301617e-06, + "loss": 7.2335, + "step": 73950 + }, + { + "epoch": 6.6404847396768405, + "grad_norm": 10.975202560424805, + "learning_rate": 6.637791741472172e-06, + "loss": 7.2226, + "step": 73975 + }, + { + "epoch": 6.642728904847397, + "grad_norm": 9.461324691772461, + "learning_rate": 6.640035906642729e-06, + "loss": 7.176, + "step": 74000 + }, + { + "epoch": 6.6449730700179535, + "grad_norm": 18.654748916625977, + "learning_rate": 6.6422800718132865e-06, + "loss": 7.3087, + "step": 74025 + }, + { + "epoch": 6.64721723518851, + "grad_norm": 17.012714385986328, + "learning_rate": 6.644524236983842e-06, + "loss": 7.2806, + "step": 74050 + }, + { + "epoch": 6.649461400359066, + "grad_norm": 12.602012634277344, + "learning_rate": 6.646768402154399e-06, + "loss": 7.155, + "step": 74075 + }, + { + "epoch": 6.651705565529623, + "grad_norm": 9.12929916381836, + "learning_rate": 6.649012567324956e-06, + "loss": 7.4508, + "step": 74100 + }, + { + "epoch": 6.653949730700179, + "grad_norm": 11.040406227111816, + "learning_rate": 6.651256732495512e-06, + "loss": 7.3635, + "step": 74125 + }, + { + "epoch": 6.656193895870736, + "grad_norm": 11.455739974975586, + "learning_rate": 6.653500897666069e-06, + "loss": 7.3721, + "step": 74150 + }, + { + "epoch": 6.658438061041292, + "grad_norm": 10.584799766540527, + "learning_rate": 6.655745062836625e-06, + "loss": 7.5647, + "step": 74175 + }, + { + "epoch": 6.660682226211849, + "grad_norm": 13.968071937561035, + "learning_rate": 6.657989228007182e-06, + "loss": 7.4558, + "step": 74200 + }, + { + "epoch": 6.662926391382406, + "grad_norm": 10.618939399719238, + "learning_rate": 6.6602333931777385e-06, + "loss": 7.2198, + "step": 74225 + }, + { + "epoch": 6.665170556552963, + "grad_norm": 15.480769157409668, + "learning_rate": 6.662477558348295e-06, + "loss": 7.3394, + "step": 74250 + }, + { + "epoch": 6.667414721723519, + "grad_norm": 12.59046459197998, + "learning_rate": 6.664721723518851e-06, + "loss": 7.0538, + "step": 74275 + }, + { + "epoch": 6.669658886894076, + "grad_norm": 10.038260459899902, + "learning_rate": 6.666965888689408e-06, + "loss": 7.2596, + "step": 74300 + }, + { + "epoch": 6.671903052064632, + "grad_norm": 12.201627731323242, + "learning_rate": 6.669210053859965e-06, + "loss": 7.2221, + "step": 74325 + }, + { + "epoch": 6.674147217235189, + "grad_norm": 9.645535469055176, + "learning_rate": 6.671454219030521e-06, + "loss": 7.3213, + "step": 74350 + }, + { + "epoch": 6.676391382405745, + "grad_norm": 11.566288948059082, + "learning_rate": 6.673698384201078e-06, + "loss": 7.0059, + "step": 74375 + }, + { + "epoch": 6.6786355475763015, + "grad_norm": 9.893118858337402, + "learning_rate": 6.675942549371635e-06, + "loss": 7.4395, + "step": 74400 + }, + { + "epoch": 6.680879712746858, + "grad_norm": 9.519179344177246, + "learning_rate": 6.678186714542191e-06, + "loss": 7.1968, + "step": 74425 + }, + { + "epoch": 6.6831238779174145, + "grad_norm": 9.0681791305542, + "learning_rate": 6.680430879712747e-06, + "loss": 7.1706, + "step": 74450 + }, + { + "epoch": 6.685368043087971, + "grad_norm": 8.948789596557617, + "learning_rate": 6.682675044883305e-06, + "loss": 7.236, + "step": 74475 + }, + { + "epoch": 6.687612208258528, + "grad_norm": 11.437298774719238, + "learning_rate": 6.6849192100538605e-06, + "loss": 7.2419, + "step": 74500 + }, + { + "epoch": 6.689856373429084, + "grad_norm": 12.912850379943848, + "learning_rate": 6.687163375224417e-06, + "loss": 7.0596, + "step": 74525 + }, + { + "epoch": 6.692100538599641, + "grad_norm": 12.372191429138184, + "learning_rate": 6.689407540394973e-06, + "loss": 7.2299, + "step": 74550 + }, + { + "epoch": 6.694344703770198, + "grad_norm": 10.880531311035156, + "learning_rate": 6.69165170556553e-06, + "loss": 7.23, + "step": 74575 + }, + { + "epoch": 6.696588868940754, + "grad_norm": 10.210481643676758, + "learning_rate": 6.693895870736087e-06, + "loss": 7.1022, + "step": 74600 + }, + { + "epoch": 6.698833034111311, + "grad_norm": 12.166997909545898, + "learning_rate": 6.696140035906643e-06, + "loss": 7.3521, + "step": 74625 + }, + { + "epoch": 6.701077199281867, + "grad_norm": 11.555604934692383, + "learning_rate": 6.698384201077199e-06, + "loss": 7.173, + "step": 74650 + }, + { + "epoch": 6.703321364452424, + "grad_norm": 9.333062171936035, + "learning_rate": 6.700628366247757e-06, + "loss": 7.1785, + "step": 74675 + }, + { + "epoch": 6.70556552962298, + "grad_norm": 11.045679092407227, + "learning_rate": 6.702872531418313e-06, + "loss": 7.5163, + "step": 74700 + }, + { + "epoch": 6.707809694793537, + "grad_norm": 10.978307723999023, + "learning_rate": 6.705116696588869e-06, + "loss": 7.1114, + "step": 74725 + }, + { + "epoch": 6.710053859964093, + "grad_norm": 9.924695014953613, + "learning_rate": 6.707360861759427e-06, + "loss": 7.3187, + "step": 74750 + }, + { + "epoch": 6.71229802513465, + "grad_norm": 10.144599914550781, + "learning_rate": 6.709605026929983e-06, + "loss": 7.3326, + "step": 74775 + }, + { + "epoch": 6.714542190305206, + "grad_norm": 9.950204849243164, + "learning_rate": 6.711849192100539e-06, + "loss": 7.2167, + "step": 74800 + }, + { + "epoch": 6.716786355475763, + "grad_norm": 12.476372718811035, + "learning_rate": 6.714093357271096e-06, + "loss": 7.2539, + "step": 74825 + }, + { + "epoch": 6.71903052064632, + "grad_norm": 11.428199768066406, + "learning_rate": 6.716337522441653e-06, + "loss": 7.1831, + "step": 74850 + }, + { + "epoch": 6.721274685816876, + "grad_norm": 9.863873481750488, + "learning_rate": 6.718581687612209e-06, + "loss": 7.4299, + "step": 74875 + }, + { + "epoch": 6.723518850987433, + "grad_norm": 11.711904525756836, + "learning_rate": 6.7208258527827654e-06, + "loss": 7.3741, + "step": 74900 + }, + { + "epoch": 6.725763016157989, + "grad_norm": 13.898558616638184, + "learning_rate": 6.723070017953321e-06, + "loss": 7.2457, + "step": 74925 + }, + { + "epoch": 6.728007181328546, + "grad_norm": 9.908242225646973, + "learning_rate": 6.725314183123879e-06, + "loss": 7.3093, + "step": 74950 + }, + { + "epoch": 6.730251346499102, + "grad_norm": 11.847241401672363, + "learning_rate": 6.727558348294435e-06, + "loss": 7.3933, + "step": 74975 + }, + { + "epoch": 6.732495511669659, + "grad_norm": 9.182645797729492, + "learning_rate": 6.729802513464991e-06, + "loss": 7.2711, + "step": 75000 + }, + { + "epoch": 6.734739676840215, + "grad_norm": 14.369656562805176, + "learning_rate": 6.732046678635548e-06, + "loss": 7.1238, + "step": 75025 + }, + { + "epoch": 6.736983842010772, + "grad_norm": 11.170183181762695, + "learning_rate": 6.734290843806105e-06, + "loss": 7.1929, + "step": 75050 + }, + { + "epoch": 6.739228007181328, + "grad_norm": 12.491527557373047, + "learning_rate": 6.736535008976661e-06, + "loss": 7.2412, + "step": 75075 + }, + { + "epoch": 6.741472172351886, + "grad_norm": 10.920154571533203, + "learning_rate": 6.7387791741472175e-06, + "loss": 7.2403, + "step": 75100 + }, + { + "epoch": 6.743716337522442, + "grad_norm": 10.5265531539917, + "learning_rate": 6.741023339317775e-06, + "loss": 7.2377, + "step": 75125 + }, + { + "epoch": 6.7459605026929985, + "grad_norm": 11.868908882141113, + "learning_rate": 6.743177737881509e-06, + "loss": 7.2183, + "step": 75150 + }, + { + "epoch": 6.748204667863555, + "grad_norm": 10.982571601867676, + "learning_rate": 6.745421903052065e-06, + "loss": 7.0971, + "step": 75175 + }, + { + "epoch": 6.7504488330341115, + "grad_norm": 9.449320793151855, + "learning_rate": 6.747666068222621e-06, + "loss": 7.3169, + "step": 75200 + }, + { + "epoch": 6.752692998204668, + "grad_norm": 10.14775562286377, + "learning_rate": 6.749910233393179e-06, + "loss": 7.0803, + "step": 75225 + }, + { + "epoch": 6.754937163375224, + "grad_norm": 10.895279884338379, + "learning_rate": 6.752154398563735e-06, + "loss": 7.2651, + "step": 75250 + }, + { + "epoch": 6.757181328545781, + "grad_norm": 9.862445831298828, + "learning_rate": 6.754398563734291e-06, + "loss": 7.31, + "step": 75275 + }, + { + "epoch": 6.759425493716337, + "grad_norm": 13.114846229553223, + "learning_rate": 6.756642728904848e-06, + "loss": 7.2996, + "step": 75300 + }, + { + "epoch": 6.761669658886894, + "grad_norm": 11.852275848388672, + "learning_rate": 6.758886894075405e-06, + "loss": 7.2342, + "step": 75325 + }, + { + "epoch": 6.76391382405745, + "grad_norm": 11.555021286010742, + "learning_rate": 6.761131059245961e-06, + "loss": 7.3116, + "step": 75350 + }, + { + "epoch": 6.766157989228007, + "grad_norm": 11.690221786499023, + "learning_rate": 6.763375224416518e-06, + "loss": 7.1942, + "step": 75375 + }, + { + "epoch": 6.768402154398563, + "grad_norm": 10.966025352478027, + "learning_rate": 6.7656193895870734e-06, + "loss": 7.1767, + "step": 75400 + }, + { + "epoch": 6.770646319569121, + "grad_norm": 12.991167068481445, + "learning_rate": 6.767863554757631e-06, + "loss": 7.0287, + "step": 75425 + }, + { + "epoch": 6.772890484739677, + "grad_norm": 10.198867797851562, + "learning_rate": 6.7701077199281875e-06, + "loss": 7.1782, + "step": 75450 + }, + { + "epoch": 6.775134649910234, + "grad_norm": 9.602873802185059, + "learning_rate": 6.772351885098743e-06, + "loss": 7.2459, + "step": 75475 + }, + { + "epoch": 6.77737881508079, + "grad_norm": 10.232697486877441, + "learning_rate": 6.774596050269301e-06, + "loss": 7.239, + "step": 75500 + }, + { + "epoch": 6.779622980251347, + "grad_norm": 11.159178733825684, + "learning_rate": 6.776840215439857e-06, + "loss": 7.0943, + "step": 75525 + }, + { + "epoch": 6.781867145421903, + "grad_norm": 10.025191307067871, + "learning_rate": 6.779084380610413e-06, + "loss": 7.1967, + "step": 75550 + }, + { + "epoch": 6.7841113105924595, + "grad_norm": 14.428872108459473, + "learning_rate": 6.78132854578097e-06, + "loss": 7.1777, + "step": 75575 + }, + { + "epoch": 6.786355475763016, + "grad_norm": 11.640372276306152, + "learning_rate": 6.783572710951527e-06, + "loss": 7.2964, + "step": 75600 + }, + { + "epoch": 6.7885996409335725, + "grad_norm": 10.323497772216797, + "learning_rate": 6.785816876122083e-06, + "loss": 7.3158, + "step": 75625 + }, + { + "epoch": 6.790843806104129, + "grad_norm": 9.486183166503906, + "learning_rate": 6.78806104129264e-06, + "loss": 7.2508, + "step": 75650 + }, + { + "epoch": 6.793087971274685, + "grad_norm": 10.747513771057129, + "learning_rate": 6.790305206463196e-06, + "loss": 7.0309, + "step": 75675 + }, + { + "epoch": 6.795332136445243, + "grad_norm": 13.168323516845703, + "learning_rate": 6.792549371633753e-06, + "loss": 7.0259, + "step": 75700 + }, + { + "epoch": 6.797576301615799, + "grad_norm": 12.352339744567871, + "learning_rate": 6.7947935368043094e-06, + "loss": 7.1346, + "step": 75725 + }, + { + "epoch": 6.799820466786356, + "grad_norm": 11.548859596252441, + "learning_rate": 6.797037701974866e-06, + "loss": 7.3908, + "step": 75750 + }, + { + "epoch": 6.802064631956912, + "grad_norm": 10.118538856506348, + "learning_rate": 6.799281867145422e-06, + "loss": 7.2506, + "step": 75775 + }, + { + "epoch": 6.804308797127469, + "grad_norm": 9.289041519165039, + "learning_rate": 6.801526032315979e-06, + "loss": 7.0539, + "step": 75800 + }, + { + "epoch": 6.806552962298025, + "grad_norm": 15.708856582641602, + "learning_rate": 6.803770197486536e-06, + "loss": 7.1472, + "step": 75825 + }, + { + "epoch": 6.808797127468582, + "grad_norm": 11.418462753295898, + "learning_rate": 6.806014362657092e-06, + "loss": 7.2126, + "step": 75850 + }, + { + "epoch": 6.811041292639138, + "grad_norm": 13.323988914489746, + "learning_rate": 6.808258527827649e-06, + "loss": 7.3106, + "step": 75875 + }, + { + "epoch": 6.813285457809695, + "grad_norm": 11.017227172851562, + "learning_rate": 6.810502692998206e-06, + "loss": 7.1701, + "step": 75900 + }, + { + "epoch": 6.815529622980251, + "grad_norm": 8.71892261505127, + "learning_rate": 6.8127468581687615e-06, + "loss": 7.2603, + "step": 75925 + }, + { + "epoch": 6.817773788150808, + "grad_norm": 10.641923904418945, + "learning_rate": 6.814991023339318e-06, + "loss": 7.1695, + "step": 75950 + }, + { + "epoch": 6.820017953321364, + "grad_norm": 10.756820678710938, + "learning_rate": 6.817235188509876e-06, + "loss": 7.0773, + "step": 75975 + }, + { + "epoch": 6.8222621184919205, + "grad_norm": 9.53046703338623, + "learning_rate": 6.819479353680431e-06, + "loss": 7.2953, + "step": 76000 + }, + { + "epoch": 6.824506283662478, + "grad_norm": 11.573638916015625, + "learning_rate": 6.821723518850988e-06, + "loss": 7.382, + "step": 76025 + }, + { + "epoch": 6.826750448833034, + "grad_norm": 11.270044326782227, + "learning_rate": 6.823967684021544e-06, + "loss": 7.1972, + "step": 76050 + }, + { + "epoch": 6.828994614003591, + "grad_norm": 10.953071594238281, + "learning_rate": 6.826211849192101e-06, + "loss": 7.1142, + "step": 76075 + }, + { + "epoch": 6.831238779174147, + "grad_norm": 11.13890552520752, + "learning_rate": 6.828456014362658e-06, + "loss": 7.2904, + "step": 76100 + }, + { + "epoch": 6.833482944344704, + "grad_norm": 12.172011375427246, + "learning_rate": 6.830700179533214e-06, + "loss": 7.1055, + "step": 76125 + }, + { + "epoch": 6.83572710951526, + "grad_norm": 10.920980453491211, + "learning_rate": 6.83294434470377e-06, + "loss": 7.3385, + "step": 76150 + }, + { + "epoch": 6.837971274685817, + "grad_norm": 12.282885551452637, + "learning_rate": 6.835188509874328e-06, + "loss": 7.0944, + "step": 76175 + }, + { + "epoch": 6.840215439856373, + "grad_norm": 10.058338165283203, + "learning_rate": 6.8374326750448834e-06, + "loss": 7.2958, + "step": 76200 + }, + { + "epoch": 6.84245960502693, + "grad_norm": 10.476973533630371, + "learning_rate": 6.83967684021544e-06, + "loss": 7.203, + "step": 76225 + }, + { + "epoch": 6.844703770197486, + "grad_norm": 11.94399642944336, + "learning_rate": 6.8419210053859975e-06, + "loss": 7.175, + "step": 76250 + }, + { + "epoch": 6.846947935368043, + "grad_norm": 10.661416053771973, + "learning_rate": 6.844165170556554e-06, + "loss": 7.3674, + "step": 76275 + }, + { + "epoch": 6.8491921005386, + "grad_norm": 10.999091148376465, + "learning_rate": 6.84640933572711e-06, + "loss": 7.175, + "step": 76300 + }, + { + "epoch": 6.8514362657091565, + "grad_norm": 11.593558311462402, + "learning_rate": 6.8486535008976665e-06, + "loss": 7.3252, + "step": 76325 + }, + { + "epoch": 6.853680430879713, + "grad_norm": 11.58349895477295, + "learning_rate": 6.850897666068224e-06, + "loss": 7.2877, + "step": 76350 + }, + { + "epoch": 6.8559245960502695, + "grad_norm": 11.194314002990723, + "learning_rate": 6.85314183123878e-06, + "loss": 7.1668, + "step": 76375 + }, + { + "epoch": 6.858168761220826, + "grad_norm": 12.480424880981445, + "learning_rate": 6.855385996409336e-06, + "loss": 7.1378, + "step": 76400 + }, + { + "epoch": 6.8604129263913824, + "grad_norm": 9.157910346984863, + "learning_rate": 6.857630161579892e-06, + "loss": 7.2819, + "step": 76425 + }, + { + "epoch": 6.862657091561939, + "grad_norm": 10.952082633972168, + "learning_rate": 6.85987432675045e-06, + "loss": 7.287, + "step": 76450 + }, + { + "epoch": 6.864901256732495, + "grad_norm": 10.839038848876953, + "learning_rate": 6.862118491921006e-06, + "loss": 7.3039, + "step": 76475 + }, + { + "epoch": 6.867145421903052, + "grad_norm": 10.169608116149902, + "learning_rate": 6.864362657091562e-06, + "loss": 7.2036, + "step": 76500 + }, + { + "epoch": 6.869389587073608, + "grad_norm": 12.087705612182617, + "learning_rate": 6.866606822262119e-06, + "loss": 7.1834, + "step": 76525 + }, + { + "epoch": 6.871633752244165, + "grad_norm": 12.886750221252441, + "learning_rate": 6.868850987432676e-06, + "loss": 7.458, + "step": 76550 + }, + { + "epoch": 6.873877917414722, + "grad_norm": 11.384218215942383, + "learning_rate": 6.871095152603232e-06, + "loss": 7.3714, + "step": 76575 + }, + { + "epoch": 6.876122082585278, + "grad_norm": 11.380516052246094, + "learning_rate": 6.8733393177737884e-06, + "loss": 7.4187, + "step": 76600 + }, + { + "epoch": 6.878366247755835, + "grad_norm": 10.751465797424316, + "learning_rate": 6.875583482944346e-06, + "loss": 7.1845, + "step": 76625 + }, + { + "epoch": 6.880610412926392, + "grad_norm": 14.057003021240234, + "learning_rate": 6.877827648114902e-06, + "loss": 7.397, + "step": 76650 + }, + { + "epoch": 6.882854578096948, + "grad_norm": 9.685476303100586, + "learning_rate": 6.880071813285458e-06, + "loss": 7.0667, + "step": 76675 + }, + { + "epoch": 6.885098743267505, + "grad_norm": 8.91285514831543, + "learning_rate": 6.882315978456015e-06, + "loss": 7.0916, + "step": 76700 + }, + { + "epoch": 6.887342908438061, + "grad_norm": 10.669816970825195, + "learning_rate": 6.8845601436265715e-06, + "loss": 7.1148, + "step": 76725 + }, + { + "epoch": 6.8895870736086176, + "grad_norm": 11.057476997375488, + "learning_rate": 6.886804308797128e-06, + "loss": 7.2966, + "step": 76750 + }, + { + "epoch": 6.891831238779174, + "grad_norm": 11.485709190368652, + "learning_rate": 6.889048473967685e-06, + "loss": 7.3889, + "step": 76775 + }, + { + "epoch": 6.8940754039497305, + "grad_norm": 10.69943618774414, + "learning_rate": 6.8912926391382405e-06, + "loss": 7.0917, + "step": 76800 + }, + { + "epoch": 6.896319569120287, + "grad_norm": 12.287141799926758, + "learning_rate": 6.893536804308798e-06, + "loss": 7.0284, + "step": 76825 + }, + { + "epoch": 6.8985637342908435, + "grad_norm": 10.916399002075195, + "learning_rate": 6.895780969479355e-06, + "loss": 7.1628, + "step": 76850 + }, + { + "epoch": 6.9008078994614, + "grad_norm": 12.318194389343262, + "learning_rate": 6.89802513464991e-06, + "loss": 7.0355, + "step": 76875 + }, + { + "epoch": 6.903052064631957, + "grad_norm": 9.842414855957031, + "learning_rate": 6.900269299820468e-06, + "loss": 7.1584, + "step": 76900 + }, + { + "epoch": 6.905296229802514, + "grad_norm": 11.51326847076416, + "learning_rate": 6.9025134649910244e-06, + "loss": 7.0972, + "step": 76925 + }, + { + "epoch": 6.90754039497307, + "grad_norm": 10.386106491088867, + "learning_rate": 6.90475763016158e-06, + "loss": 6.9722, + "step": 76950 + }, + { + "epoch": 6.909784560143627, + "grad_norm": 11.86064624786377, + "learning_rate": 6.907001795332137e-06, + "loss": 7.3718, + "step": 76975 + }, + { + "epoch": 6.912028725314183, + "grad_norm": 9.519308090209961, + "learning_rate": 6.909245960502694e-06, + "loss": 7.3341, + "step": 77000 + }, + { + "epoch": 6.91427289048474, + "grad_norm": 10.180307388305664, + "learning_rate": 6.91149012567325e-06, + "loss": 7.1207, + "step": 77025 + }, + { + "epoch": 6.916517055655296, + "grad_norm": 10.72646713256836, + "learning_rate": 6.913734290843807e-06, + "loss": 7.1744, + "step": 77050 + }, + { + "epoch": 6.918761220825853, + "grad_norm": 10.40217113494873, + "learning_rate": 6.9159784560143624e-06, + "loss": 7.4236, + "step": 77075 + }, + { + "epoch": 6.921005385996409, + "grad_norm": 10.558469772338867, + "learning_rate": 6.91822262118492e-06, + "loss": 7.208, + "step": 77100 + }, + { + "epoch": 6.923249551166966, + "grad_norm": 11.485169410705566, + "learning_rate": 6.9204667863554765e-06, + "loss": 7.2694, + "step": 77125 + }, + { + "epoch": 6.925493716337522, + "grad_norm": 13.373960494995117, + "learning_rate": 6.922710951526032e-06, + "loss": 7.3206, + "step": 77150 + }, + { + "epoch": 6.9277378815080795, + "grad_norm": 12.208877563476562, + "learning_rate": 6.924955116696589e-06, + "loss": 7.2691, + "step": 77175 + }, + { + "epoch": 6.929982046678636, + "grad_norm": 12.444807052612305, + "learning_rate": 6.927199281867146e-06, + "loss": 7.2852, + "step": 77200 + }, + { + "epoch": 6.932226211849192, + "grad_norm": 11.8712797164917, + "learning_rate": 6.929443447037703e-06, + "loss": 7.2758, + "step": 77225 + }, + { + "epoch": 6.934470377019749, + "grad_norm": 13.572479248046875, + "learning_rate": 6.931687612208259e-06, + "loss": 7.2505, + "step": 77250 + }, + { + "epoch": 6.936714542190305, + "grad_norm": 9.941105842590332, + "learning_rate": 6.933931777378816e-06, + "loss": 7.1278, + "step": 77275 + }, + { + "epoch": 6.938958707360862, + "grad_norm": 9.237920761108398, + "learning_rate": 6.936175942549373e-06, + "loss": 7.4927, + "step": 77300 + }, + { + "epoch": 6.941202872531418, + "grad_norm": 14.315759658813477, + "learning_rate": 6.938420107719929e-06, + "loss": 7.3095, + "step": 77325 + }, + { + "epoch": 6.943447037701975, + "grad_norm": 13.76396369934082, + "learning_rate": 6.940664272890485e-06, + "loss": 7.2102, + "step": 77350 + }, + { + "epoch": 6.945691202872531, + "grad_norm": 10.225892066955566, + "learning_rate": 6.942908438061043e-06, + "loss": 7.2399, + "step": 77375 + }, + { + "epoch": 6.947935368043088, + "grad_norm": 13.915824890136719, + "learning_rate": 6.9451526032315984e-06, + "loss": 7.3363, + "step": 77400 + }, + { + "epoch": 6.950179533213644, + "grad_norm": 9.969493865966797, + "learning_rate": 6.947396768402155e-06, + "loss": 7.0982, + "step": 77425 + }, + { + "epoch": 6.952423698384201, + "grad_norm": 11.064339637756348, + "learning_rate": 6.949640933572711e-06, + "loss": 7.1058, + "step": 77450 + }, + { + "epoch": 6.954667863554757, + "grad_norm": 10.99870777130127, + "learning_rate": 6.951885098743268e-06, + "loss": 7.4881, + "step": 77475 + }, + { + "epoch": 6.956912028725315, + "grad_norm": 9.317066192626953, + "learning_rate": 6.954129263913825e-06, + "loss": 7.0144, + "step": 77500 + }, + { + "epoch": 6.959156193895871, + "grad_norm": 11.559277534484863, + "learning_rate": 6.956373429084381e-06, + "loss": 7.0862, + "step": 77525 + }, + { + "epoch": 6.9614003590664275, + "grad_norm": 11.60411262512207, + "learning_rate": 6.958617594254937e-06, + "loss": 7.0714, + "step": 77550 + }, + { + "epoch": 6.963644524236984, + "grad_norm": 11.118532180786133, + "learning_rate": 6.960861759425495e-06, + "loss": 7.2782, + "step": 77575 + }, + { + "epoch": 6.9658886894075405, + "grad_norm": 11.511760711669922, + "learning_rate": 6.9631059245960505e-06, + "loss": 7.3178, + "step": 77600 + }, + { + "epoch": 6.968132854578097, + "grad_norm": 20.102697372436523, + "learning_rate": 6.965350089766607e-06, + "loss": 7.2294, + "step": 77625 + }, + { + "epoch": 6.970377019748653, + "grad_norm": 13.584903717041016, + "learning_rate": 6.9675942549371646e-06, + "loss": 7.3134, + "step": 77650 + }, + { + "epoch": 6.97262118491921, + "grad_norm": 11.58993911743164, + "learning_rate": 6.96983842010772e-06, + "loss": 7.309, + "step": 77675 + }, + { + "epoch": 6.974865350089766, + "grad_norm": 11.815709114074707, + "learning_rate": 6.972082585278277e-06, + "loss": 7.2551, + "step": 77700 + }, + { + "epoch": 6.977109515260323, + "grad_norm": 10.805977821350098, + "learning_rate": 6.9743267504488336e-06, + "loss": 7.134, + "step": 77725 + }, + { + "epoch": 6.979353680430879, + "grad_norm": 11.36467456817627, + "learning_rate": 6.97657091561939e-06, + "loss": 7.1905, + "step": 77750 + }, + { + "epoch": 6.981597845601437, + "grad_norm": 9.681744575500488, + "learning_rate": 6.978815080789947e-06, + "loss": 7.1869, + "step": 77775 + }, + { + "epoch": 6.983842010771993, + "grad_norm": 11.269296646118164, + "learning_rate": 6.981059245960503e-06, + "loss": 6.9928, + "step": 77800 + }, + { + "epoch": 6.98608617594255, + "grad_norm": 11.09477424621582, + "learning_rate": 6.983303411131059e-06, + "loss": 7.2174, + "step": 77825 + }, + { + "epoch": 6.988330341113106, + "grad_norm": 12.000593185424805, + "learning_rate": 6.985547576301617e-06, + "loss": 7.1916, + "step": 77850 + }, + { + "epoch": 6.990574506283663, + "grad_norm": 12.078262329101562, + "learning_rate": 6.987791741472173e-06, + "loss": 7.3048, + "step": 77875 + }, + { + "epoch": 6.992818671454219, + "grad_norm": 11.683846473693848, + "learning_rate": 6.990035906642729e-06, + "loss": 7.2582, + "step": 77900 + }, + { + "epoch": 6.995062836624776, + "grad_norm": 11.344538688659668, + "learning_rate": 6.992280071813286e-06, + "loss": 7.3395, + "step": 77925 + }, + { + "epoch": 6.997307001795332, + "grad_norm": 12.57683277130127, + "learning_rate": 6.994524236983843e-06, + "loss": 7.0017, + "step": 77950 + }, + { + "epoch": 6.9995511669658885, + "grad_norm": 14.495851516723633, + "learning_rate": 6.996768402154399e-06, + "loss": 7.0792, + "step": 77975 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.06159662421476715, + "eval_f1_macro": 0.001260209074354303, + "eval_f1_micro": 0.06159662421476715, + "eval_f1_weighted": 0.020395300524329005, + "eval_loss": 7.725404739379883, + "eval_precision_macro": 0.0010624634335382574, + "eval_precision_micro": 0.06159662421476715, + "eval_precision_weighted": 0.01528681798028724, + "eval_recall_macro": 0.003444259520843647, + "eval_recall_micro": 0.06159662421476715, + "eval_recall_weighted": 0.06159662421476715, + "eval_runtime": 128.944, + "eval_samples_per_second": 406.169, + "eval_steps_per_second": 12.695, + "step": 77980 + }, + { + "epoch": 7.001795332136445, + "grad_norm": 9.6204252243042, + "learning_rate": 6.9990125673249555e-06, + "loss": 6.8503, + "step": 78000 + }, + { + "epoch": 7.0040394973070015, + "grad_norm": 10.684722900390625, + "learning_rate": 7.001256732495513e-06, + "loss": 7.0304, + "step": 78025 + }, + { + "epoch": 7.006283662477558, + "grad_norm": 9.143749237060547, + "learning_rate": 7.003500897666069e-06, + "loss": 7.0708, + "step": 78050 + }, + { + "epoch": 7.008527827648115, + "grad_norm": 10.02152156829834, + "learning_rate": 7.005745062836625e-06, + "loss": 7.1498, + "step": 78075 + }, + { + "epoch": 7.010771992818672, + "grad_norm": 14.525875091552734, + "learning_rate": 7.007989228007181e-06, + "loss": 6.8682, + "step": 78100 + }, + { + "epoch": 7.013016157989228, + "grad_norm": 10.511037826538086, + "learning_rate": 7.0102333931777386e-06, + "loss": 6.9795, + "step": 78125 + }, + { + "epoch": 7.015260323159785, + "grad_norm": 13.277899742126465, + "learning_rate": 7.012477558348295e-06, + "loss": 7.2382, + "step": 78150 + }, + { + "epoch": 7.017504488330341, + "grad_norm": 12.166326522827148, + "learning_rate": 7.014721723518852e-06, + "loss": 6.7721, + "step": 78175 + }, + { + "epoch": 7.019748653500898, + "grad_norm": 9.936442375183105, + "learning_rate": 7.0169658886894076e-06, + "loss": 7.1726, + "step": 78200 + }, + { + "epoch": 7.021992818671454, + "grad_norm": 11.686080932617188, + "learning_rate": 7.019210053859965e-06, + "loss": 6.8669, + "step": 78225 + }, + { + "epoch": 7.024236983842011, + "grad_norm": 11.074368476867676, + "learning_rate": 7.021454219030522e-06, + "loss": 7.0178, + "step": 78250 + }, + { + "epoch": 7.026481149012567, + "grad_norm": 11.005204200744629, + "learning_rate": 7.023608617594256e-06, + "loss": 7.0313, + "step": 78275 + }, + { + "epoch": 7.028725314183124, + "grad_norm": 10.543983459472656, + "learning_rate": 7.025852782764811e-06, + "loss": 7.0522, + "step": 78300 + }, + { + "epoch": 7.03096947935368, + "grad_norm": 12.962173461914062, + "learning_rate": 7.028096947935369e-06, + "loss": 6.8866, + "step": 78325 + }, + { + "epoch": 7.033213644524237, + "grad_norm": 12.168960571289062, + "learning_rate": 7.0303411131059255e-06, + "loss": 7.0727, + "step": 78350 + }, + { + "epoch": 7.035457809694794, + "grad_norm": 12.837242126464844, + "learning_rate": 7.032585278276481e-06, + "loss": 7.1945, + "step": 78375 + }, + { + "epoch": 7.03770197486535, + "grad_norm": 12.295308113098145, + "learning_rate": 7.034829443447039e-06, + "loss": 7.0487, + "step": 78400 + }, + { + "epoch": 7.039946140035907, + "grad_norm": 10.957308769226074, + "learning_rate": 7.037073608617595e-06, + "loss": 7.1656, + "step": 78425 + }, + { + "epoch": 7.042190305206463, + "grad_norm": 10.248066902160645, + "learning_rate": 7.039317773788151e-06, + "loss": 7.0354, + "step": 78450 + }, + { + "epoch": 7.04443447037702, + "grad_norm": 11.667706489562988, + "learning_rate": 7.041561938958708e-06, + "loss": 7.2131, + "step": 78475 + }, + { + "epoch": 7.046678635547576, + "grad_norm": 12.523640632629395, + "learning_rate": 7.043806104129265e-06, + "loss": 6.8195, + "step": 78500 + }, + { + "epoch": 7.048922800718133, + "grad_norm": 10.120912551879883, + "learning_rate": 7.046050269299821e-06, + "loss": 6.9675, + "step": 78525 + }, + { + "epoch": 7.051166965888689, + "grad_norm": 12.654841423034668, + "learning_rate": 7.0482944344703776e-06, + "loss": 7.003, + "step": 78550 + }, + { + "epoch": 7.053411131059246, + "grad_norm": 11.393359184265137, + "learning_rate": 7.050538599640933e-06, + "loss": 7.105, + "step": 78575 + }, + { + "epoch": 7.055655296229802, + "grad_norm": 12.229519844055176, + "learning_rate": 7.052782764811491e-06, + "loss": 6.9604, + "step": 78600 + }, + { + "epoch": 7.057899461400359, + "grad_norm": 9.712087631225586, + "learning_rate": 7.055026929982047e-06, + "loss": 6.9387, + "step": 78625 + }, + { + "epoch": 7.060143626570915, + "grad_norm": 10.128707885742188, + "learning_rate": 7.057271095152603e-06, + "loss": 6.8961, + "step": 78650 + }, + { + "epoch": 7.062387791741473, + "grad_norm": 10.423731803894043, + "learning_rate": 7.05951526032316e-06, + "loss": 7.0539, + "step": 78675 + }, + { + "epoch": 7.064631956912029, + "grad_norm": 12.15406608581543, + "learning_rate": 7.061759425493717e-06, + "loss": 7.045, + "step": 78700 + }, + { + "epoch": 7.0668761220825855, + "grad_norm": 11.408331871032715, + "learning_rate": 7.064003590664273e-06, + "loss": 7.0286, + "step": 78725 + }, + { + "epoch": 7.069120287253142, + "grad_norm": 12.176095008850098, + "learning_rate": 7.06624775583483e-06, + "loss": 6.8713, + "step": 78750 + }, + { + "epoch": 7.0713644524236985, + "grad_norm": 10.09225845336914, + "learning_rate": 7.068491921005387e-06, + "loss": 7.1296, + "step": 78775 + }, + { + "epoch": 7.073608617594255, + "grad_norm": 10.059694290161133, + "learning_rate": 7.070736086175944e-06, + "loss": 6.7888, + "step": 78800 + }, + { + "epoch": 7.075852782764811, + "grad_norm": 13.460572242736816, + "learning_rate": 7.0729802513464995e-06, + "loss": 7.1072, + "step": 78825 + }, + { + "epoch": 7.078096947935368, + "grad_norm": 14.100081443786621, + "learning_rate": 7.075224416517056e-06, + "loss": 6.7602, + "step": 78850 + }, + { + "epoch": 7.080341113105924, + "grad_norm": 9.718121528625488, + "learning_rate": 7.0774685816876136e-06, + "loss": 6.9788, + "step": 78875 + }, + { + "epoch": 7.082585278276481, + "grad_norm": 11.089181900024414, + "learning_rate": 7.079712746858169e-06, + "loss": 6.8518, + "step": 78900 + }, + { + "epoch": 7.084829443447037, + "grad_norm": 11.068573951721191, + "learning_rate": 7.081956912028726e-06, + "loss": 6.964, + "step": 78925 + }, + { + "epoch": 7.087073608617594, + "grad_norm": 12.55396842956543, + "learning_rate": 7.084201077199282e-06, + "loss": 6.9476, + "step": 78950 + }, + { + "epoch": 7.089317773788151, + "grad_norm": 14.330034255981445, + "learning_rate": 7.086445242369839e-06, + "loss": 7.0464, + "step": 78975 + }, + { + "epoch": 7.091561938958708, + "grad_norm": 11.356575012207031, + "learning_rate": 7.088689407540396e-06, + "loss": 6.9674, + "step": 79000 + }, + { + "epoch": 7.093806104129264, + "grad_norm": 10.529419898986816, + "learning_rate": 7.0909335727109516e-06, + "loss": 7.0737, + "step": 79025 + }, + { + "epoch": 7.096050269299821, + "grad_norm": 11.634669303894043, + "learning_rate": 7.093177737881509e-06, + "loss": 7.0361, + "step": 79050 + }, + { + "epoch": 7.098294434470377, + "grad_norm": 10.434720039367676, + "learning_rate": 7.095421903052066e-06, + "loss": 7.0223, + "step": 79075 + }, + { + "epoch": 7.100538599640934, + "grad_norm": 14.029146194458008, + "learning_rate": 7.097666068222621e-06, + "loss": 7.1802, + "step": 79100 + }, + { + "epoch": 7.10278276481149, + "grad_norm": 11.503131866455078, + "learning_rate": 7.099910233393178e-06, + "loss": 6.9201, + "step": 79125 + }, + { + "epoch": 7.1050269299820465, + "grad_norm": 11.816502571105957, + "learning_rate": 7.1021543985637355e-06, + "loss": 6.6596, + "step": 79150 + }, + { + "epoch": 7.107271095152603, + "grad_norm": 12.912046432495117, + "learning_rate": 7.104398563734291e-06, + "loss": 6.9738, + "step": 79175 + }, + { + "epoch": 7.1095152603231595, + "grad_norm": 11.143064498901367, + "learning_rate": 7.106642728904848e-06, + "loss": 6.8905, + "step": 79200 + }, + { + "epoch": 7.111759425493716, + "grad_norm": 9.59415340423584, + "learning_rate": 7.1088868940754045e-06, + "loss": 6.9945, + "step": 79225 + }, + { + "epoch": 7.1140035906642725, + "grad_norm": 12.017369270324707, + "learning_rate": 7.111131059245961e-06, + "loss": 6.9881, + "step": 79250 + }, + { + "epoch": 7.11624775583483, + "grad_norm": 11.436917304992676, + "learning_rate": 7.113375224416518e-06, + "loss": 7.1558, + "step": 79275 + }, + { + "epoch": 7.118491921005386, + "grad_norm": 14.036722183227539, + "learning_rate": 7.115619389587074e-06, + "loss": 7.08, + "step": 79300 + }, + { + "epoch": 7.120736086175943, + "grad_norm": 11.410258293151855, + "learning_rate": 7.11786355475763e-06, + "loss": 6.9732, + "step": 79325 + }, + { + "epoch": 7.122980251346499, + "grad_norm": 11.516627311706543, + "learning_rate": 7.1201077199281876e-06, + "loss": 7.0483, + "step": 79350 + }, + { + "epoch": 7.125224416517056, + "grad_norm": 12.30455207824707, + "learning_rate": 7.122351885098744e-06, + "loss": 6.9402, + "step": 79375 + }, + { + "epoch": 7.127468581687612, + "grad_norm": 11.028185844421387, + "learning_rate": 7.1245960502693e-06, + "loss": 7.1172, + "step": 79400 + }, + { + "epoch": 7.129712746858169, + "grad_norm": 11.455455780029297, + "learning_rate": 7.126840215439857e-06, + "loss": 7.0966, + "step": 79425 + }, + { + "epoch": 7.131956912028725, + "grad_norm": 15.846437454223633, + "learning_rate": 7.129084380610414e-06, + "loss": 6.8229, + "step": 79450 + }, + { + "epoch": 7.134201077199282, + "grad_norm": 10.986010551452637, + "learning_rate": 7.13132854578097e-06, + "loss": 7.0318, + "step": 79475 + }, + { + "epoch": 7.136445242369838, + "grad_norm": 11.300883293151855, + "learning_rate": 7.133572710951526e-06, + "loss": 6.9256, + "step": 79500 + }, + { + "epoch": 7.138689407540395, + "grad_norm": 14.86542797088623, + "learning_rate": 7.135816876122084e-06, + "loss": 7.0311, + "step": 79525 + }, + { + "epoch": 7.140933572710951, + "grad_norm": 11.3841552734375, + "learning_rate": 7.13806104129264e-06, + "loss": 7.1041, + "step": 79550 + }, + { + "epoch": 7.1431777378815084, + "grad_norm": 10.658662796020508, + "learning_rate": 7.140305206463196e-06, + "loss": 6.9969, + "step": 79575 + }, + { + "epoch": 7.145421903052065, + "grad_norm": 11.861370086669922, + "learning_rate": 7.142549371633752e-06, + "loss": 7.2432, + "step": 79600 + }, + { + "epoch": 7.147666068222621, + "grad_norm": 10.472166061401367, + "learning_rate": 7.1447935368043095e-06, + "loss": 7.0744, + "step": 79625 + }, + { + "epoch": 7.149910233393178, + "grad_norm": 13.125968933105469, + "learning_rate": 7.147037701974866e-06, + "loss": 7.1967, + "step": 79650 + }, + { + "epoch": 7.152154398563734, + "grad_norm": 11.560393333435059, + "learning_rate": 7.149281867145422e-06, + "loss": 7.0782, + "step": 79675 + }, + { + "epoch": 7.154398563734291, + "grad_norm": 10.914063453674316, + "learning_rate": 7.1515260323159785e-06, + "loss": 6.8125, + "step": 79700 + }, + { + "epoch": 7.156642728904847, + "grad_norm": 8.606705665588379, + "learning_rate": 7.153770197486536e-06, + "loss": 7.1591, + "step": 79725 + }, + { + "epoch": 7.158886894075404, + "grad_norm": 10.321446418762207, + "learning_rate": 7.1560143626570926e-06, + "loss": 7.2474, + "step": 79750 + }, + { + "epoch": 7.16113105924596, + "grad_norm": 11.233634948730469, + "learning_rate": 7.158258527827648e-06, + "loss": 6.9239, + "step": 79775 + }, + { + "epoch": 7.163375224416517, + "grad_norm": 11.2592134475708, + "learning_rate": 7.160502692998206e-06, + "loss": 7.0749, + "step": 79800 + }, + { + "epoch": 7.165619389587073, + "grad_norm": 8.707022666931152, + "learning_rate": 7.162746858168762e-06, + "loss": 6.9672, + "step": 79825 + }, + { + "epoch": 7.167863554757631, + "grad_norm": 12.894819259643555, + "learning_rate": 7.164991023339318e-06, + "loss": 6.9918, + "step": 79850 + }, + { + "epoch": 7.170107719928187, + "grad_norm": 10.792762756347656, + "learning_rate": 7.167235188509875e-06, + "loss": 7.1279, + "step": 79875 + }, + { + "epoch": 7.1723518850987436, + "grad_norm": 9.745238304138184, + "learning_rate": 7.169479353680432e-06, + "loss": 7.0023, + "step": 79900 + }, + { + "epoch": 7.1745960502693, + "grad_norm": 15.914952278137207, + "learning_rate": 7.171723518850988e-06, + "loss": 7.0582, + "step": 79925 + }, + { + "epoch": 7.1768402154398565, + "grad_norm": 14.01064682006836, + "learning_rate": 7.173967684021545e-06, + "loss": 6.9829, + "step": 79950 + }, + { + "epoch": 7.179084380610413, + "grad_norm": 13.712442398071289, + "learning_rate": 7.1762118491921e-06, + "loss": 6.9566, + "step": 79975 + }, + { + "epoch": 7.1813285457809695, + "grad_norm": 9.80897331237793, + "learning_rate": 7.178456014362658e-06, + "loss": 6.9777, + "step": 80000 + }, + { + "epoch": 7.183572710951526, + "grad_norm": 11.759711265563965, + "learning_rate": 7.1807001795332145e-06, + "loss": 7.1688, + "step": 80025 + }, + { + "epoch": 7.185816876122082, + "grad_norm": 11.774030685424805, + "learning_rate": 7.18294434470377e-06, + "loss": 7.0848, + "step": 80050 + }, + { + "epoch": 7.188061041292639, + "grad_norm": 11.174187660217285, + "learning_rate": 7.185188509874327e-06, + "loss": 6.9152, + "step": 80075 + }, + { + "epoch": 7.190305206463195, + "grad_norm": 12.902565002441406, + "learning_rate": 7.187432675044884e-06, + "loss": 7.0883, + "step": 80100 + }, + { + "epoch": 7.192549371633752, + "grad_norm": 12.21207046508789, + "learning_rate": 7.18967684021544e-06, + "loss": 6.9871, + "step": 80125 + }, + { + "epoch": 7.194793536804309, + "grad_norm": 12.623739242553711, + "learning_rate": 7.191921005385997e-06, + "loss": 6.9668, + "step": 80150 + }, + { + "epoch": 7.197037701974866, + "grad_norm": 12.289436340332031, + "learning_rate": 7.194165170556554e-06, + "loss": 6.8668, + "step": 80175 + }, + { + "epoch": 7.199281867145422, + "grad_norm": 14.992889404296875, + "learning_rate": 7.19640933572711e-06, + "loss": 7.1315, + "step": 80200 + }, + { + "epoch": 7.201526032315979, + "grad_norm": 10.814194679260254, + "learning_rate": 7.1986535008976666e-06, + "loss": 7.009, + "step": 80225 + }, + { + "epoch": 7.203770197486535, + "grad_norm": 10.8890962600708, + "learning_rate": 7.200897666068223e-06, + "loss": 7.1142, + "step": 80250 + }, + { + "epoch": 7.206014362657092, + "grad_norm": 12.813872337341309, + "learning_rate": 7.20314183123878e-06, + "loss": 7.072, + "step": 80275 + }, + { + "epoch": 7.208258527827648, + "grad_norm": 13.490029335021973, + "learning_rate": 7.205385996409336e-06, + "loss": 7.1596, + "step": 80300 + }, + { + "epoch": 7.210502692998205, + "grad_norm": 14.477034568786621, + "learning_rate": 7.207630161579893e-06, + "loss": 6.7942, + "step": 80325 + }, + { + "epoch": 7.212746858168761, + "grad_norm": 12.745403289794922, + "learning_rate": 7.209874326750449e-06, + "loss": 7.0146, + "step": 80350 + }, + { + "epoch": 7.2149910233393175, + "grad_norm": 9.742789268493652, + "learning_rate": 7.212118491921006e-06, + "loss": 6.8976, + "step": 80375 + }, + { + "epoch": 7.217235188509874, + "grad_norm": 10.0947904586792, + "learning_rate": 7.214362657091563e-06, + "loss": 7.0262, + "step": 80400 + }, + { + "epoch": 7.2194793536804305, + "grad_norm": 11.064336776733398, + "learning_rate": 7.216606822262119e-06, + "loss": 6.9887, + "step": 80425 + }, + { + "epoch": 7.221723518850988, + "grad_norm": 10.503732681274414, + "learning_rate": 7.218850987432675e-06, + "loss": 7.038, + "step": 80450 + }, + { + "epoch": 7.223967684021544, + "grad_norm": 9.156112670898438, + "learning_rate": 7.221095152603233e-06, + "loss": 7.018, + "step": 80475 + }, + { + "epoch": 7.226211849192101, + "grad_norm": 18.267019271850586, + "learning_rate": 7.2233393177737885e-06, + "loss": 6.9897, + "step": 80500 + }, + { + "epoch": 7.228456014362657, + "grad_norm": 9.749064445495605, + "learning_rate": 7.225583482944345e-06, + "loss": 6.8746, + "step": 80525 + }, + { + "epoch": 7.230700179533214, + "grad_norm": 8.735913276672363, + "learning_rate": 7.2278276481149025e-06, + "loss": 6.976, + "step": 80550 + }, + { + "epoch": 7.23294434470377, + "grad_norm": 10.767074584960938, + "learning_rate": 7.230071813285458e-06, + "loss": 6.9679, + "step": 80575 + }, + { + "epoch": 7.235188509874327, + "grad_norm": 12.866726875305176, + "learning_rate": 7.232315978456015e-06, + "loss": 6.8109, + "step": 80600 + }, + { + "epoch": 7.237432675044883, + "grad_norm": 10.092597961425781, + "learning_rate": 7.234560143626571e-06, + "loss": 6.9502, + "step": 80625 + }, + { + "epoch": 7.23967684021544, + "grad_norm": 10.505327224731445, + "learning_rate": 7.236804308797128e-06, + "loss": 6.9746, + "step": 80650 + }, + { + "epoch": 7.241921005385996, + "grad_norm": 11.644251823425293, + "learning_rate": 7.239048473967685e-06, + "loss": 6.8613, + "step": 80675 + }, + { + "epoch": 7.244165170556553, + "grad_norm": 10.270689010620117, + "learning_rate": 7.241292639138241e-06, + "loss": 6.89, + "step": 80700 + }, + { + "epoch": 7.246409335727109, + "grad_norm": 13.185018539428711, + "learning_rate": 7.243536804308797e-06, + "loss": 7.1328, + "step": 80725 + }, + { + "epoch": 7.2486535008976665, + "grad_norm": 11.369105339050293, + "learning_rate": 7.245780969479355e-06, + "loss": 6.9361, + "step": 80750 + }, + { + "epoch": 7.250897666068223, + "grad_norm": 10.556387901306152, + "learning_rate": 7.248025134649911e-06, + "loss": 6.9609, + "step": 80775 + }, + { + "epoch": 7.253141831238779, + "grad_norm": 12.592976570129395, + "learning_rate": 7.250269299820467e-06, + "loss": 6.9428, + "step": 80800 + }, + { + "epoch": 7.255385996409336, + "grad_norm": 11.361931800842285, + "learning_rate": 7.252513464991024e-06, + "loss": 6.8818, + "step": 80825 + }, + { + "epoch": 7.257630161579892, + "grad_norm": 11.426374435424805, + "learning_rate": 7.254757630161581e-06, + "loss": 6.9374, + "step": 80850 + }, + { + "epoch": 7.259874326750449, + "grad_norm": 11.502232551574707, + "learning_rate": 7.257001795332137e-06, + "loss": 6.9797, + "step": 80875 + }, + { + "epoch": 7.262118491921005, + "grad_norm": 14.397311210632324, + "learning_rate": 7.2592459605026935e-06, + "loss": 7.0524, + "step": 80900 + }, + { + "epoch": 7.264362657091562, + "grad_norm": 13.321002960205078, + "learning_rate": 7.261490125673251e-06, + "loss": 6.9266, + "step": 80925 + }, + { + "epoch": 7.266606822262118, + "grad_norm": 14.083715438842773, + "learning_rate": 7.263734290843807e-06, + "loss": 6.8424, + "step": 80950 + }, + { + "epoch": 7.268850987432675, + "grad_norm": 13.528890609741211, + "learning_rate": 7.265978456014363e-06, + "loss": 6.9424, + "step": 80975 + }, + { + "epoch": 7.271095152603231, + "grad_norm": 10.734893798828125, + "learning_rate": 7.268222621184919e-06, + "loss": 7.0523, + "step": 81000 + }, + { + "epoch": 7.273339317773788, + "grad_norm": 10.117459297180176, + "learning_rate": 7.2704667863554765e-06, + "loss": 7.0061, + "step": 81025 + }, + { + "epoch": 7.275583482944345, + "grad_norm": 11.360634803771973, + "learning_rate": 7.272710951526033e-06, + "loss": 7.2141, + "step": 81050 + }, + { + "epoch": 7.277827648114902, + "grad_norm": 9.660614967346191, + "learning_rate": 7.274955116696589e-06, + "loss": 6.917, + "step": 81075 + }, + { + "epoch": 7.280071813285458, + "grad_norm": 12.042798042297363, + "learning_rate": 7.2771992818671455e-06, + "loss": 6.9564, + "step": 81100 + }, + { + "epoch": 7.2823159784560145, + "grad_norm": 15.907037734985352, + "learning_rate": 7.279443447037703e-06, + "loss": 7.2511, + "step": 81125 + }, + { + "epoch": 7.284560143626571, + "grad_norm": 12.50130844116211, + "learning_rate": 7.281687612208259e-06, + "loss": 7.069, + "step": 81150 + }, + { + "epoch": 7.2868043087971275, + "grad_norm": 12.800305366516113, + "learning_rate": 7.283931777378815e-06, + "loss": 6.9956, + "step": 81175 + }, + { + "epoch": 7.289048473967684, + "grad_norm": 11.17619800567627, + "learning_rate": 7.286175942549373e-06, + "loss": 6.5794, + "step": 81200 + }, + { + "epoch": 7.29129263913824, + "grad_norm": 11.889313697814941, + "learning_rate": 7.288420107719929e-06, + "loss": 6.7705, + "step": 81225 + }, + { + "epoch": 7.293536804308797, + "grad_norm": 9.867574691772461, + "learning_rate": 7.290664272890485e-06, + "loss": 7.0563, + "step": 81250 + }, + { + "epoch": 7.295780969479353, + "grad_norm": 13.30628776550293, + "learning_rate": 7.292908438061042e-06, + "loss": 7.0107, + "step": 81275 + }, + { + "epoch": 7.29802513464991, + "grad_norm": 11.732073783874512, + "learning_rate": 7.295152603231599e-06, + "loss": 7.1252, + "step": 81300 + }, + { + "epoch": 7.300269299820467, + "grad_norm": 16.559724807739258, + "learning_rate": 7.297396768402155e-06, + "loss": 6.922, + "step": 81325 + }, + { + "epoch": 7.302513464991024, + "grad_norm": 10.791009902954102, + "learning_rate": 7.299551166965889e-06, + "loss": 6.9168, + "step": 81350 + }, + { + "epoch": 7.30475763016158, + "grad_norm": 11.13119125366211, + "learning_rate": 7.301795332136446e-06, + "loss": 7.2383, + "step": 81375 + }, + { + "epoch": 7.307001795332137, + "grad_norm": 12.268176078796387, + "learning_rate": 7.304039497307003e-06, + "loss": 7.0791, + "step": 81400 + }, + { + "epoch": 7.309245960502693, + "grad_norm": 13.499882698059082, + "learning_rate": 7.306283662477559e-06, + "loss": 6.8109, + "step": 81425 + }, + { + "epoch": 7.31149012567325, + "grad_norm": 11.078801155090332, + "learning_rate": 7.3085278276481155e-06, + "loss": 6.9822, + "step": 81450 + }, + { + "epoch": 7.313734290843806, + "grad_norm": 10.45136833190918, + "learning_rate": 7.310771992818671e-06, + "loss": 7.0914, + "step": 81475 + }, + { + "epoch": 7.315978456014363, + "grad_norm": 12.117834091186523, + "learning_rate": 7.313016157989229e-06, + "loss": 6.7089, + "step": 81500 + }, + { + "epoch": 7.318222621184919, + "grad_norm": 9.978744506835938, + "learning_rate": 7.315260323159785e-06, + "loss": 7.0978, + "step": 81525 + }, + { + "epoch": 7.3204667863554755, + "grad_norm": 10.59483814239502, + "learning_rate": 7.317504488330341e-06, + "loss": 7.0632, + "step": 81550 + }, + { + "epoch": 7.322710951526032, + "grad_norm": 10.841562271118164, + "learning_rate": 7.319748653500899e-06, + "loss": 7.0027, + "step": 81575 + }, + { + "epoch": 7.3249551166965885, + "grad_norm": 10.470500946044922, + "learning_rate": 7.321992818671455e-06, + "loss": 7.0383, + "step": 81600 + }, + { + "epoch": 7.327199281867145, + "grad_norm": 10.606712341308594, + "learning_rate": 7.324236983842011e-06, + "loss": 7.158, + "step": 81625 + }, + { + "epoch": 7.329443447037702, + "grad_norm": 11.102705955505371, + "learning_rate": 7.326481149012568e-06, + "loss": 7.0702, + "step": 81650 + }, + { + "epoch": 7.331687612208259, + "grad_norm": 11.058761596679688, + "learning_rate": 7.328725314183125e-06, + "loss": 6.9689, + "step": 81675 + }, + { + "epoch": 7.333931777378815, + "grad_norm": 10.457879066467285, + "learning_rate": 7.330969479353681e-06, + "loss": 7.2918, + "step": 81700 + }, + { + "epoch": 7.336175942549372, + "grad_norm": 10.752019882202148, + "learning_rate": 7.3332136445242375e-06, + "loss": 7.017, + "step": 81725 + }, + { + "epoch": 7.338420107719928, + "grad_norm": 10.627870559692383, + "learning_rate": 7.335457809694794e-06, + "loss": 7.1312, + "step": 81750 + }, + { + "epoch": 7.340664272890485, + "grad_norm": 10.283477783203125, + "learning_rate": 7.337701974865351e-06, + "loss": 7.0476, + "step": 81775 + }, + { + "epoch": 7.342908438061041, + "grad_norm": 12.882774353027344, + "learning_rate": 7.339946140035907e-06, + "loss": 6.8575, + "step": 81800 + }, + { + "epoch": 7.345152603231598, + "grad_norm": 10.07178783416748, + "learning_rate": 7.342190305206464e-06, + "loss": 6.8706, + "step": 81825 + }, + { + "epoch": 7.347396768402154, + "grad_norm": 11.36489486694336, + "learning_rate": 7.34443447037702e-06, + "loss": 6.9442, + "step": 81850 + }, + { + "epoch": 7.349640933572711, + "grad_norm": 8.98350715637207, + "learning_rate": 7.346678635547577e-06, + "loss": 6.9273, + "step": 81875 + }, + { + "epoch": 7.351885098743267, + "grad_norm": 11.553561210632324, + "learning_rate": 7.348922800718134e-06, + "loss": 6.912, + "step": 81900 + }, + { + "epoch": 7.3541292639138245, + "grad_norm": 13.547800064086914, + "learning_rate": 7.3511669658886895e-06, + "loss": 6.9633, + "step": 81925 + }, + { + "epoch": 7.356373429084381, + "grad_norm": 14.310856819152832, + "learning_rate": 7.353411131059247e-06, + "loss": 7.1309, + "step": 81950 + }, + { + "epoch": 7.358617594254937, + "grad_norm": 10.591049194335938, + "learning_rate": 7.355655296229804e-06, + "loss": 6.9669, + "step": 81975 + }, + { + "epoch": 7.360861759425494, + "grad_norm": 13.09131145477295, + "learning_rate": 7.357899461400359e-06, + "loss": 7.0035, + "step": 82000 + }, + { + "epoch": 7.36310592459605, + "grad_norm": 11.081475257873535, + "learning_rate": 7.360143626570916e-06, + "loss": 6.9782, + "step": 82025 + }, + { + "epoch": 7.365350089766607, + "grad_norm": 9.76803970336914, + "learning_rate": 7.3623877917414735e-06, + "loss": 6.9672, + "step": 82050 + }, + { + "epoch": 7.367594254937163, + "grad_norm": 10.6868314743042, + "learning_rate": 7.364631956912029e-06, + "loss": 6.8671, + "step": 82075 + }, + { + "epoch": 7.36983842010772, + "grad_norm": 12.546939849853516, + "learning_rate": 7.366876122082586e-06, + "loss": 7.019, + "step": 82100 + }, + { + "epoch": 7.372082585278276, + "grad_norm": 12.750497817993164, + "learning_rate": 7.369120287253142e-06, + "loss": 6.9124, + "step": 82125 + }, + { + "epoch": 7.374326750448833, + "grad_norm": 11.351529121398926, + "learning_rate": 7.371364452423699e-06, + "loss": 7.0285, + "step": 82150 + }, + { + "epoch": 7.376570915619389, + "grad_norm": 12.111541748046875, + "learning_rate": 7.373608617594256e-06, + "loss": 6.9594, + "step": 82175 + }, + { + "epoch": 7.378815080789946, + "grad_norm": 10.469437599182129, + "learning_rate": 7.3758527827648115e-06, + "loss": 6.7429, + "step": 82200 + }, + { + "epoch": 7.381059245960502, + "grad_norm": 9.779574394226074, + "learning_rate": 7.378096947935368e-06, + "loss": 6.8984, + "step": 82225 + }, + { + "epoch": 7.38330341113106, + "grad_norm": 13.02514934539795, + "learning_rate": 7.3803411131059255e-06, + "loss": 6.8938, + "step": 82250 + }, + { + "epoch": 7.385547576301616, + "grad_norm": 10.253472328186035, + "learning_rate": 7.382585278276482e-06, + "loss": 6.8453, + "step": 82275 + }, + { + "epoch": 7.3877917414721725, + "grad_norm": 13.285163879394531, + "learning_rate": 7.384829443447038e-06, + "loss": 6.7295, + "step": 82300 + }, + { + "epoch": 7.390035906642729, + "grad_norm": 10.40170669555664, + "learning_rate": 7.387073608617595e-06, + "loss": 7.1427, + "step": 82325 + }, + { + "epoch": 7.3922800718132855, + "grad_norm": 12.27161979675293, + "learning_rate": 7.389317773788152e-06, + "loss": 6.9904, + "step": 82350 + }, + { + "epoch": 7.394524236983842, + "grad_norm": 9.362683296203613, + "learning_rate": 7.391561938958708e-06, + "loss": 7.1813, + "step": 82375 + }, + { + "epoch": 7.3967684021543985, + "grad_norm": 10.524234771728516, + "learning_rate": 7.393806104129264e-06, + "loss": 6.9313, + "step": 82400 + }, + { + "epoch": 7.399012567324955, + "grad_norm": 11.216941833496094, + "learning_rate": 7.396050269299822e-06, + "loss": 6.9306, + "step": 82425 + }, + { + "epoch": 7.401256732495511, + "grad_norm": 11.153081893920898, + "learning_rate": 7.398294434470378e-06, + "loss": 6.9942, + "step": 82450 + }, + { + "epoch": 7.403500897666068, + "grad_norm": 10.728699684143066, + "learning_rate": 7.400538599640934e-06, + "loss": 7.1088, + "step": 82475 + }, + { + "epoch": 7.405745062836624, + "grad_norm": 22.602781295776367, + "learning_rate": 7.40278276481149e-06, + "loss": 6.9576, + "step": 82500 + }, + { + "epoch": 7.407989228007182, + "grad_norm": 11.127198219299316, + "learning_rate": 7.4050269299820475e-06, + "loss": 7.0171, + "step": 82525 + }, + { + "epoch": 7.410233393177738, + "grad_norm": 12.4783296585083, + "learning_rate": 7.407271095152604e-06, + "loss": 6.7792, + "step": 82550 + }, + { + "epoch": 7.412477558348295, + "grad_norm": 12.585015296936035, + "learning_rate": 7.40951526032316e-06, + "loss": 7.0439, + "step": 82575 + }, + { + "epoch": 7.414721723518851, + "grad_norm": 14.473796844482422, + "learning_rate": 7.4117594254937165e-06, + "loss": 7.0304, + "step": 82600 + }, + { + "epoch": 7.416965888689408, + "grad_norm": 14.047775268554688, + "learning_rate": 7.414003590664274e-06, + "loss": 7.01, + "step": 82625 + }, + { + "epoch": 7.419210053859964, + "grad_norm": 9.436056137084961, + "learning_rate": 7.41624775583483e-06, + "loss": 6.8633, + "step": 82650 + }, + { + "epoch": 7.421454219030521, + "grad_norm": 10.278822898864746, + "learning_rate": 7.418491921005386e-06, + "loss": 6.9923, + "step": 82675 + }, + { + "epoch": 7.423698384201077, + "grad_norm": 10.651102066040039, + "learning_rate": 7.420736086175944e-06, + "loss": 6.8, + "step": 82700 + }, + { + "epoch": 7.425942549371634, + "grad_norm": 12.251317977905273, + "learning_rate": 7.4229802513464995e-06, + "loss": 7.1001, + "step": 82725 + }, + { + "epoch": 7.42818671454219, + "grad_norm": 10.934837341308594, + "learning_rate": 7.425224416517056e-06, + "loss": 6.8992, + "step": 82750 + }, + { + "epoch": 7.4304308797127465, + "grad_norm": 11.211463928222656, + "learning_rate": 7.427468581687613e-06, + "loss": 6.8502, + "step": 82775 + }, + { + "epoch": 7.432675044883303, + "grad_norm": 14.186153411865234, + "learning_rate": 7.429712746858169e-06, + "loss": 7.1549, + "step": 82800 + }, + { + "epoch": 7.43491921005386, + "grad_norm": 9.178536415100098, + "learning_rate": 7.431956912028726e-06, + "loss": 7.0046, + "step": 82825 + }, + { + "epoch": 7.437163375224417, + "grad_norm": 11.839761734008789, + "learning_rate": 7.434201077199283e-06, + "loss": 7.0461, + "step": 82850 + }, + { + "epoch": 7.439407540394973, + "grad_norm": 10.35836124420166, + "learning_rate": 7.436445242369838e-06, + "loss": 6.8764, + "step": 82875 + }, + { + "epoch": 7.44165170556553, + "grad_norm": 10.576817512512207, + "learning_rate": 7.438689407540396e-06, + "loss": 6.8988, + "step": 82900 + }, + { + "epoch": 7.443895870736086, + "grad_norm": 11.6895751953125, + "learning_rate": 7.4409335727109525e-06, + "loss": 6.89, + "step": 82925 + }, + { + "epoch": 7.446140035906643, + "grad_norm": 10.69385051727295, + "learning_rate": 7.443177737881508e-06, + "loss": 6.9347, + "step": 82950 + }, + { + "epoch": 7.448384201077199, + "grad_norm": 12.83753776550293, + "learning_rate": 7.445421903052065e-06, + "loss": 6.9279, + "step": 82975 + }, + { + "epoch": 7.450628366247756, + "grad_norm": 14.174725532531738, + "learning_rate": 7.447666068222622e-06, + "loss": 6.9598, + "step": 83000 + }, + { + "epoch": 7.452872531418312, + "grad_norm": 11.029766082763672, + "learning_rate": 7.449910233393178e-06, + "loss": 6.9586, + "step": 83025 + }, + { + "epoch": 7.455116696588869, + "grad_norm": 12.730446815490723, + "learning_rate": 7.452154398563735e-06, + "loss": 6.9654, + "step": 83050 + }, + { + "epoch": 7.457360861759425, + "grad_norm": 11.458307266235352, + "learning_rate": 7.454398563734292e-06, + "loss": 6.9439, + "step": 83075 + }, + { + "epoch": 7.459605026929982, + "grad_norm": 13.794890403747559, + "learning_rate": 7.456642728904848e-06, + "loss": 7.0874, + "step": 83100 + }, + { + "epoch": 7.461849192100539, + "grad_norm": 11.341353416442871, + "learning_rate": 7.4588868940754045e-06, + "loss": 6.8516, + "step": 83125 + }, + { + "epoch": 7.4640933572710955, + "grad_norm": 10.955180168151855, + "learning_rate": 7.46113105924596e-06, + "loss": 6.9084, + "step": 83150 + }, + { + "epoch": 7.466337522441652, + "grad_norm": 11.539389610290527, + "learning_rate": 7.463375224416518e-06, + "loss": 6.9469, + "step": 83175 + }, + { + "epoch": 7.468581687612208, + "grad_norm": 9.801163673400879, + "learning_rate": 7.465619389587074e-06, + "loss": 7.1392, + "step": 83200 + }, + { + "epoch": 7.470825852782765, + "grad_norm": 11.938000679016113, + "learning_rate": 7.467863554757631e-06, + "loss": 6.8474, + "step": 83225 + }, + { + "epoch": 7.473070017953321, + "grad_norm": 12.345878601074219, + "learning_rate": 7.470107719928187e-06, + "loss": 7.0222, + "step": 83250 + }, + { + "epoch": 7.475314183123878, + "grad_norm": 12.572527885437012, + "learning_rate": 7.472351885098744e-06, + "loss": 7.0877, + "step": 83275 + }, + { + "epoch": 7.477558348294434, + "grad_norm": 10.109928131103516, + "learning_rate": 7.474596050269301e-06, + "loss": 6.9026, + "step": 83300 + }, + { + "epoch": 7.479802513464991, + "grad_norm": 11.345059394836426, + "learning_rate": 7.476840215439857e-06, + "loss": 6.8223, + "step": 83325 + }, + { + "epoch": 7.482046678635547, + "grad_norm": 11.363851547241211, + "learning_rate": 7.479084380610414e-06, + "loss": 7.0309, + "step": 83350 + }, + { + "epoch": 7.484290843806104, + "grad_norm": 11.460016250610352, + "learning_rate": 7.481328545780971e-06, + "loss": 6.9255, + "step": 83375 + }, + { + "epoch": 7.486535008976661, + "grad_norm": 12.944345474243164, + "learning_rate": 7.4835727109515264e-06, + "loss": 7.0222, + "step": 83400 + }, + { + "epoch": 7.488779174147218, + "grad_norm": 12.860617637634277, + "learning_rate": 7.485816876122083e-06, + "loss": 7.1294, + "step": 83425 + }, + { + "epoch": 7.491023339317774, + "grad_norm": 11.678791046142578, + "learning_rate": 7.4880610412926405e-06, + "loss": 6.8889, + "step": 83450 + }, + { + "epoch": 7.493267504488331, + "grad_norm": 11.92329216003418, + "learning_rate": 7.490305206463196e-06, + "loss": 7.0553, + "step": 83475 + }, + { + "epoch": 7.495511669658887, + "grad_norm": 15.835587501525879, + "learning_rate": 7.492549371633753e-06, + "loss": 7.0854, + "step": 83500 + }, + { + "epoch": 7.4977558348294435, + "grad_norm": 13.73170280456543, + "learning_rate": 7.494793536804309e-06, + "loss": 6.8917, + "step": 83525 + }, + { + "epoch": 7.5, + "grad_norm": 10.138802528381348, + "learning_rate": 7.497037701974866e-06, + "loss": 6.8446, + "step": 83550 + }, + { + "epoch": 7.5022441651705565, + "grad_norm": 12.642066955566406, + "learning_rate": 7.499281867145423e-06, + "loss": 7.2946, + "step": 83575 + }, + { + "epoch": 7.504488330341113, + "grad_norm": 11.259531021118164, + "learning_rate": 7.5015260323159785e-06, + "loss": 6.973, + "step": 83600 + }, + { + "epoch": 7.506732495511669, + "grad_norm": 12.371447563171387, + "learning_rate": 7.503770197486535e-06, + "loss": 6.8122, + "step": 83625 + }, + { + "epoch": 7.508976660682226, + "grad_norm": 11.001914978027344, + "learning_rate": 7.506014362657093e-06, + "loss": 6.9641, + "step": 83650 + }, + { + "epoch": 7.511220825852782, + "grad_norm": 11.006735801696777, + "learning_rate": 7.508258527827648e-06, + "loss": 7.2048, + "step": 83675 + }, + { + "epoch": 7.513464991023339, + "grad_norm": 10.026503562927246, + "learning_rate": 7.510502692998205e-06, + "loss": 6.9737, + "step": 83700 + }, + { + "epoch": 7.515709156193896, + "grad_norm": 12.12936019897461, + "learning_rate": 7.5127468581687624e-06, + "loss": 6.9895, + "step": 83725 + }, + { + "epoch": 7.517953321364453, + "grad_norm": 13.405523300170898, + "learning_rate": 7.514991023339318e-06, + "loss": 6.994, + "step": 83750 + }, + { + "epoch": 7.520197486535009, + "grad_norm": 10.42353630065918, + "learning_rate": 7.517235188509875e-06, + "loss": 6.7646, + "step": 83775 + }, + { + "epoch": 7.522441651705566, + "grad_norm": 13.586066246032715, + "learning_rate": 7.5194793536804314e-06, + "loss": 6.9971, + "step": 83800 + }, + { + "epoch": 7.524685816876122, + "grad_norm": 13.26130485534668, + "learning_rate": 7.521723518850989e-06, + "loss": 6.8623, + "step": 83825 + }, + { + "epoch": 7.526929982046679, + "grad_norm": 12.112799644470215, + "learning_rate": 7.523967684021545e-06, + "loss": 6.8495, + "step": 83850 + }, + { + "epoch": 7.529174147217235, + "grad_norm": 10.905085563659668, + "learning_rate": 7.526211849192101e-06, + "loss": 6.9623, + "step": 83875 + }, + { + "epoch": 7.531418312387792, + "grad_norm": 10.527405738830566, + "learning_rate": 7.528456014362657e-06, + "loss": 7.0835, + "step": 83900 + }, + { + "epoch": 7.533662477558348, + "grad_norm": 10.165383338928223, + "learning_rate": 7.5307001795332145e-06, + "loss": 7.0784, + "step": 83925 + }, + { + "epoch": 7.5359066427289045, + "grad_norm": 11.370438575744629, + "learning_rate": 7.532944344703771e-06, + "loss": 7.0018, + "step": 83950 + }, + { + "epoch": 7.538150807899461, + "grad_norm": 11.30592155456543, + "learning_rate": 7.535098743267505e-06, + "loss": 6.9312, + "step": 83975 + }, + { + "epoch": 7.540394973070018, + "grad_norm": 12.65178108215332, + "learning_rate": 7.537342908438061e-06, + "loss": 6.9144, + "step": 84000 + }, + { + "epoch": 7.542639138240575, + "grad_norm": 10.724345207214355, + "learning_rate": 7.539587073608618e-06, + "loss": 7.0713, + "step": 84025 + }, + { + "epoch": 7.544883303411131, + "grad_norm": 10.865346908569336, + "learning_rate": 7.541831238779175e-06, + "loss": 7.0387, + "step": 84050 + }, + { + "epoch": 7.547127468581688, + "grad_norm": 10.906403541564941, + "learning_rate": 7.544075403949731e-06, + "loss": 7.0246, + "step": 84075 + }, + { + "epoch": 7.549371633752244, + "grad_norm": 12.565633773803711, + "learning_rate": 7.546319569120288e-06, + "loss": 6.9621, + "step": 84100 + }, + { + "epoch": 7.551615798922801, + "grad_norm": 10.649481773376465, + "learning_rate": 7.548563734290845e-06, + "loss": 6.8067, + "step": 84125 + }, + { + "epoch": 7.553859964093357, + "grad_norm": 11.952442169189453, + "learning_rate": 7.550807899461401e-06, + "loss": 6.7849, + "step": 84150 + }, + { + "epoch": 7.556104129263914, + "grad_norm": 12.754776954650879, + "learning_rate": 7.553052064631957e-06, + "loss": 7.2188, + "step": 84175 + }, + { + "epoch": 7.55834829443447, + "grad_norm": 12.924092292785645, + "learning_rate": 7.555296229802515e-06, + "loss": 7.0268, + "step": 84200 + }, + { + "epoch": 7.560592459605027, + "grad_norm": 13.11525821685791, + "learning_rate": 7.5575403949730704e-06, + "loss": 6.9515, + "step": 84225 + }, + { + "epoch": 7.562836624775583, + "grad_norm": 8.56275749206543, + "learning_rate": 7.559784560143627e-06, + "loss": 6.912, + "step": 84250 + }, + { + "epoch": 7.5650807899461405, + "grad_norm": 10.829798698425293, + "learning_rate": 7.562028725314184e-06, + "loss": 6.8436, + "step": 84275 + }, + { + "epoch": 7.567324955116696, + "grad_norm": 9.076409339904785, + "learning_rate": 7.56427289048474e-06, + "loss": 6.9104, + "step": 84300 + }, + { + "epoch": 7.5695691202872535, + "grad_norm": 12.741874694824219, + "learning_rate": 7.566517055655297e-06, + "loss": 6.9826, + "step": 84325 + }, + { + "epoch": 7.57181328545781, + "grad_norm": 10.725357055664062, + "learning_rate": 7.5687612208258535e-06, + "loss": 6.7808, + "step": 84350 + }, + { + "epoch": 7.574057450628366, + "grad_norm": 10.986882209777832, + "learning_rate": 7.571005385996409e-06, + "loss": 6.8534, + "step": 84375 + }, + { + "epoch": 7.576301615798923, + "grad_norm": 10.302948951721191, + "learning_rate": 7.573249551166967e-06, + "loss": 6.867, + "step": 84400 + }, + { + "epoch": 7.578545780969479, + "grad_norm": 10.4717378616333, + "learning_rate": 7.575493716337523e-06, + "loss": 6.9424, + "step": 84425 + }, + { + "epoch": 7.580789946140036, + "grad_norm": 14.359646797180176, + "learning_rate": 7.577737881508079e-06, + "loss": 6.9163, + "step": 84450 + }, + { + "epoch": 7.583034111310592, + "grad_norm": 14.217012405395508, + "learning_rate": 7.579982046678637e-06, + "loss": 6.7919, + "step": 84475 + }, + { + "epoch": 7.585278276481149, + "grad_norm": 13.686928749084473, + "learning_rate": 7.582226211849193e-06, + "loss": 7.045, + "step": 84500 + }, + { + "epoch": 7.587522441651705, + "grad_norm": 13.529449462890625, + "learning_rate": 7.584470377019749e-06, + "loss": 6.7613, + "step": 84525 + }, + { + "epoch": 7.589766606822262, + "grad_norm": 13.248819351196289, + "learning_rate": 7.586714542190306e-06, + "loss": 7.0153, + "step": 84550 + }, + { + "epoch": 7.592010771992818, + "grad_norm": 12.732885360717773, + "learning_rate": 7.588958707360863e-06, + "loss": 6.7698, + "step": 84575 + }, + { + "epoch": 7.594254937163376, + "grad_norm": 10.77596378326416, + "learning_rate": 7.591202872531419e-06, + "loss": 7.05, + "step": 84600 + }, + { + "epoch": 7.596499102333932, + "grad_norm": 11.889184951782227, + "learning_rate": 7.5934470377019754e-06, + "loss": 6.8215, + "step": 84625 + }, + { + "epoch": 7.598743267504489, + "grad_norm": 11.728381156921387, + "learning_rate": 7.595691202872531e-06, + "loss": 6.9706, + "step": 84650 + }, + { + "epoch": 7.600987432675045, + "grad_norm": 15.01768970489502, + "learning_rate": 7.597935368043089e-06, + "loss": 6.8843, + "step": 84675 + }, + { + "epoch": 7.6032315978456015, + "grad_norm": 9.994674682617188, + "learning_rate": 7.600179533213645e-06, + "loss": 6.9207, + "step": 84700 + }, + { + "epoch": 7.605475763016158, + "grad_norm": 10.70443344116211, + "learning_rate": 7.602423698384201e-06, + "loss": 7.0031, + "step": 84725 + }, + { + "epoch": 7.6077199281867145, + "grad_norm": 11.344264030456543, + "learning_rate": 7.604667863554758e-06, + "loss": 6.759, + "step": 84750 + }, + { + "epoch": 7.609964093357271, + "grad_norm": 11.196703910827637, + "learning_rate": 7.606912028725315e-06, + "loss": 6.8474, + "step": 84775 + }, + { + "epoch": 7.6122082585278275, + "grad_norm": 10.368355751037598, + "learning_rate": 7.609156193895872e-06, + "loss": 6.9537, + "step": 84800 + }, + { + "epoch": 7.614452423698384, + "grad_norm": 12.797274589538574, + "learning_rate": 7.6114003590664275e-06, + "loss": 6.9944, + "step": 84825 + }, + { + "epoch": 7.61669658886894, + "grad_norm": 10.861352920532227, + "learning_rate": 7.613644524236985e-06, + "loss": 6.8249, + "step": 84850 + }, + { + "epoch": 7.618940754039498, + "grad_norm": 13.389528274536133, + "learning_rate": 7.615888689407542e-06, + "loss": 7.0191, + "step": 84875 + }, + { + "epoch": 7.621184919210053, + "grad_norm": 11.635929107666016, + "learning_rate": 7.618132854578097e-06, + "loss": 6.7376, + "step": 84900 + }, + { + "epoch": 7.623429084380611, + "grad_norm": 9.720861434936523, + "learning_rate": 7.620377019748654e-06, + "loss": 7.0435, + "step": 84925 + }, + { + "epoch": 7.625673249551167, + "grad_norm": 11.047475814819336, + "learning_rate": 7.6226211849192114e-06, + "loss": 6.9959, + "step": 84950 + }, + { + "epoch": 7.627917414721724, + "grad_norm": 12.997077941894531, + "learning_rate": 7.624865350089767e-06, + "loss": 6.8112, + "step": 84975 + }, + { + "epoch": 7.63016157989228, + "grad_norm": 16.129478454589844, + "learning_rate": 7.627109515260324e-06, + "loss": 6.9531, + "step": 85000 + }, + { + "epoch": 7.632405745062837, + "grad_norm": 11.374859809875488, + "learning_rate": 7.62935368043088e-06, + "loss": 7.0365, + "step": 85025 + }, + { + "epoch": 7.634649910233393, + "grad_norm": 12.476303100585938, + "learning_rate": 7.631597845601438e-06, + "loss": 6.852, + "step": 85050 + }, + { + "epoch": 7.63689407540395, + "grad_norm": 13.31424331665039, + "learning_rate": 7.633842010771994e-06, + "loss": 7.0844, + "step": 85075 + }, + { + "epoch": 7.639138240574506, + "grad_norm": 11.007489204406738, + "learning_rate": 7.63608617594255e-06, + "loss": 7.1659, + "step": 85100 + }, + { + "epoch": 7.641382405745063, + "grad_norm": 11.137946128845215, + "learning_rate": 7.638330341113105e-06, + "loss": 6.9434, + "step": 85125 + }, + { + "epoch": 7.643626570915619, + "grad_norm": 10.686476707458496, + "learning_rate": 7.640574506283663e-06, + "loss": 6.906, + "step": 85150 + }, + { + "epoch": 7.6458707360861755, + "grad_norm": 11.5156888961792, + "learning_rate": 7.64281867145422e-06, + "loss": 6.9822, + "step": 85175 + }, + { + "epoch": 7.648114901256733, + "grad_norm": 11.42072582244873, + "learning_rate": 7.645062836624776e-06, + "loss": 6.8389, + "step": 85200 + }, + { + "epoch": 7.650359066427289, + "grad_norm": 10.546635627746582, + "learning_rate": 7.647307001795333e-06, + "loss": 6.788, + "step": 85225 + }, + { + "epoch": 7.652603231597846, + "grad_norm": 10.548582077026367, + "learning_rate": 7.649551166965889e-06, + "loss": 7.0397, + "step": 85250 + }, + { + "epoch": 7.654847396768402, + "grad_norm": 11.206052780151367, + "learning_rate": 7.651795332136447e-06, + "loss": 6.8692, + "step": 85275 + }, + { + "epoch": 7.657091561938959, + "grad_norm": 10.413423538208008, + "learning_rate": 7.654039497307002e-06, + "loss": 7.0798, + "step": 85300 + }, + { + "epoch": 7.659335727109515, + "grad_norm": 15.835567474365234, + "learning_rate": 7.65628366247756e-06, + "loss": 6.8328, + "step": 85325 + }, + { + "epoch": 7.661579892280072, + "grad_norm": 12.241540908813477, + "learning_rate": 7.658527827648116e-06, + "loss": 6.8503, + "step": 85350 + }, + { + "epoch": 7.663824057450628, + "grad_norm": 12.812426567077637, + "learning_rate": 7.660771992818671e-06, + "loss": 6.9953, + "step": 85375 + }, + { + "epoch": 7.666068222621185, + "grad_norm": 12.426287651062012, + "learning_rate": 7.663016157989229e-06, + "loss": 6.8675, + "step": 85400 + }, + { + "epoch": 7.668312387791741, + "grad_norm": 10.542205810546875, + "learning_rate": 7.665260323159786e-06, + "loss": 6.9997, + "step": 85425 + }, + { + "epoch": 7.670556552962298, + "grad_norm": 13.173736572265625, + "learning_rate": 7.667504488330342e-06, + "loss": 6.743, + "step": 85450 + }, + { + "epoch": 7.672800718132855, + "grad_norm": 13.162904739379883, + "learning_rate": 7.669748653500898e-06, + "loss": 6.9437, + "step": 85475 + }, + { + "epoch": 7.6750448833034115, + "grad_norm": 13.82711124420166, + "learning_rate": 7.671992818671455e-06, + "loss": 6.9853, + "step": 85500 + }, + { + "epoch": 7.677289048473968, + "grad_norm": 11.186212539672852, + "learning_rate": 7.674236983842011e-06, + "loss": 7.0693, + "step": 85525 + }, + { + "epoch": 7.6795332136445245, + "grad_norm": 11.474502563476562, + "learning_rate": 7.676481149012569e-06, + "loss": 6.9264, + "step": 85550 + }, + { + "epoch": 7.681777378815081, + "grad_norm": 13.189702987670898, + "learning_rate": 7.678725314183124e-06, + "loss": 7.052, + "step": 85575 + }, + { + "epoch": 7.684021543985637, + "grad_norm": 14.159173011779785, + "learning_rate": 7.680969479353682e-06, + "loss": 6.9251, + "step": 85600 + }, + { + "epoch": 7.686265709156194, + "grad_norm": 12.214414596557617, + "learning_rate": 7.683213644524238e-06, + "loss": 6.5772, + "step": 85625 + }, + { + "epoch": 7.68850987432675, + "grad_norm": 10.979900360107422, + "learning_rate": 7.685457809694793e-06, + "loss": 6.9635, + "step": 85650 + }, + { + "epoch": 7.690754039497307, + "grad_norm": 12.855430603027344, + "learning_rate": 7.68770197486535e-06, + "loss": 7.028, + "step": 85675 + }, + { + "epoch": 7.692998204667863, + "grad_norm": 11.6830415725708, + "learning_rate": 7.689946140035908e-06, + "loss": 6.7604, + "step": 85700 + }, + { + "epoch": 7.69524236983842, + "grad_norm": 13.02591323852539, + "learning_rate": 7.692190305206464e-06, + "loss": 6.9349, + "step": 85725 + }, + { + "epoch": 7.697486535008976, + "grad_norm": 13.57638168334961, + "learning_rate": 7.69443447037702e-06, + "loss": 6.6209, + "step": 85750 + }, + { + "epoch": 7.699730700179533, + "grad_norm": 14.482358932495117, + "learning_rate": 7.696678635547577e-06, + "loss": 6.9268, + "step": 85775 + }, + { + "epoch": 7.70197486535009, + "grad_norm": 12.594042778015137, + "learning_rate": 7.698922800718133e-06, + "loss": 6.9757, + "step": 85800 + }, + { + "epoch": 7.704219030520647, + "grad_norm": 11.174958229064941, + "learning_rate": 7.70116696588869e-06, + "loss": 7.0455, + "step": 85825 + }, + { + "epoch": 7.706463195691203, + "grad_norm": 11.756101608276367, + "learning_rate": 7.703411131059246e-06, + "loss": 6.7978, + "step": 85850 + }, + { + "epoch": 7.70870736086176, + "grad_norm": 11.835110664367676, + "learning_rate": 7.705655296229804e-06, + "loss": 7.0092, + "step": 85875 + }, + { + "epoch": 7.710951526032316, + "grad_norm": 13.484200477600098, + "learning_rate": 7.70789946140036e-06, + "loss": 6.9207, + "step": 85900 + }, + { + "epoch": 7.7131956912028725, + "grad_norm": 10.667430877685547, + "learning_rate": 7.710143626570917e-06, + "loss": 6.9238, + "step": 85925 + }, + { + "epoch": 7.715439856373429, + "grad_norm": 12.227505683898926, + "learning_rate": 7.712387791741473e-06, + "loss": 6.8303, + "step": 85950 + }, + { + "epoch": 7.7176840215439855, + "grad_norm": 10.775405883789062, + "learning_rate": 7.71463195691203e-06, + "loss": 6.81, + "step": 85975 + }, + { + "epoch": 7.719928186714542, + "grad_norm": 11.631196022033691, + "learning_rate": 7.716876122082586e-06, + "loss": 7.0513, + "step": 86000 + }, + { + "epoch": 7.722172351885098, + "grad_norm": 11.314697265625, + "learning_rate": 7.719120287253142e-06, + "loss": 7.0459, + "step": 86025 + }, + { + "epoch": 7.724416517055655, + "grad_norm": 10.018441200256348, + "learning_rate": 7.721364452423699e-06, + "loss": 6.823, + "step": 86050 + }, + { + "epoch": 7.726660682226212, + "grad_norm": 17.11212158203125, + "learning_rate": 7.723608617594257e-06, + "loss": 6.7775, + "step": 86075 + }, + { + "epoch": 7.728904847396769, + "grad_norm": 12.286527633666992, + "learning_rate": 7.725852782764812e-06, + "loss": 6.8704, + "step": 86100 + }, + { + "epoch": 7.731149012567325, + "grad_norm": 10.847532272338867, + "learning_rate": 7.728096947935368e-06, + "loss": 6.8909, + "step": 86125 + }, + { + "epoch": 7.733393177737882, + "grad_norm": 11.047266960144043, + "learning_rate": 7.730251346499102e-06, + "loss": 6.9553, + "step": 86150 + }, + { + "epoch": 7.735637342908438, + "grad_norm": 12.927014350891113, + "learning_rate": 7.73249551166966e-06, + "loss": 6.9213, + "step": 86175 + }, + { + "epoch": 7.737881508078995, + "grad_norm": 9.820618629455566, + "learning_rate": 7.734739676840215e-06, + "loss": 6.976, + "step": 86200 + }, + { + "epoch": 7.740125673249551, + "grad_norm": 11.564702033996582, + "learning_rate": 7.736983842010773e-06, + "loss": 7.0276, + "step": 86225 + }, + { + "epoch": 7.742369838420108, + "grad_norm": 12.011716842651367, + "learning_rate": 7.73922800718133e-06, + "loss": 7.1557, + "step": 86250 + }, + { + "epoch": 7.744614003590664, + "grad_norm": 9.98680591583252, + "learning_rate": 7.741472172351886e-06, + "loss": 7.0621, + "step": 86275 + }, + { + "epoch": 7.746858168761221, + "grad_norm": 10.447504043579102, + "learning_rate": 7.743716337522442e-06, + "loss": 6.9418, + "step": 86300 + }, + { + "epoch": 7.749102333931777, + "grad_norm": 12.029853820800781, + "learning_rate": 7.745960502693e-06, + "loss": 6.9044, + "step": 86325 + }, + { + "epoch": 7.751346499102334, + "grad_norm": 16.2181339263916, + "learning_rate": 7.748204667863555e-06, + "loss": 6.9028, + "step": 86350 + }, + { + "epoch": 7.75359066427289, + "grad_norm": 11.11961555480957, + "learning_rate": 7.750448833034113e-06, + "loss": 6.7325, + "step": 86375 + }, + { + "epoch": 7.755834829443447, + "grad_norm": 13.797930717468262, + "learning_rate": 7.752692998204668e-06, + "loss": 6.8931, + "step": 86400 + }, + { + "epoch": 7.758078994614004, + "grad_norm": 12.109404563903809, + "learning_rate": 7.754937163375224e-06, + "loss": 6.8451, + "step": 86425 + }, + { + "epoch": 7.76032315978456, + "grad_norm": 11.452766418457031, + "learning_rate": 7.757181328545782e-06, + "loss": 7.1656, + "step": 86450 + }, + { + "epoch": 7.762567324955117, + "grad_norm": 12.731559753417969, + "learning_rate": 7.759425493716339e-06, + "loss": 7.0197, + "step": 86475 + }, + { + "epoch": 7.764811490125673, + "grad_norm": 10.786626815795898, + "learning_rate": 7.761669658886895e-06, + "loss": 6.7496, + "step": 86500 + }, + { + "epoch": 7.76705565529623, + "grad_norm": 10.617570877075195, + "learning_rate": 7.76391382405745e-06, + "loss": 6.9674, + "step": 86525 + }, + { + "epoch": 7.769299820466786, + "grad_norm": 10.822609901428223, + "learning_rate": 7.766157989228008e-06, + "loss": 7.0379, + "step": 86550 + }, + { + "epoch": 7.771543985637343, + "grad_norm": 12.591691017150879, + "learning_rate": 7.768402154398564e-06, + "loss": 7.0073, + "step": 86575 + }, + { + "epoch": 7.773788150807899, + "grad_norm": 10.794010162353516, + "learning_rate": 7.770646319569121e-06, + "loss": 7.1328, + "step": 86600 + }, + { + "epoch": 7.776032315978456, + "grad_norm": 14.545469284057617, + "learning_rate": 7.772890484739679e-06, + "loss": 6.7806, + "step": 86625 + }, + { + "epoch": 7.778276481149012, + "grad_norm": 10.526585578918457, + "learning_rate": 7.775134649910234e-06, + "loss": 6.7752, + "step": 86650 + }, + { + "epoch": 7.7805206463195695, + "grad_norm": 13.90031623840332, + "learning_rate": 7.77737881508079e-06, + "loss": 7.1176, + "step": 86675 + }, + { + "epoch": 7.782764811490126, + "grad_norm": 9.815380096435547, + "learning_rate": 7.779622980251346e-06, + "loss": 6.7825, + "step": 86700 + }, + { + "epoch": 7.7850089766606825, + "grad_norm": 16.714420318603516, + "learning_rate": 7.781867145421903e-06, + "loss": 7.0617, + "step": 86725 + }, + { + "epoch": 7.787253141831239, + "grad_norm": 11.353870391845703, + "learning_rate": 7.784111310592461e-06, + "loss": 6.8595, + "step": 86750 + }, + { + "epoch": 7.789497307001795, + "grad_norm": 11.523234367370605, + "learning_rate": 7.786355475763017e-06, + "loss": 6.8909, + "step": 86775 + }, + { + "epoch": 7.791741472172352, + "grad_norm": 13.900197982788086, + "learning_rate": 7.788599640933572e-06, + "loss": 6.7235, + "step": 86800 + }, + { + "epoch": 7.793985637342908, + "grad_norm": 10.1730318069458, + "learning_rate": 7.79084380610413e-06, + "loss": 6.8988, + "step": 86825 + }, + { + "epoch": 7.796229802513465, + "grad_norm": 12.931896209716797, + "learning_rate": 7.793087971274687e-06, + "loss": 7.0046, + "step": 86850 + }, + { + "epoch": 7.798473967684021, + "grad_norm": 14.04401969909668, + "learning_rate": 7.795332136445243e-06, + "loss": 7.0476, + "step": 86875 + }, + { + "epoch": 7.800718132854578, + "grad_norm": 13.50241756439209, + "learning_rate": 7.797576301615799e-06, + "loss": 7.1589, + "step": 86900 + }, + { + "epoch": 7.802962298025134, + "grad_norm": 11.471029281616211, + "learning_rate": 7.799820466786356e-06, + "loss": 7.0925, + "step": 86925 + }, + { + "epoch": 7.805206463195692, + "grad_norm": 11.461629867553711, + "learning_rate": 7.802064631956912e-06, + "loss": 6.8848, + "step": 86950 + }, + { + "epoch": 7.807450628366247, + "grad_norm": 12.092886924743652, + "learning_rate": 7.80430879712747e-06, + "loss": 7.4645, + "step": 86975 + }, + { + "epoch": 7.809694793536805, + "grad_norm": 10.533390998840332, + "learning_rate": 7.806552962298027e-06, + "loss": 6.6593, + "step": 87000 + }, + { + "epoch": 7.811938958707361, + "grad_norm": 13.010260581970215, + "learning_rate": 7.808797127468583e-06, + "loss": 7.0926, + "step": 87025 + }, + { + "epoch": 7.814183123877918, + "grad_norm": 14.842456817626953, + "learning_rate": 7.811041292639139e-06, + "loss": 6.916, + "step": 87050 + }, + { + "epoch": 7.816427289048474, + "grad_norm": 10.127785682678223, + "learning_rate": 7.813285457809694e-06, + "loss": 6.8479, + "step": 87075 + }, + { + "epoch": 7.8186714542190305, + "grad_norm": 11.722616195678711, + "learning_rate": 7.815529622980252e-06, + "loss": 6.8784, + "step": 87100 + }, + { + "epoch": 7.820915619389587, + "grad_norm": 11.76123332977295, + "learning_rate": 7.81777378815081e-06, + "loss": 6.9212, + "step": 87125 + }, + { + "epoch": 7.8231597845601435, + "grad_norm": 12.423851013183594, + "learning_rate": 7.820017953321365e-06, + "loss": 6.8252, + "step": 87150 + }, + { + "epoch": 7.8254039497307, + "grad_norm": 14.998824119567871, + "learning_rate": 7.82226211849192e-06, + "loss": 6.9255, + "step": 87175 + }, + { + "epoch": 7.8276481149012564, + "grad_norm": 11.23087215423584, + "learning_rate": 7.824506283662478e-06, + "loss": 6.954, + "step": 87200 + }, + { + "epoch": 7.829892280071813, + "grad_norm": 12.423251152038574, + "learning_rate": 7.826750448833034e-06, + "loss": 6.9197, + "step": 87225 + }, + { + "epoch": 7.832136445242369, + "grad_norm": 11.6273193359375, + "learning_rate": 7.828994614003591e-06, + "loss": 6.8481, + "step": 87250 + }, + { + "epoch": 7.834380610412927, + "grad_norm": 11.154300689697266, + "learning_rate": 7.831238779174147e-06, + "loss": 6.8868, + "step": 87275 + }, + { + "epoch": 7.836624775583483, + "grad_norm": 12.648314476013184, + "learning_rate": 7.833482944344705e-06, + "loss": 6.6531, + "step": 87300 + }, + { + "epoch": 7.83886894075404, + "grad_norm": 12.050822257995605, + "learning_rate": 7.83572710951526e-06, + "loss": 7.0568, + "step": 87325 + }, + { + "epoch": 7.841113105924596, + "grad_norm": 10.024081230163574, + "learning_rate": 7.837971274685818e-06, + "loss": 7.0584, + "step": 87350 + }, + { + "epoch": 7.843357271095153, + "grad_norm": 13.23332691192627, + "learning_rate": 7.840215439856374e-06, + "loss": 6.8487, + "step": 87375 + }, + { + "epoch": 7.845601436265709, + "grad_norm": 10.634153366088867, + "learning_rate": 7.842459605026931e-06, + "loss": 6.9603, + "step": 87400 + }, + { + "epoch": 7.847845601436266, + "grad_norm": 11.861431121826172, + "learning_rate": 7.844703770197487e-06, + "loss": 7.0145, + "step": 87425 + }, + { + "epoch": 7.850089766606822, + "grad_norm": 11.66434383392334, + "learning_rate": 7.846947935368043e-06, + "loss": 6.8051, + "step": 87450 + }, + { + "epoch": 7.852333931777379, + "grad_norm": 11.68253231048584, + "learning_rate": 7.8491921005386e-06, + "loss": 7.0543, + "step": 87475 + }, + { + "epoch": 7.854578096947935, + "grad_norm": 13.559669494628906, + "learning_rate": 7.851436265709158e-06, + "loss": 6.8736, + "step": 87500 + }, + { + "epoch": 7.8568222621184916, + "grad_norm": 11.674641609191895, + "learning_rate": 7.853680430879713e-06, + "loss": 7.1976, + "step": 87525 + }, + { + "epoch": 7.859066427289049, + "grad_norm": 11.607339859008789, + "learning_rate": 7.85592459605027e-06, + "loss": 7.0012, + "step": 87550 + }, + { + "epoch": 7.861310592459605, + "grad_norm": 10.307097434997559, + "learning_rate": 7.858168761220827e-06, + "loss": 6.977, + "step": 87575 + }, + { + "epoch": 7.863554757630162, + "grad_norm": 13.29431438446045, + "learning_rate": 7.860412926391382e-06, + "loss": 6.8461, + "step": 87600 + }, + { + "epoch": 7.865798922800718, + "grad_norm": 11.145401954650879, + "learning_rate": 7.86265709156194e-06, + "loss": 6.8057, + "step": 87625 + }, + { + "epoch": 7.868043087971275, + "grad_norm": 13.97718620300293, + "learning_rate": 7.864901256732496e-06, + "loss": 6.8292, + "step": 87650 + }, + { + "epoch": 7.870287253141831, + "grad_norm": 10.555792808532715, + "learning_rate": 7.867145421903053e-06, + "loss": 6.8281, + "step": 87675 + }, + { + "epoch": 7.872531418312388, + "grad_norm": 9.619808197021484, + "learning_rate": 7.869389587073609e-06, + "loss": 6.9202, + "step": 87700 + }, + { + "epoch": 7.874775583482944, + "grad_norm": 10.56817626953125, + "learning_rate": 7.871633752244165e-06, + "loss": 6.982, + "step": 87725 + }, + { + "epoch": 7.877019748653501, + "grad_norm": 10.50408935546875, + "learning_rate": 7.873877917414722e-06, + "loss": 6.6951, + "step": 87750 + }, + { + "epoch": 7.879263913824057, + "grad_norm": 9.975687026977539, + "learning_rate": 7.87612208258528e-06, + "loss": 6.7333, + "step": 87775 + }, + { + "epoch": 7.881508078994614, + "grad_norm": 14.653034210205078, + "learning_rate": 7.878366247755835e-06, + "loss": 6.9326, + "step": 87800 + }, + { + "epoch": 7.88375224416517, + "grad_norm": 9.667840003967285, + "learning_rate": 7.880610412926391e-06, + "loss": 6.9227, + "step": 87825 + }, + { + "epoch": 7.885996409335727, + "grad_norm": 11.281652450561523, + "learning_rate": 7.882854578096949e-06, + "loss": 7.0042, + "step": 87850 + }, + { + "epoch": 7.888240574506284, + "grad_norm": 12.152596473693848, + "learning_rate": 7.885098743267506e-06, + "loss": 6.8519, + "step": 87875 + }, + { + "epoch": 7.8904847396768405, + "grad_norm": 11.428849220275879, + "learning_rate": 7.887342908438062e-06, + "loss": 7.0614, + "step": 87900 + }, + { + "epoch": 7.892728904847397, + "grad_norm": 10.79477596282959, + "learning_rate": 7.889587073608618e-06, + "loss": 6.9843, + "step": 87925 + }, + { + "epoch": 7.8949730700179535, + "grad_norm": 13.062361717224121, + "learning_rate": 7.891831238779175e-06, + "loss": 7.0883, + "step": 87950 + }, + { + "epoch": 7.89721723518851, + "grad_norm": 9.971893310546875, + "learning_rate": 7.89407540394973e-06, + "loss": 6.8984, + "step": 87975 + }, + { + "epoch": 7.899461400359066, + "grad_norm": 12.747640609741211, + "learning_rate": 7.896319569120288e-06, + "loss": 6.7154, + "step": 88000 + }, + { + "epoch": 7.901705565529623, + "grad_norm": 13.412044525146484, + "learning_rate": 7.898563734290846e-06, + "loss": 6.9392, + "step": 88025 + }, + { + "epoch": 7.903949730700179, + "grad_norm": 12.23696231842041, + "learning_rate": 7.900807899461401e-06, + "loss": 7.0501, + "step": 88050 + }, + { + "epoch": 7.906193895870736, + "grad_norm": 15.082104682922363, + "learning_rate": 7.903052064631957e-06, + "loss": 7.1229, + "step": 88075 + }, + { + "epoch": 7.908438061041292, + "grad_norm": 10.25049877166748, + "learning_rate": 7.905296229802513e-06, + "loss": 7.0451, + "step": 88100 + }, + { + "epoch": 7.910682226211849, + "grad_norm": 10.964566230773926, + "learning_rate": 7.90754039497307e-06, + "loss": 7.0327, + "step": 88125 + }, + { + "epoch": 7.912926391382406, + "grad_norm": 11.630402565002441, + "learning_rate": 7.909784560143628e-06, + "loss": 7.0587, + "step": 88150 + }, + { + "epoch": 7.915170556552963, + "grad_norm": 14.050610542297363, + "learning_rate": 7.912028725314184e-06, + "loss": 6.7442, + "step": 88175 + }, + { + "epoch": 7.917414721723519, + "grad_norm": 12.260645866394043, + "learning_rate": 7.91427289048474e-06, + "loss": 6.9871, + "step": 88200 + }, + { + "epoch": 7.919658886894076, + "grad_norm": 11.1692533493042, + "learning_rate": 7.916517055655297e-06, + "loss": 6.8924, + "step": 88225 + }, + { + "epoch": 7.921903052064632, + "grad_norm": 13.824788093566895, + "learning_rate": 7.918761220825853e-06, + "loss": 6.881, + "step": 88250 + }, + { + "epoch": 7.924147217235189, + "grad_norm": 11.570984840393066, + "learning_rate": 7.92100538599641e-06, + "loss": 7.1194, + "step": 88275 + }, + { + "epoch": 7.926391382405745, + "grad_norm": 11.897928237915039, + "learning_rate": 7.923249551166966e-06, + "loss": 6.9688, + "step": 88300 + }, + { + "epoch": 7.9286355475763015, + "grad_norm": 12.314791679382324, + "learning_rate": 7.925493716337523e-06, + "loss": 7.0521, + "step": 88325 + }, + { + "epoch": 7.930879712746858, + "grad_norm": 13.814423561096191, + "learning_rate": 7.92773788150808e-06, + "loss": 7.0579, + "step": 88350 + }, + { + "epoch": 7.9331238779174145, + "grad_norm": 12.475587844848633, + "learning_rate": 7.929982046678637e-06, + "loss": 7.1822, + "step": 88375 + }, + { + "epoch": 7.935368043087971, + "grad_norm": 12.138313293457031, + "learning_rate": 7.932226211849194e-06, + "loss": 6.836, + "step": 88400 + }, + { + "epoch": 7.937612208258528, + "grad_norm": 13.276442527770996, + "learning_rate": 7.93447037701975e-06, + "loss": 6.914, + "step": 88425 + }, + { + "epoch": 7.939856373429084, + "grad_norm": 12.199323654174805, + "learning_rate": 7.936714542190306e-06, + "loss": 6.8768, + "step": 88450 + }, + { + "epoch": 7.942100538599641, + "grad_norm": 11.62320613861084, + "learning_rate": 7.938958707360861e-06, + "loss": 6.7344, + "step": 88475 + }, + { + "epoch": 7.944344703770198, + "grad_norm": 10.222034454345703, + "learning_rate": 7.941202872531419e-06, + "loss": 6.7707, + "step": 88500 + }, + { + "epoch": 7.946588868940754, + "grad_norm": 14.974617958068848, + "learning_rate": 7.943447037701976e-06, + "loss": 6.9219, + "step": 88525 + }, + { + "epoch": 7.948833034111311, + "grad_norm": 12.11795711517334, + "learning_rate": 7.945691202872532e-06, + "loss": 6.8847, + "step": 88550 + }, + { + "epoch": 7.951077199281867, + "grad_norm": 10.93553352355957, + "learning_rate": 7.947935368043088e-06, + "loss": 6.769, + "step": 88575 + }, + { + "epoch": 7.953321364452424, + "grad_norm": 12.450016975402832, + "learning_rate": 7.950179533213645e-06, + "loss": 7.1919, + "step": 88600 + }, + { + "epoch": 7.95556552962298, + "grad_norm": 12.055841445922852, + "learning_rate": 7.952423698384201e-06, + "loss": 7.034, + "step": 88625 + }, + { + "epoch": 7.957809694793537, + "grad_norm": 13.671492576599121, + "learning_rate": 7.954667863554759e-06, + "loss": 6.8371, + "step": 88650 + }, + { + "epoch": 7.960053859964093, + "grad_norm": 11.686847686767578, + "learning_rate": 7.956822262118493e-06, + "loss": 6.9662, + "step": 88675 + }, + { + "epoch": 7.96229802513465, + "grad_norm": 10.738042831420898, + "learning_rate": 7.95906642728905e-06, + "loss": 7.0447, + "step": 88700 + }, + { + "epoch": 7.964542190305206, + "grad_norm": 10.932969093322754, + "learning_rate": 7.961310592459606e-06, + "loss": 6.8512, + "step": 88725 + }, + { + "epoch": 7.966786355475763, + "grad_norm": 10.52639102935791, + "learning_rate": 7.963554757630162e-06, + "loss": 7.2651, + "step": 88750 + }, + { + "epoch": 7.96903052064632, + "grad_norm": 12.088666915893555, + "learning_rate": 7.965798922800719e-06, + "loss": 6.7928, + "step": 88775 + }, + { + "epoch": 7.971274685816876, + "grad_norm": 15.626638412475586, + "learning_rate": 7.968043087971275e-06, + "loss": 7.0578, + "step": 88800 + }, + { + "epoch": 7.973518850987433, + "grad_norm": 11.994172096252441, + "learning_rate": 7.970287253141832e-06, + "loss": 6.9159, + "step": 88825 + }, + { + "epoch": 7.975763016157989, + "grad_norm": 14.24830436706543, + "learning_rate": 7.972531418312388e-06, + "loss": 6.8188, + "step": 88850 + }, + { + "epoch": 7.978007181328546, + "grad_norm": 12.0084810256958, + "learning_rate": 7.974775583482945e-06, + "loss": 6.9584, + "step": 88875 + }, + { + "epoch": 7.980251346499102, + "grad_norm": 14.801278114318848, + "learning_rate": 7.977019748653501e-06, + "loss": 7.0143, + "step": 88900 + }, + { + "epoch": 7.982495511669659, + "grad_norm": 13.928526878356934, + "learning_rate": 7.979263913824059e-06, + "loss": 6.7138, + "step": 88925 + }, + { + "epoch": 7.984739676840215, + "grad_norm": 10.234553337097168, + "learning_rate": 7.981508078994614e-06, + "loss": 7.1035, + "step": 88950 + }, + { + "epoch": 7.986983842010772, + "grad_norm": 13.120739936828613, + "learning_rate": 7.983752244165172e-06, + "loss": 6.8729, + "step": 88975 + }, + { + "epoch": 7.989228007181328, + "grad_norm": 10.249099731445312, + "learning_rate": 7.985996409335728e-06, + "loss": 6.9375, + "step": 89000 + }, + { + "epoch": 7.991472172351886, + "grad_norm": 13.610127449035645, + "learning_rate": 7.988240574506283e-06, + "loss": 7.0725, + "step": 89025 + }, + { + "epoch": 7.993716337522442, + "grad_norm": 14.08362865447998, + "learning_rate": 7.990484739676841e-06, + "loss": 7.002, + "step": 89050 + }, + { + "epoch": 7.9959605026929985, + "grad_norm": 10.694058418273926, + "learning_rate": 7.992728904847398e-06, + "loss": 6.9281, + "step": 89075 + }, + { + "epoch": 7.998204667863555, + "grad_norm": 11.103165626525879, + "learning_rate": 7.994973070017954e-06, + "loss": 6.9508, + "step": 89100 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.06709564088366143, + "eval_f1_macro": 0.0017964218733059856, + "eval_f1_micro": 0.06709564088366143, + "eval_f1_weighted": 0.024507495768924317, + "eval_loss": 7.569960117340088, + "eval_precision_macro": 0.0015810635741307687, + "eval_precision_micro": 0.06709564088366143, + "eval_precision_weighted": 0.018940439111444305, + "eval_recall_macro": 0.004429317299910357, + "eval_recall_micro": 0.06709564088366143, + "eval_recall_weighted": 0.06709564088366143, + "eval_runtime": 128.1302, + "eval_samples_per_second": 408.748, + "eval_steps_per_second": 12.776, + "step": 89120 + }, + { + "epoch": 8.000448833034111, + "grad_norm": 10.893725395202637, + "learning_rate": 7.99721723518851e-06, + "loss": 6.9036, + "step": 89125 + }, + { + "epoch": 8.002692998204667, + "grad_norm": 9.924653053283691, + "learning_rate": 7.999461400359067e-06, + "loss": 6.6747, + "step": 89150 + }, + { + "epoch": 8.004937163375224, + "grad_norm": 11.08837604522705, + "learning_rate": 8.001705565529623e-06, + "loss": 6.7748, + "step": 89175 + }, + { + "epoch": 8.007181328545782, + "grad_norm": 12.850152015686035, + "learning_rate": 8.00394973070018e-06, + "loss": 6.94, + "step": 89200 + }, + { + "epoch": 8.009425493716337, + "grad_norm": 11.687116622924805, + "learning_rate": 8.006193895870736e-06, + "loss": 6.6786, + "step": 89225 + }, + { + "epoch": 8.011669658886895, + "grad_norm": 12.887269973754883, + "learning_rate": 8.008438061041294e-06, + "loss": 6.823, + "step": 89250 + }, + { + "epoch": 8.01391382405745, + "grad_norm": 12.531646728515625, + "learning_rate": 8.01068222621185e-06, + "loss": 6.6716, + "step": 89275 + }, + { + "epoch": 8.016157989228008, + "grad_norm": 13.612051963806152, + "learning_rate": 8.012926391382405e-06, + "loss": 6.7183, + "step": 89300 + }, + { + "epoch": 8.018402154398563, + "grad_norm": 19.135725021362305, + "learning_rate": 8.015170556552963e-06, + "loss": 6.5824, + "step": 89325 + }, + { + "epoch": 8.02064631956912, + "grad_norm": 11.803299903869629, + "learning_rate": 8.01741472172352e-06, + "loss": 6.5941, + "step": 89350 + }, + { + "epoch": 8.022890484739676, + "grad_norm": 10.720213890075684, + "learning_rate": 8.019658886894076e-06, + "loss": 6.6767, + "step": 89375 + }, + { + "epoch": 8.025134649910234, + "grad_norm": 13.511064529418945, + "learning_rate": 8.021903052064632e-06, + "loss": 6.5931, + "step": 89400 + }, + { + "epoch": 8.02737881508079, + "grad_norm": 12.856791496276855, + "learning_rate": 8.02414721723519e-06, + "loss": 6.6269, + "step": 89425 + }, + { + "epoch": 8.029622980251347, + "grad_norm": 11.227065086364746, + "learning_rate": 8.026391382405747e-06, + "loss": 6.6559, + "step": 89450 + }, + { + "epoch": 8.031867145421902, + "grad_norm": 10.842206001281738, + "learning_rate": 8.028635547576303e-06, + "loss": 6.7918, + "step": 89475 + }, + { + "epoch": 8.03411131059246, + "grad_norm": 14.217710494995117, + "learning_rate": 8.030879712746858e-06, + "loss": 6.5712, + "step": 89500 + }, + { + "epoch": 8.036355475763017, + "grad_norm": 10.4103422164917, + "learning_rate": 8.033123877917416e-06, + "loss": 6.9551, + "step": 89525 + }, + { + "epoch": 8.038599640933572, + "grad_norm": 12.188045501708984, + "learning_rate": 8.035368043087972e-06, + "loss": 6.5912, + "step": 89550 + }, + { + "epoch": 8.04084380610413, + "grad_norm": 12.719904899597168, + "learning_rate": 8.037612208258529e-06, + "loss": 6.8164, + "step": 89575 + }, + { + "epoch": 8.043087971274685, + "grad_norm": 11.19282341003418, + "learning_rate": 8.039856373429085e-06, + "loss": 6.6641, + "step": 89600 + }, + { + "epoch": 8.045332136445243, + "grad_norm": 11.714044570922852, + "learning_rate": 8.042100538599642e-06, + "loss": 6.8775, + "step": 89625 + }, + { + "epoch": 8.047576301615798, + "grad_norm": 11.42071533203125, + "learning_rate": 8.044344703770198e-06, + "loss": 6.7116, + "step": 89650 + }, + { + "epoch": 8.049820466786356, + "grad_norm": 16.380523681640625, + "learning_rate": 8.046588868940754e-06, + "loss": 6.8576, + "step": 89675 + }, + { + "epoch": 8.052064631956911, + "grad_norm": 11.731369972229004, + "learning_rate": 8.048833034111311e-06, + "loss": 6.6485, + "step": 89700 + }, + { + "epoch": 8.054308797127469, + "grad_norm": 13.751173973083496, + "learning_rate": 8.051077199281869e-06, + "loss": 6.675, + "step": 89725 + }, + { + "epoch": 8.056552962298024, + "grad_norm": 11.843663215637207, + "learning_rate": 8.053321364452424e-06, + "loss": 6.7022, + "step": 89750 + }, + { + "epoch": 8.058797127468582, + "grad_norm": 15.860100746154785, + "learning_rate": 8.05556552962298e-06, + "loss": 6.3767, + "step": 89775 + }, + { + "epoch": 8.061041292639139, + "grad_norm": 12.183211326599121, + "learning_rate": 8.057809694793538e-06, + "loss": 6.5095, + "step": 89800 + }, + { + "epoch": 8.063285457809695, + "grad_norm": 11.781852722167969, + "learning_rate": 8.060053859964093e-06, + "loss": 6.8668, + "step": 89825 + }, + { + "epoch": 8.065529622980252, + "grad_norm": 13.789640426635742, + "learning_rate": 8.062298025134651e-06, + "loss": 6.7129, + "step": 89850 + }, + { + "epoch": 8.067773788150808, + "grad_norm": 10.019376754760742, + "learning_rate": 8.064542190305207e-06, + "loss": 6.8351, + "step": 89875 + }, + { + "epoch": 8.070017953321365, + "grad_norm": 10.694501876831055, + "learning_rate": 8.066786355475764e-06, + "loss": 6.9595, + "step": 89900 + }, + { + "epoch": 8.07226211849192, + "grad_norm": 12.29908561706543, + "learning_rate": 8.06903052064632e-06, + "loss": 6.6079, + "step": 89925 + }, + { + "epoch": 8.074506283662478, + "grad_norm": 11.58143138885498, + "learning_rate": 8.071274685816877e-06, + "loss": 6.7049, + "step": 89950 + }, + { + "epoch": 8.076750448833034, + "grad_norm": 12.395988464355469, + "learning_rate": 8.073518850987433e-06, + "loss": 6.6877, + "step": 89975 + }, + { + "epoch": 8.07899461400359, + "grad_norm": 11.55416202545166, + "learning_rate": 8.07576301615799e-06, + "loss": 6.6591, + "step": 90000 + }, + { + "epoch": 8.081238779174146, + "grad_norm": 11.703386306762695, + "learning_rate": 8.078007181328546e-06, + "loss": 6.6154, + "step": 90025 + }, + { + "epoch": 8.083482944344704, + "grad_norm": 12.721657752990723, + "learning_rate": 8.080251346499102e-06, + "loss": 6.585, + "step": 90050 + }, + { + "epoch": 8.085727109515261, + "grad_norm": 12.156530380249023, + "learning_rate": 8.08249551166966e-06, + "loss": 6.7881, + "step": 90075 + }, + { + "epoch": 8.087971274685817, + "grad_norm": 11.978278160095215, + "learning_rate": 8.084739676840217e-06, + "loss": 6.6892, + "step": 90100 + }, + { + "epoch": 8.090215439856374, + "grad_norm": 9.14209270477295, + "learning_rate": 8.086983842010773e-06, + "loss": 6.7496, + "step": 90125 + }, + { + "epoch": 8.09245960502693, + "grad_norm": 11.344133377075195, + "learning_rate": 8.089228007181329e-06, + "loss": 6.7226, + "step": 90150 + }, + { + "epoch": 8.094703770197487, + "grad_norm": 9.805061340332031, + "learning_rate": 8.091472172351886e-06, + "loss": 6.7111, + "step": 90175 + }, + { + "epoch": 8.096947935368043, + "grad_norm": 14.927505493164062, + "learning_rate": 8.093716337522442e-06, + "loss": 6.8414, + "step": 90200 + }, + { + "epoch": 8.0991921005386, + "grad_norm": 11.319808006286621, + "learning_rate": 8.095960502693e-06, + "loss": 6.7319, + "step": 90225 + }, + { + "epoch": 8.101436265709156, + "grad_norm": 11.88205337524414, + "learning_rate": 8.098204667863555e-06, + "loss": 6.7078, + "step": 90250 + }, + { + "epoch": 8.103680430879713, + "grad_norm": 11.631815910339355, + "learning_rate": 8.100448833034113e-06, + "loss": 6.7846, + "step": 90275 + }, + { + "epoch": 8.105924596050269, + "grad_norm": 13.024479866027832, + "learning_rate": 8.102692998204668e-06, + "loss": 6.6754, + "step": 90300 + }, + { + "epoch": 8.108168761220826, + "grad_norm": 13.511468887329102, + "learning_rate": 8.104937163375226e-06, + "loss": 6.7619, + "step": 90325 + }, + { + "epoch": 8.110412926391382, + "grad_norm": 10.855725288391113, + "learning_rate": 8.107181328545782e-06, + "loss": 6.6731, + "step": 90350 + }, + { + "epoch": 8.112657091561939, + "grad_norm": 13.848235130310059, + "learning_rate": 8.109425493716339e-06, + "loss": 6.8285, + "step": 90375 + }, + { + "epoch": 8.114901256732496, + "grad_norm": 10.932981491088867, + "learning_rate": 8.111669658886895e-06, + "loss": 6.6662, + "step": 90400 + }, + { + "epoch": 8.117145421903052, + "grad_norm": 12.36020565032959, + "learning_rate": 8.11391382405745e-06, + "loss": 6.6803, + "step": 90425 + }, + { + "epoch": 8.11938958707361, + "grad_norm": 11.97225284576416, + "learning_rate": 8.116157989228008e-06, + "loss": 6.7277, + "step": 90450 + }, + { + "epoch": 8.121633752244165, + "grad_norm": 11.274802207946777, + "learning_rate": 8.118402154398565e-06, + "loss": 6.8301, + "step": 90475 + }, + { + "epoch": 8.123877917414722, + "grad_norm": 11.887615203857422, + "learning_rate": 8.120646319569121e-06, + "loss": 6.5581, + "step": 90500 + }, + { + "epoch": 8.126122082585278, + "grad_norm": 12.267580032348633, + "learning_rate": 8.122890484739677e-06, + "loss": 6.5796, + "step": 90525 + }, + { + "epoch": 8.128366247755835, + "grad_norm": 12.43082332611084, + "learning_rate": 8.125134649910234e-06, + "loss": 6.5544, + "step": 90550 + }, + { + "epoch": 8.13061041292639, + "grad_norm": 12.85740852355957, + "learning_rate": 8.12737881508079e-06, + "loss": 6.5766, + "step": 90575 + }, + { + "epoch": 8.132854578096948, + "grad_norm": 12.433923721313477, + "learning_rate": 8.129622980251348e-06, + "loss": 6.7719, + "step": 90600 + }, + { + "epoch": 8.135098743267504, + "grad_norm": 12.112196922302246, + "learning_rate": 8.131867145421903e-06, + "loss": 6.7838, + "step": 90625 + }, + { + "epoch": 8.137342908438061, + "grad_norm": 10.68923282623291, + "learning_rate": 8.134111310592461e-06, + "loss": 6.5842, + "step": 90650 + }, + { + "epoch": 8.139587073608618, + "grad_norm": 11.63250732421875, + "learning_rate": 8.136355475763017e-06, + "loss": 6.696, + "step": 90675 + }, + { + "epoch": 8.141831238779174, + "grad_norm": 11.337833404541016, + "learning_rate": 8.138599640933572e-06, + "loss": 6.8277, + "step": 90700 + }, + { + "epoch": 8.144075403949731, + "grad_norm": 11.109478950500488, + "learning_rate": 8.14084380610413e-06, + "loss": 6.9976, + "step": 90725 + }, + { + "epoch": 8.146319569120287, + "grad_norm": 14.982721328735352, + "learning_rate": 8.143087971274687e-06, + "loss": 6.5871, + "step": 90750 + }, + { + "epoch": 8.148563734290844, + "grad_norm": 11.6317777633667, + "learning_rate": 8.145332136445243e-06, + "loss": 6.6498, + "step": 90775 + }, + { + "epoch": 8.1508078994614, + "grad_norm": 12.228524208068848, + "learning_rate": 8.147576301615799e-06, + "loss": 6.6573, + "step": 90800 + }, + { + "epoch": 8.153052064631957, + "grad_norm": 11.742822647094727, + "learning_rate": 8.149820466786356e-06, + "loss": 6.5854, + "step": 90825 + }, + { + "epoch": 8.155296229802513, + "grad_norm": 13.231059074401855, + "learning_rate": 8.152064631956914e-06, + "loss": 6.6885, + "step": 90850 + }, + { + "epoch": 8.15754039497307, + "grad_norm": 11.465946197509766, + "learning_rate": 8.15430879712747e-06, + "loss": 6.788, + "step": 90875 + }, + { + "epoch": 8.159784560143626, + "grad_norm": 12.749308586120605, + "learning_rate": 8.156552962298025e-06, + "loss": 6.5896, + "step": 90900 + }, + { + "epoch": 8.162028725314183, + "grad_norm": 14.277674674987793, + "learning_rate": 8.158797127468583e-06, + "loss": 6.9475, + "step": 90925 + }, + { + "epoch": 8.164272890484739, + "grad_norm": 10.872325897216797, + "learning_rate": 8.161041292639139e-06, + "loss": 6.602, + "step": 90950 + }, + { + "epoch": 8.166517055655296, + "grad_norm": 12.278039932250977, + "learning_rate": 8.163285457809696e-06, + "loss": 6.5109, + "step": 90975 + }, + { + "epoch": 8.168761220825854, + "grad_norm": 14.883376121520996, + "learning_rate": 8.165529622980252e-06, + "loss": 6.5644, + "step": 91000 + }, + { + "epoch": 8.17100538599641, + "grad_norm": 15.055330276489258, + "learning_rate": 8.16777378815081e-06, + "loss": 6.5484, + "step": 91025 + }, + { + "epoch": 8.173249551166966, + "grad_norm": 11.86784553527832, + "learning_rate": 8.170017953321365e-06, + "loss": 6.5652, + "step": 91050 + }, + { + "epoch": 8.175493716337522, + "grad_norm": 14.140316009521484, + "learning_rate": 8.17226211849192e-06, + "loss": 6.884, + "step": 91075 + }, + { + "epoch": 8.17773788150808, + "grad_norm": 10.941006660461426, + "learning_rate": 8.174506283662478e-06, + "loss": 6.6144, + "step": 91100 + }, + { + "epoch": 8.179982046678635, + "grad_norm": 12.326054573059082, + "learning_rate": 8.176660682226212e-06, + "loss": 6.8059, + "step": 91125 + }, + { + "epoch": 8.182226211849192, + "grad_norm": 12.88784408569336, + "learning_rate": 8.17890484739677e-06, + "loss": 6.8471, + "step": 91150 + }, + { + "epoch": 8.184470377019748, + "grad_norm": 12.82504940032959, + "learning_rate": 8.181149012567326e-06, + "loss": 6.6257, + "step": 91175 + }, + { + "epoch": 8.186714542190305, + "grad_norm": 13.531026840209961, + "learning_rate": 8.183393177737881e-06, + "loss": 6.4624, + "step": 91200 + }, + { + "epoch": 8.188958707360861, + "grad_norm": 12.127076148986816, + "learning_rate": 8.185637342908439e-06, + "loss": 6.6819, + "step": 91225 + }, + { + "epoch": 8.191202872531418, + "grad_norm": 14.923761367797852, + "learning_rate": 8.187881508078995e-06, + "loss": 6.6163, + "step": 91250 + }, + { + "epoch": 8.193447037701976, + "grad_norm": 15.363482475280762, + "learning_rate": 8.190125673249552e-06, + "loss": 6.5691, + "step": 91275 + }, + { + "epoch": 8.195691202872531, + "grad_norm": 10.673319816589355, + "learning_rate": 8.19236983842011e-06, + "loss": 6.5869, + "step": 91300 + }, + { + "epoch": 8.197935368043089, + "grad_norm": 10.888690948486328, + "learning_rate": 8.194614003590665e-06, + "loss": 6.5375, + "step": 91325 + }, + { + "epoch": 8.200179533213644, + "grad_norm": 12.64423942565918, + "learning_rate": 8.196858168761221e-06, + "loss": 6.7768, + "step": 91350 + }, + { + "epoch": 8.202423698384202, + "grad_norm": 11.827288627624512, + "learning_rate": 8.199102333931778e-06, + "loss": 6.5437, + "step": 91375 + }, + { + "epoch": 8.204667863554757, + "grad_norm": 12.880770683288574, + "learning_rate": 8.201346499102334e-06, + "loss": 6.7477, + "step": 91400 + }, + { + "epoch": 8.206912028725315, + "grad_norm": 12.252667427062988, + "learning_rate": 8.203590664272892e-06, + "loss": 6.5967, + "step": 91425 + }, + { + "epoch": 8.20915619389587, + "grad_norm": 9.740447044372559, + "learning_rate": 8.205834829443447e-06, + "loss": 6.8786, + "step": 91450 + }, + { + "epoch": 8.211400359066428, + "grad_norm": 14.36905574798584, + "learning_rate": 8.208078994614003e-06, + "loss": 6.9439, + "step": 91475 + }, + { + "epoch": 8.213644524236983, + "grad_norm": 12.63491439819336, + "learning_rate": 8.21032315978456e-06, + "loss": 6.8248, + "step": 91500 + }, + { + "epoch": 8.21588868940754, + "grad_norm": 12.403112411499023, + "learning_rate": 8.212567324955118e-06, + "loss": 6.7202, + "step": 91525 + }, + { + "epoch": 8.218132854578098, + "grad_norm": 12.455216407775879, + "learning_rate": 8.214811490125674e-06, + "loss": 6.7916, + "step": 91550 + }, + { + "epoch": 8.220377019748653, + "grad_norm": 11.308056831359863, + "learning_rate": 8.21705565529623e-06, + "loss": 6.916, + "step": 91575 + }, + { + "epoch": 8.22262118491921, + "grad_norm": 11.1338529586792, + "learning_rate": 8.219299820466787e-06, + "loss": 6.9029, + "step": 91600 + }, + { + "epoch": 8.224865350089766, + "grad_norm": 11.701810836791992, + "learning_rate": 8.221543985637343e-06, + "loss": 6.692, + "step": 91625 + }, + { + "epoch": 8.227109515260324, + "grad_norm": 12.216734886169434, + "learning_rate": 8.2237881508079e-06, + "loss": 6.6695, + "step": 91650 + }, + { + "epoch": 8.22935368043088, + "grad_norm": 11.276915550231934, + "learning_rate": 8.226032315978458e-06, + "loss": 6.6037, + "step": 91675 + }, + { + "epoch": 8.231597845601437, + "grad_norm": 12.09537410736084, + "learning_rate": 8.228276481149014e-06, + "loss": 6.7409, + "step": 91700 + }, + { + "epoch": 8.233842010771992, + "grad_norm": 12.533995628356934, + "learning_rate": 8.23052064631957e-06, + "loss": 6.8233, + "step": 91725 + }, + { + "epoch": 8.23608617594255, + "grad_norm": 12.257575988769531, + "learning_rate": 8.232764811490125e-06, + "loss": 6.7431, + "step": 91750 + }, + { + "epoch": 8.238330341113105, + "grad_norm": 12.450362205505371, + "learning_rate": 8.235008976660683e-06, + "loss": 6.7019, + "step": 91775 + }, + { + "epoch": 8.240574506283663, + "grad_norm": 11.222847938537598, + "learning_rate": 8.23725314183124e-06, + "loss": 6.7252, + "step": 91800 + }, + { + "epoch": 8.242818671454218, + "grad_norm": 12.134832382202148, + "learning_rate": 8.239497307001796e-06, + "loss": 6.7811, + "step": 91825 + }, + { + "epoch": 8.245062836624776, + "grad_norm": 16.95819854736328, + "learning_rate": 8.241741472172352e-06, + "loss": 6.7201, + "step": 91850 + }, + { + "epoch": 8.247307001795333, + "grad_norm": 10.40034008026123, + "learning_rate": 8.243985637342909e-06, + "loss": 6.6346, + "step": 91875 + }, + { + "epoch": 8.249551166965889, + "grad_norm": 12.653993606567383, + "learning_rate": 8.246229802513467e-06, + "loss": 6.811, + "step": 91900 + }, + { + "epoch": 8.251795332136446, + "grad_norm": 14.384180068969727, + "learning_rate": 8.248473967684022e-06, + "loss": 6.6975, + "step": 91925 + }, + { + "epoch": 8.254039497307001, + "grad_norm": 12.107657432556152, + "learning_rate": 8.250718132854578e-06, + "loss": 6.8573, + "step": 91950 + }, + { + "epoch": 8.256283662477559, + "grad_norm": 11.006041526794434, + "learning_rate": 8.252962298025136e-06, + "loss": 6.6385, + "step": 91975 + }, + { + "epoch": 8.258527827648114, + "grad_norm": 15.803927421569824, + "learning_rate": 8.255206463195691e-06, + "loss": 6.8748, + "step": 92000 + }, + { + "epoch": 8.260771992818672, + "grad_norm": 10.90997314453125, + "learning_rate": 8.257450628366249e-06, + "loss": 6.7208, + "step": 92025 + }, + { + "epoch": 8.263016157989227, + "grad_norm": 13.932628631591797, + "learning_rate": 8.259694793536806e-06, + "loss": 6.7173, + "step": 92050 + }, + { + "epoch": 8.265260323159785, + "grad_norm": 12.254186630249023, + "learning_rate": 8.261938958707362e-06, + "loss": 6.8368, + "step": 92075 + }, + { + "epoch": 8.26750448833034, + "grad_norm": 11.596794128417969, + "learning_rate": 8.264183123877918e-06, + "loss": 6.802, + "step": 92100 + }, + { + "epoch": 8.269748653500898, + "grad_norm": 10.778088569641113, + "learning_rate": 8.266427289048474e-06, + "loss": 6.5988, + "step": 92125 + }, + { + "epoch": 8.271992818671453, + "grad_norm": 13.212989807128906, + "learning_rate": 8.268671454219031e-06, + "loss": 6.6746, + "step": 92150 + }, + { + "epoch": 8.27423698384201, + "grad_norm": 14.430342674255371, + "learning_rate": 8.270915619389588e-06, + "loss": 6.7509, + "step": 92175 + }, + { + "epoch": 8.276481149012568, + "grad_norm": 14.556343078613281, + "learning_rate": 8.273159784560144e-06, + "loss": 6.5751, + "step": 92200 + }, + { + "epoch": 8.278725314183124, + "grad_norm": 11.320582389831543, + "learning_rate": 8.275314183123878e-06, + "loss": 6.6651, + "step": 92225 + }, + { + "epoch": 8.280969479353681, + "grad_norm": 16.431936264038086, + "learning_rate": 8.277558348294436e-06, + "loss": 6.7674, + "step": 92250 + }, + { + "epoch": 8.283213644524237, + "grad_norm": 12.11632251739502, + "learning_rate": 8.279802513464991e-06, + "loss": 6.6968, + "step": 92275 + }, + { + "epoch": 8.285457809694794, + "grad_norm": 15.397802352905273, + "learning_rate": 8.282046678635547e-06, + "loss": 6.7134, + "step": 92300 + }, + { + "epoch": 8.28770197486535, + "grad_norm": 12.630739212036133, + "learning_rate": 8.284290843806105e-06, + "loss": 6.8465, + "step": 92325 + }, + { + "epoch": 8.289946140035907, + "grad_norm": 12.067208290100098, + "learning_rate": 8.286535008976662e-06, + "loss": 6.5785, + "step": 92350 + }, + { + "epoch": 8.292190305206462, + "grad_norm": 14.028731346130371, + "learning_rate": 8.288779174147218e-06, + "loss": 6.6203, + "step": 92375 + }, + { + "epoch": 8.29443447037702, + "grad_norm": 12.343050003051758, + "learning_rate": 8.291023339317774e-06, + "loss": 6.7432, + "step": 92400 + }, + { + "epoch": 8.296678635547575, + "grad_norm": 10.438252449035645, + "learning_rate": 8.293267504488331e-06, + "loss": 6.8914, + "step": 92425 + }, + { + "epoch": 8.298922800718133, + "grad_norm": 12.116363525390625, + "learning_rate": 8.295511669658887e-06, + "loss": 6.6443, + "step": 92450 + }, + { + "epoch": 8.30116696588869, + "grad_norm": 12.499375343322754, + "learning_rate": 8.297755834829444e-06, + "loss": 6.8117, + "step": 92475 + }, + { + "epoch": 8.303411131059246, + "grad_norm": 15.515640258789062, + "learning_rate": 8.3e-06, + "loss": 6.7637, + "step": 92500 + }, + { + "epoch": 8.305655296229803, + "grad_norm": 11.003232955932617, + "learning_rate": 8.302244165170558e-06, + "loss": 6.532, + "step": 92525 + }, + { + "epoch": 8.307899461400359, + "grad_norm": 11.928780555725098, + "learning_rate": 8.304488330341113e-06, + "loss": 6.5493, + "step": 92550 + }, + { + "epoch": 8.310143626570916, + "grad_norm": 12.450206756591797, + "learning_rate": 8.30673249551167e-06, + "loss": 6.6993, + "step": 92575 + }, + { + "epoch": 8.312387791741472, + "grad_norm": 13.780129432678223, + "learning_rate": 8.308976660682227e-06, + "loss": 6.6238, + "step": 92600 + }, + { + "epoch": 8.314631956912029, + "grad_norm": 14.470551490783691, + "learning_rate": 8.311220825852784e-06, + "loss": 7.0666, + "step": 92625 + }, + { + "epoch": 8.316876122082585, + "grad_norm": 12.579374313354492, + "learning_rate": 8.31346499102334e-06, + "loss": 6.821, + "step": 92650 + }, + { + "epoch": 8.319120287253142, + "grad_norm": 13.022006034851074, + "learning_rate": 8.315709156193896e-06, + "loss": 6.6177, + "step": 92675 + }, + { + "epoch": 8.321364452423698, + "grad_norm": 13.488304138183594, + "learning_rate": 8.317953321364453e-06, + "loss": 6.6982, + "step": 92700 + }, + { + "epoch": 8.323608617594255, + "grad_norm": 11.311760902404785, + "learning_rate": 8.32019748653501e-06, + "loss": 6.6877, + "step": 92725 + }, + { + "epoch": 8.325852782764812, + "grad_norm": 12.758604049682617, + "learning_rate": 8.322441651705566e-06, + "loss": 6.6108, + "step": 92750 + }, + { + "epoch": 8.328096947935368, + "grad_norm": 11.35584545135498, + "learning_rate": 8.324685816876122e-06, + "loss": 6.7306, + "step": 92775 + }, + { + "epoch": 8.330341113105925, + "grad_norm": 13.758017539978027, + "learning_rate": 8.32692998204668e-06, + "loss": 6.76, + "step": 92800 + }, + { + "epoch": 8.33258527827648, + "grad_norm": 11.723893165588379, + "learning_rate": 8.329174147217235e-06, + "loss": 6.7393, + "step": 92825 + }, + { + "epoch": 8.334829443447038, + "grad_norm": 12.250651359558105, + "learning_rate": 8.331418312387793e-06, + "loss": 6.7094, + "step": 92850 + }, + { + "epoch": 8.337073608617594, + "grad_norm": 12.747035026550293, + "learning_rate": 8.333662477558349e-06, + "loss": 6.6983, + "step": 92875 + }, + { + "epoch": 8.339317773788151, + "grad_norm": 11.166705131530762, + "learning_rate": 8.335906642728906e-06, + "loss": 6.7851, + "step": 92900 + }, + { + "epoch": 8.341561938958707, + "grad_norm": 12.132503509521484, + "learning_rate": 8.338150807899462e-06, + "loss": 6.6442, + "step": 92925 + }, + { + "epoch": 8.343806104129264, + "grad_norm": 12.160259246826172, + "learning_rate": 8.34039497307002e-06, + "loss": 6.624, + "step": 92950 + }, + { + "epoch": 8.34605026929982, + "grad_norm": 11.109405517578125, + "learning_rate": 8.342639138240575e-06, + "loss": 6.8851, + "step": 92975 + }, + { + "epoch": 8.348294434470377, + "grad_norm": 13.641729354858398, + "learning_rate": 8.344883303411132e-06, + "loss": 6.5493, + "step": 93000 + }, + { + "epoch": 8.350538599640934, + "grad_norm": 13.126416206359863, + "learning_rate": 8.347127468581688e-06, + "loss": 6.3511, + "step": 93025 + }, + { + "epoch": 8.35278276481149, + "grad_norm": 10.378361701965332, + "learning_rate": 8.349371633752244e-06, + "loss": 6.7001, + "step": 93050 + }, + { + "epoch": 8.355026929982047, + "grad_norm": 12.91038990020752, + "learning_rate": 8.351615798922801e-06, + "loss": 6.6796, + "step": 93075 + }, + { + "epoch": 8.357271095152603, + "grad_norm": 12.760645866394043, + "learning_rate": 8.353859964093359e-06, + "loss": 6.6903, + "step": 93100 + }, + { + "epoch": 8.35951526032316, + "grad_norm": 11.973479270935059, + "learning_rate": 8.356104129263915e-06, + "loss": 6.6305, + "step": 93125 + }, + { + "epoch": 8.361759425493716, + "grad_norm": 18.108558654785156, + "learning_rate": 8.35834829443447e-06, + "loss": 6.6419, + "step": 93150 + }, + { + "epoch": 8.364003590664273, + "grad_norm": 13.661123275756836, + "learning_rate": 8.360592459605028e-06, + "loss": 6.7415, + "step": 93175 + }, + { + "epoch": 8.366247755834829, + "grad_norm": 11.82676887512207, + "learning_rate": 8.362836624775584e-06, + "loss": 6.7098, + "step": 93200 + }, + { + "epoch": 8.368491921005386, + "grad_norm": 13.49092960357666, + "learning_rate": 8.365080789946141e-06, + "loss": 6.7269, + "step": 93225 + }, + { + "epoch": 8.370736086175942, + "grad_norm": 14.427236557006836, + "learning_rate": 8.367324955116697e-06, + "loss": 6.6449, + "step": 93250 + }, + { + "epoch": 8.3729802513465, + "grad_norm": 11.744227409362793, + "learning_rate": 8.369569120287254e-06, + "loss": 6.8394, + "step": 93275 + }, + { + "epoch": 8.375224416517055, + "grad_norm": 15.865598678588867, + "learning_rate": 8.37181328545781e-06, + "loss": 6.7547, + "step": 93300 + }, + { + "epoch": 8.377468581687612, + "grad_norm": 11.660062789916992, + "learning_rate": 8.374057450628366e-06, + "loss": 6.8159, + "step": 93325 + }, + { + "epoch": 8.37971274685817, + "grad_norm": 14.166010856628418, + "learning_rate": 8.376301615798923e-06, + "loss": 6.6667, + "step": 93350 + }, + { + "epoch": 8.381956912028725, + "grad_norm": 11.960041999816895, + "learning_rate": 8.37854578096948e-06, + "loss": 6.6101, + "step": 93375 + }, + { + "epoch": 8.384201077199283, + "grad_norm": 10.878419876098633, + "learning_rate": 8.380789946140037e-06, + "loss": 6.7358, + "step": 93400 + }, + { + "epoch": 8.386445242369838, + "grad_norm": 12.86672306060791, + "learning_rate": 8.383034111310592e-06, + "loss": 6.7964, + "step": 93425 + }, + { + "epoch": 8.388689407540395, + "grad_norm": 12.212451934814453, + "learning_rate": 8.38527827648115e-06, + "loss": 6.7592, + "step": 93450 + }, + { + "epoch": 8.390933572710951, + "grad_norm": 9.82118034362793, + "learning_rate": 8.387522441651707e-06, + "loss": 6.5592, + "step": 93475 + }, + { + "epoch": 8.393177737881508, + "grad_norm": 12.048615455627441, + "learning_rate": 8.389766606822263e-06, + "loss": 6.5795, + "step": 93500 + }, + { + "epoch": 8.395421903052064, + "grad_norm": 11.945093154907227, + "learning_rate": 8.392010771992819e-06, + "loss": 6.5, + "step": 93525 + }, + { + "epoch": 8.397666068222621, + "grad_norm": 12.499398231506348, + "learning_rate": 8.394254937163376e-06, + "loss": 6.5305, + "step": 93550 + }, + { + "epoch": 8.399910233393177, + "grad_norm": 10.791783332824707, + "learning_rate": 8.396499102333932e-06, + "loss": 6.4266, + "step": 93575 + }, + { + "epoch": 8.402154398563734, + "grad_norm": 12.566157341003418, + "learning_rate": 8.39874326750449e-06, + "loss": 6.6638, + "step": 93600 + }, + { + "epoch": 8.40439856373429, + "grad_norm": 15.590743064880371, + "learning_rate": 8.400987432675045e-06, + "loss": 6.8687, + "step": 93625 + }, + { + "epoch": 8.406642728904847, + "grad_norm": 14.909618377685547, + "learning_rate": 8.403231597845603e-06, + "loss": 6.9065, + "step": 93650 + }, + { + "epoch": 8.408886894075405, + "grad_norm": 12.09634017944336, + "learning_rate": 8.405475763016159e-06, + "loss": 6.8059, + "step": 93675 + }, + { + "epoch": 8.41113105924596, + "grad_norm": 13.703232765197754, + "learning_rate": 8.407719928186714e-06, + "loss": 6.5475, + "step": 93700 + }, + { + "epoch": 8.413375224416518, + "grad_norm": 12.722075462341309, + "learning_rate": 8.409964093357272e-06, + "loss": 6.7096, + "step": 93725 + }, + { + "epoch": 8.415619389587073, + "grad_norm": 12.148754119873047, + "learning_rate": 8.41220825852783e-06, + "loss": 6.7664, + "step": 93750 + }, + { + "epoch": 8.41786355475763, + "grad_norm": 14.097867965698242, + "learning_rate": 8.414452423698385e-06, + "loss": 6.6407, + "step": 93775 + }, + { + "epoch": 8.420107719928186, + "grad_norm": 12.714947700500488, + "learning_rate": 8.41669658886894e-06, + "loss": 6.7677, + "step": 93800 + }, + { + "epoch": 8.422351885098744, + "grad_norm": 13.751261711120605, + "learning_rate": 8.418940754039498e-06, + "loss": 6.7365, + "step": 93825 + }, + { + "epoch": 8.4245960502693, + "grad_norm": 12.40906047821045, + "learning_rate": 8.421184919210054e-06, + "loss": 6.684, + "step": 93850 + }, + { + "epoch": 8.426840215439857, + "grad_norm": 12.64323902130127, + "learning_rate": 8.423429084380611e-06, + "loss": 6.6574, + "step": 93875 + }, + { + "epoch": 8.429084380610412, + "grad_norm": 14.31631851196289, + "learning_rate": 8.425673249551167e-06, + "loss": 6.805, + "step": 93900 + }, + { + "epoch": 8.43132854578097, + "grad_norm": 10.502531051635742, + "learning_rate": 8.427917414721725e-06, + "loss": 6.7156, + "step": 93925 + }, + { + "epoch": 8.433572710951527, + "grad_norm": 13.499920845031738, + "learning_rate": 8.43016157989228e-06, + "loss": 6.5062, + "step": 93950 + }, + { + "epoch": 8.435816876122082, + "grad_norm": 10.9510498046875, + "learning_rate": 8.432405745062838e-06, + "loss": 6.5847, + "step": 93975 + }, + { + "epoch": 8.43806104129264, + "grad_norm": 13.480298042297363, + "learning_rate": 8.434649910233394e-06, + "loss": 6.6252, + "step": 94000 + }, + { + "epoch": 8.440305206463195, + "grad_norm": 13.83859634399414, + "learning_rate": 8.436894075403951e-06, + "loss": 6.8538, + "step": 94025 + }, + { + "epoch": 8.442549371633753, + "grad_norm": 9.975552558898926, + "learning_rate": 8.439138240574507e-06, + "loss": 6.8254, + "step": 94050 + }, + { + "epoch": 8.444793536804308, + "grad_norm": 13.822813034057617, + "learning_rate": 8.441382405745063e-06, + "loss": 6.5494, + "step": 94075 + }, + { + "epoch": 8.447037701974866, + "grad_norm": 11.44621753692627, + "learning_rate": 8.44362657091562e-06, + "loss": 6.6823, + "step": 94100 + }, + { + "epoch": 8.449281867145421, + "grad_norm": 12.264030456542969, + "learning_rate": 8.445870736086178e-06, + "loss": 6.9802, + "step": 94125 + }, + { + "epoch": 8.451526032315979, + "grad_norm": 11.882315635681152, + "learning_rate": 8.448114901256733e-06, + "loss": 6.4935, + "step": 94150 + }, + { + "epoch": 8.453770197486534, + "grad_norm": 12.317831993103027, + "learning_rate": 8.450359066427289e-06, + "loss": 6.7382, + "step": 94175 + }, + { + "epoch": 8.456014362657092, + "grad_norm": 14.023643493652344, + "learning_rate": 8.452603231597847e-06, + "loss": 6.8311, + "step": 94200 + }, + { + "epoch": 8.458258527827649, + "grad_norm": 11.515290260314941, + "learning_rate": 8.454847396768402e-06, + "loss": 6.5357, + "step": 94225 + }, + { + "epoch": 8.460502692998205, + "grad_norm": 14.153326988220215, + "learning_rate": 8.45709156193896e-06, + "loss": 6.5519, + "step": 94250 + }, + { + "epoch": 8.462746858168762, + "grad_norm": 16.22002601623535, + "learning_rate": 8.459335727109516e-06, + "loss": 6.6187, + "step": 94275 + }, + { + "epoch": 8.464991023339318, + "grad_norm": 13.054065704345703, + "learning_rate": 8.461579892280073e-06, + "loss": 6.6371, + "step": 94300 + }, + { + "epoch": 8.467235188509875, + "grad_norm": 15.358997344970703, + "learning_rate": 8.463824057450629e-06, + "loss": 6.8524, + "step": 94325 + }, + { + "epoch": 8.46947935368043, + "grad_norm": 10.805742263793945, + "learning_rate": 8.466068222621186e-06, + "loss": 6.5409, + "step": 94350 + }, + { + "epoch": 8.471723518850988, + "grad_norm": 10.790205955505371, + "learning_rate": 8.468312387791742e-06, + "loss": 6.7256, + "step": 94375 + }, + { + "epoch": 8.473967684021543, + "grad_norm": 14.86347484588623, + "learning_rate": 8.4705565529623e-06, + "loss": 6.3152, + "step": 94400 + }, + { + "epoch": 8.4762118491921, + "grad_norm": 12.066336631774902, + "learning_rate": 8.472800718132855e-06, + "loss": 6.6965, + "step": 94425 + }, + { + "epoch": 8.478456014362656, + "grad_norm": 13.456534385681152, + "learning_rate": 8.475044883303411e-06, + "loss": 6.5214, + "step": 94450 + }, + { + "epoch": 8.480700179533214, + "grad_norm": 10.750638008117676, + "learning_rate": 8.477289048473969e-06, + "loss": 6.6364, + "step": 94475 + }, + { + "epoch": 8.48294434470377, + "grad_norm": 12.828737258911133, + "learning_rate": 8.479533213644526e-06, + "loss": 6.6664, + "step": 94500 + }, + { + "epoch": 8.485188509874327, + "grad_norm": 13.649569511413574, + "learning_rate": 8.481777378815082e-06, + "loss": 6.5687, + "step": 94525 + }, + { + "epoch": 8.487432675044884, + "grad_norm": 11.726101875305176, + "learning_rate": 8.484021543985638e-06, + "loss": 6.9182, + "step": 94550 + }, + { + "epoch": 8.48967684021544, + "grad_norm": 14.131293296813965, + "learning_rate": 8.486265709156195e-06, + "loss": 6.822, + "step": 94575 + }, + { + "epoch": 8.491921005385997, + "grad_norm": 19.61222267150879, + "learning_rate": 8.48850987432675e-06, + "loss": 6.4005, + "step": 94600 + }, + { + "epoch": 8.494165170556553, + "grad_norm": 11.357909202575684, + "learning_rate": 8.490754039497308e-06, + "loss": 6.6633, + "step": 94625 + }, + { + "epoch": 8.49640933572711, + "grad_norm": 10.219314575195312, + "learning_rate": 8.492998204667864e-06, + "loss": 6.618, + "step": 94650 + }, + { + "epoch": 8.498653500897666, + "grad_norm": 12.409692764282227, + "learning_rate": 8.495242369838421e-06, + "loss": 6.7719, + "step": 94675 + }, + { + "epoch": 8.500897666068223, + "grad_norm": 14.167600631713867, + "learning_rate": 8.497486535008977e-06, + "loss": 6.6377, + "step": 94700 + }, + { + "epoch": 8.503141831238779, + "grad_norm": 11.191832542419434, + "learning_rate": 8.499730700179533e-06, + "loss": 6.4891, + "step": 94725 + }, + { + "epoch": 8.505385996409336, + "grad_norm": 13.001180648803711, + "learning_rate": 8.50197486535009e-06, + "loss": 6.7549, + "step": 94750 + }, + { + "epoch": 8.507630161579891, + "grad_norm": 15.03341293334961, + "learning_rate": 8.504219030520648e-06, + "loss": 6.5794, + "step": 94775 + }, + { + "epoch": 8.509874326750449, + "grad_norm": 12.992362022399902, + "learning_rate": 8.506463195691204e-06, + "loss": 6.6513, + "step": 94800 + }, + { + "epoch": 8.512118491921004, + "grad_norm": 12.043025970458984, + "learning_rate": 8.50870736086176e-06, + "loss": 6.8332, + "step": 94825 + }, + { + "epoch": 8.514362657091562, + "grad_norm": 15.279191970825195, + "learning_rate": 8.510951526032317e-06, + "loss": 6.8578, + "step": 94850 + }, + { + "epoch": 8.51660682226212, + "grad_norm": 10.499835014343262, + "learning_rate": 8.513195691202873e-06, + "loss": 6.6127, + "step": 94875 + }, + { + "epoch": 8.518850987432675, + "grad_norm": 13.017276763916016, + "learning_rate": 8.51543985637343e-06, + "loss": 6.5355, + "step": 94900 + }, + { + "epoch": 8.521095152603232, + "grad_norm": 11.805363655090332, + "learning_rate": 8.517684021543986e-06, + "loss": 6.6638, + "step": 94925 + }, + { + "epoch": 8.523339317773788, + "grad_norm": 11.237983703613281, + "learning_rate": 8.519928186714543e-06, + "loss": 6.7149, + "step": 94950 + }, + { + "epoch": 8.525583482944345, + "grad_norm": 10.789929389953613, + "learning_rate": 8.522172351885099e-06, + "loss": 6.8031, + "step": 94975 + }, + { + "epoch": 8.5278276481149, + "grad_norm": 11.793042182922363, + "learning_rate": 8.524416517055657e-06, + "loss": 6.8799, + "step": 95000 + }, + { + "epoch": 8.530071813285458, + "grad_norm": 14.645045280456543, + "learning_rate": 8.526660682226212e-06, + "loss": 6.6894, + "step": 95025 + }, + { + "epoch": 8.532315978456014, + "grad_norm": 11.257546424865723, + "learning_rate": 8.52890484739677e-06, + "loss": 6.6927, + "step": 95050 + }, + { + "epoch": 8.534560143626571, + "grad_norm": 10.553267478942871, + "learning_rate": 8.531149012567326e-06, + "loss": 6.6557, + "step": 95075 + }, + { + "epoch": 8.536804308797127, + "grad_norm": 10.36174488067627, + "learning_rate": 8.533393177737881e-06, + "loss": 6.6013, + "step": 95100 + }, + { + "epoch": 8.539048473967684, + "grad_norm": 13.0820951461792, + "learning_rate": 8.535637342908439e-06, + "loss": 6.7024, + "step": 95125 + }, + { + "epoch": 8.541292639138241, + "grad_norm": 10.427326202392578, + "learning_rate": 8.537881508078996e-06, + "loss": 6.6344, + "step": 95150 + }, + { + "epoch": 8.543536804308797, + "grad_norm": 15.597009658813477, + "learning_rate": 8.540125673249552e-06, + "loss": 6.7239, + "step": 95175 + }, + { + "epoch": 8.545780969479354, + "grad_norm": 13.521751403808594, + "learning_rate": 8.542369838420108e-06, + "loss": 6.4718, + "step": 95200 + }, + { + "epoch": 8.54802513464991, + "grad_norm": 13.069845199584961, + "learning_rate": 8.544614003590665e-06, + "loss": 6.6586, + "step": 95225 + }, + { + "epoch": 8.550269299820467, + "grad_norm": 12.880742073059082, + "learning_rate": 8.546858168761221e-06, + "loss": 6.6671, + "step": 95250 + }, + { + "epoch": 8.552513464991023, + "grad_norm": 12.511231422424316, + "learning_rate": 8.549102333931779e-06, + "loss": 6.8484, + "step": 95275 + }, + { + "epoch": 8.55475763016158, + "grad_norm": 12.877594947814941, + "learning_rate": 8.551346499102334e-06, + "loss": 6.8071, + "step": 95300 + }, + { + "epoch": 8.557001795332136, + "grad_norm": 12.785994529724121, + "learning_rate": 8.553590664272892e-06, + "loss": 6.6916, + "step": 95325 + }, + { + "epoch": 8.559245960502693, + "grad_norm": 17.03653335571289, + "learning_rate": 8.555834829443448e-06, + "loss": 6.7061, + "step": 95350 + }, + { + "epoch": 8.561490125673249, + "grad_norm": 12.51469612121582, + "learning_rate": 8.557989228007182e-06, + "loss": 6.9055, + "step": 95375 + }, + { + "epoch": 8.563734290843806, + "grad_norm": 9.832930564880371, + "learning_rate": 8.560233393177739e-06, + "loss": 6.6695, + "step": 95400 + }, + { + "epoch": 8.565978456014363, + "grad_norm": 11.258909225463867, + "learning_rate": 8.562477558348295e-06, + "loss": 6.6245, + "step": 95425 + }, + { + "epoch": 8.568222621184919, + "grad_norm": 12.816901206970215, + "learning_rate": 8.564721723518852e-06, + "loss": 6.625, + "step": 95450 + }, + { + "epoch": 8.570466786355476, + "grad_norm": 14.345751762390137, + "learning_rate": 8.566965888689408e-06, + "loss": 6.5375, + "step": 95475 + }, + { + "epoch": 8.572710951526032, + "grad_norm": 13.084672927856445, + "learning_rate": 8.569210053859964e-06, + "loss": 6.5316, + "step": 95500 + }, + { + "epoch": 8.57495511669659, + "grad_norm": 14.733793258666992, + "learning_rate": 8.571454219030521e-06, + "loss": 6.7993, + "step": 95525 + }, + { + "epoch": 8.577199281867145, + "grad_norm": 13.344637870788574, + "learning_rate": 8.573698384201079e-06, + "loss": 6.8467, + "step": 95550 + }, + { + "epoch": 8.579443447037702, + "grad_norm": 11.320901870727539, + "learning_rate": 8.575942549371634e-06, + "loss": 6.7316, + "step": 95575 + }, + { + "epoch": 8.581687612208258, + "grad_norm": 11.345437049865723, + "learning_rate": 8.578186714542192e-06, + "loss": 6.7616, + "step": 95600 + }, + { + "epoch": 8.583931777378815, + "grad_norm": 12.170719146728516, + "learning_rate": 8.580430879712748e-06, + "loss": 6.5141, + "step": 95625 + }, + { + "epoch": 8.58617594254937, + "grad_norm": 16.259946823120117, + "learning_rate": 8.582675044883303e-06, + "loss": 6.607, + "step": 95650 + }, + { + "epoch": 8.588420107719928, + "grad_norm": 10.564281463623047, + "learning_rate": 8.584919210053861e-06, + "loss": 6.6186, + "step": 95675 + }, + { + "epoch": 8.590664272890486, + "grad_norm": 11.138690948486328, + "learning_rate": 8.587163375224418e-06, + "loss": 6.8388, + "step": 95700 + }, + { + "epoch": 8.592908438061041, + "grad_norm": 10.048398971557617, + "learning_rate": 8.589407540394974e-06, + "loss": 6.7164, + "step": 95725 + }, + { + "epoch": 8.595152603231599, + "grad_norm": 10.331938743591309, + "learning_rate": 8.59165170556553e-06, + "loss": 6.8284, + "step": 95750 + }, + { + "epoch": 8.597396768402154, + "grad_norm": 15.07302474975586, + "learning_rate": 8.593895870736086e-06, + "loss": 6.6405, + "step": 95775 + }, + { + "epoch": 8.599640933572712, + "grad_norm": 13.424948692321777, + "learning_rate": 8.596140035906643e-06, + "loss": 6.6792, + "step": 95800 + }, + { + "epoch": 8.601885098743267, + "grad_norm": 12.257489204406738, + "learning_rate": 8.5983842010772e-06, + "loss": 6.6998, + "step": 95825 + }, + { + "epoch": 8.604129263913824, + "grad_norm": 13.26054573059082, + "learning_rate": 8.600628366247756e-06, + "loss": 6.8825, + "step": 95850 + }, + { + "epoch": 8.60637342908438, + "grad_norm": 10.400275230407715, + "learning_rate": 8.602872531418312e-06, + "loss": 6.6263, + "step": 95875 + }, + { + "epoch": 8.608617594254937, + "grad_norm": 12.715744018554688, + "learning_rate": 8.60511669658887e-06, + "loss": 6.5442, + "step": 95900 + }, + { + "epoch": 8.610861759425493, + "grad_norm": 12.541098594665527, + "learning_rate": 8.607360861759427e-06, + "loss": 6.6108, + "step": 95925 + }, + { + "epoch": 8.61310592459605, + "grad_norm": 10.450981140136719, + "learning_rate": 8.609605026929983e-06, + "loss": 6.8595, + "step": 95950 + }, + { + "epoch": 8.615350089766606, + "grad_norm": 13.96268367767334, + "learning_rate": 8.61184919210054e-06, + "loss": 6.4948, + "step": 95975 + }, + { + "epoch": 8.617594254937163, + "grad_norm": 12.51034164428711, + "learning_rate": 8.614093357271096e-06, + "loss": 6.8991, + "step": 96000 + }, + { + "epoch": 8.61983842010772, + "grad_norm": 11.962325096130371, + "learning_rate": 8.616337522441652e-06, + "loss": 6.6092, + "step": 96025 + }, + { + "epoch": 8.622082585278276, + "grad_norm": 12.146551132202148, + "learning_rate": 8.61858168761221e-06, + "loss": 6.611, + "step": 96050 + }, + { + "epoch": 8.624326750448834, + "grad_norm": 11.80412769317627, + "learning_rate": 8.620825852782767e-06, + "loss": 6.516, + "step": 96075 + }, + { + "epoch": 8.62657091561939, + "grad_norm": 12.024552345275879, + "learning_rate": 8.623070017953323e-06, + "loss": 6.5964, + "step": 96100 + }, + { + "epoch": 8.628815080789947, + "grad_norm": 12.814855575561523, + "learning_rate": 8.625314183123878e-06, + "loss": 6.4589, + "step": 96125 + }, + { + "epoch": 8.631059245960502, + "grad_norm": 11.485803604125977, + "learning_rate": 8.627558348294434e-06, + "loss": 6.7213, + "step": 96150 + }, + { + "epoch": 8.63330341113106, + "grad_norm": 11.717217445373535, + "learning_rate": 8.629802513464992e-06, + "loss": 6.5956, + "step": 96175 + }, + { + "epoch": 8.635547576301615, + "grad_norm": 11.310052871704102, + "learning_rate": 8.632046678635549e-06, + "loss": 6.5256, + "step": 96200 + }, + { + "epoch": 8.637791741472173, + "grad_norm": 13.499818801879883, + "learning_rate": 8.634290843806105e-06, + "loss": 6.5253, + "step": 96225 + }, + { + "epoch": 8.640035906642728, + "grad_norm": 12.593222618103027, + "learning_rate": 8.63653500897666e-06, + "loss": 6.7928, + "step": 96250 + }, + { + "epoch": 8.642280071813286, + "grad_norm": 12.064352035522461, + "learning_rate": 8.638779174147218e-06, + "loss": 6.8678, + "step": 96275 + }, + { + "epoch": 8.644524236983841, + "grad_norm": 10.995774269104004, + "learning_rate": 8.641023339317774e-06, + "loss": 6.6779, + "step": 96300 + }, + { + "epoch": 8.646768402154398, + "grad_norm": 14.459524154663086, + "learning_rate": 8.643267504488331e-06, + "loss": 6.7379, + "step": 96325 + }, + { + "epoch": 8.649012567324956, + "grad_norm": 13.334378242492676, + "learning_rate": 8.645511669658889e-06, + "loss": 6.7791, + "step": 96350 + }, + { + "epoch": 8.651256732495511, + "grad_norm": 13.43887710571289, + "learning_rate": 8.647755834829444e-06, + "loss": 6.5674, + "step": 96375 + }, + { + "epoch": 8.653500897666069, + "grad_norm": 13.363248825073242, + "learning_rate": 8.65e-06, + "loss": 6.6727, + "step": 96400 + }, + { + "epoch": 8.655745062836624, + "grad_norm": 11.51230239868164, + "learning_rate": 8.652244165170558e-06, + "loss": 6.4661, + "step": 96425 + }, + { + "epoch": 8.657989228007182, + "grad_norm": 17.58278465270996, + "learning_rate": 8.654488330341113e-06, + "loss": 6.5922, + "step": 96450 + }, + { + "epoch": 8.660233393177737, + "grad_norm": 15.741951942443848, + "learning_rate": 8.656732495511671e-06, + "loss": 6.5293, + "step": 96475 + }, + { + "epoch": 8.662477558348295, + "grad_norm": 12.948196411132812, + "learning_rate": 8.658976660682227e-06, + "loss": 6.6758, + "step": 96500 + }, + { + "epoch": 8.66472172351885, + "grad_norm": 13.311129570007324, + "learning_rate": 8.661220825852782e-06, + "loss": 6.6194, + "step": 96525 + }, + { + "epoch": 8.666965888689408, + "grad_norm": 12.18950080871582, + "learning_rate": 8.66346499102334e-06, + "loss": 6.6665, + "step": 96550 + }, + { + "epoch": 8.669210053859963, + "grad_norm": 11.101162910461426, + "learning_rate": 8.665709156193897e-06, + "loss": 6.7803, + "step": 96575 + }, + { + "epoch": 8.67145421903052, + "grad_norm": 14.891246795654297, + "learning_rate": 8.667953321364453e-06, + "loss": 6.5284, + "step": 96600 + }, + { + "epoch": 8.673698384201078, + "grad_norm": 12.910101890563965, + "learning_rate": 8.670197486535009e-06, + "loss": 6.5605, + "step": 96625 + }, + { + "epoch": 8.675942549371634, + "grad_norm": 16.24932289123535, + "learning_rate": 8.672441651705566e-06, + "loss": 6.3793, + "step": 96650 + }, + { + "epoch": 8.678186714542191, + "grad_norm": 14.233362197875977, + "learning_rate": 8.674685816876122e-06, + "loss": 6.5896, + "step": 96675 + }, + { + "epoch": 8.680430879712747, + "grad_norm": 10.811127662658691, + "learning_rate": 8.67692998204668e-06, + "loss": 6.5131, + "step": 96700 + }, + { + "epoch": 8.682675044883304, + "grad_norm": 12.297187805175781, + "learning_rate": 8.679174147217237e-06, + "loss": 6.5946, + "step": 96725 + }, + { + "epoch": 8.68491921005386, + "grad_norm": 12.703540802001953, + "learning_rate": 8.681418312387793e-06, + "loss": 6.7472, + "step": 96750 + }, + { + "epoch": 8.687163375224417, + "grad_norm": 12.69092082977295, + "learning_rate": 8.683662477558349e-06, + "loss": 6.5841, + "step": 96775 + }, + { + "epoch": 8.689407540394972, + "grad_norm": 12.74277114868164, + "learning_rate": 8.685906642728904e-06, + "loss": 6.731, + "step": 96800 + }, + { + "epoch": 8.69165170556553, + "grad_norm": 12.744725227355957, + "learning_rate": 8.688150807899462e-06, + "loss": 6.6995, + "step": 96825 + }, + { + "epoch": 8.693895870736085, + "grad_norm": 12.013405799865723, + "learning_rate": 8.69039497307002e-06, + "loss": 6.7016, + "step": 96850 + }, + { + "epoch": 8.696140035906643, + "grad_norm": 10.605201721191406, + "learning_rate": 8.692639138240575e-06, + "loss": 6.8346, + "step": 96875 + }, + { + "epoch": 8.6983842010772, + "grad_norm": 14.60102367401123, + "learning_rate": 8.69488330341113e-06, + "loss": 6.7878, + "step": 96900 + }, + { + "epoch": 8.700628366247756, + "grad_norm": 11.2462158203125, + "learning_rate": 8.697127468581688e-06, + "loss": 6.38, + "step": 96925 + }, + { + "epoch": 8.702872531418313, + "grad_norm": 12.36123275756836, + "learning_rate": 8.699371633752246e-06, + "loss": 6.6192, + "step": 96950 + }, + { + "epoch": 8.705116696588869, + "grad_norm": 14.337181091308594, + "learning_rate": 8.701615798922801e-06, + "loss": 6.6637, + "step": 96975 + }, + { + "epoch": 8.707360861759426, + "grad_norm": 14.243643760681152, + "learning_rate": 8.703859964093359e-06, + "loss": 6.825, + "step": 97000 + }, + { + "epoch": 8.709605026929982, + "grad_norm": 12.404500961303711, + "learning_rate": 8.706104129263915e-06, + "loss": 6.66, + "step": 97025 + }, + { + "epoch": 8.711849192100539, + "grad_norm": 17.3852596282959, + "learning_rate": 8.70834829443447e-06, + "loss": 6.8311, + "step": 97050 + }, + { + "epoch": 8.714093357271095, + "grad_norm": 20.11705780029297, + "learning_rate": 8.710592459605028e-06, + "loss": 6.8898, + "step": 97075 + }, + { + "epoch": 8.716337522441652, + "grad_norm": 14.617064476013184, + "learning_rate": 8.712836624775585e-06, + "loss": 6.786, + "step": 97100 + }, + { + "epoch": 8.718581687612208, + "grad_norm": 14.173860549926758, + "learning_rate": 8.715080789946141e-06, + "loss": 6.9034, + "step": 97125 + }, + { + "epoch": 8.720825852782765, + "grad_norm": 13.324385643005371, + "learning_rate": 8.717324955116697e-06, + "loss": 6.6055, + "step": 97150 + }, + { + "epoch": 8.723070017953322, + "grad_norm": 12.581557273864746, + "learning_rate": 8.719569120287253e-06, + "loss": 6.7813, + "step": 97175 + }, + { + "epoch": 8.725314183123878, + "grad_norm": 11.567113876342773, + "learning_rate": 8.72181328545781e-06, + "loss": 6.3948, + "step": 97200 + }, + { + "epoch": 8.727558348294435, + "grad_norm": 15.26741886138916, + "learning_rate": 8.724057450628368e-06, + "loss": 6.5176, + "step": 97225 + }, + { + "epoch": 8.72980251346499, + "grad_norm": 14.1632719039917, + "learning_rate": 8.726301615798923e-06, + "loss": 6.6699, + "step": 97250 + }, + { + "epoch": 8.732046678635548, + "grad_norm": 14.388888359069824, + "learning_rate": 8.72854578096948e-06, + "loss": 6.8423, + "step": 97275 + }, + { + "epoch": 8.734290843806104, + "grad_norm": 12.597354888916016, + "learning_rate": 8.730789946140037e-06, + "loss": 6.7429, + "step": 97300 + }, + { + "epoch": 8.736535008976661, + "grad_norm": 12.585145950317383, + "learning_rate": 8.733034111310592e-06, + "loss": 6.71, + "step": 97325 + }, + { + "epoch": 8.738779174147217, + "grad_norm": 11.766454696655273, + "learning_rate": 8.73527827648115e-06, + "loss": 6.803, + "step": 97350 + }, + { + "epoch": 8.741023339317774, + "grad_norm": 14.326452255249023, + "learning_rate": 8.737522441651707e-06, + "loss": 6.4942, + "step": 97375 + }, + { + "epoch": 8.74326750448833, + "grad_norm": 11.654810905456543, + "learning_rate": 8.739766606822263e-06, + "loss": 6.9157, + "step": 97400 + }, + { + "epoch": 8.745511669658887, + "grad_norm": 16.095069885253906, + "learning_rate": 8.742010771992819e-06, + "loss": 6.3399, + "step": 97425 + }, + { + "epoch": 8.747755834829443, + "grad_norm": 14.289933204650879, + "learning_rate": 8.744254937163376e-06, + "loss": 6.5905, + "step": 97450 + }, + { + "epoch": 8.75, + "grad_norm": 13.174484252929688, + "learning_rate": 8.746499102333934e-06, + "loss": 6.491, + "step": 97475 + }, + { + "epoch": 8.752244165170557, + "grad_norm": 16.19005584716797, + "learning_rate": 8.74874326750449e-06, + "loss": 6.6877, + "step": 97500 + }, + { + "epoch": 8.754488330341113, + "grad_norm": 14.612138748168945, + "learning_rate": 8.750987432675045e-06, + "loss": 6.7518, + "step": 97525 + }, + { + "epoch": 8.75673249551167, + "grad_norm": 9.763134956359863, + "learning_rate": 8.753231597845601e-06, + "loss": 6.6871, + "step": 97550 + }, + { + "epoch": 8.758976660682226, + "grad_norm": 12.208136558532715, + "learning_rate": 8.755475763016159e-06, + "loss": 6.5568, + "step": 97575 + }, + { + "epoch": 8.761220825852783, + "grad_norm": 13.410249710083008, + "learning_rate": 8.757719928186716e-06, + "loss": 6.8326, + "step": 97600 + }, + { + "epoch": 8.763464991023339, + "grad_norm": 14.165125846862793, + "learning_rate": 8.759964093357272e-06, + "loss": 6.6384, + "step": 97625 + }, + { + "epoch": 8.765709156193896, + "grad_norm": 12.027939796447754, + "learning_rate": 8.762208258527828e-06, + "loss": 6.5756, + "step": 97650 + }, + { + "epoch": 8.767953321364452, + "grad_norm": 10.207423210144043, + "learning_rate": 8.764452423698385e-06, + "loss": 6.6125, + "step": 97675 + }, + { + "epoch": 8.77019748653501, + "grad_norm": 12.998360633850098, + "learning_rate": 8.76669658886894e-06, + "loss": 6.7821, + "step": 97700 + }, + { + "epoch": 8.772441651705565, + "grad_norm": 13.195469856262207, + "learning_rate": 8.768940754039498e-06, + "loss": 6.915, + "step": 97725 + }, + { + "epoch": 8.774685816876122, + "grad_norm": 11.232208251953125, + "learning_rate": 8.771184919210056e-06, + "loss": 6.4976, + "step": 97750 + }, + { + "epoch": 8.776929982046678, + "grad_norm": 9.889286994934082, + "learning_rate": 8.773429084380611e-06, + "loss": 6.8241, + "step": 97775 + }, + { + "epoch": 8.779174147217235, + "grad_norm": 12.330528259277344, + "learning_rate": 8.775673249551167e-06, + "loss": 6.6187, + "step": 97800 + }, + { + "epoch": 8.781418312387792, + "grad_norm": 12.047918319702148, + "learning_rate": 8.777917414721725e-06, + "loss": 6.6911, + "step": 97825 + }, + { + "epoch": 8.783662477558348, + "grad_norm": 14.168540954589844, + "learning_rate": 8.78016157989228e-06, + "loss": 6.6102, + "step": 97850 + }, + { + "epoch": 8.785906642728905, + "grad_norm": 11.96166706085205, + "learning_rate": 8.782405745062838e-06, + "loss": 6.6651, + "step": 97875 + }, + { + "epoch": 8.788150807899461, + "grad_norm": 11.796030044555664, + "learning_rate": 8.784649910233394e-06, + "loss": 6.6607, + "step": 97900 + }, + { + "epoch": 8.790394973070018, + "grad_norm": 14.147052764892578, + "learning_rate": 8.78689407540395e-06, + "loss": 6.8095, + "step": 97925 + }, + { + "epoch": 8.792639138240574, + "grad_norm": 14.755071640014648, + "learning_rate": 8.789138240574507e-06, + "loss": 6.5489, + "step": 97950 + }, + { + "epoch": 8.794883303411131, + "grad_norm": 11.247925758361816, + "learning_rate": 8.791382405745064e-06, + "loss": 6.6575, + "step": 97975 + }, + { + "epoch": 8.797127468581687, + "grad_norm": 9.97737979888916, + "learning_rate": 8.79362657091562e-06, + "loss": 6.6305, + "step": 98000 + }, + { + "epoch": 8.799371633752244, + "grad_norm": 12.060519218444824, + "learning_rate": 8.795870736086176e-06, + "loss": 6.7016, + "step": 98025 + }, + { + "epoch": 8.8016157989228, + "grad_norm": 11.615440368652344, + "learning_rate": 8.798114901256733e-06, + "loss": 6.6656, + "step": 98050 + }, + { + "epoch": 8.803859964093357, + "grad_norm": 17.023033142089844, + "learning_rate": 8.80035906642729e-06, + "loss": 6.5783, + "step": 98075 + }, + { + "epoch": 8.806104129263915, + "grad_norm": 16.70408821105957, + "learning_rate": 8.802603231597847e-06, + "loss": 6.8479, + "step": 98100 + }, + { + "epoch": 8.80834829443447, + "grad_norm": 11.629545211791992, + "learning_rate": 8.804847396768404e-06, + "loss": 6.7246, + "step": 98125 + }, + { + "epoch": 8.810592459605028, + "grad_norm": 15.934242248535156, + "learning_rate": 8.80709156193896e-06, + "loss": 6.4299, + "step": 98150 + }, + { + "epoch": 8.812836624775583, + "grad_norm": 11.630258560180664, + "learning_rate": 8.809335727109516e-06, + "loss": 6.526, + "step": 98175 + }, + { + "epoch": 8.81508078994614, + "grad_norm": 14.702563285827637, + "learning_rate": 8.811579892280071e-06, + "loss": 6.7863, + "step": 98200 + }, + { + "epoch": 8.817324955116696, + "grad_norm": 14.500388145446777, + "learning_rate": 8.813824057450629e-06, + "loss": 6.7782, + "step": 98225 + }, + { + "epoch": 8.819569120287253, + "grad_norm": 16.682579040527344, + "learning_rate": 8.816068222621186e-06, + "loss": 6.8283, + "step": 98250 + }, + { + "epoch": 8.821813285457809, + "grad_norm": 11.073346138000488, + "learning_rate": 8.818312387791742e-06, + "loss": 6.8541, + "step": 98275 + }, + { + "epoch": 8.824057450628366, + "grad_norm": 12.065454483032227, + "learning_rate": 8.820556552962298e-06, + "loss": 6.745, + "step": 98300 + }, + { + "epoch": 8.826301615798922, + "grad_norm": 15.44114875793457, + "learning_rate": 8.822800718132855e-06, + "loss": 6.6839, + "step": 98325 + }, + { + "epoch": 8.82854578096948, + "grad_norm": 12.062435150146484, + "learning_rate": 8.825044883303411e-06, + "loss": 6.5991, + "step": 98350 + }, + { + "epoch": 8.830789946140037, + "grad_norm": 12.201584815979004, + "learning_rate": 8.827289048473969e-06, + "loss": 6.6515, + "step": 98375 + }, + { + "epoch": 8.833034111310592, + "grad_norm": 10.548948287963867, + "learning_rate": 8.829533213644524e-06, + "loss": 6.6214, + "step": 98400 + }, + { + "epoch": 8.83527827648115, + "grad_norm": 12.091026306152344, + "learning_rate": 8.831777378815082e-06, + "loss": 6.7324, + "step": 98425 + }, + { + "epoch": 8.837522441651705, + "grad_norm": 16.115028381347656, + "learning_rate": 8.834021543985638e-06, + "loss": 6.6399, + "step": 98450 + }, + { + "epoch": 8.839766606822263, + "grad_norm": 10.398847579956055, + "learning_rate": 8.836265709156195e-06, + "loss": 6.7981, + "step": 98475 + }, + { + "epoch": 8.842010771992818, + "grad_norm": 14.405200958251953, + "learning_rate": 8.838509874326752e-06, + "loss": 6.6231, + "step": 98500 + }, + { + "epoch": 8.844254937163376, + "grad_norm": 11.27393913269043, + "learning_rate": 8.840754039497308e-06, + "loss": 6.6038, + "step": 98525 + }, + { + "epoch": 8.846499102333931, + "grad_norm": 13.461312294006348, + "learning_rate": 8.842998204667864e-06, + "loss": 6.7386, + "step": 98550 + }, + { + "epoch": 8.848743267504489, + "grad_norm": 13.088689804077148, + "learning_rate": 8.84524236983842e-06, + "loss": 6.5646, + "step": 98575 + }, + { + "epoch": 8.850987432675044, + "grad_norm": 15.858865737915039, + "learning_rate": 8.847486535008977e-06, + "loss": 6.6586, + "step": 98600 + }, + { + "epoch": 8.853231597845602, + "grad_norm": 11.645283699035645, + "learning_rate": 8.849730700179535e-06, + "loss": 6.5277, + "step": 98625 + }, + { + "epoch": 8.855475763016159, + "grad_norm": 12.657540321350098, + "learning_rate": 8.85197486535009e-06, + "loss": 6.5982, + "step": 98650 + }, + { + "epoch": 8.857719928186714, + "grad_norm": 13.131637573242188, + "learning_rate": 8.854219030520646e-06, + "loss": 6.6282, + "step": 98675 + }, + { + "epoch": 8.859964093357272, + "grad_norm": 14.544132232666016, + "learning_rate": 8.856463195691204e-06, + "loss": 6.5945, + "step": 98700 + }, + { + "epoch": 8.862208258527827, + "grad_norm": 13.914116859436035, + "learning_rate": 8.85870736086176e-06, + "loss": 6.5488, + "step": 98725 + }, + { + "epoch": 8.864452423698385, + "grad_norm": 12.105886459350586, + "learning_rate": 8.860951526032317e-06, + "loss": 6.6922, + "step": 98750 + }, + { + "epoch": 8.86669658886894, + "grad_norm": 10.72050952911377, + "learning_rate": 8.863195691202873e-06, + "loss": 6.8052, + "step": 98775 + }, + { + "epoch": 8.868940754039498, + "grad_norm": 12.20400619506836, + "learning_rate": 8.86543985637343e-06, + "loss": 6.77, + "step": 98800 + }, + { + "epoch": 8.871184919210053, + "grad_norm": 15.416425704956055, + "learning_rate": 8.867684021543986e-06, + "loss": 6.5719, + "step": 98825 + }, + { + "epoch": 8.87342908438061, + "grad_norm": 12.236030578613281, + "learning_rate": 8.869928186714543e-06, + "loss": 6.594, + "step": 98850 + }, + { + "epoch": 8.875673249551166, + "grad_norm": 11.431305885314941, + "learning_rate": 8.8721723518851e-06, + "loss": 6.6694, + "step": 98875 + }, + { + "epoch": 8.877917414721724, + "grad_norm": 11.865811347961426, + "learning_rate": 8.874416517055657e-06, + "loss": 6.5213, + "step": 98900 + }, + { + "epoch": 8.88016157989228, + "grad_norm": 10.582343101501465, + "learning_rate": 8.876660682226212e-06, + "loss": 6.7344, + "step": 98925 + }, + { + "epoch": 8.882405745062837, + "grad_norm": 11.931846618652344, + "learning_rate": 8.878904847396768e-06, + "loss": 6.7278, + "step": 98950 + }, + { + "epoch": 8.884649910233394, + "grad_norm": 12.310480117797852, + "learning_rate": 8.881149012567326e-06, + "loss": 6.2867, + "step": 98975 + }, + { + "epoch": 8.88689407540395, + "grad_norm": 13.565872192382812, + "learning_rate": 8.883393177737883e-06, + "loss": 6.6052, + "step": 99000 + }, + { + "epoch": 8.889138240574507, + "grad_norm": 13.894827842712402, + "learning_rate": 8.885637342908439e-06, + "loss": 6.5232, + "step": 99025 + }, + { + "epoch": 8.891382405745063, + "grad_norm": 11.576489448547363, + "learning_rate": 8.887881508078995e-06, + "loss": 6.8384, + "step": 99050 + }, + { + "epoch": 8.89362657091562, + "grad_norm": 12.599846839904785, + "learning_rate": 8.890125673249552e-06, + "loss": 6.6793, + "step": 99075 + }, + { + "epoch": 8.895870736086176, + "grad_norm": 11.540206909179688, + "learning_rate": 8.892369838420108e-06, + "loss": 6.678, + "step": 99100 + }, + { + "epoch": 8.898114901256733, + "grad_norm": 13.214176177978516, + "learning_rate": 8.894614003590665e-06, + "loss": 6.6114, + "step": 99125 + }, + { + "epoch": 8.900359066427288, + "grad_norm": 14.207098007202148, + "learning_rate": 8.896858168761223e-06, + "loss": 6.5871, + "step": 99150 + }, + { + "epoch": 8.902603231597846, + "grad_norm": 10.896808624267578, + "learning_rate": 8.899102333931779e-06, + "loss": 6.6624, + "step": 99175 + }, + { + "epoch": 8.904847396768401, + "grad_norm": 12.708535194396973, + "learning_rate": 8.901346499102334e-06, + "loss": 6.7, + "step": 99200 + }, + { + "epoch": 8.907091561938959, + "grad_norm": 13.366960525512695, + "learning_rate": 8.90359066427289e-06, + "loss": 6.6959, + "step": 99225 + }, + { + "epoch": 8.909335727109514, + "grad_norm": 11.95833683013916, + "learning_rate": 8.905834829443448e-06, + "loss": 6.7161, + "step": 99250 + }, + { + "epoch": 8.911579892280072, + "grad_norm": 11.887898445129395, + "learning_rate": 8.908078994614005e-06, + "loss": 6.6812, + "step": 99275 + }, + { + "epoch": 8.91382405745063, + "grad_norm": 12.443777084350586, + "learning_rate": 8.91032315978456e-06, + "loss": 6.7669, + "step": 99300 + }, + { + "epoch": 8.916068222621185, + "grad_norm": 12.869447708129883, + "learning_rate": 8.912567324955117e-06, + "loss": 6.4693, + "step": 99325 + }, + { + "epoch": 8.918312387791742, + "grad_norm": 12.58298397064209, + "learning_rate": 8.914811490125674e-06, + "loss": 6.6968, + "step": 99350 + }, + { + "epoch": 8.920556552962298, + "grad_norm": 11.942317962646484, + "learning_rate": 8.917055655296231e-06, + "loss": 6.6341, + "step": 99375 + }, + { + "epoch": 8.922800718132855, + "grad_norm": 12.792378425598145, + "learning_rate": 8.919299820466787e-06, + "loss": 6.7359, + "step": 99400 + }, + { + "epoch": 8.92504488330341, + "grad_norm": 15.224624633789062, + "learning_rate": 8.921543985637343e-06, + "loss": 6.9138, + "step": 99425 + }, + { + "epoch": 8.927289048473968, + "grad_norm": 17.34048843383789, + "learning_rate": 8.9237881508079e-06, + "loss": 6.6485, + "step": 99450 + }, + { + "epoch": 8.929533213644524, + "grad_norm": 13.27348804473877, + "learning_rate": 8.926032315978456e-06, + "loss": 6.7052, + "step": 99475 + }, + { + "epoch": 8.931777378815081, + "grad_norm": 14.627366065979004, + "learning_rate": 8.928276481149014e-06, + "loss": 6.5415, + "step": 99500 + }, + { + "epoch": 8.934021543985637, + "grad_norm": 16.402523040771484, + "learning_rate": 8.930520646319571e-06, + "loss": 6.5803, + "step": 99525 + }, + { + "epoch": 8.936265709156194, + "grad_norm": 12.163031578063965, + "learning_rate": 8.932764811490127e-06, + "loss": 6.603, + "step": 99550 + }, + { + "epoch": 8.938509874326751, + "grad_norm": 14.005494117736816, + "learning_rate": 8.935008976660683e-06, + "loss": 6.516, + "step": 99575 + }, + { + "epoch": 8.940754039497307, + "grad_norm": 14.187764167785645, + "learning_rate": 8.937253141831238e-06, + "loss": 6.733, + "step": 99600 + }, + { + "epoch": 8.942998204667864, + "grad_norm": 13.61208724975586, + "learning_rate": 8.939497307001796e-06, + "loss": 6.5445, + "step": 99625 + }, + { + "epoch": 8.94524236983842, + "grad_norm": 11.296825408935547, + "learning_rate": 8.941741472172353e-06, + "loss": 6.3456, + "step": 99650 + }, + { + "epoch": 8.947486535008977, + "grad_norm": 12.624124526977539, + "learning_rate": 8.943985637342909e-06, + "loss": 6.5643, + "step": 99675 + }, + { + "epoch": 8.949730700179533, + "grad_norm": 13.939628601074219, + "learning_rate": 8.946229802513465e-06, + "loss": 6.5982, + "step": 99700 + }, + { + "epoch": 8.95197486535009, + "grad_norm": 13.203315734863281, + "learning_rate": 8.948473967684022e-06, + "loss": 6.8269, + "step": 99725 + }, + { + "epoch": 8.954219030520646, + "grad_norm": 13.751965522766113, + "learning_rate": 8.950628366247756e-06, + "loss": 6.7096, + "step": 99750 + }, + { + "epoch": 8.956463195691203, + "grad_norm": 14.459602355957031, + "learning_rate": 8.952872531418312e-06, + "loss": 6.9104, + "step": 99775 + }, + { + "epoch": 8.958707360861759, + "grad_norm": 15.753278732299805, + "learning_rate": 8.95511669658887e-06, + "loss": 6.5286, + "step": 99800 + }, + { + "epoch": 8.960951526032316, + "grad_norm": 10.368764877319336, + "learning_rate": 8.957360861759427e-06, + "loss": 6.5399, + "step": 99825 + }, + { + "epoch": 8.963195691202873, + "grad_norm": 9.729665756225586, + "learning_rate": 8.959605026929983e-06, + "loss": 6.726, + "step": 99850 + }, + { + "epoch": 8.965439856373429, + "grad_norm": 12.435331344604492, + "learning_rate": 8.961849192100539e-06, + "loss": 6.754, + "step": 99875 + }, + { + "epoch": 8.967684021543986, + "grad_norm": 13.341442108154297, + "learning_rate": 8.964093357271096e-06, + "loss": 6.7306, + "step": 99900 + }, + { + "epoch": 8.969928186714542, + "grad_norm": 13.610530853271484, + "learning_rate": 8.966337522441652e-06, + "loss": 6.4367, + "step": 99925 + }, + { + "epoch": 8.9721723518851, + "grad_norm": 11.093850135803223, + "learning_rate": 8.96858168761221e-06, + "loss": 6.5707, + "step": 99950 + }, + { + "epoch": 8.974416517055655, + "grad_norm": 10.222867012023926, + "learning_rate": 8.970825852782765e-06, + "loss": 6.4909, + "step": 99975 + }, + { + "epoch": 8.976660682226212, + "grad_norm": 19.479663848876953, + "learning_rate": 8.973070017953323e-06, + "loss": 6.7235, + "step": 100000 + }, + { + "epoch": 8.978904847396768, + "grad_norm": 14.102149963378906, + "learning_rate": 8.975314183123878e-06, + "loss": 6.5276, + "step": 100025 + }, + { + "epoch": 8.981149012567325, + "grad_norm": 12.58425521850586, + "learning_rate": 8.977558348294436e-06, + "loss": 6.5867, + "step": 100050 + }, + { + "epoch": 8.98339317773788, + "grad_norm": 14.809758186340332, + "learning_rate": 8.979802513464992e-06, + "loss": 6.7164, + "step": 100075 + }, + { + "epoch": 8.985637342908438, + "grad_norm": 11.487586975097656, + "learning_rate": 8.982046678635549e-06, + "loss": 6.3121, + "step": 100100 + }, + { + "epoch": 8.987881508078996, + "grad_norm": 11.479657173156738, + "learning_rate": 8.984290843806105e-06, + "loss": 6.4592, + "step": 100125 + }, + { + "epoch": 8.990125673249551, + "grad_norm": 10.80176830291748, + "learning_rate": 8.98653500897666e-06, + "loss": 6.8035, + "step": 100150 + }, + { + "epoch": 8.992369838420109, + "grad_norm": 10.843179702758789, + "learning_rate": 8.988779174147218e-06, + "loss": 6.3818, + "step": 100175 + }, + { + "epoch": 8.994614003590664, + "grad_norm": 13.138623237609863, + "learning_rate": 8.991023339317775e-06, + "loss": 6.7386, + "step": 100200 + }, + { + "epoch": 8.996858168761221, + "grad_norm": 25.281129837036133, + "learning_rate": 8.993267504488331e-06, + "loss": 6.566, + "step": 100225 + }, + { + "epoch": 8.999102333931777, + "grad_norm": 12.619494438171387, + "learning_rate": 8.995511669658887e-06, + "loss": 6.7472, + "step": 100250 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.07372119221736391, + "eval_f1_macro": 0.0025567619976640844, + "eval_f1_micro": 0.07372119221736391, + "eval_f1_weighted": 0.03052229568240256, + "eval_loss": 7.368748188018799, + "eval_precision_macro": 0.002455985920688981, + "eval_precision_micro": 0.07372119221736391, + "eval_precision_weighted": 0.02461576798868031, + "eval_recall_macro": 0.005724603394285497, + "eval_recall_micro": 0.07372119221736391, + "eval_recall_weighted": 0.07372119221736391, + "eval_runtime": 128.9758, + "eval_samples_per_second": 406.068, + "eval_steps_per_second": 12.692, + "step": 100260 + }, + { + "epoch": 9.001346499102334, + "grad_norm": 10.510261535644531, + "learning_rate": 8.997755834829444e-06, + "loss": 6.3479, + "step": 100275 + }, + { + "epoch": 9.00359066427289, + "grad_norm": 12.156265258789062, + "learning_rate": 9e-06, + "loss": 6.3743, + "step": 100300 + }, + { + "epoch": 9.005834829443447, + "grad_norm": 11.077884674072266, + "learning_rate": 9.002244165170558e-06, + "loss": 6.3311, + "step": 100325 + }, + { + "epoch": 9.008078994614003, + "grad_norm": 12.203634262084961, + "learning_rate": 9.004488330341113e-06, + "loss": 6.489, + "step": 100350 + }, + { + "epoch": 9.01032315978456, + "grad_norm": 11.785304069519043, + "learning_rate": 9.006732495511671e-06, + "loss": 6.3616, + "step": 100375 + }, + { + "epoch": 9.012567324955116, + "grad_norm": 13.85549545288086, + "learning_rate": 9.008976660682227e-06, + "loss": 6.2967, + "step": 100400 + }, + { + "epoch": 9.014811490125673, + "grad_norm": 15.528587341308594, + "learning_rate": 9.011220825852784e-06, + "loss": 6.5002, + "step": 100425 + }, + { + "epoch": 9.01705565529623, + "grad_norm": 11.213828086853027, + "learning_rate": 9.01346499102334e-06, + "loss": 6.4868, + "step": 100450 + }, + { + "epoch": 9.019299820466786, + "grad_norm": 11.514039993286133, + "learning_rate": 9.015709156193897e-06, + "loss": 6.2628, + "step": 100475 + }, + { + "epoch": 9.021543985637344, + "grad_norm": 15.143702507019043, + "learning_rate": 9.017953321364453e-06, + "loss": 6.5082, + "step": 100500 + }, + { + "epoch": 9.0237881508079, + "grad_norm": 10.743465423583984, + "learning_rate": 9.020197486535009e-06, + "loss": 6.246, + "step": 100525 + }, + { + "epoch": 9.026032315978457, + "grad_norm": 12.181488037109375, + "learning_rate": 9.022441651705566e-06, + "loss": 6.4814, + "step": 100550 + }, + { + "epoch": 9.028276481149012, + "grad_norm": 10.768160820007324, + "learning_rate": 9.024685816876124e-06, + "loss": 6.6522, + "step": 100575 + }, + { + "epoch": 9.03052064631957, + "grad_norm": 13.921645164489746, + "learning_rate": 9.02692998204668e-06, + "loss": 6.2277, + "step": 100600 + }, + { + "epoch": 9.032764811490125, + "grad_norm": 11.472830772399902, + "learning_rate": 9.029174147217235e-06, + "loss": 6.4833, + "step": 100625 + }, + { + "epoch": 9.035008976660682, + "grad_norm": 15.354777336120605, + "learning_rate": 9.031418312387793e-06, + "loss": 6.2515, + "step": 100650 + }, + { + "epoch": 9.037253141831238, + "grad_norm": 11.249044418334961, + "learning_rate": 9.033662477558349e-06, + "loss": 6.4424, + "step": 100675 + }, + { + "epoch": 9.039497307001795, + "grad_norm": 13.209831237792969, + "learning_rate": 9.035906642728906e-06, + "loss": 6.4146, + "step": 100700 + }, + { + "epoch": 9.041741472172351, + "grad_norm": 11.421797752380371, + "learning_rate": 9.038150807899462e-06, + "loss": 6.2452, + "step": 100725 + }, + { + "epoch": 9.043985637342908, + "grad_norm": 11.888664245605469, + "learning_rate": 9.04039497307002e-06, + "loss": 6.4108, + "step": 100750 + }, + { + "epoch": 9.046229802513466, + "grad_norm": 16.1672420501709, + "learning_rate": 9.042639138240575e-06, + "loss": 6.4897, + "step": 100775 + }, + { + "epoch": 9.048473967684021, + "grad_norm": 11.7051362991333, + "learning_rate": 9.04488330341113e-06, + "loss": 6.1394, + "step": 100800 + }, + { + "epoch": 9.050718132854579, + "grad_norm": 12.381804466247559, + "learning_rate": 9.047127468581688e-06, + "loss": 6.5234, + "step": 100825 + }, + { + "epoch": 9.052962298025134, + "grad_norm": 12.526471138000488, + "learning_rate": 9.049371633752246e-06, + "loss": 6.4039, + "step": 100850 + }, + { + "epoch": 9.055206463195692, + "grad_norm": 12.94444465637207, + "learning_rate": 9.051615798922802e-06, + "loss": 6.5746, + "step": 100875 + }, + { + "epoch": 9.057450628366247, + "grad_norm": 13.056926727294922, + "learning_rate": 9.053859964093357e-06, + "loss": 6.0809, + "step": 100900 + }, + { + "epoch": 9.059694793536805, + "grad_norm": 13.626959800720215, + "learning_rate": 9.056104129263915e-06, + "loss": 6.2423, + "step": 100925 + }, + { + "epoch": 9.06193895870736, + "grad_norm": 12.378629684448242, + "learning_rate": 9.058348294434472e-06, + "loss": 6.3727, + "step": 100950 + }, + { + "epoch": 9.064183123877918, + "grad_norm": 12.176326751708984, + "learning_rate": 9.060592459605028e-06, + "loss": 6.7308, + "step": 100975 + }, + { + "epoch": 9.066427289048473, + "grad_norm": 12.463343620300293, + "learning_rate": 9.062836624775584e-06, + "loss": 6.4037, + "step": 101000 + }, + { + "epoch": 9.06867145421903, + "grad_norm": 13.44317626953125, + "learning_rate": 9.065080789946141e-06, + "loss": 6.5568, + "step": 101025 + }, + { + "epoch": 9.070915619389588, + "grad_norm": 12.813921928405762, + "learning_rate": 9.067324955116697e-06, + "loss": 6.4385, + "step": 101050 + }, + { + "epoch": 9.073159784560143, + "grad_norm": 16.297863006591797, + "learning_rate": 9.069569120287254e-06, + "loss": 6.3643, + "step": 101075 + }, + { + "epoch": 9.0754039497307, + "grad_norm": 12.376840591430664, + "learning_rate": 9.07181328545781e-06, + "loss": 6.5124, + "step": 101100 + }, + { + "epoch": 9.077648114901256, + "grad_norm": 10.924262046813965, + "learning_rate": 9.074057450628368e-06, + "loss": 6.4629, + "step": 101125 + }, + { + "epoch": 9.079892280071814, + "grad_norm": 10.859965324401855, + "learning_rate": 9.076301615798923e-06, + "loss": 6.198, + "step": 101150 + }, + { + "epoch": 9.08213644524237, + "grad_norm": 13.788814544677734, + "learning_rate": 9.07854578096948e-06, + "loss": 6.5627, + "step": 101175 + }, + { + "epoch": 9.084380610412927, + "grad_norm": 13.38847827911377, + "learning_rate": 9.080789946140037e-06, + "loss": 6.359, + "step": 101200 + }, + { + "epoch": 9.086624775583482, + "grad_norm": 10.859132766723633, + "learning_rate": 9.083034111310594e-06, + "loss": 6.3238, + "step": 101225 + }, + { + "epoch": 9.08886894075404, + "grad_norm": 13.158913612365723, + "learning_rate": 9.08527827648115e-06, + "loss": 6.4464, + "step": 101250 + }, + { + "epoch": 9.091113105924595, + "grad_norm": 12.283122062683105, + "learning_rate": 9.087522441651706e-06, + "loss": 6.3247, + "step": 101275 + }, + { + "epoch": 9.093357271095153, + "grad_norm": 9.849767684936523, + "learning_rate": 9.089766606822263e-06, + "loss": 6.4371, + "step": 101300 + }, + { + "epoch": 9.09560143626571, + "grad_norm": 15.05694580078125, + "learning_rate": 9.092010771992819e-06, + "loss": 6.4945, + "step": 101325 + }, + { + "epoch": 9.097845601436266, + "grad_norm": 13.79715633392334, + "learning_rate": 9.094254937163376e-06, + "loss": 6.2929, + "step": 101350 + }, + { + "epoch": 9.100089766606823, + "grad_norm": 11.904982566833496, + "learning_rate": 9.096499102333932e-06, + "loss": 6.2706, + "step": 101375 + }, + { + "epoch": 9.102333931777379, + "grad_norm": 11.828036308288574, + "learning_rate": 9.09874326750449e-06, + "loss": 6.7329, + "step": 101400 + }, + { + "epoch": 9.104578096947936, + "grad_norm": 10.806059837341309, + "learning_rate": 9.100987432675045e-06, + "loss": 6.4417, + "step": 101425 + }, + { + "epoch": 9.106822262118492, + "grad_norm": 12.144794464111328, + "learning_rate": 9.103231597845603e-06, + "loss": 6.5293, + "step": 101450 + }, + { + "epoch": 9.109066427289049, + "grad_norm": 12.365259170532227, + "learning_rate": 9.105475763016159e-06, + "loss": 6.43, + "step": 101475 + }, + { + "epoch": 9.111310592459605, + "grad_norm": 11.921772003173828, + "learning_rate": 9.107719928186716e-06, + "loss": 6.715, + "step": 101500 + }, + { + "epoch": 9.113554757630162, + "grad_norm": 12.305421829223633, + "learning_rate": 9.109964093357272e-06, + "loss": 6.2613, + "step": 101525 + }, + { + "epoch": 9.115798922800717, + "grad_norm": 13.138167381286621, + "learning_rate": 9.112208258527828e-06, + "loss": 6.3744, + "step": 101550 + }, + { + "epoch": 9.118043087971275, + "grad_norm": 16.21400260925293, + "learning_rate": 9.114452423698385e-06, + "loss": 6.5652, + "step": 101575 + }, + { + "epoch": 9.12028725314183, + "grad_norm": 12.93182373046875, + "learning_rate": 9.116696588868943e-06, + "loss": 6.5809, + "step": 101600 + }, + { + "epoch": 9.122531418312388, + "grad_norm": 13.125479698181152, + "learning_rate": 9.118940754039498e-06, + "loss": 6.3871, + "step": 101625 + }, + { + "epoch": 9.124775583482945, + "grad_norm": 14.524245262145996, + "learning_rate": 9.121184919210054e-06, + "loss": 6.5676, + "step": 101650 + }, + { + "epoch": 9.1270197486535, + "grad_norm": 15.742608070373535, + "learning_rate": 9.123429084380612e-06, + "loss": 6.5442, + "step": 101675 + }, + { + "epoch": 9.129263913824058, + "grad_norm": 11.688802719116211, + "learning_rate": 9.125673249551167e-06, + "loss": 6.5033, + "step": 101700 + }, + { + "epoch": 9.131508078994614, + "grad_norm": 11.75794506072998, + "learning_rate": 9.127917414721725e-06, + "loss": 6.3608, + "step": 101725 + }, + { + "epoch": 9.133752244165171, + "grad_norm": 11.398234367370605, + "learning_rate": 9.13016157989228e-06, + "loss": 6.1612, + "step": 101750 + }, + { + "epoch": 9.135996409335727, + "grad_norm": 17.01484489440918, + "learning_rate": 9.132405745062838e-06, + "loss": 6.5147, + "step": 101775 + }, + { + "epoch": 9.138240574506284, + "grad_norm": 12.866793632507324, + "learning_rate": 9.134649910233394e-06, + "loss": 6.3918, + "step": 101800 + }, + { + "epoch": 9.14048473967684, + "grad_norm": 11.540843963623047, + "learning_rate": 9.136804308797128e-06, + "loss": 6.4821, + "step": 101825 + }, + { + "epoch": 9.142728904847397, + "grad_norm": 12.42692756652832, + "learning_rate": 9.139048473967684e-06, + "loss": 6.42, + "step": 101850 + }, + { + "epoch": 9.144973070017953, + "grad_norm": 11.646912574768066, + "learning_rate": 9.141292639138241e-06, + "loss": 6.6006, + "step": 101875 + }, + { + "epoch": 9.14721723518851, + "grad_norm": 11.859711647033691, + "learning_rate": 9.143536804308798e-06, + "loss": 6.0938, + "step": 101900 + }, + { + "epoch": 9.149461400359067, + "grad_norm": 14.009509086608887, + "learning_rate": 9.145780969479354e-06, + "loss": 6.6477, + "step": 101925 + }, + { + "epoch": 9.151705565529623, + "grad_norm": 10.580940246582031, + "learning_rate": 9.14802513464991e-06, + "loss": 6.5079, + "step": 101950 + }, + { + "epoch": 9.15394973070018, + "grad_norm": 15.191463470458984, + "learning_rate": 9.150269299820467e-06, + "loss": 6.5507, + "step": 101975 + }, + { + "epoch": 9.156193895870736, + "grad_norm": 11.240462303161621, + "learning_rate": 9.152513464991025e-06, + "loss": 6.2405, + "step": 102000 + }, + { + "epoch": 9.158438061041293, + "grad_norm": 14.464875221252441, + "learning_rate": 9.15475763016158e-06, + "loss": 6.6814, + "step": 102025 + }, + { + "epoch": 9.160682226211849, + "grad_norm": 13.086214065551758, + "learning_rate": 9.157001795332138e-06, + "loss": 6.4227, + "step": 102050 + }, + { + "epoch": 9.162926391382406, + "grad_norm": 12.462900161743164, + "learning_rate": 9.159245960502694e-06, + "loss": 6.486, + "step": 102075 + }, + { + "epoch": 9.165170556552962, + "grad_norm": 12.717019081115723, + "learning_rate": 9.16149012567325e-06, + "loss": 6.3343, + "step": 102100 + }, + { + "epoch": 9.16741472172352, + "grad_norm": 12.935893058776855, + "learning_rate": 9.163734290843807e-06, + "loss": 6.4301, + "step": 102125 + }, + { + "epoch": 9.169658886894075, + "grad_norm": 11.834382057189941, + "learning_rate": 9.165978456014365e-06, + "loss": 6.4738, + "step": 102150 + }, + { + "epoch": 9.171903052064632, + "grad_norm": 13.71847152709961, + "learning_rate": 9.16822262118492e-06, + "loss": 6.4435, + "step": 102175 + }, + { + "epoch": 9.174147217235188, + "grad_norm": 11.075526237487793, + "learning_rate": 9.170466786355476e-06, + "loss": 6.2438, + "step": 102200 + }, + { + "epoch": 9.176391382405745, + "grad_norm": 11.344983100891113, + "learning_rate": 9.172710951526032e-06, + "loss": 6.4644, + "step": 102225 + }, + { + "epoch": 9.178635547576302, + "grad_norm": 10.612468719482422, + "learning_rate": 9.17495511669659e-06, + "loss": 6.3526, + "step": 102250 + }, + { + "epoch": 9.180879712746858, + "grad_norm": 14.722274780273438, + "learning_rate": 9.177199281867147e-06, + "loss": 6.402, + "step": 102275 + }, + { + "epoch": 9.183123877917415, + "grad_norm": 10.479104042053223, + "learning_rate": 9.179443447037703e-06, + "loss": 6.0346, + "step": 102300 + }, + { + "epoch": 9.185368043087971, + "grad_norm": 12.530048370361328, + "learning_rate": 9.181687612208258e-06, + "loss": 6.5342, + "step": 102325 + }, + { + "epoch": 9.187612208258528, + "grad_norm": 12.11065673828125, + "learning_rate": 9.183931777378816e-06, + "loss": 6.3555, + "step": 102350 + }, + { + "epoch": 9.189856373429084, + "grad_norm": 13.111006736755371, + "learning_rate": 9.186175942549372e-06, + "loss": 6.1828, + "step": 102375 + }, + { + "epoch": 9.192100538599641, + "grad_norm": 12.485138893127441, + "learning_rate": 9.188420107719929e-06, + "loss": 6.5131, + "step": 102400 + }, + { + "epoch": 9.194344703770197, + "grad_norm": 12.619248390197754, + "learning_rate": 9.190664272890487e-06, + "loss": 6.3578, + "step": 102425 + }, + { + "epoch": 9.196588868940754, + "grad_norm": 14.01384162902832, + "learning_rate": 9.192908438061042e-06, + "loss": 6.3851, + "step": 102450 + }, + { + "epoch": 9.19883303411131, + "grad_norm": 11.268627166748047, + "learning_rate": 9.195152603231598e-06, + "loss": 6.5582, + "step": 102475 + }, + { + "epoch": 9.201077199281867, + "grad_norm": 15.917984962463379, + "learning_rate": 9.197396768402156e-06, + "loss": 6.5202, + "step": 102500 + }, + { + "epoch": 9.203321364452425, + "grad_norm": 14.881120681762695, + "learning_rate": 9.199640933572713e-06, + "loss": 6.4799, + "step": 102525 + }, + { + "epoch": 9.20556552962298, + "grad_norm": 15.299811363220215, + "learning_rate": 9.201885098743269e-06, + "loss": 6.4584, + "step": 102550 + }, + { + "epoch": 9.207809694793538, + "grad_norm": 12.1554536819458, + "learning_rate": 9.204129263913825e-06, + "loss": 6.3728, + "step": 102575 + }, + { + "epoch": 9.210053859964093, + "grad_norm": 12.031976699829102, + "learning_rate": 9.20637342908438e-06, + "loss": 6.3731, + "step": 102600 + }, + { + "epoch": 9.21229802513465, + "grad_norm": 11.683928489685059, + "learning_rate": 9.208617594254938e-06, + "loss": 6.5028, + "step": 102625 + }, + { + "epoch": 9.214542190305206, + "grad_norm": 13.618297576904297, + "learning_rate": 9.210861759425495e-06, + "loss": 6.3767, + "step": 102650 + }, + { + "epoch": 9.216786355475763, + "grad_norm": 11.727862358093262, + "learning_rate": 9.213105924596051e-06, + "loss": 6.2699, + "step": 102675 + }, + { + "epoch": 9.219030520646319, + "grad_norm": 12.217309951782227, + "learning_rate": 9.215350089766607e-06, + "loss": 6.3279, + "step": 102700 + }, + { + "epoch": 9.221274685816876, + "grad_norm": 12.016396522521973, + "learning_rate": 9.217594254937164e-06, + "loss": 6.3668, + "step": 102725 + }, + { + "epoch": 9.223518850987432, + "grad_norm": 13.602384567260742, + "learning_rate": 9.21983842010772e-06, + "loss": 6.2044, + "step": 102750 + }, + { + "epoch": 9.22576301615799, + "grad_norm": 12.438458442687988, + "learning_rate": 9.222082585278277e-06, + "loss": 6.3353, + "step": 102775 + }, + { + "epoch": 9.228007181328545, + "grad_norm": 14.560842514038086, + "learning_rate": 9.224326750448835e-06, + "loss": 6.3063, + "step": 102800 + }, + { + "epoch": 9.230251346499102, + "grad_norm": 12.00201416015625, + "learning_rate": 9.22657091561939e-06, + "loss": 6.4904, + "step": 102825 + }, + { + "epoch": 9.23249551166966, + "grad_norm": 15.946187019348145, + "learning_rate": 9.228815080789946e-06, + "loss": 6.3318, + "step": 102850 + }, + { + "epoch": 9.234739676840215, + "grad_norm": 15.11262035369873, + "learning_rate": 9.231059245960504e-06, + "loss": 6.2832, + "step": 102875 + }, + { + "epoch": 9.236983842010773, + "grad_norm": 11.271397590637207, + "learning_rate": 9.23330341113106e-06, + "loss": 6.491, + "step": 102900 + }, + { + "epoch": 9.239228007181328, + "grad_norm": 12.22497272491455, + "learning_rate": 9.235547576301617e-06, + "loss": 6.3943, + "step": 102925 + }, + { + "epoch": 9.241472172351886, + "grad_norm": 12.570024490356445, + "learning_rate": 9.237791741472173e-06, + "loss": 6.4952, + "step": 102950 + }, + { + "epoch": 9.243716337522441, + "grad_norm": 14.506901741027832, + "learning_rate": 9.240035906642729e-06, + "loss": 6.5383, + "step": 102975 + }, + { + "epoch": 9.245960502692999, + "grad_norm": 11.359461784362793, + "learning_rate": 9.242280071813286e-06, + "loss": 6.6167, + "step": 103000 + }, + { + "epoch": 9.248204667863554, + "grad_norm": 12.633959770202637, + "learning_rate": 9.244524236983844e-06, + "loss": 6.2604, + "step": 103025 + }, + { + "epoch": 9.250448833034111, + "grad_norm": 16.005229949951172, + "learning_rate": 9.2467684021544e-06, + "loss": 6.542, + "step": 103050 + }, + { + "epoch": 9.252692998204667, + "grad_norm": 11.657487869262695, + "learning_rate": 9.249012567324955e-06, + "loss": 6.3965, + "step": 103075 + }, + { + "epoch": 9.254937163375224, + "grad_norm": 14.366698265075684, + "learning_rate": 9.251256732495513e-06, + "loss": 6.4174, + "step": 103100 + }, + { + "epoch": 9.257181328545782, + "grad_norm": 11.645773887634277, + "learning_rate": 9.253500897666068e-06, + "loss": 6.195, + "step": 103125 + }, + { + "epoch": 9.259425493716337, + "grad_norm": 13.030241966247559, + "learning_rate": 9.255745062836626e-06, + "loss": 6.4661, + "step": 103150 + }, + { + "epoch": 9.261669658886895, + "grad_norm": 10.756841659545898, + "learning_rate": 9.257989228007183e-06, + "loss": 6.5021, + "step": 103175 + }, + { + "epoch": 9.26391382405745, + "grad_norm": 13.88847827911377, + "learning_rate": 9.260233393177739e-06, + "loss": 6.4601, + "step": 103200 + }, + { + "epoch": 9.266157989228008, + "grad_norm": 12.183505058288574, + "learning_rate": 9.262477558348295e-06, + "loss": 6.2064, + "step": 103225 + }, + { + "epoch": 9.268402154398563, + "grad_norm": 11.842512130737305, + "learning_rate": 9.26472172351885e-06, + "loss": 6.3722, + "step": 103250 + }, + { + "epoch": 9.27064631956912, + "grad_norm": 11.082062721252441, + "learning_rate": 9.266965888689408e-06, + "loss": 6.4615, + "step": 103275 + }, + { + "epoch": 9.272890484739676, + "grad_norm": 16.51767921447754, + "learning_rate": 9.269210053859966e-06, + "loss": 6.3297, + "step": 103300 + }, + { + "epoch": 9.275134649910234, + "grad_norm": 15.001440048217773, + "learning_rate": 9.271454219030521e-06, + "loss": 6.118, + "step": 103325 + }, + { + "epoch": 9.27737881508079, + "grad_norm": 13.854418754577637, + "learning_rate": 9.273698384201077e-06, + "loss": 6.3077, + "step": 103350 + }, + { + "epoch": 9.279622980251347, + "grad_norm": 11.158513069152832, + "learning_rate": 9.275942549371635e-06, + "loss": 6.4697, + "step": 103375 + }, + { + "epoch": 9.281867145421902, + "grad_norm": 18.932613372802734, + "learning_rate": 9.27818671454219e-06, + "loss": 6.5689, + "step": 103400 + }, + { + "epoch": 9.28411131059246, + "grad_norm": 14.096111297607422, + "learning_rate": 9.280430879712748e-06, + "loss": 6.4084, + "step": 103425 + }, + { + "epoch": 9.286355475763017, + "grad_norm": 15.014046669006348, + "learning_rate": 9.282675044883304e-06, + "loss": 6.4153, + "step": 103450 + }, + { + "epoch": 9.288599640933572, + "grad_norm": 13.05613899230957, + "learning_rate": 9.284919210053861e-06, + "loss": 6.3657, + "step": 103475 + }, + { + "epoch": 9.29084380610413, + "grad_norm": 12.497053146362305, + "learning_rate": 9.287163375224417e-06, + "loss": 6.4278, + "step": 103500 + }, + { + "epoch": 9.293087971274685, + "grad_norm": 12.757352828979492, + "learning_rate": 9.289407540394974e-06, + "loss": 6.3855, + "step": 103525 + }, + { + "epoch": 9.295332136445243, + "grad_norm": 11.897615432739258, + "learning_rate": 9.291651705565532e-06, + "loss": 6.5567, + "step": 103550 + }, + { + "epoch": 9.297576301615798, + "grad_norm": 13.511356353759766, + "learning_rate": 9.293895870736087e-06, + "loss": 6.4326, + "step": 103575 + }, + { + "epoch": 9.299820466786356, + "grad_norm": 10.918742179870605, + "learning_rate": 9.296140035906643e-06, + "loss": 6.4992, + "step": 103600 + }, + { + "epoch": 9.302064631956911, + "grad_norm": 12.02066421508789, + "learning_rate": 9.298384201077199e-06, + "loss": 6.4408, + "step": 103625 + }, + { + "epoch": 9.304308797127469, + "grad_norm": 13.554341316223145, + "learning_rate": 9.300628366247756e-06, + "loss": 6.3014, + "step": 103650 + }, + { + "epoch": 9.306552962298024, + "grad_norm": 15.23718547821045, + "learning_rate": 9.302872531418314e-06, + "loss": 6.522, + "step": 103675 + }, + { + "epoch": 9.308797127468582, + "grad_norm": 14.617758750915527, + "learning_rate": 9.30511669658887e-06, + "loss": 6.5721, + "step": 103700 + }, + { + "epoch": 9.311041292639139, + "grad_norm": 12.974705696105957, + "learning_rate": 9.307360861759425e-06, + "loss": 6.4953, + "step": 103725 + }, + { + "epoch": 9.313285457809695, + "grad_norm": 15.44774341583252, + "learning_rate": 9.309605026929983e-06, + "loss": 6.185, + "step": 103750 + }, + { + "epoch": 9.315529622980252, + "grad_norm": 11.806844711303711, + "learning_rate": 9.311849192100539e-06, + "loss": 6.4305, + "step": 103775 + }, + { + "epoch": 9.317773788150808, + "grad_norm": 11.786248207092285, + "learning_rate": 9.314093357271096e-06, + "loss": 6.4941, + "step": 103800 + }, + { + "epoch": 9.320017953321365, + "grad_norm": 10.683141708374023, + "learning_rate": 9.316337522441654e-06, + "loss": 6.4004, + "step": 103825 + }, + { + "epoch": 9.32226211849192, + "grad_norm": 13.948647499084473, + "learning_rate": 9.31858168761221e-06, + "loss": 6.376, + "step": 103850 + }, + { + "epoch": 9.324506283662478, + "grad_norm": 15.043383598327637, + "learning_rate": 9.320825852782765e-06, + "loss": 6.5462, + "step": 103875 + }, + { + "epoch": 9.326750448833034, + "grad_norm": 12.957610130310059, + "learning_rate": 9.323070017953323e-06, + "loss": 6.612, + "step": 103900 + }, + { + "epoch": 9.32899461400359, + "grad_norm": 12.550429344177246, + "learning_rate": 9.325314183123878e-06, + "loss": 6.3743, + "step": 103925 + }, + { + "epoch": 9.331238779174146, + "grad_norm": 14.187219619750977, + "learning_rate": 9.327558348294436e-06, + "loss": 6.4287, + "step": 103950 + }, + { + "epoch": 9.333482944344704, + "grad_norm": 13.833624839782715, + "learning_rate": 9.329802513464992e-06, + "loss": 6.5543, + "step": 103975 + }, + { + "epoch": 9.335727109515261, + "grad_norm": 12.933545112609863, + "learning_rate": 9.331956912028726e-06, + "loss": 6.7205, + "step": 104000 + }, + { + "epoch": 9.337971274685817, + "grad_norm": 11.329670906066895, + "learning_rate": 9.334201077199283e-06, + "loss": 6.499, + "step": 104025 + }, + { + "epoch": 9.340215439856374, + "grad_norm": 14.617602348327637, + "learning_rate": 9.336445242369839e-06, + "loss": 6.2907, + "step": 104050 + }, + { + "epoch": 9.34245960502693, + "grad_norm": 17.252649307250977, + "learning_rate": 9.338689407540396e-06, + "loss": 6.553, + "step": 104075 + }, + { + "epoch": 9.344703770197487, + "grad_norm": 11.609710693359375, + "learning_rate": 9.340933572710952e-06, + "loss": 6.4264, + "step": 104100 + }, + { + "epoch": 9.346947935368043, + "grad_norm": 12.226161003112793, + "learning_rate": 9.34317773788151e-06, + "loss": 6.3206, + "step": 104125 + }, + { + "epoch": 9.3491921005386, + "grad_norm": 11.449350357055664, + "learning_rate": 9.345421903052065e-06, + "loss": 6.3047, + "step": 104150 + }, + { + "epoch": 9.351436265709156, + "grad_norm": 13.884167671203613, + "learning_rate": 9.347666068222621e-06, + "loss": 6.4956, + "step": 104175 + }, + { + "epoch": 9.353680430879713, + "grad_norm": 12.963262557983398, + "learning_rate": 9.349910233393179e-06, + "loss": 6.5908, + "step": 104200 + }, + { + "epoch": 9.355924596050269, + "grad_norm": 13.320496559143066, + "learning_rate": 9.352154398563736e-06, + "loss": 6.2689, + "step": 104225 + }, + { + "epoch": 9.358168761220826, + "grad_norm": 11.171613693237305, + "learning_rate": 9.354398563734292e-06, + "loss": 6.4488, + "step": 104250 + }, + { + "epoch": 9.360412926391382, + "grad_norm": 13.23499870300293, + "learning_rate": 9.356642728904848e-06, + "loss": 6.5235, + "step": 104275 + }, + { + "epoch": 9.362657091561939, + "grad_norm": 12.053414344787598, + "learning_rate": 9.358886894075405e-06, + "loss": 6.3121, + "step": 104300 + }, + { + "epoch": 9.364901256732496, + "grad_norm": 14.63707160949707, + "learning_rate": 9.36113105924596e-06, + "loss": 6.5258, + "step": 104325 + }, + { + "epoch": 9.367145421903052, + "grad_norm": 12.785804748535156, + "learning_rate": 9.363375224416518e-06, + "loss": 6.4685, + "step": 104350 + }, + { + "epoch": 9.36938958707361, + "grad_norm": 13.612741470336914, + "learning_rate": 9.365619389587074e-06, + "loss": 6.4409, + "step": 104375 + }, + { + "epoch": 9.371633752244165, + "grad_norm": 12.945833206176758, + "learning_rate": 9.367863554757631e-06, + "loss": 6.5401, + "step": 104400 + }, + { + "epoch": 9.373877917414722, + "grad_norm": 12.277961730957031, + "learning_rate": 9.370107719928187e-06, + "loss": 6.3907, + "step": 104425 + }, + { + "epoch": 9.376122082585278, + "grad_norm": 15.223895072937012, + "learning_rate": 9.372351885098745e-06, + "loss": 6.3915, + "step": 104450 + }, + { + "epoch": 9.378366247755835, + "grad_norm": 16.76180648803711, + "learning_rate": 9.3745960502693e-06, + "loss": 6.6293, + "step": 104475 + }, + { + "epoch": 9.38061041292639, + "grad_norm": 10.911585807800293, + "learning_rate": 9.376840215439858e-06, + "loss": 6.4594, + "step": 104500 + }, + { + "epoch": 9.382854578096948, + "grad_norm": 12.020665168762207, + "learning_rate": 9.379084380610414e-06, + "loss": 6.5358, + "step": 104525 + }, + { + "epoch": 9.385098743267504, + "grad_norm": 11.775627136230469, + "learning_rate": 9.38132854578097e-06, + "loss": 6.4838, + "step": 104550 + }, + { + "epoch": 9.387342908438061, + "grad_norm": 13.675743103027344, + "learning_rate": 9.383572710951527e-06, + "loss": 6.3171, + "step": 104575 + }, + { + "epoch": 9.389587073608618, + "grad_norm": 16.8325252532959, + "learning_rate": 9.385816876122084e-06, + "loss": 6.1147, + "step": 104600 + }, + { + "epoch": 9.391831238779174, + "grad_norm": 18.22374153137207, + "learning_rate": 9.38806104129264e-06, + "loss": 6.5933, + "step": 104625 + }, + { + "epoch": 9.394075403949731, + "grad_norm": 11.343647003173828, + "learning_rate": 9.390305206463196e-06, + "loss": 6.4198, + "step": 104650 + }, + { + "epoch": 9.396319569120287, + "grad_norm": 13.427556037902832, + "learning_rate": 9.392549371633753e-06, + "loss": 6.5181, + "step": 104675 + }, + { + "epoch": 9.398563734290844, + "grad_norm": 13.387423515319824, + "learning_rate": 9.394793536804309e-06, + "loss": 6.3122, + "step": 104700 + }, + { + "epoch": 9.4008078994614, + "grad_norm": 12.492395401000977, + "learning_rate": 9.397037701974867e-06, + "loss": 6.4436, + "step": 104725 + }, + { + "epoch": 9.403052064631957, + "grad_norm": 10.768617630004883, + "learning_rate": 9.399281867145422e-06, + "loss": 6.3626, + "step": 104750 + }, + { + "epoch": 9.405296229802513, + "grad_norm": 13.172008514404297, + "learning_rate": 9.40152603231598e-06, + "loss": 6.3311, + "step": 104775 + }, + { + "epoch": 9.40754039497307, + "grad_norm": 12.844744682312012, + "learning_rate": 9.403770197486536e-06, + "loss": 6.5648, + "step": 104800 + }, + { + "epoch": 9.409784560143626, + "grad_norm": 12.1707181930542, + "learning_rate": 9.406014362657091e-06, + "loss": 6.2959, + "step": 104825 + }, + { + "epoch": 9.412028725314183, + "grad_norm": 14.522943496704102, + "learning_rate": 9.408258527827649e-06, + "loss": 6.2552, + "step": 104850 + }, + { + "epoch": 9.414272890484739, + "grad_norm": 13.527265548706055, + "learning_rate": 9.410502692998206e-06, + "loss": 6.4713, + "step": 104875 + }, + { + "epoch": 9.416517055655296, + "grad_norm": 10.736381530761719, + "learning_rate": 9.412746858168762e-06, + "loss": 6.3522, + "step": 104900 + }, + { + "epoch": 9.418761220825854, + "grad_norm": 12.269417762756348, + "learning_rate": 9.414991023339318e-06, + "loss": 6.6091, + "step": 104925 + }, + { + "epoch": 9.42100538599641, + "grad_norm": 12.741127014160156, + "learning_rate": 9.417235188509875e-06, + "loss": 6.4751, + "step": 104950 + }, + { + "epoch": 9.423249551166966, + "grad_norm": 15.980728149414062, + "learning_rate": 9.419479353680431e-06, + "loss": 6.3391, + "step": 104975 + }, + { + "epoch": 9.425493716337522, + "grad_norm": 13.3069429397583, + "learning_rate": 9.421723518850988e-06, + "loss": 6.3925, + "step": 105000 + }, + { + "epoch": 9.42773788150808, + "grad_norm": 13.045138359069824, + "learning_rate": 9.423967684021544e-06, + "loss": 6.2797, + "step": 105025 + }, + { + "epoch": 9.429982046678635, + "grad_norm": 18.27940559387207, + "learning_rate": 9.426211849192102e-06, + "loss": 6.4336, + "step": 105050 + }, + { + "epoch": 9.432226211849192, + "grad_norm": 12.283141136169434, + "learning_rate": 9.428456014362657e-06, + "loss": 6.5574, + "step": 105075 + }, + { + "epoch": 9.434470377019748, + "grad_norm": 15.243672370910645, + "learning_rate": 9.430700179533215e-06, + "loss": 6.3008, + "step": 105100 + }, + { + "epoch": 9.436714542190305, + "grad_norm": 12.054439544677734, + "learning_rate": 9.43294434470377e-06, + "loss": 6.4373, + "step": 105125 + }, + { + "epoch": 9.438958707360861, + "grad_norm": 13.891838073730469, + "learning_rate": 9.435188509874328e-06, + "loss": 6.3945, + "step": 105150 + }, + { + "epoch": 9.441202872531418, + "grad_norm": 15.428339958190918, + "learning_rate": 9.437432675044884e-06, + "loss": 6.3994, + "step": 105175 + }, + { + "epoch": 9.443447037701976, + "grad_norm": 13.97810173034668, + "learning_rate": 9.43967684021544e-06, + "loss": 6.5473, + "step": 105200 + }, + { + "epoch": 9.445691202872531, + "grad_norm": 11.456852912902832, + "learning_rate": 9.441921005385997e-06, + "loss": 6.3244, + "step": 105225 + }, + { + "epoch": 9.447935368043089, + "grad_norm": 11.495838165283203, + "learning_rate": 9.444165170556555e-06, + "loss": 6.4343, + "step": 105250 + }, + { + "epoch": 9.450179533213644, + "grad_norm": 10.942477226257324, + "learning_rate": 9.44640933572711e-06, + "loss": 6.2041, + "step": 105275 + }, + { + "epoch": 9.452423698384202, + "grad_norm": 12.8779935836792, + "learning_rate": 9.448653500897666e-06, + "loss": 6.4116, + "step": 105300 + }, + { + "epoch": 9.454667863554757, + "grad_norm": 13.973786354064941, + "learning_rate": 9.450897666068224e-06, + "loss": 6.4081, + "step": 105325 + }, + { + "epoch": 9.456912028725315, + "grad_norm": 11.647880554199219, + "learning_rate": 9.45314183123878e-06, + "loss": 6.5382, + "step": 105350 + }, + { + "epoch": 9.45915619389587, + "grad_norm": 13.784834861755371, + "learning_rate": 9.455385996409337e-06, + "loss": 6.2972, + "step": 105375 + }, + { + "epoch": 9.461400359066428, + "grad_norm": 14.048850059509277, + "learning_rate": 9.457630161579893e-06, + "loss": 6.1408, + "step": 105400 + }, + { + "epoch": 9.463644524236983, + "grad_norm": 13.909639358520508, + "learning_rate": 9.45987432675045e-06, + "loss": 6.2921, + "step": 105425 + }, + { + "epoch": 9.46588868940754, + "grad_norm": 16.81739616394043, + "learning_rate": 9.462118491921006e-06, + "loss": 6.1714, + "step": 105450 + }, + { + "epoch": 9.468132854578098, + "grad_norm": 11.6475191116333, + "learning_rate": 9.464362657091563e-06, + "loss": 6.6698, + "step": 105475 + }, + { + "epoch": 9.470377019748653, + "grad_norm": 13.447057723999023, + "learning_rate": 9.466606822262119e-06, + "loss": 6.5216, + "step": 105500 + }, + { + "epoch": 9.47262118491921, + "grad_norm": 16.447811126708984, + "learning_rate": 9.468850987432677e-06, + "loss": 6.6075, + "step": 105525 + }, + { + "epoch": 9.474865350089766, + "grad_norm": 13.36626148223877, + "learning_rate": 9.471095152603232e-06, + "loss": 6.4988, + "step": 105550 + }, + { + "epoch": 9.477109515260324, + "grad_norm": 11.414298057556152, + "learning_rate": 9.473339317773788e-06, + "loss": 6.4346, + "step": 105575 + }, + { + "epoch": 9.47935368043088, + "grad_norm": 12.723348617553711, + "learning_rate": 9.475583482944346e-06, + "loss": 6.2716, + "step": 105600 + }, + { + "epoch": 9.481597845601437, + "grad_norm": 19.448667526245117, + "learning_rate": 9.477827648114903e-06, + "loss": 6.4269, + "step": 105625 + }, + { + "epoch": 9.483842010771992, + "grad_norm": 12.48808765411377, + "learning_rate": 9.480071813285459e-06, + "loss": 6.5728, + "step": 105650 + }, + { + "epoch": 9.48608617594255, + "grad_norm": 13.928885459899902, + "learning_rate": 9.482315978456015e-06, + "loss": 6.4328, + "step": 105675 + }, + { + "epoch": 9.488330341113105, + "grad_norm": 13.665495872497559, + "learning_rate": 9.484560143626572e-06, + "loss": 6.4607, + "step": 105700 + }, + { + "epoch": 9.490574506283663, + "grad_norm": 13.373543739318848, + "learning_rate": 9.486804308797128e-06, + "loss": 6.3925, + "step": 105725 + }, + { + "epoch": 9.492818671454218, + "grad_norm": 12.11533260345459, + "learning_rate": 9.489048473967685e-06, + "loss": 6.6194, + "step": 105750 + }, + { + "epoch": 9.495062836624776, + "grad_norm": 12.933317184448242, + "learning_rate": 9.491292639138241e-06, + "loss": 6.5145, + "step": 105775 + }, + { + "epoch": 9.497307001795333, + "grad_norm": 15.318145751953125, + "learning_rate": 9.493536804308798e-06, + "loss": 6.5973, + "step": 105800 + }, + { + "epoch": 9.499551166965889, + "grad_norm": 14.316102027893066, + "learning_rate": 9.495780969479354e-06, + "loss": 6.2736, + "step": 105825 + }, + { + "epoch": 9.501795332136446, + "grad_norm": 14.845878601074219, + "learning_rate": 9.49802513464991e-06, + "loss": 6.4272, + "step": 105850 + }, + { + "epoch": 9.504039497307001, + "grad_norm": 13.211219787597656, + "learning_rate": 9.500269299820467e-06, + "loss": 6.2435, + "step": 105875 + }, + { + "epoch": 9.506283662477559, + "grad_norm": 15.539787292480469, + "learning_rate": 9.502513464991025e-06, + "loss": 6.4888, + "step": 105900 + }, + { + "epoch": 9.508527827648114, + "grad_norm": 15.879250526428223, + "learning_rate": 9.50475763016158e-06, + "loss": 6.3682, + "step": 105925 + }, + { + "epoch": 9.510771992818672, + "grad_norm": 13.560327529907227, + "learning_rate": 9.507001795332136e-06, + "loss": 6.4397, + "step": 105950 + }, + { + "epoch": 9.513016157989227, + "grad_norm": 14.969319343566895, + "learning_rate": 9.509245960502694e-06, + "loss": 6.2784, + "step": 105975 + }, + { + "epoch": 9.515260323159785, + "grad_norm": 11.584208488464355, + "learning_rate": 9.511490125673251e-06, + "loss": 6.3627, + "step": 106000 + }, + { + "epoch": 9.51750448833034, + "grad_norm": 9.81027889251709, + "learning_rate": 9.513644524236985e-06, + "loss": 6.2914, + "step": 106025 + }, + { + "epoch": 9.519748653500898, + "grad_norm": 12.312421798706055, + "learning_rate": 9.515888689407541e-06, + "loss": 6.3267, + "step": 106050 + }, + { + "epoch": 9.521992818671453, + "grad_norm": 13.887837409973145, + "learning_rate": 9.518132854578099e-06, + "loss": 6.5846, + "step": 106075 + }, + { + "epoch": 9.52423698384201, + "grad_norm": 12.269282341003418, + "learning_rate": 9.520377019748654e-06, + "loss": 6.3027, + "step": 106100 + }, + { + "epoch": 9.526481149012568, + "grad_norm": 13.895957946777344, + "learning_rate": 9.52262118491921e-06, + "loss": 6.5868, + "step": 106125 + }, + { + "epoch": 9.528725314183124, + "grad_norm": 9.970773696899414, + "learning_rate": 9.524865350089768e-06, + "loss": 6.5845, + "step": 106150 + }, + { + "epoch": 9.530969479353681, + "grad_norm": 12.003915786743164, + "learning_rate": 9.527109515260325e-06, + "loss": 6.3404, + "step": 106175 + }, + { + "epoch": 9.533213644524237, + "grad_norm": 12.292943000793457, + "learning_rate": 9.529353680430881e-06, + "loss": 6.3329, + "step": 106200 + }, + { + "epoch": 9.535457809694794, + "grad_norm": 13.750739097595215, + "learning_rate": 9.531597845601437e-06, + "loss": 6.4523, + "step": 106225 + }, + { + "epoch": 9.53770197486535, + "grad_norm": 11.686034202575684, + "learning_rate": 9.533842010771992e-06, + "loss": 6.4971, + "step": 106250 + }, + { + "epoch": 9.539946140035907, + "grad_norm": 12.613044738769531, + "learning_rate": 9.53608617594255e-06, + "loss": 6.5903, + "step": 106275 + }, + { + "epoch": 9.542190305206462, + "grad_norm": 13.821603775024414, + "learning_rate": 9.538330341113107e-06, + "loss": 6.3823, + "step": 106300 + }, + { + "epoch": 9.54443447037702, + "grad_norm": 17.00397491455078, + "learning_rate": 9.540574506283663e-06, + "loss": 6.5331, + "step": 106325 + }, + { + "epoch": 9.546678635547575, + "grad_norm": 11.279273986816406, + "learning_rate": 9.54281867145422e-06, + "loss": 6.4915, + "step": 106350 + }, + { + "epoch": 9.548922800718133, + "grad_norm": 11.997570991516113, + "learning_rate": 9.545062836624776e-06, + "loss": 6.4428, + "step": 106375 + }, + { + "epoch": 9.55116696588869, + "grad_norm": 10.930870056152344, + "learning_rate": 9.547307001795332e-06, + "loss": 6.4848, + "step": 106400 + }, + { + "epoch": 9.553411131059246, + "grad_norm": 11.8528470993042, + "learning_rate": 9.54955116696589e-06, + "loss": 6.3988, + "step": 106425 + }, + { + "epoch": 9.555655296229803, + "grad_norm": 11.497407913208008, + "learning_rate": 9.551795332136447e-06, + "loss": 6.3574, + "step": 106450 + }, + { + "epoch": 9.557899461400359, + "grad_norm": 11.010323524475098, + "learning_rate": 9.554039497307003e-06, + "loss": 6.2358, + "step": 106475 + }, + { + "epoch": 9.560143626570916, + "grad_norm": 13.82494068145752, + "learning_rate": 9.556283662477559e-06, + "loss": 6.46, + "step": 106500 + }, + { + "epoch": 9.562387791741472, + "grad_norm": 16.269819259643555, + "learning_rate": 9.558527827648116e-06, + "loss": 6.2148, + "step": 106525 + }, + { + "epoch": 9.564631956912029, + "grad_norm": 12.219965934753418, + "learning_rate": 9.560771992818672e-06, + "loss": 6.2506, + "step": 106550 + }, + { + "epoch": 9.566876122082585, + "grad_norm": 12.22597599029541, + "learning_rate": 9.56301615798923e-06, + "loss": 6.6351, + "step": 106575 + }, + { + "epoch": 9.569120287253142, + "grad_norm": 14.448294639587402, + "learning_rate": 9.565260323159785e-06, + "loss": 6.1655, + "step": 106600 + }, + { + "epoch": 9.571364452423698, + "grad_norm": 12.525994300842285, + "learning_rate": 9.56750448833034e-06, + "loss": 6.3608, + "step": 106625 + }, + { + "epoch": 9.573608617594255, + "grad_norm": 12.693790435791016, + "learning_rate": 9.569748653500898e-06, + "loss": 6.2961, + "step": 106650 + }, + { + "epoch": 9.575852782764812, + "grad_norm": 14.719560623168945, + "learning_rate": 9.571992818671456e-06, + "loss": 6.4376, + "step": 106675 + }, + { + "epoch": 9.578096947935368, + "grad_norm": 13.145798683166504, + "learning_rate": 9.574236983842011e-06, + "loss": 6.4024, + "step": 106700 + }, + { + "epoch": 9.580341113105925, + "grad_norm": 14.016873359680176, + "learning_rate": 9.576481149012569e-06, + "loss": 6.599, + "step": 106725 + }, + { + "epoch": 9.58258527827648, + "grad_norm": 12.538500785827637, + "learning_rate": 9.578725314183125e-06, + "loss": 6.4312, + "step": 106750 + }, + { + "epoch": 9.584829443447038, + "grad_norm": 18.154722213745117, + "learning_rate": 9.58096947935368e-06, + "loss": 6.586, + "step": 106775 + }, + { + "epoch": 9.587073608617594, + "grad_norm": 14.458498001098633, + "learning_rate": 9.583213644524238e-06, + "loss": 6.4479, + "step": 106800 + }, + { + "epoch": 9.589317773788151, + "grad_norm": 14.289872169494629, + "learning_rate": 9.585457809694795e-06, + "loss": 6.5472, + "step": 106825 + }, + { + "epoch": 9.591561938958707, + "grad_norm": 14.030860900878906, + "learning_rate": 9.587701974865351e-06, + "loss": 6.4899, + "step": 106850 + }, + { + "epoch": 9.593806104129264, + "grad_norm": 12.164823532104492, + "learning_rate": 9.589946140035907e-06, + "loss": 6.4688, + "step": 106875 + }, + { + "epoch": 9.59605026929982, + "grad_norm": 12.462658882141113, + "learning_rate": 9.592190305206463e-06, + "loss": 6.2848, + "step": 106900 + }, + { + "epoch": 9.598294434470377, + "grad_norm": 15.385250091552734, + "learning_rate": 9.59443447037702e-06, + "loss": 6.4229, + "step": 106925 + }, + { + "epoch": 9.600538599640934, + "grad_norm": 11.503856658935547, + "learning_rate": 9.596678635547578e-06, + "loss": 6.403, + "step": 106950 + }, + { + "epoch": 9.60278276481149, + "grad_norm": 12.275575637817383, + "learning_rate": 9.598922800718133e-06, + "loss": 6.56, + "step": 106975 + }, + { + "epoch": 9.605026929982047, + "grad_norm": 14.446584701538086, + "learning_rate": 9.60116696588869e-06, + "loss": 6.337, + "step": 107000 + }, + { + "epoch": 9.607271095152603, + "grad_norm": 13.605598449707031, + "learning_rate": 9.603411131059247e-06, + "loss": 6.2759, + "step": 107025 + }, + { + "epoch": 9.60951526032316, + "grad_norm": 21.263874053955078, + "learning_rate": 9.605655296229804e-06, + "loss": 6.2113, + "step": 107050 + }, + { + "epoch": 9.611759425493716, + "grad_norm": 12.988906860351562, + "learning_rate": 9.60789946140036e-06, + "loss": 6.4859, + "step": 107075 + }, + { + "epoch": 9.614003590664273, + "grad_norm": 13.714360237121582, + "learning_rate": 9.610143626570917e-06, + "loss": 6.4787, + "step": 107100 + }, + { + "epoch": 9.616247755834829, + "grad_norm": 12.41856861114502, + "learning_rate": 9.612387791741473e-06, + "loss": 6.2411, + "step": 107125 + }, + { + "epoch": 9.618491921005386, + "grad_norm": 11.285481452941895, + "learning_rate": 9.614631956912029e-06, + "loss": 6.3034, + "step": 107150 + }, + { + "epoch": 9.620736086175942, + "grad_norm": 13.711465835571289, + "learning_rate": 9.616876122082586e-06, + "loss": 6.1448, + "step": 107175 + }, + { + "epoch": 9.6229802513465, + "grad_norm": 13.078324317932129, + "learning_rate": 9.619120287253144e-06, + "loss": 6.3829, + "step": 107200 + }, + { + "epoch": 9.625224416517055, + "grad_norm": 21.82048797607422, + "learning_rate": 9.6213644524237e-06, + "loss": 6.3932, + "step": 107225 + }, + { + "epoch": 9.627468581687612, + "grad_norm": 15.360684394836426, + "learning_rate": 9.623608617594255e-06, + "loss": 6.3632, + "step": 107250 + }, + { + "epoch": 9.62971274685817, + "grad_norm": 13.741127014160156, + "learning_rate": 9.625852782764811e-06, + "loss": 6.3376, + "step": 107275 + }, + { + "epoch": 9.631956912028725, + "grad_norm": 13.486806869506836, + "learning_rate": 9.628096947935369e-06, + "loss": 6.4813, + "step": 107300 + }, + { + "epoch": 9.634201077199283, + "grad_norm": 14.130350112915039, + "learning_rate": 9.630341113105926e-06, + "loss": 6.0117, + "step": 107325 + }, + { + "epoch": 9.636445242369838, + "grad_norm": 14.355613708496094, + "learning_rate": 9.632585278276482e-06, + "loss": 6.1609, + "step": 107350 + }, + { + "epoch": 9.638689407540395, + "grad_norm": 12.17120361328125, + "learning_rate": 9.634829443447038e-06, + "loss": 6.3922, + "step": 107375 + }, + { + "epoch": 9.640933572710951, + "grad_norm": 13.651363372802734, + "learning_rate": 9.637073608617595e-06, + "loss": 6.3674, + "step": 107400 + }, + { + "epoch": 9.643177737881508, + "grad_norm": 10.3859224319458, + "learning_rate": 9.63931777378815e-06, + "loss": 6.1757, + "step": 107425 + }, + { + "epoch": 9.645421903052064, + "grad_norm": 12.458842277526855, + "learning_rate": 9.641561938958708e-06, + "loss": 6.6465, + "step": 107450 + }, + { + "epoch": 9.647666068222621, + "grad_norm": 14.089179992675781, + "learning_rate": 9.643806104129266e-06, + "loss": 6.6372, + "step": 107475 + }, + { + "epoch": 9.649910233393177, + "grad_norm": 13.787422180175781, + "learning_rate": 9.646050269299821e-06, + "loss": 6.5057, + "step": 107500 + }, + { + "epoch": 9.652154398563734, + "grad_norm": 13.063244819641113, + "learning_rate": 9.648294434470377e-06, + "loss": 6.1793, + "step": 107525 + }, + { + "epoch": 9.65439856373429, + "grad_norm": 16.2415714263916, + "learning_rate": 9.650538599640935e-06, + "loss": 6.481, + "step": 107550 + }, + { + "epoch": 9.656642728904847, + "grad_norm": 12.52452278137207, + "learning_rate": 9.652782764811492e-06, + "loss": 6.6077, + "step": 107575 + }, + { + "epoch": 9.658886894075405, + "grad_norm": 12.080723762512207, + "learning_rate": 9.655026929982048e-06, + "loss": 6.3007, + "step": 107600 + }, + { + "epoch": 9.66113105924596, + "grad_norm": 12.235962867736816, + "learning_rate": 9.657271095152604e-06, + "loss": 6.5355, + "step": 107625 + }, + { + "epoch": 9.663375224416518, + "grad_norm": 13.180590629577637, + "learning_rate": 9.65951526032316e-06, + "loss": 6.5015, + "step": 107650 + }, + { + "epoch": 9.665619389587073, + "grad_norm": 13.081302642822266, + "learning_rate": 9.661759425493717e-06, + "loss": 6.4207, + "step": 107675 + }, + { + "epoch": 9.66786355475763, + "grad_norm": 15.60968017578125, + "learning_rate": 9.664003590664274e-06, + "loss": 6.4019, + "step": 107700 + }, + { + "epoch": 9.670107719928186, + "grad_norm": 13.941032409667969, + "learning_rate": 9.66624775583483e-06, + "loss": 6.3066, + "step": 107725 + }, + { + "epoch": 9.672351885098744, + "grad_norm": 21.22992706298828, + "learning_rate": 9.668491921005386e-06, + "loss": 6.6063, + "step": 107750 + }, + { + "epoch": 9.6745960502693, + "grad_norm": 14.702470779418945, + "learning_rate": 9.670736086175943e-06, + "loss": 6.5659, + "step": 107775 + }, + { + "epoch": 9.676840215439857, + "grad_norm": 14.1359224319458, + "learning_rate": 9.6729802513465e-06, + "loss": 6.5666, + "step": 107800 + }, + { + "epoch": 9.679084380610412, + "grad_norm": 12.357477188110352, + "learning_rate": 9.675224416517057e-06, + "loss": 6.252, + "step": 107825 + }, + { + "epoch": 9.68132854578097, + "grad_norm": 11.197810173034668, + "learning_rate": 9.677468581687614e-06, + "loss": 6.3284, + "step": 107850 + }, + { + "epoch": 9.683572710951527, + "grad_norm": 11.917259216308594, + "learning_rate": 9.67971274685817e-06, + "loss": 6.4801, + "step": 107875 + }, + { + "epoch": 9.685816876122082, + "grad_norm": 13.461578369140625, + "learning_rate": 9.681956912028726e-06, + "loss": 6.3343, + "step": 107900 + }, + { + "epoch": 9.68806104129264, + "grad_norm": 13.134796142578125, + "learning_rate": 9.684201077199283e-06, + "loss": 6.428, + "step": 107925 + }, + { + "epoch": 9.690305206463195, + "grad_norm": 12.268779754638672, + "learning_rate": 9.686445242369839e-06, + "loss": 6.5234, + "step": 107950 + }, + { + "epoch": 9.692549371633753, + "grad_norm": 16.056123733520508, + "learning_rate": 9.688689407540396e-06, + "loss": 6.4796, + "step": 107975 + }, + { + "epoch": 9.694793536804308, + "grad_norm": 17.51797866821289, + "learning_rate": 9.690933572710952e-06, + "loss": 6.2808, + "step": 108000 + }, + { + "epoch": 9.697037701974866, + "grad_norm": 10.888681411743164, + "learning_rate": 9.693177737881508e-06, + "loss": 6.3344, + "step": 108025 + }, + { + "epoch": 9.699281867145421, + "grad_norm": 13.053838729858398, + "learning_rate": 9.695421903052065e-06, + "loss": 6.2415, + "step": 108050 + }, + { + "epoch": 9.701526032315979, + "grad_norm": 14.821563720703125, + "learning_rate": 9.697666068222623e-06, + "loss": 6.4328, + "step": 108075 + }, + { + "epoch": 9.703770197486534, + "grad_norm": 14.09903335571289, + "learning_rate": 9.699910233393179e-06, + "loss": 6.251, + "step": 108100 + }, + { + "epoch": 9.706014362657092, + "grad_norm": 14.381698608398438, + "learning_rate": 9.702154398563734e-06, + "loss": 6.4022, + "step": 108125 + }, + { + "epoch": 9.708258527827649, + "grad_norm": 13.632722854614258, + "learning_rate": 9.70430879712747e-06, + "loss": 6.3852, + "step": 108150 + }, + { + "epoch": 9.710502692998205, + "grad_norm": 19.330141067504883, + "learning_rate": 9.706552962298026e-06, + "loss": 6.709, + "step": 108175 + }, + { + "epoch": 9.712746858168762, + "grad_norm": 10.728434562683105, + "learning_rate": 9.708797127468582e-06, + "loss": 6.3482, + "step": 108200 + }, + { + "epoch": 9.714991023339318, + "grad_norm": 14.511509895324707, + "learning_rate": 9.711041292639139e-06, + "loss": 6.6634, + "step": 108225 + }, + { + "epoch": 9.717235188509875, + "grad_norm": 12.238296508789062, + "learning_rate": 9.713285457809696e-06, + "loss": 6.3385, + "step": 108250 + }, + { + "epoch": 9.71947935368043, + "grad_norm": 12.592646598815918, + "learning_rate": 9.715529622980252e-06, + "loss": 6.1743, + "step": 108275 + }, + { + "epoch": 9.721723518850988, + "grad_norm": 19.924381256103516, + "learning_rate": 9.717773788150808e-06, + "loss": 6.4265, + "step": 108300 + }, + { + "epoch": 9.723967684021543, + "grad_norm": 15.769845008850098, + "learning_rate": 9.720017953321365e-06, + "loss": 6.458, + "step": 108325 + }, + { + "epoch": 9.7262118491921, + "grad_norm": 11.470673561096191, + "learning_rate": 9.722262118491921e-06, + "loss": 6.2473, + "step": 108350 + }, + { + "epoch": 9.728456014362656, + "grad_norm": 12.993226051330566, + "learning_rate": 9.724506283662479e-06, + "loss": 6.4053, + "step": 108375 + }, + { + "epoch": 9.730700179533214, + "grad_norm": 11.660843849182129, + "learning_rate": 9.726750448833034e-06, + "loss": 6.1824, + "step": 108400 + }, + { + "epoch": 9.732944344703771, + "grad_norm": 14.907675743103027, + "learning_rate": 9.728994614003592e-06, + "loss": 6.4295, + "step": 108425 + }, + { + "epoch": 9.735188509874327, + "grad_norm": 13.273019790649414, + "learning_rate": 9.731238779174148e-06, + "loss": 6.3734, + "step": 108450 + }, + { + "epoch": 9.737432675044884, + "grad_norm": 14.438289642333984, + "learning_rate": 9.733482944344703e-06, + "loss": 6.3695, + "step": 108475 + }, + { + "epoch": 9.73967684021544, + "grad_norm": 13.452004432678223, + "learning_rate": 9.735727109515261e-06, + "loss": 6.4818, + "step": 108500 + }, + { + "epoch": 9.741921005385997, + "grad_norm": 10.873201370239258, + "learning_rate": 9.737971274685818e-06, + "loss": 6.3801, + "step": 108525 + }, + { + "epoch": 9.744165170556553, + "grad_norm": 17.06882095336914, + "learning_rate": 9.740215439856374e-06, + "loss": 6.6555, + "step": 108550 + }, + { + "epoch": 9.74640933572711, + "grad_norm": 15.245879173278809, + "learning_rate": 9.74245960502693e-06, + "loss": 6.5677, + "step": 108575 + }, + { + "epoch": 9.748653500897666, + "grad_norm": 12.823203086853027, + "learning_rate": 9.744703770197487e-06, + "loss": 6.42, + "step": 108600 + }, + { + "epoch": 9.750897666068223, + "grad_norm": 11.411410331726074, + "learning_rate": 9.746947935368045e-06, + "loss": 6.323, + "step": 108625 + }, + { + "epoch": 9.753141831238779, + "grad_norm": 14.636033058166504, + "learning_rate": 9.7491921005386e-06, + "loss": 6.2549, + "step": 108650 + }, + { + "epoch": 9.755385996409336, + "grad_norm": 13.905294418334961, + "learning_rate": 9.751436265709156e-06, + "loss": 6.6822, + "step": 108675 + }, + { + "epoch": 9.757630161579891, + "grad_norm": 12.98100757598877, + "learning_rate": 9.753680430879714e-06, + "loss": 6.3411, + "step": 108700 + }, + { + "epoch": 9.759874326750449, + "grad_norm": 13.583991050720215, + "learning_rate": 9.75592459605027e-06, + "loss": 6.174, + "step": 108725 + }, + { + "epoch": 9.762118491921004, + "grad_norm": 15.756292343139648, + "learning_rate": 9.758168761220827e-06, + "loss": 6.2863, + "step": 108750 + }, + { + "epoch": 9.764362657091562, + "grad_norm": 15.298417091369629, + "learning_rate": 9.760412926391383e-06, + "loss": 6.5302, + "step": 108775 + }, + { + "epoch": 9.76660682226212, + "grad_norm": 13.113062858581543, + "learning_rate": 9.76265709156194e-06, + "loss": 6.6263, + "step": 108800 + }, + { + "epoch": 9.768850987432675, + "grad_norm": 16.303987503051758, + "learning_rate": 9.764901256732496e-06, + "loss": 6.3375, + "step": 108825 + }, + { + "epoch": 9.771095152603232, + "grad_norm": 12.581841468811035, + "learning_rate": 9.767145421903052e-06, + "loss": 6.4724, + "step": 108850 + }, + { + "epoch": 9.773339317773788, + "grad_norm": 13.053592681884766, + "learning_rate": 9.76938958707361e-06, + "loss": 6.4168, + "step": 108875 + }, + { + "epoch": 9.775583482944345, + "grad_norm": 18.823152542114258, + "learning_rate": 9.771633752244167e-06, + "loss": 6.7379, + "step": 108900 + }, + { + "epoch": 9.7778276481149, + "grad_norm": 12.259645462036133, + "learning_rate": 9.773877917414723e-06, + "loss": 6.3789, + "step": 108925 + }, + { + "epoch": 9.780071813285458, + "grad_norm": 12.5889253616333, + "learning_rate": 9.776122082585278e-06, + "loss": 6.353, + "step": 108950 + }, + { + "epoch": 9.782315978456014, + "grad_norm": 11.568065643310547, + "learning_rate": 9.778366247755836e-06, + "loss": 6.6738, + "step": 108975 + }, + { + "epoch": 9.784560143626571, + "grad_norm": 12.418069839477539, + "learning_rate": 9.780610412926392e-06, + "loss": 6.5324, + "step": 109000 + }, + { + "epoch": 9.786804308797127, + "grad_norm": 16.768478393554688, + "learning_rate": 9.782854578096949e-06, + "loss": 6.4208, + "step": 109025 + }, + { + "epoch": 9.789048473967684, + "grad_norm": 14.25368881225586, + "learning_rate": 9.785098743267505e-06, + "loss": 6.2731, + "step": 109050 + }, + { + "epoch": 9.791292639138241, + "grad_norm": 11.176106452941895, + "learning_rate": 9.787342908438062e-06, + "loss": 6.5955, + "step": 109075 + }, + { + "epoch": 9.793536804308797, + "grad_norm": 13.584867477416992, + "learning_rate": 9.789587073608618e-06, + "loss": 6.4458, + "step": 109100 + }, + { + "epoch": 9.795780969479354, + "grad_norm": 12.384140014648438, + "learning_rate": 9.791831238779175e-06, + "loss": 6.2968, + "step": 109125 + }, + { + "epoch": 9.79802513464991, + "grad_norm": 15.7565279006958, + "learning_rate": 9.794075403949731e-06, + "loss": 6.63, + "step": 109150 + }, + { + "epoch": 9.800269299820467, + "grad_norm": 13.954254150390625, + "learning_rate": 9.796319569120289e-06, + "loss": 6.2482, + "step": 109175 + }, + { + "epoch": 9.802513464991023, + "grad_norm": 13.599989891052246, + "learning_rate": 9.798563734290844e-06, + "loss": 6.3905, + "step": 109200 + }, + { + "epoch": 9.80475763016158, + "grad_norm": 11.902372360229492, + "learning_rate": 9.8008078994614e-06, + "loss": 6.3129, + "step": 109225 + }, + { + "epoch": 9.807001795332136, + "grad_norm": 14.550816535949707, + "learning_rate": 9.803052064631958e-06, + "loss": 6.6863, + "step": 109250 + }, + { + "epoch": 9.809245960502693, + "grad_norm": 13.842496871948242, + "learning_rate": 9.805296229802515e-06, + "loss": 6.4883, + "step": 109275 + }, + { + "epoch": 9.811490125673249, + "grad_norm": 9.80747127532959, + "learning_rate": 9.807540394973071e-06, + "loss": 6.3179, + "step": 109300 + }, + { + "epoch": 9.813734290843806, + "grad_norm": 12.114779472351074, + "learning_rate": 9.809784560143627e-06, + "loss": 6.5748, + "step": 109325 + }, + { + "epoch": 9.815978456014363, + "grad_norm": 14.310303688049316, + "learning_rate": 9.812028725314184e-06, + "loss": 6.3159, + "step": 109350 + }, + { + "epoch": 9.818222621184919, + "grad_norm": 14.68010425567627, + "learning_rate": 9.81427289048474e-06, + "loss": 6.4294, + "step": 109375 + }, + { + "epoch": 9.820466786355476, + "grad_norm": 13.997249603271484, + "learning_rate": 9.816517055655297e-06, + "loss": 6.4538, + "step": 109400 + }, + { + "epoch": 9.822710951526032, + "grad_norm": 13.918299674987793, + "learning_rate": 9.818761220825853e-06, + "loss": 6.4013, + "step": 109425 + }, + { + "epoch": 9.82495511669659, + "grad_norm": 15.224852561950684, + "learning_rate": 9.82100538599641e-06, + "loss": 6.6911, + "step": 109450 + }, + { + "epoch": 9.827199281867145, + "grad_norm": 11.776430130004883, + "learning_rate": 9.823249551166966e-06, + "loss": 6.7449, + "step": 109475 + }, + { + "epoch": 9.829443447037702, + "grad_norm": 15.443318367004395, + "learning_rate": 9.825493716337524e-06, + "loss": 6.4196, + "step": 109500 + }, + { + "epoch": 9.831687612208258, + "grad_norm": 13.471929550170898, + "learning_rate": 9.82773788150808e-06, + "loss": 6.4218, + "step": 109525 + }, + { + "epoch": 9.833931777378815, + "grad_norm": 13.10338020324707, + "learning_rate": 9.829982046678637e-06, + "loss": 6.4311, + "step": 109550 + }, + { + "epoch": 9.83617594254937, + "grad_norm": 13.209121704101562, + "learning_rate": 9.832226211849193e-06, + "loss": 6.333, + "step": 109575 + }, + { + "epoch": 9.838420107719928, + "grad_norm": 13.153351783752441, + "learning_rate": 9.834470377019749e-06, + "loss": 6.1677, + "step": 109600 + }, + { + "epoch": 9.840664272890486, + "grad_norm": 11.809558868408203, + "learning_rate": 9.836714542190306e-06, + "loss": 6.3924, + "step": 109625 + }, + { + "epoch": 9.842908438061041, + "grad_norm": 11.93437671661377, + "learning_rate": 9.838958707360864e-06, + "loss": 6.5102, + "step": 109650 + }, + { + "epoch": 9.845152603231599, + "grad_norm": 12.748261451721191, + "learning_rate": 9.84120287253142e-06, + "loss": 6.5036, + "step": 109675 + }, + { + "epoch": 9.847396768402154, + "grad_norm": 12.803120613098145, + "learning_rate": 9.843447037701975e-06, + "loss": 6.3877, + "step": 109700 + }, + { + "epoch": 9.849640933572712, + "grad_norm": 13.084535598754883, + "learning_rate": 9.845691202872533e-06, + "loss": 6.4778, + "step": 109725 + }, + { + "epoch": 9.851885098743267, + "grad_norm": 13.307491302490234, + "learning_rate": 9.847935368043088e-06, + "loss": 6.1828, + "step": 109750 + }, + { + "epoch": 9.854129263913824, + "grad_norm": 12.24569320678711, + "learning_rate": 9.850179533213646e-06, + "loss": 6.3444, + "step": 109775 + }, + { + "epoch": 9.85637342908438, + "grad_norm": 11.43465805053711, + "learning_rate": 9.852423698384202e-06, + "loss": 6.47, + "step": 109800 + }, + { + "epoch": 9.858617594254937, + "grad_norm": 14.643242835998535, + "learning_rate": 9.854667863554759e-06, + "loss": 6.5682, + "step": 109825 + }, + { + "epoch": 9.860861759425493, + "grad_norm": 15.309720993041992, + "learning_rate": 9.856912028725315e-06, + "loss": 6.4407, + "step": 109850 + }, + { + "epoch": 9.86310592459605, + "grad_norm": 11.837284088134766, + "learning_rate": 9.85915619389587e-06, + "loss": 6.2539, + "step": 109875 + }, + { + "epoch": 9.865350089766606, + "grad_norm": 12.303644180297852, + "learning_rate": 9.861400359066428e-06, + "loss": 6.3862, + "step": 109900 + }, + { + "epoch": 9.867594254937163, + "grad_norm": 10.34244155883789, + "learning_rate": 9.863644524236985e-06, + "loss": 6.2812, + "step": 109925 + }, + { + "epoch": 9.86983842010772, + "grad_norm": 14.185670852661133, + "learning_rate": 9.865888689407541e-06, + "loss": 6.6626, + "step": 109950 + }, + { + "epoch": 9.872082585278276, + "grad_norm": 11.519030570983887, + "learning_rate": 9.868132854578097e-06, + "loss": 6.3142, + "step": 109975 + }, + { + "epoch": 9.874326750448834, + "grad_norm": 14.364169120788574, + "learning_rate": 9.870377019748654e-06, + "loss": 6.2585, + "step": 110000 + }, + { + "epoch": 9.87657091561939, + "grad_norm": 12.973367691040039, + "learning_rate": 9.87262118491921e-06, + "loss": 6.3069, + "step": 110025 + }, + { + "epoch": 9.878815080789947, + "grad_norm": 13.68610668182373, + "learning_rate": 9.874865350089768e-06, + "loss": 6.5642, + "step": 110050 + }, + { + "epoch": 9.881059245960502, + "grad_norm": 11.674675941467285, + "learning_rate": 9.877109515260323e-06, + "loss": 6.5218, + "step": 110075 + }, + { + "epoch": 9.88330341113106, + "grad_norm": 15.104235649108887, + "learning_rate": 9.879353680430881e-06, + "loss": 6.1805, + "step": 110100 + }, + { + "epoch": 9.885547576301615, + "grad_norm": 13.448314666748047, + "learning_rate": 9.881597845601437e-06, + "loss": 6.3879, + "step": 110125 + }, + { + "epoch": 9.887791741472173, + "grad_norm": 15.330083847045898, + "learning_rate": 9.883842010771994e-06, + "loss": 6.2703, + "step": 110150 + }, + { + "epoch": 9.890035906642728, + "grad_norm": 12.690573692321777, + "learning_rate": 9.88608617594255e-06, + "loss": 6.6071, + "step": 110175 + }, + { + "epoch": 9.892280071813286, + "grad_norm": 13.180569648742676, + "learning_rate": 9.888330341113107e-06, + "loss": 6.2397, + "step": 110200 + }, + { + "epoch": 9.894524236983841, + "grad_norm": 13.205240249633789, + "learning_rate": 9.890574506283663e-06, + "loss": 6.6912, + "step": 110225 + }, + { + "epoch": 9.896768402154398, + "grad_norm": 12.219529151916504, + "learning_rate": 9.892818671454219e-06, + "loss": 6.5914, + "step": 110250 + }, + { + "epoch": 9.899012567324956, + "grad_norm": 13.317253112792969, + "learning_rate": 9.895062836624776e-06, + "loss": 6.3225, + "step": 110275 + }, + { + "epoch": 9.901256732495511, + "grad_norm": 18.605087280273438, + "learning_rate": 9.89721723518851e-06, + "loss": 6.3121, + "step": 110300 + }, + { + "epoch": 9.903500897666069, + "grad_norm": 13.443893432617188, + "learning_rate": 9.899461400359068e-06, + "loss": 6.2853, + "step": 110325 + }, + { + "epoch": 9.905745062836624, + "grad_norm": 12.873085021972656, + "learning_rate": 9.901705565529624e-06, + "loss": 6.3816, + "step": 110350 + }, + { + "epoch": 9.907989228007182, + "grad_norm": 14.679572105407715, + "learning_rate": 9.903949730700181e-06, + "loss": 6.2009, + "step": 110375 + }, + { + "epoch": 9.910233393177737, + "grad_norm": 14.211260795593262, + "learning_rate": 9.906193895870737e-06, + "loss": 6.5649, + "step": 110400 + }, + { + "epoch": 9.912477558348295, + "grad_norm": 12.29214096069336, + "learning_rate": 9.908438061041293e-06, + "loss": 6.4086, + "step": 110425 + }, + { + "epoch": 9.91472172351885, + "grad_norm": 12.233564376831055, + "learning_rate": 9.91068222621185e-06, + "loss": 6.4471, + "step": 110450 + }, + { + "epoch": 9.916965888689408, + "grad_norm": 14.230286598205566, + "learning_rate": 9.912926391382408e-06, + "loss": 6.2727, + "step": 110475 + }, + { + "epoch": 9.919210053859963, + "grad_norm": 16.16468048095703, + "learning_rate": 9.915170556552963e-06, + "loss": 6.1732, + "step": 110500 + }, + { + "epoch": 9.92145421903052, + "grad_norm": 13.606269836425781, + "learning_rate": 9.917414721723519e-06, + "loss": 6.2955, + "step": 110525 + }, + { + "epoch": 9.923698384201078, + "grad_norm": 12.974431037902832, + "learning_rate": 9.919658886894077e-06, + "loss": 6.3589, + "step": 110550 + }, + { + "epoch": 9.925942549371634, + "grad_norm": 14.1532564163208, + "learning_rate": 9.921903052064632e-06, + "loss": 6.3475, + "step": 110575 + }, + { + "epoch": 9.928186714542191, + "grad_norm": 13.775023460388184, + "learning_rate": 9.92414721723519e-06, + "loss": 6.6245, + "step": 110600 + }, + { + "epoch": 9.930430879712747, + "grad_norm": 17.151395797729492, + "learning_rate": 9.926391382405746e-06, + "loss": 6.3388, + "step": 110625 + }, + { + "epoch": 9.932675044883304, + "grad_norm": 11.358728408813477, + "learning_rate": 9.928635547576301e-06, + "loss": 6.4436, + "step": 110650 + }, + { + "epoch": 9.93491921005386, + "grad_norm": 14.211457252502441, + "learning_rate": 9.930879712746859e-06, + "loss": 6.4671, + "step": 110675 + }, + { + "epoch": 9.937163375224417, + "grad_norm": 12.531098365783691, + "learning_rate": 9.933123877917416e-06, + "loss": 6.5664, + "step": 110700 + }, + { + "epoch": 9.939407540394972, + "grad_norm": 12.32366943359375, + "learning_rate": 9.935368043087972e-06, + "loss": 6.3577, + "step": 110725 + }, + { + "epoch": 9.94165170556553, + "grad_norm": 12.405170440673828, + "learning_rate": 9.93761220825853e-06, + "loss": 6.5352, + "step": 110750 + }, + { + "epoch": 9.943895870736085, + "grad_norm": 13.98643684387207, + "learning_rate": 9.939856373429085e-06, + "loss": 6.4475, + "step": 110775 + }, + { + "epoch": 9.946140035906643, + "grad_norm": 14.178044319152832, + "learning_rate": 9.94201077199282e-06, + "loss": 6.3911, + "step": 110800 + }, + { + "epoch": 9.9483842010772, + "grad_norm": 11.323468208312988, + "learning_rate": 9.944254937163375e-06, + "loss": 6.3926, + "step": 110825 + }, + { + "epoch": 9.950628366247756, + "grad_norm": 10.921385765075684, + "learning_rate": 9.946499102333932e-06, + "loss": 6.3825, + "step": 110850 + }, + { + "epoch": 9.952872531418313, + "grad_norm": 13.023432731628418, + "learning_rate": 9.94874326750449e-06, + "loss": 6.4502, + "step": 110875 + }, + { + "epoch": 9.955116696588869, + "grad_norm": 17.037811279296875, + "learning_rate": 9.950987432675046e-06, + "loss": 6.6418, + "step": 110900 + }, + { + "epoch": 9.957360861759426, + "grad_norm": 13.7045259475708, + "learning_rate": 9.953231597845601e-06, + "loss": 6.2596, + "step": 110925 + }, + { + "epoch": 9.959605026929982, + "grad_norm": 19.840457916259766, + "learning_rate": 9.955475763016159e-06, + "loss": 6.4102, + "step": 110950 + }, + { + "epoch": 9.961849192100539, + "grad_norm": 12.960018157958984, + "learning_rate": 9.957719928186715e-06, + "loss": 6.5596, + "step": 110975 + }, + { + "epoch": 9.964093357271095, + "grad_norm": 13.206340789794922, + "learning_rate": 9.959964093357272e-06, + "loss": 6.2755, + "step": 111000 + }, + { + "epoch": 9.966337522441652, + "grad_norm": 15.024995803833008, + "learning_rate": 9.962208258527828e-06, + "loss": 6.6206, + "step": 111025 + }, + { + "epoch": 9.968581687612208, + "grad_norm": 11.857934951782227, + "learning_rate": 9.964452423698385e-06, + "loss": 6.5398, + "step": 111050 + }, + { + "epoch": 9.970825852782765, + "grad_norm": 15.09706974029541, + "learning_rate": 9.966696588868941e-06, + "loss": 6.45, + "step": 111075 + }, + { + "epoch": 9.973070017953322, + "grad_norm": 14.477025032043457, + "learning_rate": 9.968940754039499e-06, + "loss": 6.4451, + "step": 111100 + }, + { + "epoch": 9.975314183123878, + "grad_norm": 11.296525001525879, + "learning_rate": 9.971184919210054e-06, + "loss": 6.3373, + "step": 111125 + }, + { + "epoch": 9.977558348294435, + "grad_norm": 11.637763023376465, + "learning_rate": 9.973429084380612e-06, + "loss": 6.6321, + "step": 111150 + }, + { + "epoch": 9.97980251346499, + "grad_norm": 18.25012969970703, + "learning_rate": 9.975673249551168e-06, + "loss": 6.4476, + "step": 111175 + }, + { + "epoch": 9.982046678635548, + "grad_norm": 15.719344139099121, + "learning_rate": 9.977917414721723e-06, + "loss": 6.4323, + "step": 111200 + }, + { + "epoch": 9.984290843806104, + "grad_norm": 14.773994445800781, + "learning_rate": 9.980161579892281e-06, + "loss": 6.439, + "step": 111225 + }, + { + "epoch": 9.986535008976661, + "grad_norm": 18.892919540405273, + "learning_rate": 9.982405745062838e-06, + "loss": 6.357, + "step": 111250 + }, + { + "epoch": 9.988779174147217, + "grad_norm": 14.04003620147705, + "learning_rate": 9.984649910233394e-06, + "loss": 6.3873, + "step": 111275 + }, + { + "epoch": 9.991023339317774, + "grad_norm": 15.945775032043457, + "learning_rate": 9.98689407540395e-06, + "loss": 6.5037, + "step": 111300 + }, + { + "epoch": 9.99326750448833, + "grad_norm": 13.563767433166504, + "learning_rate": 9.989138240574507e-06, + "loss": 6.4505, + "step": 111325 + }, + { + "epoch": 9.995511669658887, + "grad_norm": 11.788583755493164, + "learning_rate": 9.991382405745063e-06, + "loss": 6.3894, + "step": 111350 + }, + { + "epoch": 9.997755834829443, + "grad_norm": 12.527373313903809, + "learning_rate": 9.99362657091562e-06, + "loss": 6.4222, + "step": 111375 + }, + { + "epoch": 10.0, + "grad_norm": 15.889495849609375, + "learning_rate": 9.995870736086178e-06, + "loss": 6.4641, + "step": 111400 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.07713898382754472, + "eval_f1_macro": 0.003395527327337749, + "eval_f1_micro": 0.07713898382754472, + "eval_f1_weighted": 0.034601557751348884, + "eval_loss": 7.221798419952393, + "eval_precision_macro": 0.003128529492437783, + "eval_precision_micro": 0.07713898382754472, + "eval_precision_weighted": 0.02734143040994157, + "eval_recall_macro": 0.007043311360089639, + "eval_recall_micro": 0.07713898382754472, + "eval_recall_weighted": 0.07713898382754472, + "eval_runtime": 129.3381, + "eval_samples_per_second": 404.931, + "eval_steps_per_second": 12.657, + "step": 111400 + }, + { + "epoch": 10.002244165170557, + "grad_norm": 11.019133567810059, + "learning_rate": 9.998114901256734e-06, + "loss": 6.1551, + "step": 111425 + }, + { + "epoch": 10.004488330341113, + "grad_norm": 13.418596267700195, + "learning_rate": 9.999960103730302e-06, + "loss": 6.2851, + "step": 111450 + }, + { + "epoch": 10.00673249551167, + "grad_norm": 13.545694351196289, + "learning_rate": 9.999710752044685e-06, + "loss": 6.2886, + "step": 111475 + }, + { + "epoch": 10.008976660682226, + "grad_norm": 13.118175506591797, + "learning_rate": 9.999461400359067e-06, + "loss": 6.2371, + "step": 111500 + }, + { + "epoch": 10.011220825852783, + "grad_norm": 12.583810806274414, + "learning_rate": 9.999212048673451e-06, + "loss": 6.1056, + "step": 111525 + }, + { + "epoch": 10.013464991023339, + "grad_norm": 13.93835163116455, + "learning_rate": 9.998962696987833e-06, + "loss": 6.2293, + "step": 111550 + }, + { + "epoch": 10.015709156193896, + "grad_norm": 12.22034740447998, + "learning_rate": 9.998713345302216e-06, + "loss": 6.1866, + "step": 111575 + }, + { + "epoch": 10.017953321364452, + "grad_norm": 14.327311515808105, + "learning_rate": 9.998463993616598e-06, + "loss": 6.4414, + "step": 111600 + }, + { + "epoch": 10.02019748653501, + "grad_norm": 19.0643367767334, + "learning_rate": 9.99821464193098e-06, + "loss": 6.1968, + "step": 111625 + }, + { + "epoch": 10.022441651705565, + "grad_norm": 12.141693115234375, + "learning_rate": 9.997965290245362e-06, + "loss": 6.1397, + "step": 111650 + }, + { + "epoch": 10.024685816876122, + "grad_norm": 13.908327102661133, + "learning_rate": 9.997715938559745e-06, + "loss": 6.2098, + "step": 111675 + }, + { + "epoch": 10.02692998204668, + "grad_norm": 12.438508033752441, + "learning_rate": 9.997466586874129e-06, + "loss": 6.2039, + "step": 111700 + }, + { + "epoch": 10.029174147217235, + "grad_norm": 13.993825912475586, + "learning_rate": 9.997217235188511e-06, + "loss": 6.0701, + "step": 111725 + }, + { + "epoch": 10.031418312387792, + "grad_norm": 10.388389587402344, + "learning_rate": 9.996967883502893e-06, + "loss": 6.146, + "step": 111750 + }, + { + "epoch": 10.033662477558348, + "grad_norm": 13.029390335083008, + "learning_rate": 9.996718531817276e-06, + "loss": 6.1132, + "step": 111775 + }, + { + "epoch": 10.035906642728905, + "grad_norm": 12.546586990356445, + "learning_rate": 9.996469180131658e-06, + "loss": 6.1112, + "step": 111800 + }, + { + "epoch": 10.038150807899461, + "grad_norm": 13.342358589172363, + "learning_rate": 9.99621982844604e-06, + "loss": 6.2165, + "step": 111825 + }, + { + "epoch": 10.040394973070018, + "grad_norm": 16.238079071044922, + "learning_rate": 9.995970476760424e-06, + "loss": 6.1088, + "step": 111850 + }, + { + "epoch": 10.042639138240574, + "grad_norm": 15.007585525512695, + "learning_rate": 9.995721125074807e-06, + "loss": 5.9725, + "step": 111875 + }, + { + "epoch": 10.044883303411131, + "grad_norm": 13.566699981689453, + "learning_rate": 9.995471773389189e-06, + "loss": 6.2875, + "step": 111900 + }, + { + "epoch": 10.047127468581687, + "grad_norm": 14.108683586120605, + "learning_rate": 9.995222421703571e-06, + "loss": 6.1723, + "step": 111925 + }, + { + "epoch": 10.049371633752244, + "grad_norm": 14.940693855285645, + "learning_rate": 9.994973070017954e-06, + "loss": 5.8318, + "step": 111950 + }, + { + "epoch": 10.0516157989228, + "grad_norm": 12.215937614440918, + "learning_rate": 9.994723718332336e-06, + "loss": 6.1462, + "step": 111975 + }, + { + "epoch": 10.053859964093357, + "grad_norm": 13.69896125793457, + "learning_rate": 9.99447436664672e-06, + "loss": 6.203, + "step": 112000 + }, + { + "epoch": 10.056104129263915, + "grad_norm": 14.92670726776123, + "learning_rate": 9.994225014961102e-06, + "loss": 6.2639, + "step": 112025 + }, + { + "epoch": 10.05834829443447, + "grad_norm": 12.877148628234863, + "learning_rate": 9.993975663275485e-06, + "loss": 6.2016, + "step": 112050 + }, + { + "epoch": 10.060592459605028, + "grad_norm": 12.46399211883545, + "learning_rate": 9.993726311589867e-06, + "loss": 6.2599, + "step": 112075 + }, + { + "epoch": 10.062836624775583, + "grad_norm": 10.066740989685059, + "learning_rate": 9.993476959904251e-06, + "loss": 6.1133, + "step": 112100 + }, + { + "epoch": 10.06508078994614, + "grad_norm": 12.895662307739258, + "learning_rate": 9.993227608218632e-06, + "loss": 6.1714, + "step": 112125 + }, + { + "epoch": 10.067324955116696, + "grad_norm": 15.309555053710938, + "learning_rate": 9.992978256533014e-06, + "loss": 6.0641, + "step": 112150 + }, + { + "epoch": 10.069569120287253, + "grad_norm": 14.578579902648926, + "learning_rate": 9.992728904847398e-06, + "loss": 6.1692, + "step": 112175 + }, + { + "epoch": 10.071813285457809, + "grad_norm": 12.179104804992676, + "learning_rate": 9.99247955316178e-06, + "loss": 6.0808, + "step": 112200 + }, + { + "epoch": 10.074057450628366, + "grad_norm": 15.884713172912598, + "learning_rate": 9.992230201476163e-06, + "loss": 6.1458, + "step": 112225 + }, + { + "epoch": 10.076301615798922, + "grad_norm": 12.4454984664917, + "learning_rate": 9.991980849790547e-06, + "loss": 6.1938, + "step": 112250 + }, + { + "epoch": 10.07854578096948, + "grad_norm": 12.564933776855469, + "learning_rate": 9.991731498104929e-06, + "loss": 6.0696, + "step": 112275 + }, + { + "epoch": 10.080789946140037, + "grad_norm": 14.480239868164062, + "learning_rate": 9.99148214641931e-06, + "loss": 6.1975, + "step": 112300 + }, + { + "epoch": 10.083034111310592, + "grad_norm": 14.00886058807373, + "learning_rate": 9.991232794733693e-06, + "loss": 6.1253, + "step": 112325 + }, + { + "epoch": 10.08527827648115, + "grad_norm": 14.267656326293945, + "learning_rate": 9.990983443048076e-06, + "loss": 6.33, + "step": 112350 + }, + { + "epoch": 10.087522441651705, + "grad_norm": 12.78085708618164, + "learning_rate": 9.990734091362458e-06, + "loss": 6.0239, + "step": 112375 + }, + { + "epoch": 10.089766606822263, + "grad_norm": 14.63577651977539, + "learning_rate": 9.99048473967684e-06, + "loss": 6.2433, + "step": 112400 + }, + { + "epoch": 10.092010771992818, + "grad_norm": 12.422415733337402, + "learning_rate": 9.990235387991224e-06, + "loss": 6.0998, + "step": 112425 + }, + { + "epoch": 10.094254937163376, + "grad_norm": 13.502301216125488, + "learning_rate": 9.989986036305607e-06, + "loss": 6.1336, + "step": 112450 + }, + { + "epoch": 10.096499102333931, + "grad_norm": 11.31605052947998, + "learning_rate": 9.989736684619989e-06, + "loss": 6.2241, + "step": 112475 + }, + { + "epoch": 10.098743267504489, + "grad_norm": 14.0910062789917, + "learning_rate": 9.989487332934371e-06, + "loss": 6.3576, + "step": 112500 + }, + { + "epoch": 10.100987432675044, + "grad_norm": 12.169644355773926, + "learning_rate": 9.989237981248754e-06, + "loss": 6.1655, + "step": 112525 + }, + { + "epoch": 10.103231597845602, + "grad_norm": 14.307677268981934, + "learning_rate": 9.988988629563136e-06, + "loss": 6.2888, + "step": 112550 + }, + { + "epoch": 10.105475763016157, + "grad_norm": 13.183656692504883, + "learning_rate": 9.98873927787752e-06, + "loss": 5.9802, + "step": 112575 + }, + { + "epoch": 10.107719928186714, + "grad_norm": 12.803454399108887, + "learning_rate": 9.988489926191902e-06, + "loss": 6.2632, + "step": 112600 + }, + { + "epoch": 10.109964093357272, + "grad_norm": 12.438281059265137, + "learning_rate": 9.988240574506285e-06, + "loss": 6.0326, + "step": 112625 + }, + { + "epoch": 10.112208258527827, + "grad_norm": 15.322245597839355, + "learning_rate": 9.987991222820667e-06, + "loss": 6.1757, + "step": 112650 + }, + { + "epoch": 10.114452423698385, + "grad_norm": 16.201675415039062, + "learning_rate": 9.98774187113505e-06, + "loss": 5.9708, + "step": 112675 + }, + { + "epoch": 10.11669658886894, + "grad_norm": 11.93720531463623, + "learning_rate": 9.987492519449432e-06, + "loss": 6.3445, + "step": 112700 + }, + { + "epoch": 10.118940754039498, + "grad_norm": 13.516136169433594, + "learning_rate": 9.987243167763816e-06, + "loss": 6.4381, + "step": 112725 + }, + { + "epoch": 10.121184919210053, + "grad_norm": 13.847302436828613, + "learning_rate": 9.986993816078198e-06, + "loss": 6.1747, + "step": 112750 + }, + { + "epoch": 10.12342908438061, + "grad_norm": 13.054784774780273, + "learning_rate": 9.98674446439258e-06, + "loss": 6.1575, + "step": 112775 + }, + { + "epoch": 10.125673249551166, + "grad_norm": 12.022624015808105, + "learning_rate": 9.986495112706963e-06, + "loss": 6.2219, + "step": 112800 + }, + { + "epoch": 10.127917414721724, + "grad_norm": 11.628305435180664, + "learning_rate": 9.986245761021345e-06, + "loss": 6.2862, + "step": 112825 + }, + { + "epoch": 10.13016157989228, + "grad_norm": 12.511070251464844, + "learning_rate": 9.985996409335727e-06, + "loss": 6.0703, + "step": 112850 + }, + { + "epoch": 10.132405745062837, + "grad_norm": 10.705799102783203, + "learning_rate": 9.98574705765011e-06, + "loss": 6.113, + "step": 112875 + }, + { + "epoch": 10.134649910233394, + "grad_norm": 11.655420303344727, + "learning_rate": 9.985497705964494e-06, + "loss": 6.1784, + "step": 112900 + }, + { + "epoch": 10.13689407540395, + "grad_norm": 13.237956047058105, + "learning_rate": 9.985248354278876e-06, + "loss": 6.2797, + "step": 112925 + }, + { + "epoch": 10.139138240574507, + "grad_norm": 12.380025863647461, + "learning_rate": 9.984999002593258e-06, + "loss": 5.9157, + "step": 112950 + }, + { + "epoch": 10.141382405745063, + "grad_norm": 14.719572067260742, + "learning_rate": 9.984749650907642e-06, + "loss": 6.1245, + "step": 112975 + }, + { + "epoch": 10.14362657091562, + "grad_norm": 17.899499893188477, + "learning_rate": 9.984500299222023e-06, + "loss": 6.2249, + "step": 113000 + }, + { + "epoch": 10.145870736086176, + "grad_norm": 16.6634578704834, + "learning_rate": 9.984250947536405e-06, + "loss": 6.2441, + "step": 113025 + }, + { + "epoch": 10.148114901256733, + "grad_norm": 15.134480476379395, + "learning_rate": 9.984001595850789e-06, + "loss": 6.2663, + "step": 113050 + }, + { + "epoch": 10.150359066427288, + "grad_norm": 11.64604377746582, + "learning_rate": 9.983752244165171e-06, + "loss": 6.4103, + "step": 113075 + }, + { + "epoch": 10.152603231597846, + "grad_norm": 16.69687843322754, + "learning_rate": 9.983502892479554e-06, + "loss": 6.0569, + "step": 113100 + }, + { + "epoch": 10.154847396768401, + "grad_norm": 15.205822944641113, + "learning_rate": 9.983253540793936e-06, + "loss": 6.209, + "step": 113125 + }, + { + "epoch": 10.157091561938959, + "grad_norm": 11.119685173034668, + "learning_rate": 9.98300418910832e-06, + "loss": 6.2417, + "step": 113150 + }, + { + "epoch": 10.159335727109514, + "grad_norm": 13.38746166229248, + "learning_rate": 9.9827548374227e-06, + "loss": 6.1013, + "step": 113175 + }, + { + "epoch": 10.161579892280072, + "grad_norm": 12.423396110534668, + "learning_rate": 9.982505485737085e-06, + "loss": 6.0911, + "step": 113200 + }, + { + "epoch": 10.16382405745063, + "grad_norm": 16.844390869140625, + "learning_rate": 9.982256134051467e-06, + "loss": 6.1792, + "step": 113225 + }, + { + "epoch": 10.166068222621185, + "grad_norm": 15.034219741821289, + "learning_rate": 9.98200678236585e-06, + "loss": 6.3862, + "step": 113250 + }, + { + "epoch": 10.168312387791742, + "grad_norm": 15.031222343444824, + "learning_rate": 9.981757430680232e-06, + "loss": 6.1012, + "step": 113275 + }, + { + "epoch": 10.170556552962298, + "grad_norm": 15.725984573364258, + "learning_rate": 9.981508078994616e-06, + "loss": 6.0234, + "step": 113300 + }, + { + "epoch": 10.172800718132855, + "grad_norm": 15.612773895263672, + "learning_rate": 9.981258727308998e-06, + "loss": 6.4614, + "step": 113325 + }, + { + "epoch": 10.17504488330341, + "grad_norm": 18.53880500793457, + "learning_rate": 9.98100937562338e-06, + "loss": 6.1763, + "step": 113350 + }, + { + "epoch": 10.177289048473968, + "grad_norm": 15.850635528564453, + "learning_rate": 9.980760023937763e-06, + "loss": 6.0238, + "step": 113375 + }, + { + "epoch": 10.179533213644524, + "grad_norm": 15.558518409729004, + "learning_rate": 9.980510672252145e-06, + "loss": 6.2575, + "step": 113400 + }, + { + "epoch": 10.181777378815081, + "grad_norm": 11.94745922088623, + "learning_rate": 9.980261320566527e-06, + "loss": 6.1927, + "step": 113425 + }, + { + "epoch": 10.184021543985637, + "grad_norm": 13.76794719696045, + "learning_rate": 9.980011968880911e-06, + "loss": 6.2686, + "step": 113450 + }, + { + "epoch": 10.186265709156194, + "grad_norm": 14.341629981994629, + "learning_rate": 9.979762617195294e-06, + "loss": 6.2637, + "step": 113475 + }, + { + "epoch": 10.188509874326751, + "grad_norm": 12.407752990722656, + "learning_rate": 9.979513265509676e-06, + "loss": 6.1067, + "step": 113500 + }, + { + "epoch": 10.190754039497307, + "grad_norm": 14.886983871459961, + "learning_rate": 9.979263913824058e-06, + "loss": 6.1316, + "step": 113525 + }, + { + "epoch": 10.192998204667864, + "grad_norm": 11.748578071594238, + "learning_rate": 9.97901456213844e-06, + "loss": 6.2354, + "step": 113550 + }, + { + "epoch": 10.19524236983842, + "grad_norm": 13.954566955566406, + "learning_rate": 9.978765210452823e-06, + "loss": 6.0592, + "step": 113575 + }, + { + "epoch": 10.197486535008977, + "grad_norm": 13.10950756072998, + "learning_rate": 9.978515858767205e-06, + "loss": 6.2996, + "step": 113600 + }, + { + "epoch": 10.199730700179533, + "grad_norm": 12.067079544067383, + "learning_rate": 9.97826650708159e-06, + "loss": 6.0497, + "step": 113625 + }, + { + "epoch": 10.20197486535009, + "grad_norm": 13.47501277923584, + "learning_rate": 9.978017155395971e-06, + "loss": 6.1827, + "step": 113650 + }, + { + "epoch": 10.204219030520646, + "grad_norm": 13.537381172180176, + "learning_rate": 9.977767803710354e-06, + "loss": 6.2253, + "step": 113675 + }, + { + "epoch": 10.206463195691203, + "grad_norm": 14.726213455200195, + "learning_rate": 9.977518452024738e-06, + "loss": 6.0231, + "step": 113700 + }, + { + "epoch": 10.208707360861759, + "grad_norm": 16.5965576171875, + "learning_rate": 9.977269100339118e-06, + "loss": 6.3393, + "step": 113725 + }, + { + "epoch": 10.210951526032316, + "grad_norm": 13.59029483795166, + "learning_rate": 9.9770197486535e-06, + "loss": 6.2676, + "step": 113750 + }, + { + "epoch": 10.213195691202873, + "grad_norm": 14.68735122680664, + "learning_rate": 9.976770396967885e-06, + "loss": 6.2605, + "step": 113775 + }, + { + "epoch": 10.215439856373429, + "grad_norm": 12.594502449035645, + "learning_rate": 9.976521045282267e-06, + "loss": 6.2833, + "step": 113800 + }, + { + "epoch": 10.217684021543986, + "grad_norm": 18.72052574157715, + "learning_rate": 9.97627169359665e-06, + "loss": 6.1827, + "step": 113825 + }, + { + "epoch": 10.219928186714542, + "grad_norm": 11.747774124145508, + "learning_rate": 9.976022341911032e-06, + "loss": 6.2674, + "step": 113850 + }, + { + "epoch": 10.2221723518851, + "grad_norm": 14.752880096435547, + "learning_rate": 9.975772990225416e-06, + "loss": 6.0722, + "step": 113875 + }, + { + "epoch": 10.224416517055655, + "grad_norm": 15.984724998474121, + "learning_rate": 9.975523638539796e-06, + "loss": 6.1498, + "step": 113900 + }, + { + "epoch": 10.226660682226212, + "grad_norm": 12.520922660827637, + "learning_rate": 9.97527428685418e-06, + "loss": 6.2846, + "step": 113925 + }, + { + "epoch": 10.228904847396768, + "grad_norm": 13.062174797058105, + "learning_rate": 9.975024935168563e-06, + "loss": 6.1491, + "step": 113950 + }, + { + "epoch": 10.231149012567325, + "grad_norm": 14.754020690917969, + "learning_rate": 9.974775583482945e-06, + "loss": 6.1286, + "step": 113975 + }, + { + "epoch": 10.23339317773788, + "grad_norm": 15.996007919311523, + "learning_rate": 9.974526231797327e-06, + "loss": 6.0896, + "step": 114000 + }, + { + "epoch": 10.235637342908438, + "grad_norm": 14.496719360351562, + "learning_rate": 9.974276880111711e-06, + "loss": 6.2313, + "step": 114025 + }, + { + "epoch": 10.237881508078994, + "grad_norm": 12.704662322998047, + "learning_rate": 9.974027528426094e-06, + "loss": 6.2709, + "step": 114050 + }, + { + "epoch": 10.240125673249551, + "grad_norm": 14.468791961669922, + "learning_rate": 9.973778176740476e-06, + "loss": 6.266, + "step": 114075 + }, + { + "epoch": 10.242369838420109, + "grad_norm": 12.506538391113281, + "learning_rate": 9.973528825054858e-06, + "loss": 6.3691, + "step": 114100 + }, + { + "epoch": 10.244614003590664, + "grad_norm": 16.753398895263672, + "learning_rate": 9.97327947336924e-06, + "loss": 6.305, + "step": 114125 + }, + { + "epoch": 10.246858168761221, + "grad_norm": 14.69544792175293, + "learning_rate": 9.973030121683623e-06, + "loss": 6.4072, + "step": 114150 + }, + { + "epoch": 10.249102333931777, + "grad_norm": 12.991039276123047, + "learning_rate": 9.972780769998007e-06, + "loss": 5.9238, + "step": 114175 + }, + { + "epoch": 10.251346499102334, + "grad_norm": 14.67293643951416, + "learning_rate": 9.97253141831239e-06, + "loss": 6.2589, + "step": 114200 + }, + { + "epoch": 10.25359066427289, + "grad_norm": 13.363348007202148, + "learning_rate": 9.972282066626772e-06, + "loss": 6.165, + "step": 114225 + }, + { + "epoch": 10.255834829443447, + "grad_norm": 12.911703109741211, + "learning_rate": 9.972032714941154e-06, + "loss": 6.3271, + "step": 114250 + }, + { + "epoch": 10.258078994614003, + "grad_norm": 13.683006286621094, + "learning_rate": 9.971783363255536e-06, + "loss": 5.9943, + "step": 114275 + }, + { + "epoch": 10.26032315978456, + "grad_norm": 14.182135581970215, + "learning_rate": 9.971534011569918e-06, + "loss": 6.1565, + "step": 114300 + }, + { + "epoch": 10.262567324955116, + "grad_norm": 16.789316177368164, + "learning_rate": 9.9712846598843e-06, + "loss": 6.1548, + "step": 114325 + }, + { + "epoch": 10.264811490125673, + "grad_norm": 14.753307342529297, + "learning_rate": 9.971035308198685e-06, + "loss": 6.0833, + "step": 114350 + }, + { + "epoch": 10.26705565529623, + "grad_norm": 16.325212478637695, + "learning_rate": 9.970785956513067e-06, + "loss": 6.1731, + "step": 114375 + }, + { + "epoch": 10.269299820466786, + "grad_norm": 13.438663482666016, + "learning_rate": 9.97053660482745e-06, + "loss": 6.3916, + "step": 114400 + }, + { + "epoch": 10.271543985637344, + "grad_norm": 16.29828453063965, + "learning_rate": 9.970287253141832e-06, + "loss": 6.2169, + "step": 114425 + }, + { + "epoch": 10.2737881508079, + "grad_norm": 14.77005386352539, + "learning_rate": 9.970037901456214e-06, + "loss": 6.1835, + "step": 114450 + }, + { + "epoch": 10.276032315978457, + "grad_norm": 14.235603332519531, + "learning_rate": 9.969788549770596e-06, + "loss": 6.2655, + "step": 114475 + }, + { + "epoch": 10.278276481149012, + "grad_norm": 14.187959671020508, + "learning_rate": 9.96953919808498e-06, + "loss": 5.7808, + "step": 114500 + }, + { + "epoch": 10.28052064631957, + "grad_norm": 14.01728630065918, + "learning_rate": 9.969289846399363e-06, + "loss": 6.0559, + "step": 114525 + }, + { + "epoch": 10.282764811490125, + "grad_norm": 15.471481323242188, + "learning_rate": 9.969040494713745e-06, + "loss": 6.0837, + "step": 114550 + }, + { + "epoch": 10.285008976660682, + "grad_norm": 16.827096939086914, + "learning_rate": 9.968791143028127e-06, + "loss": 6.4433, + "step": 114575 + }, + { + "epoch": 10.287253141831238, + "grad_norm": 12.543929100036621, + "learning_rate": 9.96854179134251e-06, + "loss": 6.4092, + "step": 114600 + }, + { + "epoch": 10.289497307001795, + "grad_norm": 15.608855247497559, + "learning_rate": 9.968292439656892e-06, + "loss": 5.9462, + "step": 114625 + }, + { + "epoch": 10.291741472172351, + "grad_norm": 14.738057136535645, + "learning_rate": 9.968043087971276e-06, + "loss": 5.9475, + "step": 114650 + }, + { + "epoch": 10.293985637342908, + "grad_norm": 12.042741775512695, + "learning_rate": 9.967793736285658e-06, + "loss": 6.2269, + "step": 114675 + }, + { + "epoch": 10.296229802513466, + "grad_norm": 13.772116661071777, + "learning_rate": 9.96754438460004e-06, + "loss": 6.1532, + "step": 114700 + }, + { + "epoch": 10.298473967684021, + "grad_norm": 12.73529052734375, + "learning_rate": 9.967295032914423e-06, + "loss": 6.3237, + "step": 114725 + }, + { + "epoch": 10.300718132854579, + "grad_norm": 12.704668998718262, + "learning_rate": 9.967045681228807e-06, + "loss": 6.3267, + "step": 114750 + }, + { + "epoch": 10.302962298025134, + "grad_norm": 13.686273574829102, + "learning_rate": 9.966796329543188e-06, + "loss": 5.9773, + "step": 114775 + }, + { + "epoch": 10.305206463195692, + "grad_norm": 13.381675720214844, + "learning_rate": 9.966546977857572e-06, + "loss": 6.1189, + "step": 114800 + }, + { + "epoch": 10.307450628366247, + "grad_norm": 12.581282615661621, + "learning_rate": 9.966297626171954e-06, + "loss": 6.2191, + "step": 114825 + }, + { + "epoch": 10.309694793536805, + "grad_norm": 13.113036155700684, + "learning_rate": 9.966048274486336e-06, + "loss": 6.0357, + "step": 114850 + }, + { + "epoch": 10.31193895870736, + "grad_norm": 12.273456573486328, + "learning_rate": 9.965798922800719e-06, + "loss": 6.0374, + "step": 114875 + }, + { + "epoch": 10.314183123877918, + "grad_norm": 13.017513275146484, + "learning_rate": 9.965549571115103e-06, + "loss": 6.3817, + "step": 114900 + }, + { + "epoch": 10.316427289048473, + "grad_norm": 14.95040512084961, + "learning_rate": 9.965300219429485e-06, + "loss": 6.2726, + "step": 114925 + }, + { + "epoch": 10.31867145421903, + "grad_norm": 12.985831260681152, + "learning_rate": 9.965050867743867e-06, + "loss": 5.9988, + "step": 114950 + }, + { + "epoch": 10.320915619389588, + "grad_norm": 13.883319854736328, + "learning_rate": 9.96480151605825e-06, + "loss": 6.2008, + "step": 114975 + }, + { + "epoch": 10.323159784560143, + "grad_norm": 17.3106746673584, + "learning_rate": 9.964552164372632e-06, + "loss": 6.2177, + "step": 115000 + }, + { + "epoch": 10.3254039497307, + "grad_norm": 12.127328872680664, + "learning_rate": 9.964302812687014e-06, + "loss": 6.167, + "step": 115025 + }, + { + "epoch": 10.327648114901256, + "grad_norm": 15.2025728225708, + "learning_rate": 9.964053461001396e-06, + "loss": 6.3196, + "step": 115050 + }, + { + "epoch": 10.329892280071814, + "grad_norm": 16.364208221435547, + "learning_rate": 9.96380410931578e-06, + "loss": 6.1933, + "step": 115075 + }, + { + "epoch": 10.33213644524237, + "grad_norm": 16.258771896362305, + "learning_rate": 9.963554757630163e-06, + "loss": 6.2801, + "step": 115100 + }, + { + "epoch": 10.334380610412927, + "grad_norm": 14.781668663024902, + "learning_rate": 9.963305405944545e-06, + "loss": 6.1033, + "step": 115125 + }, + { + "epoch": 10.336624775583482, + "grad_norm": 13.722407341003418, + "learning_rate": 9.963056054258927e-06, + "loss": 6.2113, + "step": 115150 + }, + { + "epoch": 10.33886894075404, + "grad_norm": 13.578237533569336, + "learning_rate": 9.96280670257331e-06, + "loss": 6.1768, + "step": 115175 + }, + { + "epoch": 10.341113105924595, + "grad_norm": 13.855743408203125, + "learning_rate": 9.962557350887692e-06, + "loss": 6.2057, + "step": 115200 + }, + { + "epoch": 10.343357271095153, + "grad_norm": 13.038068771362305, + "learning_rate": 9.962317973269499e-06, + "loss": 6.167, + "step": 115225 + }, + { + "epoch": 10.34560143626571, + "grad_norm": 12.524468421936035, + "learning_rate": 9.962068621583883e-06, + "loss": 6.0966, + "step": 115250 + }, + { + "epoch": 10.347845601436266, + "grad_norm": 16.348445892333984, + "learning_rate": 9.961819269898265e-06, + "loss": 6.1601, + "step": 115275 + }, + { + "epoch": 10.350089766606823, + "grad_norm": 15.088892936706543, + "learning_rate": 9.961569918212648e-06, + "loss": 6.1614, + "step": 115300 + }, + { + "epoch": 10.352333931777379, + "grad_norm": 12.726110458374023, + "learning_rate": 9.96132056652703e-06, + "loss": 6.2001, + "step": 115325 + }, + { + "epoch": 10.354578096947936, + "grad_norm": 13.195673942565918, + "learning_rate": 9.961071214841414e-06, + "loss": 6.2282, + "step": 115350 + }, + { + "epoch": 10.356822262118492, + "grad_norm": 13.612674713134766, + "learning_rate": 9.960821863155796e-06, + "loss": 6.1393, + "step": 115375 + }, + { + "epoch": 10.359066427289049, + "grad_norm": 13.539885520935059, + "learning_rate": 9.960572511470179e-06, + "loss": 6.1956, + "step": 115400 + }, + { + "epoch": 10.361310592459605, + "grad_norm": 13.306929588317871, + "learning_rate": 9.960323159784561e-06, + "loss": 6.1562, + "step": 115425 + }, + { + "epoch": 10.363554757630162, + "grad_norm": 12.219681739807129, + "learning_rate": 9.960073808098943e-06, + "loss": 6.0576, + "step": 115450 + }, + { + "epoch": 10.365798922800717, + "grad_norm": 11.797956466674805, + "learning_rate": 9.959824456413326e-06, + "loss": 6.3784, + "step": 115475 + }, + { + "epoch": 10.368043087971275, + "grad_norm": 21.49988555908203, + "learning_rate": 9.95957510472771e-06, + "loss": 6.0803, + "step": 115500 + }, + { + "epoch": 10.37028725314183, + "grad_norm": 14.57690143585205, + "learning_rate": 9.959325753042092e-06, + "loss": 5.9113, + "step": 115525 + }, + { + "epoch": 10.372531418312388, + "grad_norm": 13.846901893615723, + "learning_rate": 9.959076401356474e-06, + "loss": 6.3314, + "step": 115550 + }, + { + "epoch": 10.374775583482945, + "grad_norm": 13.823548316955566, + "learning_rate": 9.958827049670856e-06, + "loss": 6.1483, + "step": 115575 + }, + { + "epoch": 10.3770197486535, + "grad_norm": 15.535062789916992, + "learning_rate": 9.958577697985239e-06, + "loss": 6.2809, + "step": 115600 + }, + { + "epoch": 10.379263913824058, + "grad_norm": 14.010692596435547, + "learning_rate": 9.958328346299621e-06, + "loss": 6.0977, + "step": 115625 + }, + { + "epoch": 10.381508078994614, + "grad_norm": 13.891032218933105, + "learning_rate": 9.958078994614005e-06, + "loss": 5.9708, + "step": 115650 + }, + { + "epoch": 10.383752244165171, + "grad_norm": 13.065733909606934, + "learning_rate": 9.957829642928387e-06, + "loss": 6.1592, + "step": 115675 + }, + { + "epoch": 10.385996409335727, + "grad_norm": 12.944583892822266, + "learning_rate": 9.95758029124277e-06, + "loss": 6.1509, + "step": 115700 + }, + { + "epoch": 10.388240574506284, + "grad_norm": 13.492417335510254, + "learning_rate": 9.957330939557152e-06, + "loss": 6.2682, + "step": 115725 + }, + { + "epoch": 10.39048473967684, + "grad_norm": 14.779852867126465, + "learning_rate": 9.957081587871534e-06, + "loss": 6.2824, + "step": 115750 + }, + { + "epoch": 10.392728904847397, + "grad_norm": 15.88010311126709, + "learning_rate": 9.956832236185917e-06, + "loss": 5.993, + "step": 115775 + }, + { + "epoch": 10.394973070017953, + "grad_norm": 13.081055641174316, + "learning_rate": 9.9565828845003e-06, + "loss": 6.0689, + "step": 115800 + }, + { + "epoch": 10.39721723518851, + "grad_norm": 14.527236938476562, + "learning_rate": 9.956333532814683e-06, + "loss": 6.1189, + "step": 115825 + }, + { + "epoch": 10.399461400359066, + "grad_norm": 17.19582176208496, + "learning_rate": 9.956084181129065e-06, + "loss": 6.2367, + "step": 115850 + }, + { + "epoch": 10.401705565529623, + "grad_norm": 13.847161293029785, + "learning_rate": 9.955834829443448e-06, + "loss": 6.1055, + "step": 115875 + }, + { + "epoch": 10.40394973070018, + "grad_norm": 15.594660758972168, + "learning_rate": 9.955585477757832e-06, + "loss": 6.2651, + "step": 115900 + }, + { + "epoch": 10.406193895870736, + "grad_norm": 14.668493270874023, + "learning_rate": 9.955336126072212e-06, + "loss": 6.0696, + "step": 115925 + }, + { + "epoch": 10.408438061041293, + "grad_norm": 15.07514762878418, + "learning_rate": 9.955086774386595e-06, + "loss": 6.233, + "step": 115950 + }, + { + "epoch": 10.410682226211849, + "grad_norm": 15.155996322631836, + "learning_rate": 9.954837422700979e-06, + "loss": 5.9958, + "step": 115975 + }, + { + "epoch": 10.412926391382406, + "grad_norm": 13.55897331237793, + "learning_rate": 9.954588071015361e-06, + "loss": 6.2107, + "step": 116000 + }, + { + "epoch": 10.415170556552962, + "grad_norm": 13.27270793914795, + "learning_rate": 9.954338719329743e-06, + "loss": 6.2474, + "step": 116025 + }, + { + "epoch": 10.41741472172352, + "grad_norm": 13.829033851623535, + "learning_rate": 9.954089367644126e-06, + "loss": 6.3361, + "step": 116050 + }, + { + "epoch": 10.419658886894075, + "grad_norm": 13.81086540222168, + "learning_rate": 9.95384001595851e-06, + "loss": 6.2101, + "step": 116075 + }, + { + "epoch": 10.421903052064632, + "grad_norm": 14.257719993591309, + "learning_rate": 9.95359066427289e-06, + "loss": 6.0195, + "step": 116100 + }, + { + "epoch": 10.424147217235188, + "grad_norm": 13.678173065185547, + "learning_rate": 9.953341312587274e-06, + "loss": 6.1204, + "step": 116125 + }, + { + "epoch": 10.426391382405745, + "grad_norm": 14.727384567260742, + "learning_rate": 9.953091960901657e-06, + "loss": 6.0622, + "step": 116150 + }, + { + "epoch": 10.428635547576302, + "grad_norm": 12.818706512451172, + "learning_rate": 9.952842609216039e-06, + "loss": 6.2012, + "step": 116175 + }, + { + "epoch": 10.430879712746858, + "grad_norm": 12.543864250183105, + "learning_rate": 9.952593257530421e-06, + "loss": 5.9194, + "step": 116200 + }, + { + "epoch": 10.433123877917415, + "grad_norm": 14.991695404052734, + "learning_rate": 9.952343905844805e-06, + "loss": 6.2136, + "step": 116225 + }, + { + "epoch": 10.435368043087971, + "grad_norm": 15.390132904052734, + "learning_rate": 9.952094554159187e-06, + "loss": 6.1833, + "step": 116250 + }, + { + "epoch": 10.437612208258528, + "grad_norm": 13.765616416931152, + "learning_rate": 9.95184520247357e-06, + "loss": 6.079, + "step": 116275 + }, + { + "epoch": 10.439856373429084, + "grad_norm": 14.272748947143555, + "learning_rate": 9.951595850787952e-06, + "loss": 6.1778, + "step": 116300 + }, + { + "epoch": 10.442100538599641, + "grad_norm": 14.169120788574219, + "learning_rate": 9.951346499102334e-06, + "loss": 6.0521, + "step": 116325 + }, + { + "epoch": 10.444344703770197, + "grad_norm": 12.652580261230469, + "learning_rate": 9.951097147416717e-06, + "loss": 6.2952, + "step": 116350 + }, + { + "epoch": 10.446588868940754, + "grad_norm": 13.12724494934082, + "learning_rate": 9.9508477957311e-06, + "loss": 6.0747, + "step": 116375 + }, + { + "epoch": 10.44883303411131, + "grad_norm": 11.728991508483887, + "learning_rate": 9.950598444045483e-06, + "loss": 6.319, + "step": 116400 + }, + { + "epoch": 10.451077199281867, + "grad_norm": 11.804323196411133, + "learning_rate": 9.950349092359865e-06, + "loss": 6.2846, + "step": 116425 + }, + { + "epoch": 10.453321364452425, + "grad_norm": 14.11108112335205, + "learning_rate": 9.950099740674248e-06, + "loss": 6.1032, + "step": 116450 + }, + { + "epoch": 10.45556552962298, + "grad_norm": 14.69686222076416, + "learning_rate": 9.94985038898863e-06, + "loss": 6.0589, + "step": 116475 + }, + { + "epoch": 10.457809694793538, + "grad_norm": 16.468170166015625, + "learning_rate": 9.949601037303012e-06, + "loss": 6.0781, + "step": 116500 + }, + { + "epoch": 10.460053859964093, + "grad_norm": 17.075592041015625, + "learning_rate": 9.949351685617396e-06, + "loss": 6.2627, + "step": 116525 + }, + { + "epoch": 10.46229802513465, + "grad_norm": 15.957704544067383, + "learning_rate": 9.949102333931779e-06, + "loss": 6.0612, + "step": 116550 + }, + { + "epoch": 10.464542190305206, + "grad_norm": 12.78443431854248, + "learning_rate": 9.948852982246161e-06, + "loss": 6.1486, + "step": 116575 + }, + { + "epoch": 10.466786355475763, + "grad_norm": 11.800089836120605, + "learning_rate": 9.948603630560543e-06, + "loss": 6.0618, + "step": 116600 + }, + { + "epoch": 10.469030520646319, + "grad_norm": 13.542703628540039, + "learning_rate": 9.948354278874927e-06, + "loss": 6.0342, + "step": 116625 + }, + { + "epoch": 10.471274685816876, + "grad_norm": 12.015517234802246, + "learning_rate": 9.948104927189308e-06, + "loss": 6.1475, + "step": 116650 + }, + { + "epoch": 10.473518850987432, + "grad_norm": 13.456504821777344, + "learning_rate": 9.94785557550369e-06, + "loss": 6.1783, + "step": 116675 + }, + { + "epoch": 10.47576301615799, + "grad_norm": 14.220084190368652, + "learning_rate": 9.947606223818074e-06, + "loss": 6.2272, + "step": 116700 + }, + { + "epoch": 10.478007181328547, + "grad_norm": 13.529407501220703, + "learning_rate": 9.947356872132457e-06, + "loss": 6.1678, + "step": 116725 + }, + { + "epoch": 10.480251346499102, + "grad_norm": 17.79865837097168, + "learning_rate": 9.947107520446839e-06, + "loss": 6.2161, + "step": 116750 + }, + { + "epoch": 10.48249551166966, + "grad_norm": 18.917356491088867, + "learning_rate": 9.946858168761221e-06, + "loss": 6.0681, + "step": 116775 + }, + { + "epoch": 10.484739676840215, + "grad_norm": 11.92551040649414, + "learning_rate": 9.946608817075605e-06, + "loss": 6.1573, + "step": 116800 + }, + { + "epoch": 10.486983842010773, + "grad_norm": 12.83806037902832, + "learning_rate": 9.946359465389986e-06, + "loss": 6.1094, + "step": 116825 + }, + { + "epoch": 10.489228007181328, + "grad_norm": 15.085115432739258, + "learning_rate": 9.94611011370437e-06, + "loss": 6.1099, + "step": 116850 + }, + { + "epoch": 10.491472172351886, + "grad_norm": 13.066300392150879, + "learning_rate": 9.945860762018752e-06, + "loss": 6.3347, + "step": 116875 + }, + { + "epoch": 10.493716337522441, + "grad_norm": 12.435172080993652, + "learning_rate": 9.945611410333134e-06, + "loss": 6.0933, + "step": 116900 + }, + { + "epoch": 10.495960502692999, + "grad_norm": 14.560708045959473, + "learning_rate": 9.945362058647517e-06, + "loss": 5.9978, + "step": 116925 + }, + { + "epoch": 10.498204667863554, + "grad_norm": 14.643082618713379, + "learning_rate": 9.9451127069619e-06, + "loss": 6.2144, + "step": 116950 + }, + { + "epoch": 10.500448833034111, + "grad_norm": 18.37672233581543, + "learning_rate": 9.944863355276283e-06, + "loss": 6.2167, + "step": 116975 + }, + { + "epoch": 10.502692998204667, + "grad_norm": 13.815579414367676, + "learning_rate": 9.944614003590665e-06, + "loss": 6.2042, + "step": 117000 + }, + { + "epoch": 10.504937163375224, + "grad_norm": 13.714104652404785, + "learning_rate": 9.944364651905048e-06, + "loss": 5.7758, + "step": 117025 + }, + { + "epoch": 10.507181328545782, + "grad_norm": 18.36736488342285, + "learning_rate": 9.94411530021943e-06, + "loss": 6.1936, + "step": 117050 + }, + { + "epoch": 10.509425493716337, + "grad_norm": 14.024682998657227, + "learning_rate": 9.943865948533812e-06, + "loss": 5.9506, + "step": 117075 + }, + { + "epoch": 10.511669658886895, + "grad_norm": 16.64815330505371, + "learning_rate": 9.943616596848196e-06, + "loss": 6.1824, + "step": 117100 + }, + { + "epoch": 10.51391382405745, + "grad_norm": 11.700675010681152, + "learning_rate": 9.943367245162579e-06, + "loss": 6.1748, + "step": 117125 + }, + { + "epoch": 10.516157989228008, + "grad_norm": 16.527620315551758, + "learning_rate": 9.943117893476961e-06, + "loss": 6.0514, + "step": 117150 + }, + { + "epoch": 10.518402154398563, + "grad_norm": 11.594467163085938, + "learning_rate": 9.942868541791343e-06, + "loss": 6.2972, + "step": 117175 + }, + { + "epoch": 10.52064631956912, + "grad_norm": 16.306297302246094, + "learning_rate": 9.942619190105726e-06, + "loss": 6.0664, + "step": 117200 + }, + { + "epoch": 10.522890484739676, + "grad_norm": 13.552149772644043, + "learning_rate": 9.942369838420108e-06, + "loss": 6.221, + "step": 117225 + }, + { + "epoch": 10.525134649910234, + "grad_norm": 10.992222785949707, + "learning_rate": 9.942120486734492e-06, + "loss": 6.1553, + "step": 117250 + }, + { + "epoch": 10.52737881508079, + "grad_norm": 12.035684585571289, + "learning_rate": 9.941871135048874e-06, + "loss": 6.0788, + "step": 117275 + }, + { + "epoch": 10.529622980251347, + "grad_norm": 11.983599662780762, + "learning_rate": 9.941621783363257e-06, + "loss": 6.1415, + "step": 117300 + }, + { + "epoch": 10.531867145421902, + "grad_norm": 14.605071067810059, + "learning_rate": 9.941372431677639e-06, + "loss": 6.1439, + "step": 117325 + }, + { + "epoch": 10.53411131059246, + "grad_norm": 12.795331001281738, + "learning_rate": 9.941123079992021e-06, + "loss": 6.0902, + "step": 117350 + }, + { + "epoch": 10.536355475763017, + "grad_norm": 18.06188201904297, + "learning_rate": 9.940873728306404e-06, + "loss": 6.1616, + "step": 117375 + }, + { + "epoch": 10.538599640933572, + "grad_norm": 12.624467849731445, + "learning_rate": 9.940624376620786e-06, + "loss": 6.1473, + "step": 117400 + }, + { + "epoch": 10.54084380610413, + "grad_norm": 16.0347957611084, + "learning_rate": 9.94037502493517e-06, + "loss": 6.2235, + "step": 117425 + }, + { + "epoch": 10.543087971274685, + "grad_norm": 13.866011619567871, + "learning_rate": 9.940135647316977e-06, + "loss": 6.1357, + "step": 117450 + }, + { + "epoch": 10.545332136445243, + "grad_norm": 17.28778648376465, + "learning_rate": 9.939886295631359e-06, + "loss": 6.3266, + "step": 117475 + }, + { + "epoch": 10.547576301615798, + "grad_norm": 12.925146102905273, + "learning_rate": 9.939636943945741e-06, + "loss": 6.1577, + "step": 117500 + }, + { + "epoch": 10.549820466786356, + "grad_norm": 13.713129043579102, + "learning_rate": 9.939387592260124e-06, + "loss": 6.2848, + "step": 117525 + }, + { + "epoch": 10.552064631956911, + "grad_norm": 14.374227523803711, + "learning_rate": 9.939138240574508e-06, + "loss": 6.1733, + "step": 117550 + }, + { + "epoch": 10.554308797127469, + "grad_norm": 15.747072219848633, + "learning_rate": 9.93888888888889e-06, + "loss": 5.847, + "step": 117575 + }, + { + "epoch": 10.556552962298024, + "grad_norm": 13.97408390045166, + "learning_rate": 9.938639537203272e-06, + "loss": 6.1798, + "step": 117600 + }, + { + "epoch": 10.558797127468582, + "grad_norm": 13.851740837097168, + "learning_rate": 9.938390185517655e-06, + "loss": 6.2765, + "step": 117625 + }, + { + "epoch": 10.561041292639139, + "grad_norm": 13.396052360534668, + "learning_rate": 9.938140833832037e-06, + "loss": 6.012, + "step": 117650 + }, + { + "epoch": 10.563285457809695, + "grad_norm": 14.67442512512207, + "learning_rate": 9.93789148214642e-06, + "loss": 6.2868, + "step": 117675 + }, + { + "epoch": 10.565529622980252, + "grad_norm": 15.549674987792969, + "learning_rate": 9.937642130460803e-06, + "loss": 6.1081, + "step": 117700 + }, + { + "epoch": 10.567773788150808, + "grad_norm": 17.645456314086914, + "learning_rate": 9.937392778775186e-06, + "loss": 6.1605, + "step": 117725 + }, + { + "epoch": 10.570017953321365, + "grad_norm": 13.51762580871582, + "learning_rate": 9.937143427089568e-06, + "loss": 6.3216, + "step": 117750 + }, + { + "epoch": 10.57226211849192, + "grad_norm": 14.550630569458008, + "learning_rate": 9.93689407540395e-06, + "loss": 6.2396, + "step": 117775 + }, + { + "epoch": 10.574506283662478, + "grad_norm": 15.408926010131836, + "learning_rate": 9.936644723718333e-06, + "loss": 6.0738, + "step": 117800 + }, + { + "epoch": 10.576750448833034, + "grad_norm": 11.753130912780762, + "learning_rate": 9.936395372032715e-06, + "loss": 6.046, + "step": 117825 + }, + { + "epoch": 10.57899461400359, + "grad_norm": 11.81749153137207, + "learning_rate": 9.936146020347099e-06, + "loss": 5.9885, + "step": 117850 + }, + { + "epoch": 10.581238779174146, + "grad_norm": 13.109404563903809, + "learning_rate": 9.935896668661481e-06, + "loss": 6.1322, + "step": 117875 + }, + { + "epoch": 10.583482944344704, + "grad_norm": 12.107890129089355, + "learning_rate": 9.935647316975864e-06, + "loss": 6.07, + "step": 117900 + }, + { + "epoch": 10.585727109515261, + "grad_norm": 13.73945426940918, + "learning_rate": 9.935397965290246e-06, + "loss": 6.2391, + "step": 117925 + }, + { + "epoch": 10.587971274685817, + "grad_norm": 14.716211318969727, + "learning_rate": 9.935148613604628e-06, + "loss": 6.2775, + "step": 117950 + }, + { + "epoch": 10.590215439856374, + "grad_norm": 15.493619918823242, + "learning_rate": 9.93489926191901e-06, + "loss": 6.2077, + "step": 117975 + }, + { + "epoch": 10.59245960502693, + "grad_norm": 16.36600685119629, + "learning_rate": 9.934649910233395e-06, + "loss": 5.9724, + "step": 118000 + }, + { + "epoch": 10.594703770197487, + "grad_norm": 13.128405570983887, + "learning_rate": 9.934400558547777e-06, + "loss": 6.1836, + "step": 118025 + }, + { + "epoch": 10.596947935368043, + "grad_norm": 15.246312141418457, + "learning_rate": 9.93415120686216e-06, + "loss": 6.2074, + "step": 118050 + }, + { + "epoch": 10.5991921005386, + "grad_norm": 12.46064281463623, + "learning_rate": 9.933901855176542e-06, + "loss": 6.0235, + "step": 118075 + }, + { + "epoch": 10.601436265709156, + "grad_norm": 13.566872596740723, + "learning_rate": 9.933652503490926e-06, + "loss": 6.2174, + "step": 118100 + }, + { + "epoch": 10.603680430879713, + "grad_norm": 14.47032642364502, + "learning_rate": 9.933403151805308e-06, + "loss": 5.9618, + "step": 118125 + }, + { + "epoch": 10.605924596050269, + "grad_norm": 13.195995330810547, + "learning_rate": 9.933153800119688e-06, + "loss": 6.3318, + "step": 118150 + }, + { + "epoch": 10.608168761220826, + "grad_norm": 14.900130271911621, + "learning_rate": 9.932904448434072e-06, + "loss": 5.9329, + "step": 118175 + }, + { + "epoch": 10.610412926391383, + "grad_norm": 13.867321968078613, + "learning_rate": 9.932655096748455e-06, + "loss": 6.0422, + "step": 118200 + }, + { + "epoch": 10.612657091561939, + "grad_norm": 13.276684761047363, + "learning_rate": 9.932405745062837e-06, + "loss": 6.3878, + "step": 118225 + }, + { + "epoch": 10.614901256732496, + "grad_norm": 13.056026458740234, + "learning_rate": 9.932156393377221e-06, + "loss": 6.1737, + "step": 118250 + }, + { + "epoch": 10.617145421903052, + "grad_norm": 13.517363548278809, + "learning_rate": 9.931907041691603e-06, + "loss": 6.2198, + "step": 118275 + }, + { + "epoch": 10.61938958707361, + "grad_norm": 15.768609046936035, + "learning_rate": 9.931657690005986e-06, + "loss": 6.2369, + "step": 118300 + }, + { + "epoch": 10.621633752244165, + "grad_norm": 14.4987154006958, + "learning_rate": 9.931408338320368e-06, + "loss": 6.1846, + "step": 118325 + }, + { + "epoch": 10.623877917414722, + "grad_norm": 13.461377143859863, + "learning_rate": 9.93115898663475e-06, + "loss": 6.3424, + "step": 118350 + }, + { + "epoch": 10.626122082585278, + "grad_norm": 16.284046173095703, + "learning_rate": 9.930909634949133e-06, + "loss": 6.3522, + "step": 118375 + }, + { + "epoch": 10.628366247755835, + "grad_norm": 16.345245361328125, + "learning_rate": 9.930660283263515e-06, + "loss": 5.9992, + "step": 118400 + }, + { + "epoch": 10.63061041292639, + "grad_norm": 16.488676071166992, + "learning_rate": 9.930410931577899e-06, + "loss": 6.074, + "step": 118425 + }, + { + "epoch": 10.632854578096948, + "grad_norm": 15.575264930725098, + "learning_rate": 9.930161579892281e-06, + "loss": 6.0628, + "step": 118450 + }, + { + "epoch": 10.635098743267504, + "grad_norm": 15.204346656799316, + "learning_rate": 9.929912228206664e-06, + "loss": 6.1056, + "step": 118475 + }, + { + "epoch": 10.637342908438061, + "grad_norm": 14.3408203125, + "learning_rate": 9.929662876521046e-06, + "loss": 6.0571, + "step": 118500 + }, + { + "epoch": 10.639587073608617, + "grad_norm": 13.857388496398926, + "learning_rate": 9.929413524835428e-06, + "loss": 6.255, + "step": 118525 + }, + { + "epoch": 10.641831238779174, + "grad_norm": 12.852628707885742, + "learning_rate": 9.92916417314981e-06, + "loss": 6.0769, + "step": 118550 + }, + { + "epoch": 10.644075403949731, + "grad_norm": 19.384809494018555, + "learning_rate": 9.928914821464195e-06, + "loss": 6.074, + "step": 118575 + }, + { + "epoch": 10.646319569120287, + "grad_norm": 12.229957580566406, + "learning_rate": 9.928665469778577e-06, + "loss": 6.1837, + "step": 118600 + }, + { + "epoch": 10.648563734290844, + "grad_norm": 13.242496490478516, + "learning_rate": 9.92841611809296e-06, + "loss": 6.1775, + "step": 118625 + }, + { + "epoch": 10.6508078994614, + "grad_norm": 15.80776309967041, + "learning_rate": 9.928166766407342e-06, + "loss": 6.104, + "step": 118650 + }, + { + "epoch": 10.653052064631957, + "grad_norm": 13.491728782653809, + "learning_rate": 9.927917414721724e-06, + "loss": 6.2261, + "step": 118675 + }, + { + "epoch": 10.655296229802513, + "grad_norm": 12.689077377319336, + "learning_rate": 9.927668063036106e-06, + "loss": 6.1078, + "step": 118700 + }, + { + "epoch": 10.65754039497307, + "grad_norm": 14.604201316833496, + "learning_rate": 9.92741871135049e-06, + "loss": 6.2564, + "step": 118725 + }, + { + "epoch": 10.659784560143626, + "grad_norm": 13.62071418762207, + "learning_rate": 9.927169359664873e-06, + "loss": 6.1311, + "step": 118750 + }, + { + "epoch": 10.662028725314183, + "grad_norm": 16.2298583984375, + "learning_rate": 9.926920007979255e-06, + "loss": 6.1792, + "step": 118775 + }, + { + "epoch": 10.664272890484739, + "grad_norm": 13.605439186096191, + "learning_rate": 9.926670656293637e-06, + "loss": 6.1637, + "step": 118800 + }, + { + "epoch": 10.666517055655296, + "grad_norm": 13.989283561706543, + "learning_rate": 9.926421304608021e-06, + "loss": 5.8931, + "step": 118825 + }, + { + "epoch": 10.668761220825854, + "grad_norm": 14.748190879821777, + "learning_rate": 9.926171952922402e-06, + "loss": 6.2917, + "step": 118850 + }, + { + "epoch": 10.67100538599641, + "grad_norm": 19.06110954284668, + "learning_rate": 9.925922601236784e-06, + "loss": 6.2527, + "step": 118875 + }, + { + "epoch": 10.673249551166966, + "grad_norm": 11.950432777404785, + "learning_rate": 9.925673249551168e-06, + "loss": 6.2348, + "step": 118900 + }, + { + "epoch": 10.675493716337522, + "grad_norm": 15.00532054901123, + "learning_rate": 9.92542389786555e-06, + "loss": 5.9763, + "step": 118925 + }, + { + "epoch": 10.67773788150808, + "grad_norm": 11.877495765686035, + "learning_rate": 9.925174546179933e-06, + "loss": 6.2102, + "step": 118950 + }, + { + "epoch": 10.679982046678635, + "grad_norm": 14.133099555969238, + "learning_rate": 9.924925194494317e-06, + "loss": 6.1589, + "step": 118975 + }, + { + "epoch": 10.682226211849192, + "grad_norm": 17.78510093688965, + "learning_rate": 9.924675842808699e-06, + "loss": 6.2254, + "step": 119000 + }, + { + "epoch": 10.684470377019748, + "grad_norm": 14.05225944519043, + "learning_rate": 9.92442649112308e-06, + "loss": 6.322, + "step": 119025 + }, + { + "epoch": 10.686714542190305, + "grad_norm": 15.799397468566895, + "learning_rate": 9.924177139437464e-06, + "loss": 6.4684, + "step": 119050 + }, + { + "epoch": 10.688958707360861, + "grad_norm": 14.602740287780762, + "learning_rate": 9.923927787751846e-06, + "loss": 6.1977, + "step": 119075 + }, + { + "epoch": 10.691202872531418, + "grad_norm": 15.242654800415039, + "learning_rate": 9.923678436066228e-06, + "loss": 6.3657, + "step": 119100 + }, + { + "epoch": 10.693447037701976, + "grad_norm": 16.53400230407715, + "learning_rate": 9.92342908438061e-06, + "loss": 6.1767, + "step": 119125 + }, + { + "epoch": 10.695691202872531, + "grad_norm": 12.058356285095215, + "learning_rate": 9.923179732694995e-06, + "loss": 6.1458, + "step": 119150 + }, + { + "epoch": 10.697935368043089, + "grad_norm": 13.491421699523926, + "learning_rate": 9.922930381009377e-06, + "loss": 6.3682, + "step": 119175 + }, + { + "epoch": 10.700179533213644, + "grad_norm": 11.563586235046387, + "learning_rate": 9.92268102932376e-06, + "loss": 6.0593, + "step": 119200 + }, + { + "epoch": 10.702423698384202, + "grad_norm": 13.456372261047363, + "learning_rate": 9.922431677638142e-06, + "loss": 6.2851, + "step": 119225 + }, + { + "epoch": 10.704667863554757, + "grad_norm": 16.898719787597656, + "learning_rate": 9.922182325952524e-06, + "loss": 6.0133, + "step": 119250 + }, + { + "epoch": 10.706912028725315, + "grad_norm": 13.138042449951172, + "learning_rate": 9.921932974266906e-06, + "loss": 6.3107, + "step": 119275 + }, + { + "epoch": 10.70915619389587, + "grad_norm": 16.169727325439453, + "learning_rate": 9.92168362258129e-06, + "loss": 6.2792, + "step": 119300 + }, + { + "epoch": 10.711400359066428, + "grad_norm": 16.607872009277344, + "learning_rate": 9.921434270895673e-06, + "loss": 5.9296, + "step": 119325 + }, + { + "epoch": 10.713644524236983, + "grad_norm": 13.169862747192383, + "learning_rate": 9.921184919210055e-06, + "loss": 6.2852, + "step": 119350 + }, + { + "epoch": 10.71588868940754, + "grad_norm": 14.079209327697754, + "learning_rate": 9.920935567524437e-06, + "loss": 6.0266, + "step": 119375 + }, + { + "epoch": 10.718132854578098, + "grad_norm": 11.576985359191895, + "learning_rate": 9.92068621583882e-06, + "loss": 6.1927, + "step": 119400 + }, + { + "epoch": 10.720377019748653, + "grad_norm": 12.27540397644043, + "learning_rate": 9.920436864153202e-06, + "loss": 6.2469, + "step": 119425 + }, + { + "epoch": 10.72262118491921, + "grad_norm": 14.904681205749512, + "learning_rate": 9.920187512467586e-06, + "loss": 6.1162, + "step": 119450 + }, + { + "epoch": 10.724865350089766, + "grad_norm": 13.197246551513672, + "learning_rate": 9.919948134849393e-06, + "loss": 6.3907, + "step": 119475 + }, + { + "epoch": 10.727109515260324, + "grad_norm": 13.427937507629395, + "learning_rate": 9.919698783163775e-06, + "loss": 6.0815, + "step": 119500 + }, + { + "epoch": 10.72935368043088, + "grad_norm": 13.085028648376465, + "learning_rate": 9.919449431478157e-06, + "loss": 6.1483, + "step": 119525 + }, + { + "epoch": 10.731597845601437, + "grad_norm": 13.53249740600586, + "learning_rate": 9.91920007979254e-06, + "loss": 6.2129, + "step": 119550 + }, + { + "epoch": 10.733842010771992, + "grad_norm": 12.897247314453125, + "learning_rate": 9.918950728106924e-06, + "loss": 6.1513, + "step": 119575 + }, + { + "epoch": 10.73608617594255, + "grad_norm": 12.958684921264648, + "learning_rate": 9.918701376421306e-06, + "loss": 6.273, + "step": 119600 + }, + { + "epoch": 10.738330341113105, + "grad_norm": 13.586064338684082, + "learning_rate": 9.918452024735688e-06, + "loss": 5.9795, + "step": 119625 + }, + { + "epoch": 10.740574506283663, + "grad_norm": 17.706775665283203, + "learning_rate": 9.91820267305007e-06, + "loss": 6.222, + "step": 119650 + }, + { + "epoch": 10.742818671454218, + "grad_norm": 12.976371765136719, + "learning_rate": 9.917953321364453e-06, + "loss": 6.4314, + "step": 119675 + }, + { + "epoch": 10.745062836624776, + "grad_norm": 12.328204154968262, + "learning_rate": 9.917703969678835e-06, + "loss": 6.2052, + "step": 119700 + }, + { + "epoch": 10.747307001795333, + "grad_norm": 12.53798770904541, + "learning_rate": 9.91745461799322e-06, + "loss": 6.0297, + "step": 119725 + }, + { + "epoch": 10.749551166965889, + "grad_norm": 16.501113891601562, + "learning_rate": 9.917205266307602e-06, + "loss": 6.1403, + "step": 119750 + }, + { + "epoch": 10.751795332136446, + "grad_norm": 26.41689109802246, + "learning_rate": 9.916955914621984e-06, + "loss": 6.1677, + "step": 119775 + }, + { + "epoch": 10.754039497307001, + "grad_norm": 16.036880493164062, + "learning_rate": 9.916706562936366e-06, + "loss": 6.1244, + "step": 119800 + }, + { + "epoch": 10.756283662477559, + "grad_norm": 14.187278747558594, + "learning_rate": 9.916457211250749e-06, + "loss": 6.2912, + "step": 119825 + }, + { + "epoch": 10.758527827648114, + "grad_norm": 16.944332122802734, + "learning_rate": 9.916207859565131e-06, + "loss": 6.0705, + "step": 119850 + }, + { + "epoch": 10.760771992818672, + "grad_norm": 11.742531776428223, + "learning_rate": 9.915958507879513e-06, + "loss": 6.1314, + "step": 119875 + }, + { + "epoch": 10.763016157989227, + "grad_norm": 12.26762866973877, + "learning_rate": 9.915709156193897e-06, + "loss": 6.0728, + "step": 119900 + }, + { + "epoch": 10.765260323159785, + "grad_norm": 12.59873104095459, + "learning_rate": 9.91545980450828e-06, + "loss": 6.0799, + "step": 119925 + }, + { + "epoch": 10.76750448833034, + "grad_norm": 13.638697624206543, + "learning_rate": 9.915210452822662e-06, + "loss": 6.3825, + "step": 119950 + }, + { + "epoch": 10.769748653500898, + "grad_norm": 14.814310073852539, + "learning_rate": 9.914961101137044e-06, + "loss": 6.2958, + "step": 119975 + }, + { + "epoch": 10.771992818671453, + "grad_norm": 15.352435111999512, + "learning_rate": 9.914711749451426e-06, + "loss": 6.1957, + "step": 120000 + }, + { + "epoch": 10.77423698384201, + "grad_norm": 16.200122833251953, + "learning_rate": 9.914462397765809e-06, + "loss": 6.0549, + "step": 120025 + }, + { + "epoch": 10.776481149012568, + "grad_norm": 13.533501625061035, + "learning_rate": 9.914213046080193e-06, + "loss": 5.8411, + "step": 120050 + }, + { + "epoch": 10.778725314183124, + "grad_norm": 15.274639129638672, + "learning_rate": 9.913963694394575e-06, + "loss": 6.1312, + "step": 120075 + }, + { + "epoch": 10.780969479353681, + "grad_norm": 13.429688453674316, + "learning_rate": 9.913714342708957e-06, + "loss": 6.179, + "step": 120100 + }, + { + "epoch": 10.783213644524237, + "grad_norm": 14.298039436340332, + "learning_rate": 9.91346499102334e-06, + "loss": 6.3845, + "step": 120125 + }, + { + "epoch": 10.785457809694794, + "grad_norm": 14.596199989318848, + "learning_rate": 9.913215639337724e-06, + "loss": 5.9904, + "step": 120150 + }, + { + "epoch": 10.78770197486535, + "grad_norm": 15.515877723693848, + "learning_rate": 9.912966287652104e-06, + "loss": 6.1367, + "step": 120175 + }, + { + "epoch": 10.789946140035907, + "grad_norm": 15.787416458129883, + "learning_rate": 9.912716935966488e-06, + "loss": 6.3943, + "step": 120200 + }, + { + "epoch": 10.792190305206462, + "grad_norm": 12.936419486999512, + "learning_rate": 9.91246758428087e-06, + "loss": 6.2163, + "step": 120225 + }, + { + "epoch": 10.79443447037702, + "grad_norm": 17.72237777709961, + "learning_rate": 9.912218232595253e-06, + "loss": 6.3122, + "step": 120250 + }, + { + "epoch": 10.796678635547575, + "grad_norm": 13.751188278198242, + "learning_rate": 9.911968880909635e-06, + "loss": 6.3064, + "step": 120275 + }, + { + "epoch": 10.798922800718133, + "grad_norm": 15.093024253845215, + "learning_rate": 9.91171952922402e-06, + "loss": 6.3892, + "step": 120300 + }, + { + "epoch": 10.80116696588869, + "grad_norm": 15.665129661560059, + "learning_rate": 9.911470177538402e-06, + "loss": 6.2148, + "step": 120325 + }, + { + "epoch": 10.803411131059246, + "grad_norm": 11.064018249511719, + "learning_rate": 9.911220825852782e-06, + "loss": 6.1573, + "step": 120350 + }, + { + "epoch": 10.805655296229803, + "grad_norm": 18.01207160949707, + "learning_rate": 9.910971474167166e-06, + "loss": 6.2967, + "step": 120375 + }, + { + "epoch": 10.807899461400359, + "grad_norm": 14.182719230651855, + "learning_rate": 9.910722122481549e-06, + "loss": 6.2043, + "step": 120400 + }, + { + "epoch": 10.810143626570916, + "grad_norm": 14.137421607971191, + "learning_rate": 9.910472770795931e-06, + "loss": 6.2067, + "step": 120425 + }, + { + "epoch": 10.812387791741472, + "grad_norm": 13.279022216796875, + "learning_rate": 9.910223419110315e-06, + "loss": 6.1785, + "step": 120450 + }, + { + "epoch": 10.814631956912029, + "grad_norm": 11.841418266296387, + "learning_rate": 9.909974067424697e-06, + "loss": 5.9113, + "step": 120475 + }, + { + "epoch": 10.816876122082585, + "grad_norm": 15.331753730773926, + "learning_rate": 9.90972471573908e-06, + "loss": 6.2746, + "step": 120500 + }, + { + "epoch": 10.819120287253142, + "grad_norm": 13.106661796569824, + "learning_rate": 9.909475364053462e-06, + "loss": 6.1653, + "step": 120525 + }, + { + "epoch": 10.821364452423698, + "grad_norm": 15.457744598388672, + "learning_rate": 9.909226012367844e-06, + "loss": 6.0217, + "step": 120550 + }, + { + "epoch": 10.823608617594255, + "grad_norm": 14.08447265625, + "learning_rate": 9.908976660682227e-06, + "loss": 6.194, + "step": 120575 + }, + { + "epoch": 10.825852782764812, + "grad_norm": 13.166254997253418, + "learning_rate": 9.908727308996609e-06, + "loss": 6.1876, + "step": 120600 + }, + { + "epoch": 10.828096947935368, + "grad_norm": 12.173356056213379, + "learning_rate": 9.908477957310993e-06, + "loss": 6.3156, + "step": 120625 + }, + { + "epoch": 10.830341113105925, + "grad_norm": 15.50946044921875, + "learning_rate": 9.908228605625375e-06, + "loss": 6.081, + "step": 120650 + }, + { + "epoch": 10.83258527827648, + "grad_norm": 13.432740211486816, + "learning_rate": 9.907979253939757e-06, + "loss": 6.1583, + "step": 120675 + }, + { + "epoch": 10.834829443447038, + "grad_norm": 12.149306297302246, + "learning_rate": 9.90772990225414e-06, + "loss": 5.9709, + "step": 120700 + }, + { + "epoch": 10.837073608617594, + "grad_norm": 12.028841018676758, + "learning_rate": 9.907480550568522e-06, + "loss": 5.9861, + "step": 120725 + }, + { + "epoch": 10.839317773788151, + "grad_norm": 15.118492126464844, + "learning_rate": 9.907231198882904e-06, + "loss": 6.1228, + "step": 120750 + }, + { + "epoch": 10.841561938958707, + "grad_norm": 13.598746299743652, + "learning_rate": 9.906981847197288e-06, + "loss": 6.0697, + "step": 120775 + }, + { + "epoch": 10.843806104129264, + "grad_norm": 13.040824890136719, + "learning_rate": 9.90673249551167e-06, + "loss": 6.297, + "step": 120800 + }, + { + "epoch": 10.84605026929982, + "grad_norm": 11.961688995361328, + "learning_rate": 9.906483143826053e-06, + "loss": 5.9744, + "step": 120825 + }, + { + "epoch": 10.848294434470377, + "grad_norm": 11.973241806030273, + "learning_rate": 9.906233792140435e-06, + "loss": 6.1898, + "step": 120850 + }, + { + "epoch": 10.850538599640934, + "grad_norm": 13.723494529724121, + "learning_rate": 9.905984440454818e-06, + "loss": 6.3314, + "step": 120875 + }, + { + "epoch": 10.85278276481149, + "grad_norm": 14.814408302307129, + "learning_rate": 9.9057350887692e-06, + "loss": 6.2354, + "step": 120900 + }, + { + "epoch": 10.855026929982047, + "grad_norm": 14.389415740966797, + "learning_rate": 9.905485737083584e-06, + "loss": 5.9055, + "step": 120925 + }, + { + "epoch": 10.857271095152603, + "grad_norm": 12.774707794189453, + "learning_rate": 9.905236385397966e-06, + "loss": 6.1759, + "step": 120950 + }, + { + "epoch": 10.85951526032316, + "grad_norm": 15.151957511901855, + "learning_rate": 9.904987033712349e-06, + "loss": 6.2018, + "step": 120975 + }, + { + "epoch": 10.861759425493716, + "grad_norm": 13.917842864990234, + "learning_rate": 9.904737682026731e-06, + "loss": 6.3607, + "step": 121000 + }, + { + "epoch": 10.864003590664273, + "grad_norm": 13.939277648925781, + "learning_rate": 9.904488330341115e-06, + "loss": 6.0766, + "step": 121025 + }, + { + "epoch": 10.866247755834829, + "grad_norm": 12.659622192382812, + "learning_rate": 9.904238978655496e-06, + "loss": 6.3518, + "step": 121050 + }, + { + "epoch": 10.868491921005386, + "grad_norm": 12.335886001586914, + "learning_rate": 9.903989626969878e-06, + "loss": 6.0387, + "step": 121075 + }, + { + "epoch": 10.870736086175942, + "grad_norm": 16.873624801635742, + "learning_rate": 9.903740275284262e-06, + "loss": 6.3686, + "step": 121100 + }, + { + "epoch": 10.8729802513465, + "grad_norm": 15.930216789245605, + "learning_rate": 9.903490923598644e-06, + "loss": 6.0453, + "step": 121125 + }, + { + "epoch": 10.875224416517055, + "grad_norm": 12.669122695922852, + "learning_rate": 9.903241571913027e-06, + "loss": 6.2253, + "step": 121150 + }, + { + "epoch": 10.877468581687612, + "grad_norm": 16.557958602905273, + "learning_rate": 9.90299222022741e-06, + "loss": 6.0185, + "step": 121175 + }, + { + "epoch": 10.87971274685817, + "grad_norm": 18.874666213989258, + "learning_rate": 9.902742868541793e-06, + "loss": 6.0074, + "step": 121200 + }, + { + "epoch": 10.881956912028725, + "grad_norm": 13.984445571899414, + "learning_rate": 9.902493516856174e-06, + "loss": 6.4123, + "step": 121225 + }, + { + "epoch": 10.884201077199283, + "grad_norm": 14.503372192382812, + "learning_rate": 9.902244165170558e-06, + "loss": 5.9756, + "step": 121250 + }, + { + "epoch": 10.886445242369838, + "grad_norm": 13.737139701843262, + "learning_rate": 9.90199481348494e-06, + "loss": 6.1745, + "step": 121275 + }, + { + "epoch": 10.888689407540395, + "grad_norm": 14.08408260345459, + "learning_rate": 9.901745461799322e-06, + "loss": 6.3131, + "step": 121300 + }, + { + "epoch": 10.890933572710951, + "grad_norm": 12.694790840148926, + "learning_rate": 9.901496110113704e-06, + "loss": 6.0039, + "step": 121325 + }, + { + "epoch": 10.893177737881508, + "grad_norm": 15.21885871887207, + "learning_rate": 9.901246758428089e-06, + "loss": 5.9756, + "step": 121350 + }, + { + "epoch": 10.895421903052064, + "grad_norm": 16.513652801513672, + "learning_rate": 9.90099740674247e-06, + "loss": 6.0527, + "step": 121375 + }, + { + "epoch": 10.897666068222621, + "grad_norm": 12.20182991027832, + "learning_rate": 9.900748055056853e-06, + "loss": 6.1776, + "step": 121400 + }, + { + "epoch": 10.899910233393177, + "grad_norm": 15.597916603088379, + "learning_rate": 9.900498703371235e-06, + "loss": 5.9664, + "step": 121425 + }, + { + "epoch": 10.902154398563734, + "grad_norm": 12.903572082519531, + "learning_rate": 9.900249351685618e-06, + "loss": 5.9805, + "step": 121450 + }, + { + "epoch": 10.90439856373429, + "grad_norm": 13.4376802444458, + "learning_rate": 9.9e-06, + "loss": 6.1064, + "step": 121475 + }, + { + "epoch": 10.906642728904847, + "grad_norm": 14.07511043548584, + "learning_rate": 9.899760622381807e-06, + "loss": 6.1211, + "step": 121500 + }, + { + "epoch": 10.908886894075405, + "grad_norm": 14.190185546875, + "learning_rate": 9.899511270696191e-06, + "loss": 6.1758, + "step": 121525 + }, + { + "epoch": 10.91113105924596, + "grad_norm": 18.00996208190918, + "learning_rate": 9.899261919010573e-06, + "loss": 6.4179, + "step": 121550 + }, + { + "epoch": 10.913375224416518, + "grad_norm": 12.422967910766602, + "learning_rate": 9.899012567324956e-06, + "loss": 6.285, + "step": 121575 + }, + { + "epoch": 10.915619389587073, + "grad_norm": 15.260369300842285, + "learning_rate": 9.898763215639338e-06, + "loss": 6.1205, + "step": 121600 + }, + { + "epoch": 10.91786355475763, + "grad_norm": 15.916173934936523, + "learning_rate": 9.898513863953722e-06, + "loss": 6.1257, + "step": 121625 + }, + { + "epoch": 10.920107719928186, + "grad_norm": 14.518957138061523, + "learning_rate": 9.898264512268104e-06, + "loss": 6.0226, + "step": 121650 + }, + { + "epoch": 10.922351885098744, + "grad_norm": 15.555551528930664, + "learning_rate": 9.898015160582487e-06, + "loss": 6.1881, + "step": 121675 + }, + { + "epoch": 10.9245960502693, + "grad_norm": 15.045280456542969, + "learning_rate": 9.897765808896869e-06, + "loss": 6.0691, + "step": 121700 + }, + { + "epoch": 10.926840215439857, + "grad_norm": 11.786077499389648, + "learning_rate": 9.897516457211251e-06, + "loss": 6.1559, + "step": 121725 + }, + { + "epoch": 10.929084380610412, + "grad_norm": 15.685522079467773, + "learning_rate": 9.897267105525634e-06, + "loss": 6.2258, + "step": 121750 + }, + { + "epoch": 10.93132854578097, + "grad_norm": 14.648402214050293, + "learning_rate": 9.897017753840018e-06, + "loss": 6.1985, + "step": 121775 + }, + { + "epoch": 10.933572710951527, + "grad_norm": 12.166013717651367, + "learning_rate": 9.8967684021544e-06, + "loss": 5.9681, + "step": 121800 + }, + { + "epoch": 10.935816876122082, + "grad_norm": 12.715103149414062, + "learning_rate": 9.896519050468782e-06, + "loss": 6.0485, + "step": 121825 + }, + { + "epoch": 10.93806104129264, + "grad_norm": 15.265273094177246, + "learning_rate": 9.896269698783165e-06, + "loss": 6.3514, + "step": 121850 + }, + { + "epoch": 10.940305206463195, + "grad_norm": 14.362995147705078, + "learning_rate": 9.896020347097547e-06, + "loss": 6.3106, + "step": 121875 + }, + { + "epoch": 10.942549371633753, + "grad_norm": 14.647754669189453, + "learning_rate": 9.89577099541193e-06, + "loss": 5.9992, + "step": 121900 + }, + { + "epoch": 10.944793536804308, + "grad_norm": 15.017901420593262, + "learning_rate": 9.895521643726313e-06, + "loss": 6.05, + "step": 121925 + }, + { + "epoch": 10.947037701974866, + "grad_norm": 13.793179512023926, + "learning_rate": 9.895272292040696e-06, + "loss": 6.1394, + "step": 121950 + }, + { + "epoch": 10.949281867145421, + "grad_norm": 15.207568168640137, + "learning_rate": 9.895022940355078e-06, + "loss": 6.0618, + "step": 121975 + }, + { + "epoch": 10.951526032315979, + "grad_norm": 15.759702682495117, + "learning_rate": 9.89477358866946e-06, + "loss": 6.2503, + "step": 122000 + }, + { + "epoch": 10.953770197486534, + "grad_norm": 16.098966598510742, + "learning_rate": 9.894524236983842e-06, + "loss": 6.1027, + "step": 122025 + }, + { + "epoch": 10.956014362657092, + "grad_norm": 13.359086990356445, + "learning_rate": 9.894274885298225e-06, + "loss": 5.9532, + "step": 122050 + }, + { + "epoch": 10.958258527827649, + "grad_norm": 15.300271987915039, + "learning_rate": 9.894025533612607e-06, + "loss": 6.0223, + "step": 122075 + }, + { + "epoch": 10.960502692998205, + "grad_norm": 13.575864791870117, + "learning_rate": 9.893776181926991e-06, + "loss": 6.2936, + "step": 122100 + }, + { + "epoch": 10.962746858168762, + "grad_norm": 16.410354614257812, + "learning_rate": 9.893526830241373e-06, + "loss": 6.1285, + "step": 122125 + }, + { + "epoch": 10.964991023339318, + "grad_norm": 12.694114685058594, + "learning_rate": 9.893277478555756e-06, + "loss": 6.0119, + "step": 122150 + }, + { + "epoch": 10.967235188509875, + "grad_norm": 16.779041290283203, + "learning_rate": 9.89302812687014e-06, + "loss": 6.0757, + "step": 122175 + }, + { + "epoch": 10.96947935368043, + "grad_norm": 15.029654502868652, + "learning_rate": 9.89277877518452e-06, + "loss": 5.9359, + "step": 122200 + }, + { + "epoch": 10.971723518850988, + "grad_norm": 13.742959976196289, + "learning_rate": 9.892529423498903e-06, + "loss": 6.1337, + "step": 122225 + }, + { + "epoch": 10.973967684021543, + "grad_norm": 15.693480491638184, + "learning_rate": 9.892280071813287e-06, + "loss": 6.2954, + "step": 122250 + }, + { + "epoch": 10.9762118491921, + "grad_norm": 17.53208351135254, + "learning_rate": 9.892030720127669e-06, + "loss": 5.9622, + "step": 122275 + }, + { + "epoch": 10.978456014362656, + "grad_norm": 12.904881477355957, + "learning_rate": 9.891781368442051e-06, + "loss": 6.2347, + "step": 122300 + }, + { + "epoch": 10.980700179533214, + "grad_norm": 14.40416145324707, + "learning_rate": 9.891532016756434e-06, + "loss": 6.3727, + "step": 122325 + }, + { + "epoch": 10.982944344703771, + "grad_norm": 13.166825294494629, + "learning_rate": 9.891282665070818e-06, + "loss": 6.3323, + "step": 122350 + }, + { + "epoch": 10.985188509874327, + "grad_norm": 14.086435317993164, + "learning_rate": 9.891033313385198e-06, + "loss": 5.9832, + "step": 122375 + }, + { + "epoch": 10.987432675044884, + "grad_norm": 13.986516952514648, + "learning_rate": 9.890783961699582e-06, + "loss": 6.1579, + "step": 122400 + }, + { + "epoch": 10.98967684021544, + "grad_norm": 17.48889923095703, + "learning_rate": 9.890534610013965e-06, + "loss": 6.2052, + "step": 122425 + }, + { + "epoch": 10.991921005385997, + "grad_norm": 13.429991722106934, + "learning_rate": 9.890285258328347e-06, + "loss": 6.2175, + "step": 122450 + }, + { + "epoch": 10.994165170556553, + "grad_norm": 13.74560546875, + "learning_rate": 9.89003590664273e-06, + "loss": 5.7976, + "step": 122475 + }, + { + "epoch": 10.99640933572711, + "grad_norm": 12.871654510498047, + "learning_rate": 9.889786554957113e-06, + "loss": 6.0276, + "step": 122500 + }, + { + "epoch": 10.998653500897666, + "grad_norm": 14.153260231018066, + "learning_rate": 9.889537203271496e-06, + "loss": 6.2986, + "step": 122525 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.07841826895537778, + "eval_f1_macro": 0.00421093208718805, + "eval_f1_micro": 0.07841826895537778, + "eval_f1_weighted": 0.03725373276471899, + "eval_loss": 7.122722625732422, + "eval_precision_macro": 0.0037487479195924965, + "eval_precision_micro": 0.07841826895537778, + "eval_precision_weighted": 0.029315849284094093, + "eval_recall_macro": 0.008212087389898233, + "eval_recall_micro": 0.07841826895537778, + "eval_recall_weighted": 0.07841826895537778, + "eval_runtime": 129.2269, + "eval_samples_per_second": 405.279, + "eval_steps_per_second": 12.668, + "step": 122540 + }, + { + "epoch": 11.000897666068223, + "grad_norm": 15.212898254394531, + "learning_rate": 9.889287851585876e-06, + "loss": 5.8359, + "step": 122550 + }, + { + "epoch": 11.003141831238779, + "grad_norm": 13.392788887023926, + "learning_rate": 9.88903849990026e-06, + "loss": 5.8947, + "step": 122575 + }, + { + "epoch": 11.005385996409336, + "grad_norm": 12.349957466125488, + "learning_rate": 9.888789148214642e-06, + "loss": 5.6841, + "step": 122600 + }, + { + "epoch": 11.007630161579891, + "grad_norm": 14.262967109680176, + "learning_rate": 9.888539796529025e-06, + "loss": 6.0198, + "step": 122625 + }, + { + "epoch": 11.009874326750449, + "grad_norm": 14.372055053710938, + "learning_rate": 9.888290444843409e-06, + "loss": 5.8354, + "step": 122650 + }, + { + "epoch": 11.012118491921006, + "grad_norm": 11.776667594909668, + "learning_rate": 9.888041093157791e-06, + "loss": 5.9355, + "step": 122675 + }, + { + "epoch": 11.014362657091562, + "grad_norm": 12.665971755981445, + "learning_rate": 9.887791741472173e-06, + "loss": 5.581, + "step": 122700 + }, + { + "epoch": 11.01660682226212, + "grad_norm": 17.657642364501953, + "learning_rate": 9.887542389786556e-06, + "loss": 5.9208, + "step": 122725 + }, + { + "epoch": 11.018850987432675, + "grad_norm": 15.908074378967285, + "learning_rate": 9.887293038100938e-06, + "loss": 5.8855, + "step": 122750 + }, + { + "epoch": 11.021095152603232, + "grad_norm": 13.101197242736816, + "learning_rate": 9.88704368641532e-06, + "loss": 5.851, + "step": 122775 + }, + { + "epoch": 11.023339317773788, + "grad_norm": 12.305944442749023, + "learning_rate": 9.886794334729703e-06, + "loss": 5.9416, + "step": 122800 + }, + { + "epoch": 11.025583482944345, + "grad_norm": 15.744866371154785, + "learning_rate": 9.886544983044087e-06, + "loss": 5.6161, + "step": 122825 + }, + { + "epoch": 11.0278276481149, + "grad_norm": 15.79822826385498, + "learning_rate": 9.886295631358469e-06, + "loss": 5.766, + "step": 122850 + }, + { + "epoch": 11.030071813285458, + "grad_norm": 13.211987495422363, + "learning_rate": 9.886046279672851e-06, + "loss": 5.8505, + "step": 122875 + }, + { + "epoch": 11.032315978456014, + "grad_norm": 12.8656005859375, + "learning_rate": 9.885796927987234e-06, + "loss": 5.8493, + "step": 122900 + }, + { + "epoch": 11.034560143626571, + "grad_norm": 13.379805564880371, + "learning_rate": 9.885547576301616e-06, + "loss": 5.8333, + "step": 122925 + }, + { + "epoch": 11.036804308797127, + "grad_norm": 12.982284545898438, + "learning_rate": 9.885298224615998e-06, + "loss": 5.7443, + "step": 122950 + }, + { + "epoch": 11.039048473967684, + "grad_norm": 16.78384017944336, + "learning_rate": 9.885048872930382e-06, + "loss": 5.9792, + "step": 122975 + }, + { + "epoch": 11.041292639138241, + "grad_norm": 15.546758651733398, + "learning_rate": 9.884799521244765e-06, + "loss": 6.0935, + "step": 123000 + }, + { + "epoch": 11.043536804308797, + "grad_norm": 14.047802925109863, + "learning_rate": 9.884550169559147e-06, + "loss": 5.8151, + "step": 123025 + }, + { + "epoch": 11.045780969479354, + "grad_norm": 12.961902618408203, + "learning_rate": 9.88430081787353e-06, + "loss": 5.847, + "step": 123050 + }, + { + "epoch": 11.04802513464991, + "grad_norm": 16.362041473388672, + "learning_rate": 9.884051466187913e-06, + "loss": 5.7699, + "step": 123075 + }, + { + "epoch": 11.050269299820467, + "grad_norm": 16.58650016784668, + "learning_rate": 9.883802114502294e-06, + "loss": 5.6671, + "step": 123100 + }, + { + "epoch": 11.052513464991023, + "grad_norm": 15.36952018737793, + "learning_rate": 9.883552762816678e-06, + "loss": 5.7657, + "step": 123125 + }, + { + "epoch": 11.05475763016158, + "grad_norm": 14.458913803100586, + "learning_rate": 9.88330341113106e-06, + "loss": 5.7834, + "step": 123150 + }, + { + "epoch": 11.057001795332136, + "grad_norm": 16.863431930541992, + "learning_rate": 9.883054059445443e-06, + "loss": 5.8849, + "step": 123175 + }, + { + "epoch": 11.059245960502693, + "grad_norm": 12.91762924194336, + "learning_rate": 9.882804707759825e-06, + "loss": 5.9842, + "step": 123200 + }, + { + "epoch": 11.061490125673249, + "grad_norm": 16.225927352905273, + "learning_rate": 9.882555356074209e-06, + "loss": 6.0651, + "step": 123225 + }, + { + "epoch": 11.063734290843806, + "grad_norm": 14.064132690429688, + "learning_rate": 9.882306004388591e-06, + "loss": 6.0196, + "step": 123250 + }, + { + "epoch": 11.065978456014363, + "grad_norm": 17.432662963867188, + "learning_rate": 9.882056652702973e-06, + "loss": 5.8587, + "step": 123275 + }, + { + "epoch": 11.068222621184919, + "grad_norm": 13.228371620178223, + "learning_rate": 9.881807301017356e-06, + "loss": 5.8632, + "step": 123300 + }, + { + "epoch": 11.070466786355476, + "grad_norm": 15.044435501098633, + "learning_rate": 9.881557949331738e-06, + "loss": 5.9045, + "step": 123325 + }, + { + "epoch": 11.072710951526032, + "grad_norm": 15.368718147277832, + "learning_rate": 9.88130859764612e-06, + "loss": 5.9634, + "step": 123350 + }, + { + "epoch": 11.07495511669659, + "grad_norm": 13.533321380615234, + "learning_rate": 9.881059245960504e-06, + "loss": 5.8824, + "step": 123375 + }, + { + "epoch": 11.077199281867145, + "grad_norm": 15.21696949005127, + "learning_rate": 9.880809894274887e-06, + "loss": 5.9072, + "step": 123400 + }, + { + "epoch": 11.079443447037702, + "grad_norm": 15.151921272277832, + "learning_rate": 9.880560542589269e-06, + "loss": 5.7469, + "step": 123425 + }, + { + "epoch": 11.081687612208258, + "grad_norm": 15.892813682556152, + "learning_rate": 9.880311190903651e-06, + "loss": 6.1586, + "step": 123450 + }, + { + "epoch": 11.083931777378815, + "grad_norm": 13.086346626281738, + "learning_rate": 9.880061839218034e-06, + "loss": 6.0713, + "step": 123475 + }, + { + "epoch": 11.08617594254937, + "grad_norm": 17.124481201171875, + "learning_rate": 9.879812487532416e-06, + "loss": 5.9767, + "step": 123500 + }, + { + "epoch": 11.088420107719928, + "grad_norm": 16.002729415893555, + "learning_rate": 9.879563135846798e-06, + "loss": 5.9713, + "step": 123525 + }, + { + "epoch": 11.090664272890486, + "grad_norm": 12.886330604553223, + "learning_rate": 9.879313784161182e-06, + "loss": 5.9178, + "step": 123550 + }, + { + "epoch": 11.092908438061041, + "grad_norm": 14.14970874786377, + "learning_rate": 9.87907440654299e-06, + "loss": 5.8717, + "step": 123575 + }, + { + "epoch": 11.095152603231599, + "grad_norm": 14.376486778259277, + "learning_rate": 9.878825054857372e-06, + "loss": 5.7245, + "step": 123600 + }, + { + "epoch": 11.097396768402154, + "grad_norm": 14.082846641540527, + "learning_rate": 9.878575703171754e-06, + "loss": 5.8994, + "step": 123625 + }, + { + "epoch": 11.099640933572712, + "grad_norm": 14.184966087341309, + "learning_rate": 9.878326351486138e-06, + "loss": 5.92, + "step": 123650 + }, + { + "epoch": 11.101885098743267, + "grad_norm": 12.591816902160645, + "learning_rate": 9.87807699980052e-06, + "loss": 6.1603, + "step": 123675 + }, + { + "epoch": 11.104129263913824, + "grad_norm": 17.75528907775879, + "learning_rate": 9.877827648114901e-06, + "loss": 5.9109, + "step": 123700 + }, + { + "epoch": 11.10637342908438, + "grad_norm": 11.46168327331543, + "learning_rate": 9.877578296429285e-06, + "loss": 5.9124, + "step": 123725 + }, + { + "epoch": 11.108617594254937, + "grad_norm": 19.84630012512207, + "learning_rate": 9.877328944743667e-06, + "loss": 5.8248, + "step": 123750 + }, + { + "epoch": 11.110861759425493, + "grad_norm": 12.554841995239258, + "learning_rate": 9.87707959305805e-06, + "loss": 5.8839, + "step": 123775 + }, + { + "epoch": 11.11310592459605, + "grad_norm": 13.288397789001465, + "learning_rate": 9.876830241372432e-06, + "loss": 6.0373, + "step": 123800 + }, + { + "epoch": 11.115350089766606, + "grad_norm": 14.90313720703125, + "learning_rate": 9.876580889686816e-06, + "loss": 5.8834, + "step": 123825 + }, + { + "epoch": 11.117594254937163, + "grad_norm": 15.534248352050781, + "learning_rate": 9.876331538001198e-06, + "loss": 5.9461, + "step": 123850 + }, + { + "epoch": 11.11983842010772, + "grad_norm": 13.726080894470215, + "learning_rate": 9.87608218631558e-06, + "loss": 5.8883, + "step": 123875 + }, + { + "epoch": 11.122082585278276, + "grad_norm": 13.700102806091309, + "learning_rate": 9.875832834629963e-06, + "loss": 5.9406, + "step": 123900 + }, + { + "epoch": 11.124326750448834, + "grad_norm": 14.954806327819824, + "learning_rate": 9.875583482944345e-06, + "loss": 5.7396, + "step": 123925 + }, + { + "epoch": 11.12657091561939, + "grad_norm": 16.102996826171875, + "learning_rate": 9.875334131258727e-06, + "loss": 5.9922, + "step": 123950 + }, + { + "epoch": 11.128815080789947, + "grad_norm": 15.084527015686035, + "learning_rate": 9.875084779573111e-06, + "loss": 6.0405, + "step": 123975 + }, + { + "epoch": 11.131059245960502, + "grad_norm": 11.728472709655762, + "learning_rate": 9.874835427887494e-06, + "loss": 5.9208, + "step": 124000 + }, + { + "epoch": 11.13330341113106, + "grad_norm": 12.733758926391602, + "learning_rate": 9.874586076201876e-06, + "loss": 5.845, + "step": 124025 + }, + { + "epoch": 11.135547576301615, + "grad_norm": 13.397956848144531, + "learning_rate": 9.874336724516258e-06, + "loss": 5.7905, + "step": 124050 + }, + { + "epoch": 11.137791741472173, + "grad_norm": 14.8693265914917, + "learning_rate": 9.87408737283064e-06, + "loss": 5.8779, + "step": 124075 + }, + { + "epoch": 11.140035906642728, + "grad_norm": 15.936904907226562, + "learning_rate": 9.873838021145023e-06, + "loss": 5.7637, + "step": 124100 + }, + { + "epoch": 11.142280071813286, + "grad_norm": 17.124086380004883, + "learning_rate": 9.873588669459407e-06, + "loss": 5.9224, + "step": 124125 + }, + { + "epoch": 11.144524236983843, + "grad_norm": 14.165079116821289, + "learning_rate": 9.87333931777379e-06, + "loss": 5.8362, + "step": 124150 + }, + { + "epoch": 11.146768402154398, + "grad_norm": 13.134766578674316, + "learning_rate": 9.873089966088172e-06, + "loss": 5.7506, + "step": 124175 + }, + { + "epoch": 11.149012567324956, + "grad_norm": 12.287940979003906, + "learning_rate": 9.872840614402554e-06, + "loss": 6.0029, + "step": 124200 + }, + { + "epoch": 11.151256732495511, + "grad_norm": 14.438082695007324, + "learning_rate": 9.872591262716936e-06, + "loss": 5.9507, + "step": 124225 + }, + { + "epoch": 11.153500897666069, + "grad_norm": 17.616241455078125, + "learning_rate": 9.872341911031319e-06, + "loss": 6.1589, + "step": 124250 + }, + { + "epoch": 11.155745062836624, + "grad_norm": 14.602463722229004, + "learning_rate": 9.872092559345701e-06, + "loss": 5.8236, + "step": 124275 + }, + { + "epoch": 11.157989228007182, + "grad_norm": 12.965046882629395, + "learning_rate": 9.871843207660085e-06, + "loss": 5.9908, + "step": 124300 + }, + { + "epoch": 11.160233393177737, + "grad_norm": 17.80219841003418, + "learning_rate": 9.871593855974467e-06, + "loss": 5.9105, + "step": 124325 + }, + { + "epoch": 11.162477558348295, + "grad_norm": 13.008999824523926, + "learning_rate": 9.87134450428885e-06, + "loss": 5.9497, + "step": 124350 + }, + { + "epoch": 11.16472172351885, + "grad_norm": 13.293633460998535, + "learning_rate": 9.871095152603234e-06, + "loss": 5.8647, + "step": 124375 + }, + { + "epoch": 11.166965888689408, + "grad_norm": 12.345253944396973, + "learning_rate": 9.870845800917614e-06, + "loss": 5.543, + "step": 124400 + }, + { + "epoch": 11.169210053859963, + "grad_norm": 16.934114456176758, + "learning_rate": 9.870596449231997e-06, + "loss": 6.048, + "step": 124425 + }, + { + "epoch": 11.17145421903052, + "grad_norm": 13.64240550994873, + "learning_rate": 9.87034709754638e-06, + "loss": 6.3425, + "step": 124450 + }, + { + "epoch": 11.173698384201078, + "grad_norm": 14.493309020996094, + "learning_rate": 9.870097745860763e-06, + "loss": 5.9258, + "step": 124475 + }, + { + "epoch": 11.175942549371634, + "grad_norm": 13.217451095581055, + "learning_rate": 9.869848394175145e-06, + "loss": 5.7256, + "step": 124500 + }, + { + "epoch": 11.178186714542191, + "grad_norm": 14.532031059265137, + "learning_rate": 9.869599042489527e-06, + "loss": 6.0373, + "step": 124525 + }, + { + "epoch": 11.180430879712747, + "grad_norm": 14.89008617401123, + "learning_rate": 9.869349690803911e-06, + "loss": 5.7014, + "step": 124550 + }, + { + "epoch": 11.182675044883304, + "grad_norm": 14.348172187805176, + "learning_rate": 9.869100339118294e-06, + "loss": 5.9616, + "step": 124575 + }, + { + "epoch": 11.18491921005386, + "grad_norm": 13.642805099487305, + "learning_rate": 9.868850987432676e-06, + "loss": 5.8709, + "step": 124600 + }, + { + "epoch": 11.187163375224417, + "grad_norm": 15.032594680786133, + "learning_rate": 9.868601635747058e-06, + "loss": 5.9463, + "step": 124625 + }, + { + "epoch": 11.189407540394972, + "grad_norm": 13.961465835571289, + "learning_rate": 9.86835228406144e-06, + "loss": 5.9567, + "step": 124650 + }, + { + "epoch": 11.19165170556553, + "grad_norm": 14.296107292175293, + "learning_rate": 9.868102932375823e-06, + "loss": 5.9036, + "step": 124675 + }, + { + "epoch": 11.193895870736085, + "grad_norm": 17.761335372924805, + "learning_rate": 9.867853580690207e-06, + "loss": 5.7895, + "step": 124700 + }, + { + "epoch": 11.196140035906643, + "grad_norm": 16.13467788696289, + "learning_rate": 9.86760422900459e-06, + "loss": 5.9433, + "step": 124725 + }, + { + "epoch": 11.1983842010772, + "grad_norm": 13.34069538116455, + "learning_rate": 9.867354877318972e-06, + "loss": 5.7293, + "step": 124750 + }, + { + "epoch": 11.200628366247756, + "grad_norm": 15.597334861755371, + "learning_rate": 9.867105525633354e-06, + "loss": 5.6985, + "step": 124775 + }, + { + "epoch": 11.202872531418313, + "grad_norm": 14.234192848205566, + "learning_rate": 9.866856173947736e-06, + "loss": 5.6985, + "step": 124800 + }, + { + "epoch": 11.205116696588869, + "grad_norm": 13.153831481933594, + "learning_rate": 9.866606822262119e-06, + "loss": 5.8882, + "step": 124825 + }, + { + "epoch": 11.207360861759426, + "grad_norm": 13.53307819366455, + "learning_rate": 9.866357470576503e-06, + "loss": 6.041, + "step": 124850 + }, + { + "epoch": 11.209605026929982, + "grad_norm": 13.132146835327148, + "learning_rate": 9.866108118890885e-06, + "loss": 5.8747, + "step": 124875 + }, + { + "epoch": 11.211849192100539, + "grad_norm": 14.097916603088379, + "learning_rate": 9.865858767205267e-06, + "loss": 5.8982, + "step": 124900 + }, + { + "epoch": 11.214093357271095, + "grad_norm": 12.611454010009766, + "learning_rate": 9.86560941551965e-06, + "loss": 6.0044, + "step": 124925 + }, + { + "epoch": 11.216337522441652, + "grad_norm": 14.551291465759277, + "learning_rate": 9.865360063834032e-06, + "loss": 5.8714, + "step": 124950 + }, + { + "epoch": 11.218581687612208, + "grad_norm": 15.397804260253906, + "learning_rate": 9.865110712148414e-06, + "loss": 6.1706, + "step": 124975 + }, + { + "epoch": 11.220825852782765, + "grad_norm": 12.646705627441406, + "learning_rate": 9.864861360462797e-06, + "loss": 5.9906, + "step": 125000 + }, + { + "epoch": 11.223070017953322, + "grad_norm": 14.542316436767578, + "learning_rate": 9.86461200877718e-06, + "loss": 5.9967, + "step": 125025 + }, + { + "epoch": 11.225314183123878, + "grad_norm": 13.671542167663574, + "learning_rate": 9.864362657091563e-06, + "loss": 5.9579, + "step": 125050 + }, + { + "epoch": 11.227558348294435, + "grad_norm": 16.16288948059082, + "learning_rate": 9.864113305405945e-06, + "loss": 5.7579, + "step": 125075 + }, + { + "epoch": 11.22980251346499, + "grad_norm": 12.793145179748535, + "learning_rate": 9.86386395372033e-06, + "loss": 6.2189, + "step": 125100 + }, + { + "epoch": 11.232046678635548, + "grad_norm": 14.847328186035156, + "learning_rate": 9.86361460203471e-06, + "loss": 5.9293, + "step": 125125 + }, + { + "epoch": 11.234290843806104, + "grad_norm": 15.129499435424805, + "learning_rate": 9.863365250349092e-06, + "loss": 5.8327, + "step": 125150 + }, + { + "epoch": 11.236535008976661, + "grad_norm": 15.379789352416992, + "learning_rate": 9.863115898663476e-06, + "loss": 5.9019, + "step": 125175 + }, + { + "epoch": 11.238779174147217, + "grad_norm": 15.249789237976074, + "learning_rate": 9.862866546977858e-06, + "loss": 5.812, + "step": 125200 + }, + { + "epoch": 11.241023339317774, + "grad_norm": 15.54255485534668, + "learning_rate": 9.86261719529224e-06, + "loss": 6.1654, + "step": 125225 + }, + { + "epoch": 11.24326750448833, + "grad_norm": 11.456687927246094, + "learning_rate": 9.862367843606623e-06, + "loss": 5.927, + "step": 125250 + }, + { + "epoch": 11.245511669658887, + "grad_norm": 16.47123908996582, + "learning_rate": 9.862118491921007e-06, + "loss": 5.5382, + "step": 125275 + }, + { + "epoch": 11.247755834829443, + "grad_norm": 16.65957260131836, + "learning_rate": 9.861869140235388e-06, + "loss": 5.9048, + "step": 125300 + }, + { + "epoch": 11.25, + "grad_norm": 13.372590065002441, + "learning_rate": 9.861619788549772e-06, + "loss": 6.0946, + "step": 125325 + }, + { + "epoch": 11.252244165170557, + "grad_norm": 13.988485336303711, + "learning_rate": 9.861370436864154e-06, + "loss": 5.5501, + "step": 125350 + }, + { + "epoch": 11.254488330341113, + "grad_norm": 13.645984649658203, + "learning_rate": 9.861121085178536e-06, + "loss": 6.0288, + "step": 125375 + }, + { + "epoch": 11.25673249551167, + "grad_norm": 12.339776992797852, + "learning_rate": 9.860871733492919e-06, + "loss": 5.9573, + "step": 125400 + }, + { + "epoch": 11.258976660682226, + "grad_norm": 14.2048978805542, + "learning_rate": 9.860622381807303e-06, + "loss": 5.9552, + "step": 125425 + }, + { + "epoch": 11.261220825852783, + "grad_norm": 17.28348731994629, + "learning_rate": 9.860373030121685e-06, + "loss": 6.0258, + "step": 125450 + }, + { + "epoch": 11.263464991023339, + "grad_norm": 13.389452934265137, + "learning_rate": 9.860123678436067e-06, + "loss": 5.8245, + "step": 125475 + }, + { + "epoch": 11.265709156193896, + "grad_norm": 14.454455375671387, + "learning_rate": 9.85987432675045e-06, + "loss": 5.7089, + "step": 125500 + }, + { + "epoch": 11.267953321364452, + "grad_norm": 13.237222671508789, + "learning_rate": 9.859624975064832e-06, + "loss": 6.003, + "step": 125525 + }, + { + "epoch": 11.27019748653501, + "grad_norm": 17.59484100341797, + "learning_rate": 9.859375623379214e-06, + "loss": 5.9393, + "step": 125550 + }, + { + "epoch": 11.272441651705565, + "grad_norm": 12.634407043457031, + "learning_rate": 9.859126271693598e-06, + "loss": 5.8913, + "step": 125575 + }, + { + "epoch": 11.274685816876122, + "grad_norm": 19.76247215270996, + "learning_rate": 9.85887692000798e-06, + "loss": 5.7723, + "step": 125600 + }, + { + "epoch": 11.276929982046678, + "grad_norm": 14.224422454833984, + "learning_rate": 9.858627568322363e-06, + "loss": 6.0629, + "step": 125625 + }, + { + "epoch": 11.279174147217235, + "grad_norm": 15.188237190246582, + "learning_rate": 9.858378216636745e-06, + "loss": 6.0577, + "step": 125650 + }, + { + "epoch": 11.281418312387792, + "grad_norm": 11.745467185974121, + "learning_rate": 9.858128864951128e-06, + "loss": 5.8021, + "step": 125675 + }, + { + "epoch": 11.283662477558348, + "grad_norm": 17.212411880493164, + "learning_rate": 9.85787951326551e-06, + "loss": 5.8724, + "step": 125700 + }, + { + "epoch": 11.285906642728905, + "grad_norm": 14.631608009338379, + "learning_rate": 9.857630161579894e-06, + "loss": 6.1246, + "step": 125725 + }, + { + "epoch": 11.288150807899461, + "grad_norm": 12.126426696777344, + "learning_rate": 9.857380809894276e-06, + "loss": 5.773, + "step": 125750 + }, + { + "epoch": 11.290394973070018, + "grad_norm": 13.64586353302002, + "learning_rate": 9.857131458208659e-06, + "loss": 5.6806, + "step": 125775 + }, + { + "epoch": 11.292639138240574, + "grad_norm": 17.381065368652344, + "learning_rate": 9.85688210652304e-06, + "loss": 5.8563, + "step": 125800 + }, + { + "epoch": 11.294883303411131, + "grad_norm": 15.127209663391113, + "learning_rate": 9.856632754837423e-06, + "loss": 6.0025, + "step": 125825 + }, + { + "epoch": 11.297127468581687, + "grad_norm": 12.973708152770996, + "learning_rate": 9.856383403151805e-06, + "loss": 5.9695, + "step": 125850 + }, + { + "epoch": 11.299371633752244, + "grad_norm": 17.04529571533203, + "learning_rate": 9.856134051466188e-06, + "loss": 5.8071, + "step": 125875 + }, + { + "epoch": 11.3016157989228, + "grad_norm": 12.302536964416504, + "learning_rate": 9.855884699780572e-06, + "loss": 6.0006, + "step": 125900 + }, + { + "epoch": 11.303859964093357, + "grad_norm": 14.892513275146484, + "learning_rate": 9.855635348094954e-06, + "loss": 5.8266, + "step": 125925 + }, + { + "epoch": 11.306104129263915, + "grad_norm": 16.231876373291016, + "learning_rate": 9.855385996409336e-06, + "loss": 6.0417, + "step": 125950 + }, + { + "epoch": 11.30834829443447, + "grad_norm": 13.794641494750977, + "learning_rate": 9.855136644723719e-06, + "loss": 5.9235, + "step": 125975 + }, + { + "epoch": 11.310592459605028, + "grad_norm": 16.7657527923584, + "learning_rate": 9.854887293038101e-06, + "loss": 5.9162, + "step": 126000 + }, + { + "epoch": 11.312836624775583, + "grad_norm": 16.810075759887695, + "learning_rate": 9.854637941352483e-06, + "loss": 5.7959, + "step": 126025 + }, + { + "epoch": 11.31508078994614, + "grad_norm": 13.627239227294922, + "learning_rate": 9.854398563734292e-06, + "loss": 5.9111, + "step": 126050 + }, + { + "epoch": 11.317324955116696, + "grad_norm": 16.178874969482422, + "learning_rate": 9.854149212048674e-06, + "loss": 5.8401, + "step": 126075 + }, + { + "epoch": 11.319569120287253, + "grad_norm": 12.87552261352539, + "learning_rate": 9.853899860363057e-06, + "loss": 5.764, + "step": 126100 + }, + { + "epoch": 11.321813285457809, + "grad_norm": 14.028192520141602, + "learning_rate": 9.853650508677439e-06, + "loss": 5.8667, + "step": 126125 + }, + { + "epoch": 11.324057450628366, + "grad_norm": 14.874176025390625, + "learning_rate": 9.853401156991821e-06, + "loss": 5.7662, + "step": 126150 + }, + { + "epoch": 11.326301615798922, + "grad_norm": 13.889017105102539, + "learning_rate": 9.853151805306205e-06, + "loss": 5.9471, + "step": 126175 + }, + { + "epoch": 11.32854578096948, + "grad_norm": 15.182404518127441, + "learning_rate": 9.852902453620588e-06, + "loss": 5.6486, + "step": 126200 + }, + { + "epoch": 11.330789946140037, + "grad_norm": 14.584778785705566, + "learning_rate": 9.85265310193497e-06, + "loss": 5.9839, + "step": 126225 + }, + { + "epoch": 11.333034111310592, + "grad_norm": 15.410819053649902, + "learning_rate": 9.852403750249352e-06, + "loss": 5.8751, + "step": 126250 + }, + { + "epoch": 11.33527827648115, + "grad_norm": 11.31413745880127, + "learning_rate": 9.852154398563735e-06, + "loss": 5.9799, + "step": 126275 + }, + { + "epoch": 11.337522441651705, + "grad_norm": 13.530692100524902, + "learning_rate": 9.851905046878117e-06, + "loss": 5.9754, + "step": 126300 + }, + { + "epoch": 11.339766606822263, + "grad_norm": 13.032711029052734, + "learning_rate": 9.851655695192501e-06, + "loss": 5.8762, + "step": 126325 + }, + { + "epoch": 11.342010771992818, + "grad_norm": 15.266511917114258, + "learning_rate": 9.851406343506883e-06, + "loss": 5.7719, + "step": 126350 + }, + { + "epoch": 11.344254937163376, + "grad_norm": 14.284220695495605, + "learning_rate": 9.851156991821266e-06, + "loss": 5.9163, + "step": 126375 + }, + { + "epoch": 11.346499102333931, + "grad_norm": 13.69585132598877, + "learning_rate": 9.850907640135648e-06, + "loss": 6.1553, + "step": 126400 + }, + { + "epoch": 11.348743267504489, + "grad_norm": 12.982365608215332, + "learning_rate": 9.850658288450032e-06, + "loss": 5.8471, + "step": 126425 + }, + { + "epoch": 11.350987432675044, + "grad_norm": 15.92010498046875, + "learning_rate": 9.850408936764412e-06, + "loss": 5.8269, + "step": 126450 + }, + { + "epoch": 11.353231597845602, + "grad_norm": 12.647523880004883, + "learning_rate": 9.850159585078796e-06, + "loss": 5.9637, + "step": 126475 + }, + { + "epoch": 11.355475763016157, + "grad_norm": 15.820829391479492, + "learning_rate": 9.849910233393179e-06, + "loss": 6.0371, + "step": 126500 + }, + { + "epoch": 11.357719928186714, + "grad_norm": 14.620039939880371, + "learning_rate": 9.849660881707561e-06, + "loss": 6.0158, + "step": 126525 + }, + { + "epoch": 11.359964093357272, + "grad_norm": 17.230876922607422, + "learning_rate": 9.849411530021943e-06, + "loss": 5.8067, + "step": 126550 + }, + { + "epoch": 11.362208258527827, + "grad_norm": 13.390742301940918, + "learning_rate": 9.849162178336327e-06, + "loss": 5.8281, + "step": 126575 + }, + { + "epoch": 11.364452423698385, + "grad_norm": 13.604199409484863, + "learning_rate": 9.84891282665071e-06, + "loss": 6.1484, + "step": 126600 + }, + { + "epoch": 11.36669658886894, + "grad_norm": 12.335336685180664, + "learning_rate": 9.84866347496509e-06, + "loss": 6.25, + "step": 126625 + }, + { + "epoch": 11.368940754039498, + "grad_norm": 14.617795944213867, + "learning_rate": 9.848414123279474e-06, + "loss": 5.8935, + "step": 126650 + }, + { + "epoch": 11.371184919210053, + "grad_norm": 14.274748802185059, + "learning_rate": 9.848164771593857e-06, + "loss": 6.0461, + "step": 126675 + }, + { + "epoch": 11.37342908438061, + "grad_norm": 13.899663925170898, + "learning_rate": 9.847915419908239e-06, + "loss": 5.9911, + "step": 126700 + }, + { + "epoch": 11.375673249551166, + "grad_norm": 16.821319580078125, + "learning_rate": 9.847666068222621e-06, + "loss": 6.0641, + "step": 126725 + }, + { + "epoch": 11.377917414721724, + "grad_norm": 13.745955467224121, + "learning_rate": 9.847416716537005e-06, + "loss": 5.9323, + "step": 126750 + }, + { + "epoch": 11.38016157989228, + "grad_norm": 13.912734985351562, + "learning_rate": 9.847167364851388e-06, + "loss": 5.8728, + "step": 126775 + }, + { + "epoch": 11.382405745062837, + "grad_norm": 18.9571475982666, + "learning_rate": 9.84691801316577e-06, + "loss": 6.0201, + "step": 126800 + }, + { + "epoch": 11.384649910233394, + "grad_norm": 14.552946090698242, + "learning_rate": 9.846668661480152e-06, + "loss": 5.7592, + "step": 126825 + }, + { + "epoch": 11.38689407540395, + "grad_norm": 14.39069652557373, + "learning_rate": 9.846419309794535e-06, + "loss": 5.8344, + "step": 126850 + }, + { + "epoch": 11.389138240574507, + "grad_norm": 13.273011207580566, + "learning_rate": 9.846169958108917e-06, + "loss": 5.941, + "step": 126875 + }, + { + "epoch": 11.391382405745063, + "grad_norm": 14.529260635375977, + "learning_rate": 9.845920606423301e-06, + "loss": 5.9838, + "step": 126900 + }, + { + "epoch": 11.39362657091562, + "grad_norm": 13.721771240234375, + "learning_rate": 9.845671254737683e-06, + "loss": 5.963, + "step": 126925 + }, + { + "epoch": 11.395870736086176, + "grad_norm": 13.420525550842285, + "learning_rate": 9.845421903052066e-06, + "loss": 6.0681, + "step": 126950 + }, + { + "epoch": 11.398114901256733, + "grad_norm": 16.306365966796875, + "learning_rate": 9.845172551366448e-06, + "loss": 5.9739, + "step": 126975 + }, + { + "epoch": 11.400359066427288, + "grad_norm": 14.219463348388672, + "learning_rate": 9.84492319968083e-06, + "loss": 6.0515, + "step": 127000 + }, + { + "epoch": 11.402603231597846, + "grad_norm": 13.490338325500488, + "learning_rate": 9.844673847995213e-06, + "loss": 5.9366, + "step": 127025 + }, + { + "epoch": 11.404847396768401, + "grad_norm": 12.955401420593262, + "learning_rate": 9.844424496309597e-06, + "loss": 6.2951, + "step": 127050 + }, + { + "epoch": 11.407091561938959, + "grad_norm": 13.305789947509766, + "learning_rate": 9.844175144623979e-06, + "loss": 5.9769, + "step": 127075 + }, + { + "epoch": 11.409335727109514, + "grad_norm": 15.462806701660156, + "learning_rate": 9.843925792938361e-06, + "loss": 5.9885, + "step": 127100 + }, + { + "epoch": 11.411579892280072, + "grad_norm": 12.316009521484375, + "learning_rate": 9.843676441252743e-06, + "loss": 6.0318, + "step": 127125 + }, + { + "epoch": 11.41382405745063, + "grad_norm": 15.317309379577637, + "learning_rate": 9.843427089567126e-06, + "loss": 5.9847, + "step": 127150 + }, + { + "epoch": 11.416068222621185, + "grad_norm": 15.434735298156738, + "learning_rate": 9.843177737881508e-06, + "loss": 5.8057, + "step": 127175 + }, + { + "epoch": 11.418312387791742, + "grad_norm": 14.205634117126465, + "learning_rate": 9.842928386195892e-06, + "loss": 5.8851, + "step": 127200 + }, + { + "epoch": 11.420556552962298, + "grad_norm": 15.88529109954834, + "learning_rate": 9.842679034510274e-06, + "loss": 6.1385, + "step": 127225 + }, + { + "epoch": 11.422800718132855, + "grad_norm": 12.818830490112305, + "learning_rate": 9.842429682824657e-06, + "loss": 5.8764, + "step": 127250 + }, + { + "epoch": 11.42504488330341, + "grad_norm": 13.838766098022461, + "learning_rate": 9.842180331139039e-06, + "loss": 5.7255, + "step": 127275 + }, + { + "epoch": 11.427289048473968, + "grad_norm": 13.359016418457031, + "learning_rate": 9.841930979453423e-06, + "loss": 5.9946, + "step": 127300 + }, + { + "epoch": 11.429533213644524, + "grad_norm": 12.766145706176758, + "learning_rate": 9.841681627767804e-06, + "loss": 5.7699, + "step": 127325 + }, + { + "epoch": 11.431777378815081, + "grad_norm": 15.919157028198242, + "learning_rate": 9.841432276082186e-06, + "loss": 5.939, + "step": 127350 + }, + { + "epoch": 11.434021543985637, + "grad_norm": 18.525442123413086, + "learning_rate": 9.84118292439657e-06, + "loss": 6.0043, + "step": 127375 + }, + { + "epoch": 11.436265709156194, + "grad_norm": 15.561758041381836, + "learning_rate": 9.840933572710952e-06, + "loss": 5.8729, + "step": 127400 + }, + { + "epoch": 11.438509874326751, + "grad_norm": 12.97663402557373, + "learning_rate": 9.840684221025335e-06, + "loss": 6.0081, + "step": 127425 + }, + { + "epoch": 11.440754039497307, + "grad_norm": 12.481552124023438, + "learning_rate": 9.840434869339717e-06, + "loss": 5.8643, + "step": 127450 + }, + { + "epoch": 11.442998204667864, + "grad_norm": 14.673423767089844, + "learning_rate": 9.840185517654101e-06, + "loss": 6.0321, + "step": 127475 + }, + { + "epoch": 11.44524236983842, + "grad_norm": 18.14725112915039, + "learning_rate": 9.839936165968482e-06, + "loss": 5.939, + "step": 127500 + }, + { + "epoch": 11.447486535008977, + "grad_norm": 15.632257461547852, + "learning_rate": 9.839686814282866e-06, + "loss": 6.0494, + "step": 127525 + }, + { + "epoch": 11.449730700179533, + "grad_norm": 14.369179725646973, + "learning_rate": 9.839437462597248e-06, + "loss": 5.8936, + "step": 127550 + }, + { + "epoch": 11.45197486535009, + "grad_norm": 11.884199142456055, + "learning_rate": 9.83918811091163e-06, + "loss": 5.8163, + "step": 127575 + }, + { + "epoch": 11.454219030520646, + "grad_norm": 14.40982437133789, + "learning_rate": 9.838938759226013e-06, + "loss": 5.8177, + "step": 127600 + }, + { + "epoch": 11.456463195691203, + "grad_norm": 17.859952926635742, + "learning_rate": 9.838689407540397e-06, + "loss": 5.9529, + "step": 127625 + }, + { + "epoch": 11.458707360861759, + "grad_norm": 14.152206420898438, + "learning_rate": 9.838440055854779e-06, + "loss": 6.1828, + "step": 127650 + }, + { + "epoch": 11.460951526032316, + "grad_norm": 13.148721694946289, + "learning_rate": 9.838190704169161e-06, + "loss": 5.9295, + "step": 127675 + }, + { + "epoch": 11.463195691202873, + "grad_norm": 14.188822746276855, + "learning_rate": 9.837941352483544e-06, + "loss": 5.7993, + "step": 127700 + }, + { + "epoch": 11.465439856373429, + "grad_norm": 14.74708366394043, + "learning_rate": 9.837692000797926e-06, + "loss": 5.7915, + "step": 127725 + }, + { + "epoch": 11.467684021543986, + "grad_norm": 14.290468215942383, + "learning_rate": 9.837442649112308e-06, + "loss": 5.7662, + "step": 127750 + }, + { + "epoch": 11.469928186714542, + "grad_norm": 14.026281356811523, + "learning_rate": 9.837193297426692e-06, + "loss": 5.868, + "step": 127775 + }, + { + "epoch": 11.4721723518851, + "grad_norm": 14.358498573303223, + "learning_rate": 9.836943945741074e-06, + "loss": 5.7076, + "step": 127800 + }, + { + "epoch": 11.474416517055655, + "grad_norm": 13.73015022277832, + "learning_rate": 9.836694594055457e-06, + "loss": 6.0983, + "step": 127825 + }, + { + "epoch": 11.476660682226212, + "grad_norm": 15.767241477966309, + "learning_rate": 9.836445242369839e-06, + "loss": 5.8287, + "step": 127850 + }, + { + "epoch": 11.478904847396768, + "grad_norm": 14.733309745788574, + "learning_rate": 9.836195890684221e-06, + "loss": 5.9779, + "step": 127875 + }, + { + "epoch": 11.481149012567325, + "grad_norm": 13.169815063476562, + "learning_rate": 9.835946538998604e-06, + "loss": 6.0315, + "step": 127900 + }, + { + "epoch": 11.48339317773788, + "grad_norm": 13.677334785461426, + "learning_rate": 9.835697187312988e-06, + "loss": 5.9274, + "step": 127925 + }, + { + "epoch": 11.485637342908438, + "grad_norm": 14.570043563842773, + "learning_rate": 9.83544783562737e-06, + "loss": 6.0479, + "step": 127950 + }, + { + "epoch": 11.487881508078994, + "grad_norm": 13.242156028747559, + "learning_rate": 9.835198483941752e-06, + "loss": 5.835, + "step": 127975 + }, + { + "epoch": 11.490125673249551, + "grad_norm": 14.710835456848145, + "learning_rate": 9.834949132256135e-06, + "loss": 5.7253, + "step": 128000 + }, + { + "epoch": 11.492369838420109, + "grad_norm": 17.66818618774414, + "learning_rate": 9.834699780570519e-06, + "loss": 5.9674, + "step": 128025 + }, + { + "epoch": 11.494614003590664, + "grad_norm": 12.317059516906738, + "learning_rate": 9.8344504288849e-06, + "loss": 6.0388, + "step": 128050 + }, + { + "epoch": 11.496858168761221, + "grad_norm": 15.668967247009277, + "learning_rate": 9.834201077199282e-06, + "loss": 5.9945, + "step": 128075 + }, + { + "epoch": 11.499102333931777, + "grad_norm": 14.849610328674316, + "learning_rate": 9.833951725513666e-06, + "loss": 5.8912, + "step": 128100 + }, + { + "epoch": 11.501346499102334, + "grad_norm": 13.273598670959473, + "learning_rate": 9.833702373828048e-06, + "loss": 5.9109, + "step": 128125 + }, + { + "epoch": 11.50359066427289, + "grad_norm": 13.650513648986816, + "learning_rate": 9.83345302214243e-06, + "loss": 5.9975, + "step": 128150 + }, + { + "epoch": 11.505834829443447, + "grad_norm": 16.465913772583008, + "learning_rate": 9.833203670456813e-06, + "loss": 6.0035, + "step": 128175 + }, + { + "epoch": 11.508078994614003, + "grad_norm": 16.6304931640625, + "learning_rate": 9.832954318771197e-06, + "loss": 5.9483, + "step": 128200 + }, + { + "epoch": 11.51032315978456, + "grad_norm": 17.1883487701416, + "learning_rate": 9.832704967085577e-06, + "loss": 6.0261, + "step": 128225 + }, + { + "epoch": 11.512567324955116, + "grad_norm": 13.983701705932617, + "learning_rate": 9.832455615399961e-06, + "loss": 6.1917, + "step": 128250 + }, + { + "epoch": 11.514811490125673, + "grad_norm": 14.729023933410645, + "learning_rate": 9.832206263714344e-06, + "loss": 5.7718, + "step": 128275 + }, + { + "epoch": 11.517055655296229, + "grad_norm": 15.37313175201416, + "learning_rate": 9.831956912028726e-06, + "loss": 6.1356, + "step": 128300 + }, + { + "epoch": 11.519299820466786, + "grad_norm": 19.848289489746094, + "learning_rate": 9.831707560343108e-06, + "loss": 6.0835, + "step": 128325 + }, + { + "epoch": 11.521543985637344, + "grad_norm": 13.947011947631836, + "learning_rate": 9.831458208657492e-06, + "loss": 5.9214, + "step": 128350 + }, + { + "epoch": 11.5237881508079, + "grad_norm": 12.193292617797852, + "learning_rate": 9.831208856971875e-06, + "loss": 5.9236, + "step": 128375 + }, + { + "epoch": 11.526032315978457, + "grad_norm": 14.484532356262207, + "learning_rate": 9.830959505286257e-06, + "loss": 5.8822, + "step": 128400 + }, + { + "epoch": 11.528276481149012, + "grad_norm": 15.214570999145508, + "learning_rate": 9.83071015360064e-06, + "loss": 5.7944, + "step": 128425 + }, + { + "epoch": 11.53052064631957, + "grad_norm": 19.060611724853516, + "learning_rate": 9.830460801915021e-06, + "loss": 6.0967, + "step": 128450 + }, + { + "epoch": 11.532764811490125, + "grad_norm": 15.777983665466309, + "learning_rate": 9.830211450229404e-06, + "loss": 5.9308, + "step": 128475 + }, + { + "epoch": 11.535008976660682, + "grad_norm": 11.433914184570312, + "learning_rate": 9.829962098543788e-06, + "loss": 6.1132, + "step": 128500 + }, + { + "epoch": 11.537253141831238, + "grad_norm": 12.3540678024292, + "learning_rate": 9.82971274685817e-06, + "loss": 5.6705, + "step": 128525 + }, + { + "epoch": 11.539497307001795, + "grad_norm": 12.835166931152344, + "learning_rate": 9.829463395172552e-06, + "loss": 6.0939, + "step": 128550 + }, + { + "epoch": 11.541741472172351, + "grad_norm": 17.505268096923828, + "learning_rate": 9.829214043486935e-06, + "loss": 5.8351, + "step": 128575 + }, + { + "epoch": 11.543985637342908, + "grad_norm": 16.056100845336914, + "learning_rate": 9.828964691801317e-06, + "loss": 5.8872, + "step": 128600 + }, + { + "epoch": 11.546229802513466, + "grad_norm": 13.819441795349121, + "learning_rate": 9.8287153401157e-06, + "loss": 5.9857, + "step": 128625 + }, + { + "epoch": 11.548473967684021, + "grad_norm": 14.413067817687988, + "learning_rate": 9.828465988430083e-06, + "loss": 6.1577, + "step": 128650 + }, + { + "epoch": 11.550718132854579, + "grad_norm": 18.851558685302734, + "learning_rate": 9.828216636744466e-06, + "loss": 5.894, + "step": 128675 + }, + { + "epoch": 11.552962298025134, + "grad_norm": 14.744344711303711, + "learning_rate": 9.827967285058848e-06, + "loss": 6.0826, + "step": 128700 + }, + { + "epoch": 11.555206463195692, + "grad_norm": 14.080327033996582, + "learning_rate": 9.82771793337323e-06, + "loss": 6.0989, + "step": 128725 + }, + { + "epoch": 11.557450628366247, + "grad_norm": 14.322946548461914, + "learning_rate": 9.827468581687613e-06, + "loss": 6.1997, + "step": 128750 + }, + { + "epoch": 11.559694793536805, + "grad_norm": 14.608235359191895, + "learning_rate": 9.827219230001995e-06, + "loss": 5.9201, + "step": 128775 + }, + { + "epoch": 11.56193895870736, + "grad_norm": 16.501602172851562, + "learning_rate": 9.826969878316377e-06, + "loss": 5.7382, + "step": 128800 + }, + { + "epoch": 11.564183123877918, + "grad_norm": 13.807903289794922, + "learning_rate": 9.826720526630761e-06, + "loss": 6.0576, + "step": 128825 + }, + { + "epoch": 11.566427289048473, + "grad_norm": 16.0841007232666, + "learning_rate": 9.826471174945144e-06, + "loss": 6.141, + "step": 128850 + }, + { + "epoch": 11.56867145421903, + "grad_norm": 18.35209846496582, + "learning_rate": 9.826221823259526e-06, + "loss": 5.5488, + "step": 128875 + }, + { + "epoch": 11.570915619389588, + "grad_norm": 13.43196964263916, + "learning_rate": 9.82597247157391e-06, + "loss": 5.96, + "step": 128900 + }, + { + "epoch": 11.573159784560143, + "grad_norm": 15.072882652282715, + "learning_rate": 9.82572311988829e-06, + "loss": 5.7822, + "step": 128925 + }, + { + "epoch": 11.5754039497307, + "grad_norm": 15.374707221984863, + "learning_rate": 9.825473768202673e-06, + "loss": 6.1704, + "step": 128950 + }, + { + "epoch": 11.577648114901256, + "grad_norm": 14.486473083496094, + "learning_rate": 9.825224416517057e-06, + "loss": 5.8797, + "step": 128975 + }, + { + "epoch": 11.579892280071814, + "grad_norm": 15.697866439819336, + "learning_rate": 9.82497506483144e-06, + "loss": 6.1261, + "step": 129000 + }, + { + "epoch": 11.58213644524237, + "grad_norm": 13.823222160339355, + "learning_rate": 9.824725713145822e-06, + "loss": 5.822, + "step": 129025 + }, + { + "epoch": 11.584380610412927, + "grad_norm": 13.845605850219727, + "learning_rate": 9.824476361460204e-06, + "loss": 6.0494, + "step": 129050 + }, + { + "epoch": 11.586624775583482, + "grad_norm": 15.052549362182617, + "learning_rate": 9.824227009774588e-06, + "loss": 6.2945, + "step": 129075 + }, + { + "epoch": 11.58886894075404, + "grad_norm": 17.034177780151367, + "learning_rate": 9.823977658088968e-06, + "loss": 5.8404, + "step": 129100 + }, + { + "epoch": 11.591113105924595, + "grad_norm": 13.179086685180664, + "learning_rate": 9.823728306403352e-06, + "loss": 5.8025, + "step": 129125 + }, + { + "epoch": 11.593357271095153, + "grad_norm": 12.794045448303223, + "learning_rate": 9.823478954717735e-06, + "loss": 6.0506, + "step": 129150 + }, + { + "epoch": 11.59560143626571, + "grad_norm": 13.613144874572754, + "learning_rate": 9.823229603032117e-06, + "loss": 5.9809, + "step": 129175 + }, + { + "epoch": 11.597845601436266, + "grad_norm": 13.063044548034668, + "learning_rate": 9.822990225413924e-06, + "loss": 6.1317, + "step": 129200 + }, + { + "epoch": 11.600089766606823, + "grad_norm": 13.952818870544434, + "learning_rate": 9.822740873728306e-06, + "loss": 5.946, + "step": 129225 + }, + { + "epoch": 11.602333931777379, + "grad_norm": 13.413491249084473, + "learning_rate": 9.82249152204269e-06, + "loss": 5.9171, + "step": 129250 + }, + { + "epoch": 11.604578096947936, + "grad_norm": 15.776820182800293, + "learning_rate": 9.822242170357073e-06, + "loss": 5.8295, + "step": 129275 + }, + { + "epoch": 11.606822262118492, + "grad_norm": 15.671939849853516, + "learning_rate": 9.821992818671455e-06, + "loss": 5.9452, + "step": 129300 + }, + { + "epoch": 11.609066427289049, + "grad_norm": 23.4184627532959, + "learning_rate": 9.821743466985837e-06, + "loss": 5.8101, + "step": 129325 + }, + { + "epoch": 11.611310592459605, + "grad_norm": 16.60793685913086, + "learning_rate": 9.82149411530022e-06, + "loss": 6.1279, + "step": 129350 + }, + { + "epoch": 11.613554757630162, + "grad_norm": 14.941667556762695, + "learning_rate": 9.821244763614602e-06, + "loss": 5.8756, + "step": 129375 + }, + { + "epoch": 11.615798922800717, + "grad_norm": 13.170215606689453, + "learning_rate": 9.820995411928986e-06, + "loss": 5.9525, + "step": 129400 + }, + { + "epoch": 11.618043087971275, + "grad_norm": 14.32033920288086, + "learning_rate": 9.820746060243368e-06, + "loss": 5.7162, + "step": 129425 + }, + { + "epoch": 11.62028725314183, + "grad_norm": 12.858738899230957, + "learning_rate": 9.82049670855775e-06, + "loss": 5.729, + "step": 129450 + }, + { + "epoch": 11.622531418312388, + "grad_norm": 12.517889976501465, + "learning_rate": 9.820247356872133e-06, + "loss": 5.8802, + "step": 129475 + }, + { + "epoch": 11.624775583482945, + "grad_norm": 13.266039848327637, + "learning_rate": 9.819998005186517e-06, + "loss": 5.9158, + "step": 129500 + }, + { + "epoch": 11.6270197486535, + "grad_norm": 16.1972599029541, + "learning_rate": 9.8197486535009e-06, + "loss": 5.7791, + "step": 129525 + }, + { + "epoch": 11.629263913824058, + "grad_norm": 15.574183464050293, + "learning_rate": 9.81949930181528e-06, + "loss": 6.0754, + "step": 129550 + }, + { + "epoch": 11.631508078994614, + "grad_norm": 13.987920761108398, + "learning_rate": 9.819249950129664e-06, + "loss": 5.8287, + "step": 129575 + }, + { + "epoch": 11.633752244165171, + "grad_norm": 12.650153160095215, + "learning_rate": 9.819000598444046e-06, + "loss": 6.198, + "step": 129600 + }, + { + "epoch": 11.635996409335727, + "grad_norm": 10.38704776763916, + "learning_rate": 9.818751246758429e-06, + "loss": 6.0436, + "step": 129625 + }, + { + "epoch": 11.638240574506284, + "grad_norm": 12.797455787658691, + "learning_rate": 9.818501895072813e-06, + "loss": 5.803, + "step": 129650 + }, + { + "epoch": 11.64048473967684, + "grad_norm": 15.48508358001709, + "learning_rate": 9.818252543387195e-06, + "loss": 6.0798, + "step": 129675 + }, + { + "epoch": 11.642728904847397, + "grad_norm": 14.108202934265137, + "learning_rate": 9.818003191701577e-06, + "loss": 6.0158, + "step": 129700 + }, + { + "epoch": 11.644973070017953, + "grad_norm": 15.555758476257324, + "learning_rate": 9.81775384001596e-06, + "loss": 6.032, + "step": 129725 + }, + { + "epoch": 11.64721723518851, + "grad_norm": 17.26096534729004, + "learning_rate": 9.817504488330342e-06, + "loss": 6.0016, + "step": 129750 + }, + { + "epoch": 11.649461400359066, + "grad_norm": 16.362857818603516, + "learning_rate": 9.817255136644724e-06, + "loss": 5.9215, + "step": 129775 + }, + { + "epoch": 11.651705565529623, + "grad_norm": 22.504039764404297, + "learning_rate": 9.817005784959106e-06, + "loss": 5.9222, + "step": 129800 + }, + { + "epoch": 11.65394973070018, + "grad_norm": 16.172208786010742, + "learning_rate": 9.81675643327349e-06, + "loss": 6.0003, + "step": 129825 + }, + { + "epoch": 11.656193895870736, + "grad_norm": 14.470597267150879, + "learning_rate": 9.816507081587873e-06, + "loss": 6.0292, + "step": 129850 + }, + { + "epoch": 11.658438061041293, + "grad_norm": 14.386902809143066, + "learning_rate": 9.816257729902255e-06, + "loss": 5.895, + "step": 129875 + }, + { + "epoch": 11.660682226211849, + "grad_norm": 14.409761428833008, + "learning_rate": 9.816008378216637e-06, + "loss": 5.9267, + "step": 129900 + }, + { + "epoch": 11.662926391382406, + "grad_norm": 11.768684387207031, + "learning_rate": 9.81575902653102e-06, + "loss": 5.8998, + "step": 129925 + }, + { + "epoch": 11.665170556552962, + "grad_norm": 12.624908447265625, + "learning_rate": 9.815509674845402e-06, + "loss": 5.7131, + "step": 129950 + }, + { + "epoch": 11.66741472172352, + "grad_norm": 13.08301067352295, + "learning_rate": 9.815260323159786e-06, + "loss": 6.0649, + "step": 129975 + }, + { + "epoch": 11.669658886894075, + "grad_norm": 12.430055618286133, + "learning_rate": 9.815010971474168e-06, + "loss": 5.8667, + "step": 130000 + }, + { + "epoch": 11.671903052064632, + "grad_norm": 19.05857276916504, + "learning_rate": 9.81476161978855e-06, + "loss": 5.9344, + "step": 130025 + }, + { + "epoch": 11.674147217235188, + "grad_norm": 15.122430801391602, + "learning_rate": 9.814512268102933e-06, + "loss": 6.1564, + "step": 130050 + }, + { + "epoch": 11.676391382405745, + "grad_norm": 15.101093292236328, + "learning_rate": 9.814262916417315e-06, + "loss": 6.0982, + "step": 130075 + }, + { + "epoch": 11.678635547576302, + "grad_norm": 18.703399658203125, + "learning_rate": 9.814013564731698e-06, + "loss": 5.9417, + "step": 130100 + }, + { + "epoch": 11.680879712746858, + "grad_norm": 19.688764572143555, + "learning_rate": 9.813764213046082e-06, + "loss": 6.1693, + "step": 130125 + }, + { + "epoch": 11.683123877917415, + "grad_norm": 13.335134506225586, + "learning_rate": 9.813514861360464e-06, + "loss": 6.0026, + "step": 130150 + }, + { + "epoch": 11.685368043087971, + "grad_norm": 14.732147216796875, + "learning_rate": 9.813265509674846e-06, + "loss": 5.9032, + "step": 130175 + }, + { + "epoch": 11.687612208258528, + "grad_norm": 13.935994148254395, + "learning_rate": 9.813016157989229e-06, + "loss": 5.7656, + "step": 130200 + }, + { + "epoch": 11.689856373429084, + "grad_norm": 16.88021469116211, + "learning_rate": 9.812766806303613e-06, + "loss": 5.9498, + "step": 130225 + }, + { + "epoch": 11.692100538599641, + "grad_norm": 15.333001136779785, + "learning_rate": 9.812517454617993e-06, + "loss": 6.0729, + "step": 130250 + }, + { + "epoch": 11.694344703770197, + "grad_norm": 15.298799514770508, + "learning_rate": 9.812268102932376e-06, + "loss": 5.74, + "step": 130275 + }, + { + "epoch": 11.696588868940754, + "grad_norm": 14.564105033874512, + "learning_rate": 9.81201875124676e-06, + "loss": 6.0592, + "step": 130300 + }, + { + "epoch": 11.69883303411131, + "grad_norm": 16.027870178222656, + "learning_rate": 9.811769399561142e-06, + "loss": 6.0389, + "step": 130325 + }, + { + "epoch": 11.701077199281867, + "grad_norm": 13.580489158630371, + "learning_rate": 9.811520047875524e-06, + "loss": 5.798, + "step": 130350 + }, + { + "epoch": 11.703321364452425, + "grad_norm": 13.720290184020996, + "learning_rate": 9.811270696189908e-06, + "loss": 5.997, + "step": 130375 + }, + { + "epoch": 11.70556552962298, + "grad_norm": 18.13821792602539, + "learning_rate": 9.81102134450429e-06, + "loss": 6.0726, + "step": 130400 + }, + { + "epoch": 11.707809694793538, + "grad_norm": 14.271677017211914, + "learning_rate": 9.810771992818671e-06, + "loss": 6.0482, + "step": 130425 + }, + { + "epoch": 11.710053859964093, + "grad_norm": 13.456311225891113, + "learning_rate": 9.810522641133055e-06, + "loss": 5.9087, + "step": 130450 + }, + { + "epoch": 11.71229802513465, + "grad_norm": 19.27644920349121, + "learning_rate": 9.810273289447437e-06, + "loss": 6.113, + "step": 130475 + }, + { + "epoch": 11.714542190305206, + "grad_norm": 15.427023887634277, + "learning_rate": 9.81002393776182e-06, + "loss": 5.9002, + "step": 130500 + }, + { + "epoch": 11.716786355475763, + "grad_norm": 15.558968544006348, + "learning_rate": 9.809774586076202e-06, + "loss": 6.018, + "step": 130525 + }, + { + "epoch": 11.719030520646319, + "grad_norm": 15.17101001739502, + "learning_rate": 9.809525234390586e-06, + "loss": 5.9266, + "step": 130550 + }, + { + "epoch": 11.721274685816876, + "grad_norm": 15.879241943359375, + "learning_rate": 9.809275882704968e-06, + "loss": 6.0337, + "step": 130575 + }, + { + "epoch": 11.723518850987432, + "grad_norm": 16.440380096435547, + "learning_rate": 9.80902653101935e-06, + "loss": 5.6231, + "step": 130600 + }, + { + "epoch": 11.72576301615799, + "grad_norm": 14.79899787902832, + "learning_rate": 9.808777179333733e-06, + "loss": 5.8067, + "step": 130625 + }, + { + "epoch": 11.728007181328547, + "grad_norm": 16.46668243408203, + "learning_rate": 9.808527827648115e-06, + "loss": 5.8573, + "step": 130650 + }, + { + "epoch": 11.730251346499102, + "grad_norm": 16.35871696472168, + "learning_rate": 9.808278475962498e-06, + "loss": 5.9117, + "step": 130675 + }, + { + "epoch": 11.73249551166966, + "grad_norm": 14.304941177368164, + "learning_rate": 9.808029124276882e-06, + "loss": 5.969, + "step": 130700 + }, + { + "epoch": 11.734739676840215, + "grad_norm": 14.380688667297363, + "learning_rate": 9.807779772591264e-06, + "loss": 6.1611, + "step": 130725 + }, + { + "epoch": 11.736983842010773, + "grad_norm": 15.340079307556152, + "learning_rate": 9.807530420905646e-06, + "loss": 5.8115, + "step": 130750 + }, + { + "epoch": 11.739228007181328, + "grad_norm": 17.547624588012695, + "learning_rate": 9.807281069220029e-06, + "loss": 5.79, + "step": 130775 + }, + { + "epoch": 11.741472172351886, + "grad_norm": 12.46426773071289, + "learning_rate": 9.807031717534411e-06, + "loss": 5.7347, + "step": 130800 + }, + { + "epoch": 11.743716337522441, + "grad_norm": 13.332784652709961, + "learning_rate": 9.806782365848793e-06, + "loss": 5.8665, + "step": 130825 + }, + { + "epoch": 11.745960502692999, + "grad_norm": 13.770901679992676, + "learning_rate": 9.806533014163177e-06, + "loss": 5.9419, + "step": 130850 + }, + { + "epoch": 11.748204667863554, + "grad_norm": 14.693373680114746, + "learning_rate": 9.80628366247756e-06, + "loss": 6.0126, + "step": 130875 + }, + { + "epoch": 11.750448833034111, + "grad_norm": 14.20588493347168, + "learning_rate": 9.806034310791942e-06, + "loss": 5.9057, + "step": 130900 + }, + { + "epoch": 11.752692998204667, + "grad_norm": 14.703716278076172, + "learning_rate": 9.805784959106324e-06, + "loss": 5.9641, + "step": 130925 + }, + { + "epoch": 11.754937163375224, + "grad_norm": 12.514618873596191, + "learning_rate": 9.805535607420707e-06, + "loss": 5.9068, + "step": 130950 + }, + { + "epoch": 11.757181328545782, + "grad_norm": 13.409035682678223, + "learning_rate": 9.805286255735089e-06, + "loss": 6.0656, + "step": 130975 + }, + { + "epoch": 11.759425493716337, + "grad_norm": 13.08777141571045, + "learning_rate": 9.805036904049471e-06, + "loss": 5.9153, + "step": 131000 + }, + { + "epoch": 11.761669658886895, + "grad_norm": 13.908531188964844, + "learning_rate": 9.804787552363855e-06, + "loss": 5.9089, + "step": 131025 + }, + { + "epoch": 11.76391382405745, + "grad_norm": 14.76535701751709, + "learning_rate": 9.804538200678237e-06, + "loss": 5.9183, + "step": 131050 + }, + { + "epoch": 11.766157989228008, + "grad_norm": 16.21272850036621, + "learning_rate": 9.80428884899262e-06, + "loss": 6.0774, + "step": 131075 + }, + { + "epoch": 11.768402154398563, + "grad_norm": 13.544709205627441, + "learning_rate": 9.804039497307004e-06, + "loss": 6.1335, + "step": 131100 + }, + { + "epoch": 11.77064631956912, + "grad_norm": 15.927101135253906, + "learning_rate": 9.803790145621386e-06, + "loss": 6.0849, + "step": 131125 + }, + { + "epoch": 11.772890484739676, + "grad_norm": 13.942158699035645, + "learning_rate": 9.803540793935767e-06, + "loss": 5.8914, + "step": 131150 + }, + { + "epoch": 11.775134649910234, + "grad_norm": 16.05362892150879, + "learning_rate": 9.80329144225015e-06, + "loss": 5.6891, + "step": 131175 + }, + { + "epoch": 11.77737881508079, + "grad_norm": 15.940606117248535, + "learning_rate": 9.803052064631958e-06, + "loss": 5.9145, + "step": 131200 + }, + { + "epoch": 11.779622980251347, + "grad_norm": 12.547348976135254, + "learning_rate": 9.80280271294634e-06, + "loss": 5.9287, + "step": 131225 + }, + { + "epoch": 11.781867145421902, + "grad_norm": 12.409605979919434, + "learning_rate": 9.802553361260722e-06, + "loss": 5.9962, + "step": 131250 + }, + { + "epoch": 11.78411131059246, + "grad_norm": 13.664761543273926, + "learning_rate": 9.802304009575105e-06, + "loss": 6.015, + "step": 131275 + }, + { + "epoch": 11.786355475763017, + "grad_norm": 16.092342376708984, + "learning_rate": 9.802054657889489e-06, + "loss": 6.0117, + "step": 131300 + }, + { + "epoch": 11.788599640933572, + "grad_norm": 15.840426445007324, + "learning_rate": 9.801805306203871e-06, + "loss": 5.8278, + "step": 131325 + }, + { + "epoch": 11.79084380610413, + "grad_norm": 17.78968048095703, + "learning_rate": 9.801555954518253e-06, + "loss": 5.9776, + "step": 131350 + }, + { + "epoch": 11.793087971274685, + "grad_norm": 12.440229415893555, + "learning_rate": 9.801306602832637e-06, + "loss": 5.8429, + "step": 131375 + }, + { + "epoch": 11.795332136445243, + "grad_norm": 14.07314395904541, + "learning_rate": 9.801057251147018e-06, + "loss": 5.9121, + "step": 131400 + }, + { + "epoch": 11.797576301615798, + "grad_norm": 15.238982200622559, + "learning_rate": 9.8008078994614e-06, + "loss": 6.1437, + "step": 131425 + }, + { + "epoch": 11.799820466786356, + "grad_norm": 13.064303398132324, + "learning_rate": 9.800558547775784e-06, + "loss": 5.6882, + "step": 131450 + }, + { + "epoch": 11.802064631956911, + "grad_norm": 15.065181732177734, + "learning_rate": 9.800309196090167e-06, + "loss": 5.9717, + "step": 131475 + }, + { + "epoch": 11.804308797127469, + "grad_norm": 13.83913516998291, + "learning_rate": 9.800059844404549e-06, + "loss": 5.8994, + "step": 131500 + }, + { + "epoch": 11.806552962298024, + "grad_norm": 14.86341667175293, + "learning_rate": 9.799810492718931e-06, + "loss": 5.7711, + "step": 131525 + }, + { + "epoch": 11.808797127468582, + "grad_norm": 13.883647918701172, + "learning_rate": 9.799561141033315e-06, + "loss": 5.9794, + "step": 131550 + }, + { + "epoch": 11.811041292639139, + "grad_norm": 13.931697845458984, + "learning_rate": 9.799311789347696e-06, + "loss": 5.9489, + "step": 131575 + }, + { + "epoch": 11.813285457809695, + "grad_norm": 13.888588905334473, + "learning_rate": 9.79906243766208e-06, + "loss": 5.9692, + "step": 131600 + }, + { + "epoch": 11.815529622980252, + "grad_norm": 13.862592697143555, + "learning_rate": 9.798813085976462e-06, + "loss": 5.9364, + "step": 131625 + }, + { + "epoch": 11.817773788150808, + "grad_norm": 16.083770751953125, + "learning_rate": 9.798563734290844e-06, + "loss": 5.7371, + "step": 131650 + }, + { + "epoch": 11.820017953321365, + "grad_norm": 15.862616539001465, + "learning_rate": 9.798314382605227e-06, + "loss": 6.0259, + "step": 131675 + }, + { + "epoch": 11.82226211849192, + "grad_norm": 13.465217590332031, + "learning_rate": 9.79806503091961e-06, + "loss": 6.0594, + "step": 131700 + }, + { + "epoch": 11.824506283662478, + "grad_norm": 14.821805000305176, + "learning_rate": 9.797815679233993e-06, + "loss": 6.0321, + "step": 131725 + }, + { + "epoch": 11.826750448833034, + "grad_norm": 17.48271369934082, + "learning_rate": 9.797566327548374e-06, + "loss": 6.0142, + "step": 131750 + }, + { + "epoch": 11.82899461400359, + "grad_norm": 15.63685131072998, + "learning_rate": 9.797316975862758e-06, + "loss": 6.1929, + "step": 131775 + }, + { + "epoch": 11.831238779174146, + "grad_norm": 13.767817497253418, + "learning_rate": 9.79706762417714e-06, + "loss": 5.8603, + "step": 131800 + }, + { + "epoch": 11.833482944344704, + "grad_norm": 14.645332336425781, + "learning_rate": 9.796818272491522e-06, + "loss": 6.046, + "step": 131825 + }, + { + "epoch": 11.835727109515261, + "grad_norm": 14.702397346496582, + "learning_rate": 9.796568920805906e-06, + "loss": 5.6706, + "step": 131850 + }, + { + "epoch": 11.837971274685817, + "grad_norm": 19.311664581298828, + "learning_rate": 9.796319569120289e-06, + "loss": 5.7398, + "step": 131875 + }, + { + "epoch": 11.840215439856374, + "grad_norm": 14.060702323913574, + "learning_rate": 9.796070217434671e-06, + "loss": 5.9629, + "step": 131900 + }, + { + "epoch": 11.84245960502693, + "grad_norm": 13.574694633483887, + "learning_rate": 9.795820865749053e-06, + "loss": 6.0142, + "step": 131925 + }, + { + "epoch": 11.844703770197487, + "grad_norm": 21.719146728515625, + "learning_rate": 9.795571514063436e-06, + "loss": 6.0363, + "step": 131950 + }, + { + "epoch": 11.846947935368043, + "grad_norm": 16.26080322265625, + "learning_rate": 9.795322162377818e-06, + "loss": 5.8088, + "step": 131975 + }, + { + "epoch": 11.8491921005386, + "grad_norm": 16.407922744750977, + "learning_rate": 9.7950728106922e-06, + "loss": 5.6741, + "step": 132000 + }, + { + "epoch": 11.851436265709156, + "grad_norm": 14.207199096679688, + "learning_rate": 9.794823459006584e-06, + "loss": 5.9701, + "step": 132025 + }, + { + "epoch": 11.853680430879713, + "grad_norm": 13.623016357421875, + "learning_rate": 9.794574107320967e-06, + "loss": 5.8201, + "step": 132050 + }, + { + "epoch": 11.855924596050269, + "grad_norm": 13.295695304870605, + "learning_rate": 9.794324755635349e-06, + "loss": 5.9442, + "step": 132075 + }, + { + "epoch": 11.858168761220826, + "grad_norm": 17.544954299926758, + "learning_rate": 9.794075403949731e-06, + "loss": 6.1584, + "step": 132100 + }, + { + "epoch": 11.860412926391383, + "grad_norm": 15.225937843322754, + "learning_rate": 9.793826052264114e-06, + "loss": 5.9831, + "step": 132125 + }, + { + "epoch": 11.862657091561939, + "grad_norm": 11.624756813049316, + "learning_rate": 9.793576700578496e-06, + "loss": 5.8045, + "step": 132150 + }, + { + "epoch": 11.864901256732496, + "grad_norm": 15.450072288513184, + "learning_rate": 9.79332734889288e-06, + "loss": 5.9453, + "step": 132175 + }, + { + "epoch": 11.867145421903052, + "grad_norm": 15.383034706115723, + "learning_rate": 9.793077997207262e-06, + "loss": 5.9774, + "step": 132200 + }, + { + "epoch": 11.86938958707361, + "grad_norm": 17.899967193603516, + "learning_rate": 9.792828645521645e-06, + "loss": 6.0049, + "step": 132225 + }, + { + "epoch": 11.871633752244165, + "grad_norm": 14.574652671813965, + "learning_rate": 9.792579293836027e-06, + "loss": 6.001, + "step": 132250 + }, + { + "epoch": 11.873877917414722, + "grad_norm": 15.429688453674316, + "learning_rate": 9.792329942150409e-06, + "loss": 5.7289, + "step": 132275 + }, + { + "epoch": 11.876122082585278, + "grad_norm": 17.024333953857422, + "learning_rate": 9.792080590464791e-06, + "loss": 5.9235, + "step": 132300 + }, + { + "epoch": 11.878366247755835, + "grad_norm": 15.049125671386719, + "learning_rate": 9.791831238779175e-06, + "loss": 6.1199, + "step": 132325 + }, + { + "epoch": 11.88061041292639, + "grad_norm": 16.00591468811035, + "learning_rate": 9.791581887093558e-06, + "loss": 5.8454, + "step": 132350 + }, + { + "epoch": 11.882854578096948, + "grad_norm": 15.98033332824707, + "learning_rate": 9.79133253540794e-06, + "loss": 5.8318, + "step": 132375 + }, + { + "epoch": 11.885098743267504, + "grad_norm": 13.024565696716309, + "learning_rate": 9.791083183722322e-06, + "loss": 5.8234, + "step": 132400 + }, + { + "epoch": 11.887342908438061, + "grad_norm": 14.190882682800293, + "learning_rate": 9.790833832036706e-06, + "loss": 6.001, + "step": 132425 + }, + { + "epoch": 11.889587073608617, + "grad_norm": 23.203216552734375, + "learning_rate": 9.790584480351087e-06, + "loss": 5.8768, + "step": 132450 + }, + { + "epoch": 11.891831238779174, + "grad_norm": 13.257373809814453, + "learning_rate": 9.79033512866547e-06, + "loss": 6.0386, + "step": 132475 + }, + { + "epoch": 11.894075403949731, + "grad_norm": 15.487828254699707, + "learning_rate": 9.790085776979853e-06, + "loss": 5.8654, + "step": 132500 + }, + { + "epoch": 11.896319569120287, + "grad_norm": 18.710222244262695, + "learning_rate": 9.789836425294236e-06, + "loss": 5.9928, + "step": 132525 + }, + { + "epoch": 11.898563734290844, + "grad_norm": 14.640504837036133, + "learning_rate": 9.789587073608618e-06, + "loss": 5.8864, + "step": 132550 + }, + { + "epoch": 11.9008078994614, + "grad_norm": 15.182076454162598, + "learning_rate": 9.789337721923002e-06, + "loss": 5.9158, + "step": 132575 + }, + { + "epoch": 11.903052064631957, + "grad_norm": 12.686132431030273, + "learning_rate": 9.789088370237384e-06, + "loss": 6.01, + "step": 132600 + }, + { + "epoch": 11.905296229802513, + "grad_norm": 15.563446998596191, + "learning_rate": 9.788839018551765e-06, + "loss": 5.7122, + "step": 132625 + }, + { + "epoch": 11.90754039497307, + "grad_norm": 12.951163291931152, + "learning_rate": 9.788589666866149e-06, + "loss": 5.9332, + "step": 132650 + }, + { + "epoch": 11.909784560143626, + "grad_norm": 13.948718070983887, + "learning_rate": 9.788340315180531e-06, + "loss": 5.9077, + "step": 132675 + }, + { + "epoch": 11.912028725314183, + "grad_norm": 16.038402557373047, + "learning_rate": 9.788090963494914e-06, + "loss": 5.9619, + "step": 132700 + }, + { + "epoch": 11.914272890484739, + "grad_norm": 13.437244415283203, + "learning_rate": 9.787841611809296e-06, + "loss": 6.0823, + "step": 132725 + }, + { + "epoch": 11.916517055655296, + "grad_norm": 13.00064754486084, + "learning_rate": 9.78759226012368e-06, + "loss": 6.1443, + "step": 132750 + }, + { + "epoch": 11.918761220825854, + "grad_norm": 15.990331649780273, + "learning_rate": 9.787342908438062e-06, + "loss": 5.868, + "step": 132775 + }, + { + "epoch": 11.92100538599641, + "grad_norm": 17.354145050048828, + "learning_rate": 9.787093556752445e-06, + "loss": 6.0455, + "step": 132800 + }, + { + "epoch": 11.923249551166966, + "grad_norm": 15.30614185333252, + "learning_rate": 9.786844205066827e-06, + "loss": 6.1431, + "step": 132825 + }, + { + "epoch": 11.925493716337522, + "grad_norm": 14.401947975158691, + "learning_rate": 9.78659485338121e-06, + "loss": 5.9745, + "step": 132850 + }, + { + "epoch": 11.92773788150808, + "grad_norm": 14.394043922424316, + "learning_rate": 9.786345501695592e-06, + "loss": 5.9026, + "step": 132875 + }, + { + "epoch": 11.929982046678635, + "grad_norm": 14.00729751586914, + "learning_rate": 9.786096150009976e-06, + "loss": 6.0602, + "step": 132900 + }, + { + "epoch": 11.932226211849192, + "grad_norm": 17.76215171813965, + "learning_rate": 9.785846798324358e-06, + "loss": 6.0787, + "step": 132925 + }, + { + "epoch": 11.934470377019748, + "grad_norm": 13.368648529052734, + "learning_rate": 9.78559744663874e-06, + "loss": 5.9393, + "step": 132950 + }, + { + "epoch": 11.936714542190305, + "grad_norm": 12.47648811340332, + "learning_rate": 9.785348094953122e-06, + "loss": 5.92, + "step": 132975 + }, + { + "epoch": 11.938958707360861, + "grad_norm": 15.799373626708984, + "learning_rate": 9.785098743267505e-06, + "loss": 5.8883, + "step": 133000 + }, + { + "epoch": 11.941202872531418, + "grad_norm": 14.57004451751709, + "learning_rate": 9.784849391581887e-06, + "loss": 5.9923, + "step": 133025 + }, + { + "epoch": 11.943447037701976, + "grad_norm": 13.806967735290527, + "learning_rate": 9.784600039896271e-06, + "loss": 5.8635, + "step": 133050 + }, + { + "epoch": 11.945691202872531, + "grad_norm": 14.353920936584473, + "learning_rate": 9.784350688210653e-06, + "loss": 5.63, + "step": 133075 + }, + { + "epoch": 11.947935368043089, + "grad_norm": 15.037919044494629, + "learning_rate": 9.784101336525036e-06, + "loss": 5.7928, + "step": 133100 + }, + { + "epoch": 11.950179533213644, + "grad_norm": 14.661815643310547, + "learning_rate": 9.783851984839418e-06, + "loss": 5.8673, + "step": 133125 + }, + { + "epoch": 11.952423698384202, + "grad_norm": 14.98070240020752, + "learning_rate": 9.783602633153802e-06, + "loss": 5.6933, + "step": 133150 + }, + { + "epoch": 11.954667863554757, + "grad_norm": 15.767733573913574, + "learning_rate": 9.783353281468183e-06, + "loss": 5.8571, + "step": 133175 + }, + { + "epoch": 11.956912028725315, + "grad_norm": 13.968584060668945, + "learning_rate": 9.783103929782567e-06, + "loss": 5.9011, + "step": 133200 + }, + { + "epoch": 11.95915619389587, + "grad_norm": 15.705707550048828, + "learning_rate": 9.782854578096949e-06, + "loss": 5.7449, + "step": 133225 + }, + { + "epoch": 11.961400359066428, + "grad_norm": 14.348743438720703, + "learning_rate": 9.782605226411331e-06, + "loss": 5.9241, + "step": 133250 + }, + { + "epoch": 11.963644524236983, + "grad_norm": 13.363097190856934, + "learning_rate": 9.782355874725714e-06, + "loss": 5.8494, + "step": 133275 + }, + { + "epoch": 11.96588868940754, + "grad_norm": 20.265270233154297, + "learning_rate": 9.782106523040098e-06, + "loss": 6.0202, + "step": 133300 + }, + { + "epoch": 11.968132854578098, + "grad_norm": 13.712265014648438, + "learning_rate": 9.78185717135448e-06, + "loss": 6.0872, + "step": 133325 + }, + { + "epoch": 11.970377019748653, + "grad_norm": 14.04157829284668, + "learning_rate": 9.78160781966886e-06, + "loss": 6.0906, + "step": 133350 + }, + { + "epoch": 11.97262118491921, + "grad_norm": 18.052515029907227, + "learning_rate": 9.781358467983245e-06, + "loss": 5.7585, + "step": 133375 + }, + { + "epoch": 11.974865350089766, + "grad_norm": 20.344430923461914, + "learning_rate": 9.781109116297627e-06, + "loss": 6.0009, + "step": 133400 + }, + { + "epoch": 11.977109515260324, + "grad_norm": 15.734578132629395, + "learning_rate": 9.780869738679434e-06, + "loss": 5.9213, + "step": 133425 + }, + { + "epoch": 11.97935368043088, + "grad_norm": 15.867990493774414, + "learning_rate": 9.780620386993816e-06, + "loss": 6.0418, + "step": 133450 + }, + { + "epoch": 11.981597845601437, + "grad_norm": 12.050469398498535, + "learning_rate": 9.780371035308199e-06, + "loss": 6.0616, + "step": 133475 + }, + { + "epoch": 11.983842010771992, + "grad_norm": 14.993341445922852, + "learning_rate": 9.780121683622583e-06, + "loss": 6.0648, + "step": 133500 + }, + { + "epoch": 11.98608617594255, + "grad_norm": 15.26055908203125, + "learning_rate": 9.779872331936965e-06, + "loss": 6.0021, + "step": 133525 + }, + { + "epoch": 11.988330341113105, + "grad_norm": 12.966731071472168, + "learning_rate": 9.779622980251347e-06, + "loss": 5.892, + "step": 133550 + }, + { + "epoch": 11.990574506283663, + "grad_norm": 14.8916597366333, + "learning_rate": 9.779373628565731e-06, + "loss": 5.812, + "step": 133575 + }, + { + "epoch": 11.992818671454218, + "grad_norm": 21.85711097717285, + "learning_rate": 9.779124276880112e-06, + "loss": 6.0171, + "step": 133600 + }, + { + "epoch": 11.995062836624776, + "grad_norm": 13.135345458984375, + "learning_rate": 9.778874925194494e-06, + "loss": 5.9183, + "step": 133625 + }, + { + "epoch": 11.997307001795333, + "grad_norm": 14.138036727905273, + "learning_rate": 9.778625573508878e-06, + "loss": 5.9329, + "step": 133650 + }, + { + "epoch": 11.999551166965889, + "grad_norm": 14.722801208496094, + "learning_rate": 9.77837622182326e-06, + "loss": 5.8924, + "step": 133675 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.07857101941840261, + "eval_f1_macro": 0.004751604903760213, + "eval_f1_micro": 0.07857101941840261, + "eval_f1_weighted": 0.039321394015216574, + "eval_loss": 7.042420864105225, + "eval_precision_macro": 0.004310522505317231, + "eval_precision_micro": 0.07857101941840261, + "eval_precision_weighted": 0.03144795546720536, + "eval_recall_macro": 0.008911900434914488, + "eval_recall_micro": 0.07857101941840261, + "eval_recall_weighted": 0.07857101941840261, + "eval_runtime": 127.3882, + "eval_samples_per_second": 411.129, + "eval_steps_per_second": 12.85, + "step": 133680 + }, + { + "epoch": 12.001795332136446, + "grad_norm": 13.248446464538574, + "learning_rate": 9.778126870137643e-06, + "loss": 5.7746, + "step": 133700 + }, + { + "epoch": 12.004039497307001, + "grad_norm": 12.091355323791504, + "learning_rate": 9.777877518452025e-06, + "loss": 5.6359, + "step": 133725 + }, + { + "epoch": 12.006283662477559, + "grad_norm": 17.11530113220215, + "learning_rate": 9.777628166766409e-06, + "loss": 5.6066, + "step": 133750 + }, + { + "epoch": 12.008527827648114, + "grad_norm": 13.28249454498291, + "learning_rate": 9.77737881508079e-06, + "loss": 5.7361, + "step": 133775 + }, + { + "epoch": 12.010771992818672, + "grad_norm": 14.332635879516602, + "learning_rate": 9.777129463395174e-06, + "loss": 5.7622, + "step": 133800 + }, + { + "epoch": 12.013016157989227, + "grad_norm": 16.244461059570312, + "learning_rate": 9.776880111709556e-06, + "loss": 5.3409, + "step": 133825 + }, + { + "epoch": 12.015260323159785, + "grad_norm": 14.885889053344727, + "learning_rate": 9.776630760023938e-06, + "loss": 5.3391, + "step": 133850 + }, + { + "epoch": 12.01750448833034, + "grad_norm": 16.75574493408203, + "learning_rate": 9.77638140833832e-06, + "loss": 5.5702, + "step": 133875 + }, + { + "epoch": 12.019748653500898, + "grad_norm": 15.099706649780273, + "learning_rate": 9.776132056652705e-06, + "loss": 5.6734, + "step": 133900 + }, + { + "epoch": 12.021992818671455, + "grad_norm": 15.86748218536377, + "learning_rate": 9.775882704967087e-06, + "loss": 5.5891, + "step": 133925 + }, + { + "epoch": 12.02423698384201, + "grad_norm": 13.737422943115234, + "learning_rate": 9.77563335328147e-06, + "loss": 5.7564, + "step": 133950 + }, + { + "epoch": 12.026481149012568, + "grad_norm": 15.38515567779541, + "learning_rate": 9.775384001595852e-06, + "loss": 5.7221, + "step": 133975 + }, + { + "epoch": 12.028725314183124, + "grad_norm": 15.930129051208496, + "learning_rate": 9.775134649910234e-06, + "loss": 5.5523, + "step": 134000 + }, + { + "epoch": 12.030969479353681, + "grad_norm": 13.602152824401855, + "learning_rate": 9.774885298224616e-06, + "loss": 5.5803, + "step": 134025 + }, + { + "epoch": 12.033213644524237, + "grad_norm": 12.880447387695312, + "learning_rate": 9.774635946539e-06, + "loss": 5.8186, + "step": 134050 + }, + { + "epoch": 12.035457809694794, + "grad_norm": 15.052387237548828, + "learning_rate": 9.774386594853383e-06, + "loss": 5.6781, + "step": 134075 + }, + { + "epoch": 12.03770197486535, + "grad_norm": 14.516335487365723, + "learning_rate": 9.774137243167765e-06, + "loss": 5.3927, + "step": 134100 + }, + { + "epoch": 12.039946140035907, + "grad_norm": 16.510507583618164, + "learning_rate": 9.773887891482147e-06, + "loss": 5.7494, + "step": 134125 + }, + { + "epoch": 12.042190305206462, + "grad_norm": 13.177630424499512, + "learning_rate": 9.77363853979653e-06, + "loss": 5.5302, + "step": 134150 + }, + { + "epoch": 12.04443447037702, + "grad_norm": 14.512594223022461, + "learning_rate": 9.773389188110912e-06, + "loss": 5.4538, + "step": 134175 + }, + { + "epoch": 12.046678635547575, + "grad_norm": 17.004169464111328, + "learning_rate": 9.773139836425294e-06, + "loss": 5.6478, + "step": 134200 + }, + { + "epoch": 12.048922800718133, + "grad_norm": 16.421777725219727, + "learning_rate": 9.772890484739678e-06, + "loss": 5.6585, + "step": 134225 + }, + { + "epoch": 12.05116696588869, + "grad_norm": 14.186623573303223, + "learning_rate": 9.77264113305406e-06, + "loss": 5.5568, + "step": 134250 + }, + { + "epoch": 12.053411131059246, + "grad_norm": 14.711870193481445, + "learning_rate": 9.772391781368443e-06, + "loss": 5.8191, + "step": 134275 + }, + { + "epoch": 12.055655296229803, + "grad_norm": 14.938262939453125, + "learning_rate": 9.772142429682827e-06, + "loss": 5.6246, + "step": 134300 + }, + { + "epoch": 12.057899461400359, + "grad_norm": 14.07133674621582, + "learning_rate": 9.771893077997207e-06, + "loss": 5.5251, + "step": 134325 + }, + { + "epoch": 12.060143626570916, + "grad_norm": 12.5813627243042, + "learning_rate": 9.77164372631159e-06, + "loss": 5.6565, + "step": 134350 + }, + { + "epoch": 12.062387791741472, + "grad_norm": 11.810647964477539, + "learning_rate": 9.771394374625974e-06, + "loss": 5.5219, + "step": 134375 + }, + { + "epoch": 12.064631956912029, + "grad_norm": 15.389547348022461, + "learning_rate": 9.771145022940356e-06, + "loss": 5.7236, + "step": 134400 + }, + { + "epoch": 12.066876122082585, + "grad_norm": 15.095398902893066, + "learning_rate": 9.770895671254738e-06, + "loss": 5.7598, + "step": 134425 + }, + { + "epoch": 12.069120287253142, + "grad_norm": 16.687095642089844, + "learning_rate": 9.77064631956912e-06, + "loss": 5.6812, + "step": 134450 + }, + { + "epoch": 12.071364452423698, + "grad_norm": 15.579635620117188, + "learning_rate": 9.770396967883505e-06, + "loss": 5.5776, + "step": 134475 + }, + { + "epoch": 12.073608617594255, + "grad_norm": 13.322521209716797, + "learning_rate": 9.770147616197885e-06, + "loss": 5.7761, + "step": 134500 + }, + { + "epoch": 12.075852782764812, + "grad_norm": 18.972665786743164, + "learning_rate": 9.76989826451227e-06, + "loss": 5.7086, + "step": 134525 + }, + { + "epoch": 12.078096947935368, + "grad_norm": 13.737403869628906, + "learning_rate": 9.769648912826652e-06, + "loss": 5.5642, + "step": 134550 + }, + { + "epoch": 12.080341113105925, + "grad_norm": 14.255826950073242, + "learning_rate": 9.769399561141034e-06, + "loss": 5.7296, + "step": 134575 + }, + { + "epoch": 12.08258527827648, + "grad_norm": 18.433141708374023, + "learning_rate": 9.769150209455416e-06, + "loss": 5.4119, + "step": 134600 + }, + { + "epoch": 12.084829443447038, + "grad_norm": 14.284860610961914, + "learning_rate": 9.7689008577698e-06, + "loss": 5.6426, + "step": 134625 + }, + { + "epoch": 12.087073608617594, + "grad_norm": 15.803447723388672, + "learning_rate": 9.768651506084183e-06, + "loss": 5.6333, + "step": 134650 + }, + { + "epoch": 12.089317773788151, + "grad_norm": 13.901741981506348, + "learning_rate": 9.768402154398565e-06, + "loss": 5.8579, + "step": 134675 + }, + { + "epoch": 12.091561938958707, + "grad_norm": 13.868571281433105, + "learning_rate": 9.768152802712947e-06, + "loss": 5.7243, + "step": 134700 + }, + { + "epoch": 12.093806104129264, + "grad_norm": 16.46580696105957, + "learning_rate": 9.76790345102733e-06, + "loss": 5.3719, + "step": 134725 + }, + { + "epoch": 12.09605026929982, + "grad_norm": 17.874752044677734, + "learning_rate": 9.767654099341712e-06, + "loss": 5.7231, + "step": 134750 + }, + { + "epoch": 12.098294434470377, + "grad_norm": 15.408415794372559, + "learning_rate": 9.767404747656096e-06, + "loss": 5.7192, + "step": 134775 + }, + { + "epoch": 12.100538599640933, + "grad_norm": 15.868102073669434, + "learning_rate": 9.767155395970478e-06, + "loss": 5.5739, + "step": 134800 + }, + { + "epoch": 12.10278276481149, + "grad_norm": 15.645434379577637, + "learning_rate": 9.76690604428486e-06, + "loss": 5.5428, + "step": 134825 + }, + { + "epoch": 12.105026929982047, + "grad_norm": 14.934269905090332, + "learning_rate": 9.766656692599243e-06, + "loss": 5.789, + "step": 134850 + }, + { + "epoch": 12.107271095152603, + "grad_norm": 12.491634368896484, + "learning_rate": 9.766407340913625e-06, + "loss": 5.6534, + "step": 134875 + }, + { + "epoch": 12.10951526032316, + "grad_norm": 12.913652420043945, + "learning_rate": 9.766157989228007e-06, + "loss": 5.8001, + "step": 134900 + }, + { + "epoch": 12.111759425493716, + "grad_norm": 15.724586486816406, + "learning_rate": 9.76590863754239e-06, + "loss": 5.6265, + "step": 134925 + }, + { + "epoch": 12.114003590664273, + "grad_norm": 14.107572555541992, + "learning_rate": 9.765659285856774e-06, + "loss": 5.6769, + "step": 134950 + }, + { + "epoch": 12.116247755834829, + "grad_norm": 11.737385749816895, + "learning_rate": 9.765409934171156e-06, + "loss": 5.5518, + "step": 134975 + }, + { + "epoch": 12.118491921005386, + "grad_norm": 15.043304443359375, + "learning_rate": 9.765160582485538e-06, + "loss": 5.6947, + "step": 135000 + }, + { + "epoch": 12.120736086175942, + "grad_norm": 14.350622177124023, + "learning_rate": 9.76491123079992e-06, + "loss": 5.6442, + "step": 135025 + }, + { + "epoch": 12.1229802513465, + "grad_norm": 13.080719947814941, + "learning_rate": 9.764661879114303e-06, + "loss": 5.5025, + "step": 135050 + }, + { + "epoch": 12.125224416517055, + "grad_norm": 12.63123607635498, + "learning_rate": 9.764412527428685e-06, + "loss": 5.5282, + "step": 135075 + }, + { + "epoch": 12.127468581687612, + "grad_norm": 13.457511901855469, + "learning_rate": 9.76416317574307e-06, + "loss": 5.7491, + "step": 135100 + }, + { + "epoch": 12.12971274685817, + "grad_norm": 13.38658332824707, + "learning_rate": 9.763913824057452e-06, + "loss": 5.6307, + "step": 135125 + }, + { + "epoch": 12.131956912028725, + "grad_norm": 14.796300888061523, + "learning_rate": 9.763664472371834e-06, + "loss": 5.573, + "step": 135150 + }, + { + "epoch": 12.134201077199283, + "grad_norm": 14.366106986999512, + "learning_rate": 9.763415120686216e-06, + "loss": 5.7598, + "step": 135175 + }, + { + "epoch": 12.136445242369838, + "grad_norm": 13.848299980163574, + "learning_rate": 9.763165769000599e-06, + "loss": 5.3992, + "step": 135200 + }, + { + "epoch": 12.138689407540395, + "grad_norm": 15.783818244934082, + "learning_rate": 9.762916417314981e-06, + "loss": 5.6869, + "step": 135225 + }, + { + "epoch": 12.140933572710951, + "grad_norm": 19.032743453979492, + "learning_rate": 9.762667065629365e-06, + "loss": 5.8399, + "step": 135250 + }, + { + "epoch": 12.143177737881508, + "grad_norm": 13.406451225280762, + "learning_rate": 9.762417713943747e-06, + "loss": 5.7158, + "step": 135275 + }, + { + "epoch": 12.145421903052064, + "grad_norm": 16.164812088012695, + "learning_rate": 9.76216836225813e-06, + "loss": 5.4829, + "step": 135300 + }, + { + "epoch": 12.147666068222621, + "grad_norm": 16.028228759765625, + "learning_rate": 9.761919010572512e-06, + "loss": 5.8434, + "step": 135325 + }, + { + "epoch": 12.149910233393177, + "grad_norm": 12.066994667053223, + "learning_rate": 9.761669658886896e-06, + "loss": 5.5929, + "step": 135350 + }, + { + "epoch": 12.152154398563734, + "grad_norm": 14.656852722167969, + "learning_rate": 9.761420307201277e-06, + "loss": 5.7619, + "step": 135375 + }, + { + "epoch": 12.15439856373429, + "grad_norm": 16.257986068725586, + "learning_rate": 9.76117095551566e-06, + "loss": 5.7646, + "step": 135400 + }, + { + "epoch": 12.156642728904847, + "grad_norm": 16.003536224365234, + "learning_rate": 9.760921603830043e-06, + "loss": 5.8252, + "step": 135425 + }, + { + "epoch": 12.158886894075405, + "grad_norm": 14.273369789123535, + "learning_rate": 9.760672252144425e-06, + "loss": 5.6535, + "step": 135450 + }, + { + "epoch": 12.16113105924596, + "grad_norm": 12.896692276000977, + "learning_rate": 9.760422900458807e-06, + "loss": 5.5449, + "step": 135475 + }, + { + "epoch": 12.163375224416518, + "grad_norm": 15.281362533569336, + "learning_rate": 9.760173548773192e-06, + "loss": 5.6457, + "step": 135500 + }, + { + "epoch": 12.165619389587073, + "grad_norm": 12.2943115234375, + "learning_rate": 9.759924197087574e-06, + "loss": 5.5322, + "step": 135525 + }, + { + "epoch": 12.16786355475763, + "grad_norm": 14.149646759033203, + "learning_rate": 9.759674845401954e-06, + "loss": 5.7475, + "step": 135550 + }, + { + "epoch": 12.170107719928186, + "grad_norm": 14.456539154052734, + "learning_rate": 9.759425493716338e-06, + "loss": 5.6795, + "step": 135575 + }, + { + "epoch": 12.172351885098744, + "grad_norm": 22.570648193359375, + "learning_rate": 9.75917614203072e-06, + "loss": 5.7921, + "step": 135600 + }, + { + "epoch": 12.1745960502693, + "grad_norm": 18.244979858398438, + "learning_rate": 9.758926790345103e-06, + "loss": 5.668, + "step": 135625 + }, + { + "epoch": 12.176840215439857, + "grad_norm": 13.245718955993652, + "learning_rate": 9.758677438659485e-06, + "loss": 5.6854, + "step": 135650 + }, + { + "epoch": 12.179084380610412, + "grad_norm": 15.4839506149292, + "learning_rate": 9.75842808697387e-06, + "loss": 5.4856, + "step": 135675 + }, + { + "epoch": 12.18132854578097, + "grad_norm": 15.424006462097168, + "learning_rate": 9.758178735288252e-06, + "loss": 5.6279, + "step": 135700 + }, + { + "epoch": 12.183572710951527, + "grad_norm": 14.504141807556152, + "learning_rate": 9.757929383602634e-06, + "loss": 5.3934, + "step": 135725 + }, + { + "epoch": 12.185816876122082, + "grad_norm": 14.590744018554688, + "learning_rate": 9.757680031917016e-06, + "loss": 5.628, + "step": 135750 + }, + { + "epoch": 12.18806104129264, + "grad_norm": 14.668487548828125, + "learning_rate": 9.757430680231399e-06, + "loss": 5.6626, + "step": 135775 + }, + { + "epoch": 12.190305206463195, + "grad_norm": 13.036996841430664, + "learning_rate": 9.757181328545781e-06, + "loss": 5.7097, + "step": 135800 + }, + { + "epoch": 12.192549371633753, + "grad_norm": 13.548688888549805, + "learning_rate": 9.756931976860165e-06, + "loss": 5.7612, + "step": 135825 + }, + { + "epoch": 12.194793536804308, + "grad_norm": 14.488332748413086, + "learning_rate": 9.756682625174547e-06, + "loss": 5.5938, + "step": 135850 + }, + { + "epoch": 12.197037701974866, + "grad_norm": 17.19993019104004, + "learning_rate": 9.75643327348893e-06, + "loss": 5.7144, + "step": 135875 + }, + { + "epoch": 12.199281867145421, + "grad_norm": 15.529732704162598, + "learning_rate": 9.756183921803312e-06, + "loss": 5.4673, + "step": 135900 + }, + { + "epoch": 12.201526032315979, + "grad_norm": 12.792205810546875, + "learning_rate": 9.755934570117694e-06, + "loss": 5.6137, + "step": 135925 + }, + { + "epoch": 12.203770197486534, + "grad_norm": 13.403780937194824, + "learning_rate": 9.755685218432077e-06, + "loss": 5.7136, + "step": 135950 + }, + { + "epoch": 12.206014362657092, + "grad_norm": 17.082687377929688, + "learning_rate": 9.75543586674646e-06, + "loss": 5.5161, + "step": 135975 + }, + { + "epoch": 12.208258527827649, + "grad_norm": 14.468817710876465, + "learning_rate": 9.755186515060843e-06, + "loss": 5.7546, + "step": 136000 + }, + { + "epoch": 12.210502692998205, + "grad_norm": 13.22601318359375, + "learning_rate": 9.754937163375225e-06, + "loss": 5.893, + "step": 136025 + }, + { + "epoch": 12.212746858168762, + "grad_norm": 13.839679718017578, + "learning_rate": 9.754687811689608e-06, + "loss": 5.6907, + "step": 136050 + }, + { + "epoch": 12.214991023339318, + "grad_norm": 14.110713958740234, + "learning_rate": 9.754438460003992e-06, + "loss": 5.8458, + "step": 136075 + }, + { + "epoch": 12.217235188509875, + "grad_norm": 17.277788162231445, + "learning_rate": 9.754189108318372e-06, + "loss": 5.7485, + "step": 136100 + }, + { + "epoch": 12.21947935368043, + "grad_norm": 12.46101188659668, + "learning_rate": 9.753939756632756e-06, + "loss": 5.5512, + "step": 136125 + }, + { + "epoch": 12.221723518850988, + "grad_norm": 12.648404121398926, + "learning_rate": 9.753690404947138e-06, + "loss": 5.6332, + "step": 136150 + }, + { + "epoch": 12.223967684021543, + "grad_norm": 13.59543228149414, + "learning_rate": 9.75344105326152e-06, + "loss": 5.7723, + "step": 136175 + }, + { + "epoch": 12.2262118491921, + "grad_norm": 16.405441284179688, + "learning_rate": 9.753191701575903e-06, + "loss": 5.9119, + "step": 136200 + }, + { + "epoch": 12.228456014362656, + "grad_norm": 15.75175952911377, + "learning_rate": 9.75295232395771e-06, + "loss": 5.7018, + "step": 136225 + }, + { + "epoch": 12.230700179533214, + "grad_norm": 15.124077796936035, + "learning_rate": 9.752702972272094e-06, + "loss": 5.571, + "step": 136250 + }, + { + "epoch": 12.23294434470377, + "grad_norm": 15.55730152130127, + "learning_rate": 9.752453620586476e-06, + "loss": 5.502, + "step": 136275 + }, + { + "epoch": 12.235188509874327, + "grad_norm": 13.594555854797363, + "learning_rate": 9.752204268900859e-06, + "loss": 5.8008, + "step": 136300 + }, + { + "epoch": 12.237432675044884, + "grad_norm": 15.298013687133789, + "learning_rate": 9.751954917215241e-06, + "loss": 5.653, + "step": 136325 + }, + { + "epoch": 12.23967684021544, + "grad_norm": 14.690682411193848, + "learning_rate": 9.751705565529623e-06, + "loss": 5.3973, + "step": 136350 + }, + { + "epoch": 12.241921005385997, + "grad_norm": 15.516347885131836, + "learning_rate": 9.751456213844006e-06, + "loss": 5.7421, + "step": 136375 + }, + { + "epoch": 12.244165170556553, + "grad_norm": 12.571380615234375, + "learning_rate": 9.75120686215839e-06, + "loss": 5.5989, + "step": 136400 + }, + { + "epoch": 12.24640933572711, + "grad_norm": 14.265933990478516, + "learning_rate": 9.750957510472772e-06, + "loss": 5.7073, + "step": 136425 + }, + { + "epoch": 12.248653500897666, + "grad_norm": 13.082076072692871, + "learning_rate": 9.750708158787154e-06, + "loss": 5.8264, + "step": 136450 + }, + { + "epoch": 12.250897666068223, + "grad_norm": 14.81555461883545, + "learning_rate": 9.750458807101537e-06, + "loss": 5.591, + "step": 136475 + }, + { + "epoch": 12.253141831238779, + "grad_norm": 16.632421493530273, + "learning_rate": 9.75020945541592e-06, + "loss": 5.9399, + "step": 136500 + }, + { + "epoch": 12.255385996409336, + "grad_norm": 15.533607482910156, + "learning_rate": 9.749960103730301e-06, + "loss": 5.9233, + "step": 136525 + }, + { + "epoch": 12.257630161579891, + "grad_norm": 16.467599868774414, + "learning_rate": 9.749710752044684e-06, + "loss": 5.9078, + "step": 136550 + }, + { + "epoch": 12.259874326750449, + "grad_norm": 15.75776195526123, + "learning_rate": 9.749461400359068e-06, + "loss": 5.5188, + "step": 136575 + }, + { + "epoch": 12.262118491921006, + "grad_norm": 15.39986801147461, + "learning_rate": 9.74921204867345e-06, + "loss": 5.729, + "step": 136600 + }, + { + "epoch": 12.264362657091562, + "grad_norm": 14.168042182922363, + "learning_rate": 9.748962696987832e-06, + "loss": 5.8507, + "step": 136625 + }, + { + "epoch": 12.26660682226212, + "grad_norm": 16.346120834350586, + "learning_rate": 9.748713345302215e-06, + "loss": 5.528, + "step": 136650 + }, + { + "epoch": 12.268850987432675, + "grad_norm": 18.575546264648438, + "learning_rate": 9.748463993616599e-06, + "loss": 5.5662, + "step": 136675 + }, + { + "epoch": 12.271095152603232, + "grad_norm": 15.883482933044434, + "learning_rate": 9.74821464193098e-06, + "loss": 5.5944, + "step": 136700 + }, + { + "epoch": 12.273339317773788, + "grad_norm": 14.598809242248535, + "learning_rate": 9.747965290245363e-06, + "loss": 5.6581, + "step": 136725 + }, + { + "epoch": 12.275583482944345, + "grad_norm": 16.437833786010742, + "learning_rate": 9.747715938559745e-06, + "loss": 5.8452, + "step": 136750 + }, + { + "epoch": 12.2778276481149, + "grad_norm": 14.201944351196289, + "learning_rate": 9.747466586874128e-06, + "loss": 5.8418, + "step": 136775 + }, + { + "epoch": 12.280071813285458, + "grad_norm": 13.846593856811523, + "learning_rate": 9.74721723518851e-06, + "loss": 5.9064, + "step": 136800 + }, + { + "epoch": 12.282315978456014, + "grad_norm": 18.23011589050293, + "learning_rate": 9.746967883502894e-06, + "loss": 5.5382, + "step": 136825 + }, + { + "epoch": 12.284560143626571, + "grad_norm": 15.363862991333008, + "learning_rate": 9.746718531817276e-06, + "loss": 5.5983, + "step": 136850 + }, + { + "epoch": 12.286804308797127, + "grad_norm": 17.836854934692383, + "learning_rate": 9.746469180131659e-06, + "loss": 5.8166, + "step": 136875 + }, + { + "epoch": 12.289048473967684, + "grad_norm": 20.809858322143555, + "learning_rate": 9.746219828446041e-06, + "loss": 5.6677, + "step": 136900 + }, + { + "epoch": 12.291292639138241, + "grad_norm": 12.510653495788574, + "learning_rate": 9.745970476760423e-06, + "loss": 5.7099, + "step": 136925 + }, + { + "epoch": 12.293536804308797, + "grad_norm": 15.864949226379395, + "learning_rate": 9.745721125074806e-06, + "loss": 5.7875, + "step": 136950 + }, + { + "epoch": 12.295780969479354, + "grad_norm": 19.627277374267578, + "learning_rate": 9.74547177338919e-06, + "loss": 5.6747, + "step": 136975 + }, + { + "epoch": 12.29802513464991, + "grad_norm": 19.871028900146484, + "learning_rate": 9.745222421703572e-06, + "loss": 5.6648, + "step": 137000 + }, + { + "epoch": 12.300269299820467, + "grad_norm": 16.971019744873047, + "learning_rate": 9.744973070017954e-06, + "loss": 5.5594, + "step": 137025 + }, + { + "epoch": 12.302513464991023, + "grad_norm": 11.44542407989502, + "learning_rate": 9.744723718332337e-06, + "loss": 5.8078, + "step": 137050 + }, + { + "epoch": 12.30475763016158, + "grad_norm": 14.369965553283691, + "learning_rate": 9.744474366646719e-06, + "loss": 5.7874, + "step": 137075 + }, + { + "epoch": 12.307001795332136, + "grad_norm": 14.486089706420898, + "learning_rate": 9.744225014961101e-06, + "loss": 5.7273, + "step": 137100 + }, + { + "epoch": 12.309245960502693, + "grad_norm": 16.559322357177734, + "learning_rate": 9.743975663275485e-06, + "loss": 5.8436, + "step": 137125 + }, + { + "epoch": 12.311490125673249, + "grad_norm": 18.098052978515625, + "learning_rate": 9.743726311589868e-06, + "loss": 5.5923, + "step": 137150 + }, + { + "epoch": 12.313734290843806, + "grad_norm": 14.006628036499023, + "learning_rate": 9.74347695990425e-06, + "loss": 5.7822, + "step": 137175 + }, + { + "epoch": 12.315978456014363, + "grad_norm": 12.355735778808594, + "learning_rate": 9.743227608218632e-06, + "loss": 5.6594, + "step": 137200 + }, + { + "epoch": 12.318222621184919, + "grad_norm": 12.502506256103516, + "learning_rate": 9.742978256533015e-06, + "loss": 5.7795, + "step": 137225 + }, + { + "epoch": 12.320466786355476, + "grad_norm": 15.076539993286133, + "learning_rate": 9.742728904847397e-06, + "loss": 5.5204, + "step": 137250 + }, + { + "epoch": 12.322710951526032, + "grad_norm": 18.186193466186523, + "learning_rate": 9.74247955316178e-06, + "loss": 5.906, + "step": 137275 + }, + { + "epoch": 12.32495511669659, + "grad_norm": 14.654407501220703, + "learning_rate": 9.742230201476163e-06, + "loss": 5.8155, + "step": 137300 + }, + { + "epoch": 12.327199281867145, + "grad_norm": 15.645171165466309, + "learning_rate": 9.741980849790546e-06, + "loss": 5.5706, + "step": 137325 + }, + { + "epoch": 12.329443447037702, + "grad_norm": 14.411921501159668, + "learning_rate": 9.741731498104928e-06, + "loss": 5.5562, + "step": 137350 + }, + { + "epoch": 12.331687612208258, + "grad_norm": 14.574870109558105, + "learning_rate": 9.74148214641931e-06, + "loss": 5.9173, + "step": 137375 + }, + { + "epoch": 12.333931777378815, + "grad_norm": 13.315103530883789, + "learning_rate": 9.741232794733692e-06, + "loss": 5.8516, + "step": 137400 + }, + { + "epoch": 12.33617594254937, + "grad_norm": 14.161166191101074, + "learning_rate": 9.740983443048075e-06, + "loss": 5.7104, + "step": 137425 + }, + { + "epoch": 12.338420107719928, + "grad_norm": 14.801436424255371, + "learning_rate": 9.740734091362459e-06, + "loss": 5.6644, + "step": 137450 + }, + { + "epoch": 12.340664272890486, + "grad_norm": 14.06878662109375, + "learning_rate": 9.740484739676841e-06, + "loss": 5.8649, + "step": 137475 + }, + { + "epoch": 12.342908438061041, + "grad_norm": 14.951578140258789, + "learning_rate": 9.740235387991223e-06, + "loss": 5.7707, + "step": 137500 + }, + { + "epoch": 12.345152603231599, + "grad_norm": 14.895669937133789, + "learning_rate": 9.739986036305606e-06, + "loss": 5.7205, + "step": 137525 + }, + { + "epoch": 12.347396768402154, + "grad_norm": 15.832048416137695, + "learning_rate": 9.73973668461999e-06, + "loss": 5.5355, + "step": 137550 + }, + { + "epoch": 12.349640933572712, + "grad_norm": 13.574796676635742, + "learning_rate": 9.739487332934372e-06, + "loss": 5.8832, + "step": 137575 + }, + { + "epoch": 12.351885098743267, + "grad_norm": 15.809988021850586, + "learning_rate": 9.739237981248754e-06, + "loss": 5.6653, + "step": 137600 + }, + { + "epoch": 12.354129263913824, + "grad_norm": 21.182876586914062, + "learning_rate": 9.738988629563137e-06, + "loss": 5.733, + "step": 137625 + }, + { + "epoch": 12.35637342908438, + "grad_norm": 12.443145751953125, + "learning_rate": 9.738739277877519e-06, + "loss": 5.5994, + "step": 137650 + }, + { + "epoch": 12.358617594254937, + "grad_norm": 15.830723762512207, + "learning_rate": 9.738489926191901e-06, + "loss": 5.6466, + "step": 137675 + }, + { + "epoch": 12.360861759425493, + "grad_norm": 16.346309661865234, + "learning_rate": 9.738240574506285e-06, + "loss": 5.8467, + "step": 137700 + }, + { + "epoch": 12.36310592459605, + "grad_norm": 16.36995506286621, + "learning_rate": 9.737991222820668e-06, + "loss": 5.543, + "step": 137725 + }, + { + "epoch": 12.365350089766606, + "grad_norm": 14.534601211547852, + "learning_rate": 9.73774187113505e-06, + "loss": 5.7152, + "step": 137750 + }, + { + "epoch": 12.367594254937163, + "grad_norm": 14.13437557220459, + "learning_rate": 9.737492519449432e-06, + "loss": 5.7789, + "step": 137775 + }, + { + "epoch": 12.36983842010772, + "grad_norm": 14.985627174377441, + "learning_rate": 9.737243167763815e-06, + "loss": 5.5079, + "step": 137800 + }, + { + "epoch": 12.372082585278276, + "grad_norm": 16.42649269104004, + "learning_rate": 9.736993816078197e-06, + "loss": 5.4485, + "step": 137825 + }, + { + "epoch": 12.374326750448834, + "grad_norm": 16.710174560546875, + "learning_rate": 9.736744464392581e-06, + "loss": 5.6813, + "step": 137850 + }, + { + "epoch": 12.37657091561939, + "grad_norm": 13.784011840820312, + "learning_rate": 9.736495112706963e-06, + "loss": 5.4357, + "step": 137875 + }, + { + "epoch": 12.378815080789947, + "grad_norm": 16.659461975097656, + "learning_rate": 9.736245761021346e-06, + "loss": 5.9187, + "step": 137900 + }, + { + "epoch": 12.381059245960502, + "grad_norm": 17.906381607055664, + "learning_rate": 9.735996409335728e-06, + "loss": 5.5928, + "step": 137925 + }, + { + "epoch": 12.38330341113106, + "grad_norm": 15.616056442260742, + "learning_rate": 9.73574705765011e-06, + "loss": 5.8294, + "step": 137950 + }, + { + "epoch": 12.385547576301615, + "grad_norm": 15.66157054901123, + "learning_rate": 9.735497705964493e-06, + "loss": 5.8657, + "step": 137975 + }, + { + "epoch": 12.387791741472173, + "grad_norm": 17.13243293762207, + "learning_rate": 9.735248354278875e-06, + "loss": 5.7999, + "step": 138000 + }, + { + "epoch": 12.390035906642728, + "grad_norm": 14.834726333618164, + "learning_rate": 9.734999002593259e-06, + "loss": 5.8053, + "step": 138025 + }, + { + "epoch": 12.392280071813286, + "grad_norm": 14.78522777557373, + "learning_rate": 9.734749650907641e-06, + "loss": 5.743, + "step": 138050 + }, + { + "epoch": 12.394524236983843, + "grad_norm": 18.77862548828125, + "learning_rate": 9.734500299222023e-06, + "loss": 6.013, + "step": 138075 + }, + { + "epoch": 12.396768402154398, + "grad_norm": 15.129013061523438, + "learning_rate": 9.734250947536406e-06, + "loss": 5.7997, + "step": 138100 + }, + { + "epoch": 12.399012567324956, + "grad_norm": 17.551259994506836, + "learning_rate": 9.734001595850788e-06, + "loss": 5.4252, + "step": 138125 + }, + { + "epoch": 12.401256732495511, + "grad_norm": 11.660561561584473, + "learning_rate": 9.73375224416517e-06, + "loss": 5.621, + "step": 138150 + }, + { + "epoch": 12.403500897666069, + "grad_norm": 14.785981178283691, + "learning_rate": 9.733502892479554e-06, + "loss": 5.9063, + "step": 138175 + }, + { + "epoch": 12.405745062836624, + "grad_norm": 14.178849220275879, + "learning_rate": 9.733253540793937e-06, + "loss": 5.8017, + "step": 138200 + }, + { + "epoch": 12.407989228007182, + "grad_norm": 14.008167266845703, + "learning_rate": 9.733004189108319e-06, + "loss": 5.8263, + "step": 138225 + }, + { + "epoch": 12.410233393177737, + "grad_norm": 13.254127502441406, + "learning_rate": 9.732754837422701e-06, + "loss": 5.7226, + "step": 138250 + }, + { + "epoch": 12.412477558348295, + "grad_norm": 16.845895767211914, + "learning_rate": 9.732505485737085e-06, + "loss": 5.8391, + "step": 138275 + }, + { + "epoch": 12.41472172351885, + "grad_norm": 14.714625358581543, + "learning_rate": 9.732256134051466e-06, + "loss": 5.6155, + "step": 138300 + }, + { + "epoch": 12.416965888689408, + "grad_norm": 20.076818466186523, + "learning_rate": 9.73200678236585e-06, + "loss": 5.6432, + "step": 138325 + }, + { + "epoch": 12.419210053859963, + "grad_norm": 15.93281364440918, + "learning_rate": 9.731757430680232e-06, + "loss": 5.5638, + "step": 138350 + }, + { + "epoch": 12.42145421903052, + "grad_norm": 15.553938865661621, + "learning_rate": 9.731508078994615e-06, + "loss": 5.4428, + "step": 138375 + }, + { + "epoch": 12.423698384201078, + "grad_norm": 13.639921188354492, + "learning_rate": 9.731258727308997e-06, + "loss": 5.5325, + "step": 138400 + }, + { + "epoch": 12.425942549371634, + "grad_norm": 16.604063034057617, + "learning_rate": 9.731009375623381e-06, + "loss": 5.7936, + "step": 138425 + }, + { + "epoch": 12.428186714542191, + "grad_norm": 15.146505355834961, + "learning_rate": 9.730760023937763e-06, + "loss": 5.906, + "step": 138450 + }, + { + "epoch": 12.430430879712747, + "grad_norm": 15.210261344909668, + "learning_rate": 9.730510672252144e-06, + "loss": 5.8933, + "step": 138475 + }, + { + "epoch": 12.432675044883304, + "grad_norm": 12.923754692077637, + "learning_rate": 9.730261320566528e-06, + "loss": 5.7721, + "step": 138500 + }, + { + "epoch": 12.43491921005386, + "grad_norm": 13.457086563110352, + "learning_rate": 9.73001196888091e-06, + "loss": 5.6557, + "step": 138525 + }, + { + "epoch": 12.437163375224417, + "grad_norm": 16.674964904785156, + "learning_rate": 9.729772591262717e-06, + "loss": 5.8391, + "step": 138550 + }, + { + "epoch": 12.439407540394972, + "grad_norm": 15.061531066894531, + "learning_rate": 9.7295232395771e-06, + "loss": 5.7508, + "step": 138575 + }, + { + "epoch": 12.44165170556553, + "grad_norm": 15.790905952453613, + "learning_rate": 9.729273887891484e-06, + "loss": 5.9213, + "step": 138600 + }, + { + "epoch": 12.443895870736085, + "grad_norm": 19.79812240600586, + "learning_rate": 9.729024536205866e-06, + "loss": 5.6774, + "step": 138625 + }, + { + "epoch": 12.446140035906643, + "grad_norm": 15.828672409057617, + "learning_rate": 9.728775184520248e-06, + "loss": 5.5499, + "step": 138650 + }, + { + "epoch": 12.4483842010772, + "grad_norm": 15.094359397888184, + "learning_rate": 9.72852583283463e-06, + "loss": 5.6265, + "step": 138675 + }, + { + "epoch": 12.450628366247756, + "grad_norm": 19.03557586669922, + "learning_rate": 9.728276481149014e-06, + "loss": 5.6257, + "step": 138700 + }, + { + "epoch": 12.452872531418313, + "grad_norm": 16.305973052978516, + "learning_rate": 9.728027129463395e-06, + "loss": 5.6848, + "step": 138725 + }, + { + "epoch": 12.455116696588869, + "grad_norm": 16.66585350036621, + "learning_rate": 9.727777777777777e-06, + "loss": 5.7733, + "step": 138750 + }, + { + "epoch": 12.457360861759426, + "grad_norm": 15.140485763549805, + "learning_rate": 9.727528426092161e-06, + "loss": 5.6608, + "step": 138775 + }, + { + "epoch": 12.459605026929982, + "grad_norm": 14.275444984436035, + "learning_rate": 9.727279074406544e-06, + "loss": 5.8865, + "step": 138800 + }, + { + "epoch": 12.461849192100539, + "grad_norm": 14.587714195251465, + "learning_rate": 9.727029722720926e-06, + "loss": 5.4752, + "step": 138825 + }, + { + "epoch": 12.464093357271095, + "grad_norm": 16.323909759521484, + "learning_rate": 9.72678037103531e-06, + "loss": 5.8384, + "step": 138850 + }, + { + "epoch": 12.466337522441652, + "grad_norm": 13.420914649963379, + "learning_rate": 9.726531019349692e-06, + "loss": 5.6222, + "step": 138875 + }, + { + "epoch": 12.468581687612208, + "grad_norm": 14.348862648010254, + "learning_rate": 9.726281667664073e-06, + "loss": 5.6179, + "step": 138900 + }, + { + "epoch": 12.470825852782765, + "grad_norm": 13.619343757629395, + "learning_rate": 9.726032315978457e-06, + "loss": 5.8838, + "step": 138925 + }, + { + "epoch": 12.473070017953322, + "grad_norm": 14.514681816101074, + "learning_rate": 9.72578296429284e-06, + "loss": 5.8439, + "step": 138950 + }, + { + "epoch": 12.475314183123878, + "grad_norm": 15.820856094360352, + "learning_rate": 9.725533612607222e-06, + "loss": 5.7529, + "step": 138975 + }, + { + "epoch": 12.477558348294435, + "grad_norm": 16.739185333251953, + "learning_rate": 9.725284260921604e-06, + "loss": 5.8292, + "step": 139000 + }, + { + "epoch": 12.47980251346499, + "grad_norm": 14.020853996276855, + "learning_rate": 9.725034909235988e-06, + "loss": 5.7265, + "step": 139025 + }, + { + "epoch": 12.482046678635548, + "grad_norm": 15.927620887756348, + "learning_rate": 9.72478555755037e-06, + "loss": 5.5788, + "step": 139050 + }, + { + "epoch": 12.484290843806104, + "grad_norm": 14.164593696594238, + "learning_rate": 9.724536205864753e-06, + "loss": 5.7609, + "step": 139075 + }, + { + "epoch": 12.486535008976661, + "grad_norm": 15.417964935302734, + "learning_rate": 9.724286854179135e-06, + "loss": 5.831, + "step": 139100 + }, + { + "epoch": 12.488779174147217, + "grad_norm": 12.01565933227539, + "learning_rate": 9.724037502493517e-06, + "loss": 5.8757, + "step": 139125 + }, + { + "epoch": 12.491023339317774, + "grad_norm": 13.318023681640625, + "learning_rate": 9.7237881508079e-06, + "loss": 5.8876, + "step": 139150 + }, + { + "epoch": 12.49326750448833, + "grad_norm": 14.92900562286377, + "learning_rate": 9.723538799122284e-06, + "loss": 5.7813, + "step": 139175 + }, + { + "epoch": 12.495511669658887, + "grad_norm": 16.131237030029297, + "learning_rate": 9.723289447436666e-06, + "loss": 5.8245, + "step": 139200 + }, + { + "epoch": 12.497755834829443, + "grad_norm": 17.417388916015625, + "learning_rate": 9.723040095751048e-06, + "loss": 5.4846, + "step": 139225 + }, + { + "epoch": 12.5, + "grad_norm": 13.054971694946289, + "learning_rate": 9.72279074406543e-06, + "loss": 5.7173, + "step": 139250 + }, + { + "epoch": 12.502244165170557, + "grad_norm": 13.886519432067871, + "learning_rate": 9.722541392379813e-06, + "loss": 5.756, + "step": 139275 + }, + { + "epoch": 12.504488330341113, + "grad_norm": 15.390425682067871, + "learning_rate": 9.722292040694195e-06, + "loss": 5.921, + "step": 139300 + }, + { + "epoch": 12.50673249551167, + "grad_norm": 12.962925910949707, + "learning_rate": 9.72204268900858e-06, + "loss": 5.5664, + "step": 139325 + }, + { + "epoch": 12.508976660682226, + "grad_norm": 14.887640953063965, + "learning_rate": 9.721793337322961e-06, + "loss": 5.8344, + "step": 139350 + }, + { + "epoch": 12.511220825852783, + "grad_norm": 15.829704284667969, + "learning_rate": 9.721543985637344e-06, + "loss": 5.7972, + "step": 139375 + }, + { + "epoch": 12.513464991023339, + "grad_norm": 19.445804595947266, + "learning_rate": 9.721294633951726e-06, + "loss": 5.8823, + "step": 139400 + }, + { + "epoch": 12.515709156193896, + "grad_norm": 15.598150253295898, + "learning_rate": 9.72104528226611e-06, + "loss": 5.7458, + "step": 139425 + }, + { + "epoch": 12.517953321364452, + "grad_norm": 12.132575988769531, + "learning_rate": 9.72079593058049e-06, + "loss": 5.8612, + "step": 139450 + }, + { + "epoch": 12.52019748653501, + "grad_norm": 14.706637382507324, + "learning_rate": 9.720546578894873e-06, + "loss": 5.7178, + "step": 139475 + }, + { + "epoch": 12.522441651705565, + "grad_norm": 15.17214298248291, + "learning_rate": 9.720297227209257e-06, + "loss": 5.7213, + "step": 139500 + }, + { + "epoch": 12.524685816876122, + "grad_norm": 14.147696495056152, + "learning_rate": 9.72004787552364e-06, + "loss": 5.5581, + "step": 139525 + }, + { + "epoch": 12.526929982046678, + "grad_norm": 18.000207901000977, + "learning_rate": 9.719798523838022e-06, + "loss": 5.9623, + "step": 139550 + }, + { + "epoch": 12.529174147217235, + "grad_norm": 15.866022109985352, + "learning_rate": 9.719549172152406e-06, + "loss": 5.9367, + "step": 139575 + }, + { + "epoch": 12.531418312387792, + "grad_norm": 14.63575267791748, + "learning_rate": 9.719299820466788e-06, + "loss": 5.808, + "step": 139600 + }, + { + "epoch": 12.533662477558348, + "grad_norm": 18.450841903686523, + "learning_rate": 9.719050468781169e-06, + "loss": 5.6651, + "step": 139625 + }, + { + "epoch": 12.535906642728905, + "grad_norm": 15.142062187194824, + "learning_rate": 9.718801117095553e-06, + "loss": 5.8124, + "step": 139650 + }, + { + "epoch": 12.538150807899461, + "grad_norm": 15.502577781677246, + "learning_rate": 9.718551765409935e-06, + "loss": 5.504, + "step": 139675 + }, + { + "epoch": 12.540394973070018, + "grad_norm": 14.538687705993652, + "learning_rate": 9.718302413724317e-06, + "loss": 5.8076, + "step": 139700 + }, + { + "epoch": 12.542639138240574, + "grad_norm": 17.905941009521484, + "learning_rate": 9.7180530620387e-06, + "loss": 5.7307, + "step": 139725 + }, + { + "epoch": 12.544883303411131, + "grad_norm": 16.76826286315918, + "learning_rate": 9.717803710353084e-06, + "loss": 5.7771, + "step": 139750 + }, + { + "epoch": 12.547127468581687, + "grad_norm": 16.49610710144043, + "learning_rate": 9.717554358667466e-06, + "loss": 5.8553, + "step": 139775 + }, + { + "epoch": 12.549371633752244, + "grad_norm": 18.589702606201172, + "learning_rate": 9.717305006981848e-06, + "loss": 5.7742, + "step": 139800 + }, + { + "epoch": 12.5516157989228, + "grad_norm": 14.892868995666504, + "learning_rate": 9.71705565529623e-06, + "loss": 5.6315, + "step": 139825 + }, + { + "epoch": 12.553859964093357, + "grad_norm": 14.115604400634766, + "learning_rate": 9.716806303610613e-06, + "loss": 6.1186, + "step": 139850 + }, + { + "epoch": 12.556104129263915, + "grad_norm": 14.707093238830566, + "learning_rate": 9.716556951924995e-06, + "loss": 5.7673, + "step": 139875 + }, + { + "epoch": 12.55834829443447, + "grad_norm": 17.00351905822754, + "learning_rate": 9.71630760023938e-06, + "loss": 5.487, + "step": 139900 + }, + { + "epoch": 12.560592459605028, + "grad_norm": 15.989154815673828, + "learning_rate": 9.716058248553762e-06, + "loss": 5.953, + "step": 139925 + }, + { + "epoch": 12.562836624775583, + "grad_norm": 14.642463684082031, + "learning_rate": 9.715808896868144e-06, + "loss": 5.7573, + "step": 139950 + }, + { + "epoch": 12.56508078994614, + "grad_norm": 12.601354598999023, + "learning_rate": 9.715559545182526e-06, + "loss": 5.8254, + "step": 139975 + }, + { + "epoch": 12.567324955116696, + "grad_norm": 14.673994064331055, + "learning_rate": 9.715310193496908e-06, + "loss": 5.6232, + "step": 140000 + }, + { + "epoch": 12.569569120287253, + "grad_norm": 16.059783935546875, + "learning_rate": 9.71506084181129e-06, + "loss": 5.8037, + "step": 140025 + }, + { + "epoch": 12.571813285457809, + "grad_norm": 17.668296813964844, + "learning_rate": 9.714811490125675e-06, + "loss": 5.5321, + "step": 140050 + }, + { + "epoch": 12.574057450628366, + "grad_norm": 16.645030975341797, + "learning_rate": 9.714562138440057e-06, + "loss": 5.5875, + "step": 140075 + }, + { + "epoch": 12.576301615798922, + "grad_norm": 15.241878509521484, + "learning_rate": 9.71431278675444e-06, + "loss": 5.8061, + "step": 140100 + }, + { + "epoch": 12.57854578096948, + "grad_norm": 17.37699317932129, + "learning_rate": 9.714063435068822e-06, + "loss": 5.343, + "step": 140125 + }, + { + "epoch": 12.580789946140037, + "grad_norm": 16.28099250793457, + "learning_rate": 9.713814083383204e-06, + "loss": 5.685, + "step": 140150 + }, + { + "epoch": 12.583034111310592, + "grad_norm": 15.477314949035645, + "learning_rate": 9.713564731697586e-06, + "loss": 5.775, + "step": 140175 + }, + { + "epoch": 12.58527827648115, + "grad_norm": 14.1270170211792, + "learning_rate": 9.713315380011969e-06, + "loss": 5.6911, + "step": 140200 + }, + { + "epoch": 12.587522441651705, + "grad_norm": 14.68868350982666, + "learning_rate": 9.713066028326353e-06, + "loss": 5.7871, + "step": 140225 + }, + { + "epoch": 12.589766606822263, + "grad_norm": 15.953790664672852, + "learning_rate": 9.712816676640735e-06, + "loss": 5.6211, + "step": 140250 + }, + { + "epoch": 12.592010771992818, + "grad_norm": 14.18967342376709, + "learning_rate": 9.712567324955117e-06, + "loss": 5.782, + "step": 140275 + }, + { + "epoch": 12.594254937163376, + "grad_norm": 18.137561798095703, + "learning_rate": 9.712317973269501e-06, + "loss": 5.8663, + "step": 140300 + }, + { + "epoch": 12.596499102333931, + "grad_norm": 15.623364448547363, + "learning_rate": 9.712068621583882e-06, + "loss": 5.9233, + "step": 140325 + }, + { + "epoch": 12.598743267504489, + "grad_norm": 15.326069831848145, + "learning_rate": 9.711819269898264e-06, + "loss": 6.023, + "step": 140350 + }, + { + "epoch": 12.600987432675044, + "grad_norm": 20.15094757080078, + "learning_rate": 9.711569918212648e-06, + "loss": 5.669, + "step": 140375 + }, + { + "epoch": 12.603231597845602, + "grad_norm": 12.031243324279785, + "learning_rate": 9.71132056652703e-06, + "loss": 5.7283, + "step": 140400 + }, + { + "epoch": 12.605475763016159, + "grad_norm": 15.641345024108887, + "learning_rate": 9.711071214841413e-06, + "loss": 5.7768, + "step": 140425 + }, + { + "epoch": 12.607719928186714, + "grad_norm": 16.216819763183594, + "learning_rate": 9.710821863155795e-06, + "loss": 5.7037, + "step": 140450 + }, + { + "epoch": 12.609964093357272, + "grad_norm": 13.778680801391602, + "learning_rate": 9.71057251147018e-06, + "loss": 5.7594, + "step": 140475 + }, + { + "epoch": 12.612208258527827, + "grad_norm": 13.909587860107422, + "learning_rate": 9.71032315978456e-06, + "loss": 5.6637, + "step": 140500 + }, + { + "epoch": 12.614452423698385, + "grad_norm": 17.037723541259766, + "learning_rate": 9.710073808098944e-06, + "loss": 5.8174, + "step": 140525 + }, + { + "epoch": 12.61669658886894, + "grad_norm": 14.9293212890625, + "learning_rate": 9.709824456413326e-06, + "loss": 5.7889, + "step": 140550 + }, + { + "epoch": 12.618940754039498, + "grad_norm": 16.491926193237305, + "learning_rate": 9.709575104727709e-06, + "loss": 5.7652, + "step": 140575 + }, + { + "epoch": 12.621184919210053, + "grad_norm": 15.619288444519043, + "learning_rate": 9.70932575304209e-06, + "loss": 5.8686, + "step": 140600 + }, + { + "epoch": 12.62342908438061, + "grad_norm": 17.662519454956055, + "learning_rate": 9.709076401356475e-06, + "loss": 5.8409, + "step": 140625 + }, + { + "epoch": 12.625673249551166, + "grad_norm": 12.82396125793457, + "learning_rate": 9.708827049670857e-06, + "loss": 5.7404, + "step": 140650 + }, + { + "epoch": 12.627917414721724, + "grad_norm": 22.759765625, + "learning_rate": 9.708587672052664e-06, + "loss": 5.7066, + "step": 140675 + }, + { + "epoch": 12.63016157989228, + "grad_norm": 15.516670227050781, + "learning_rate": 9.708338320367046e-06, + "loss": 5.7, + "step": 140700 + }, + { + "epoch": 12.632405745062837, + "grad_norm": 16.628158569335938, + "learning_rate": 9.708088968681429e-06, + "loss": 5.5843, + "step": 140725 + }, + { + "epoch": 12.634649910233394, + "grad_norm": 14.155272483825684, + "learning_rate": 9.707839616995813e-06, + "loss": 5.8775, + "step": 140750 + }, + { + "epoch": 12.63689407540395, + "grad_norm": 13.070432662963867, + "learning_rate": 9.707590265310193e-06, + "loss": 5.7522, + "step": 140775 + }, + { + "epoch": 12.639138240574507, + "grad_norm": 14.269660949707031, + "learning_rate": 9.707340913624577e-06, + "loss": 5.8269, + "step": 140800 + }, + { + "epoch": 12.641382405745063, + "grad_norm": 17.565357208251953, + "learning_rate": 9.70709156193896e-06, + "loss": 5.825, + "step": 140825 + }, + { + "epoch": 12.64362657091562, + "grad_norm": 13.69836139678955, + "learning_rate": 9.706842210253342e-06, + "loss": 5.5226, + "step": 140850 + }, + { + "epoch": 12.645870736086176, + "grad_norm": 16.054731369018555, + "learning_rate": 9.706592858567724e-06, + "loss": 5.782, + "step": 140875 + }, + { + "epoch": 12.648114901256733, + "grad_norm": 16.99522590637207, + "learning_rate": 9.706343506882108e-06, + "loss": 5.7517, + "step": 140900 + }, + { + "epoch": 12.650359066427288, + "grad_norm": 14.689205169677734, + "learning_rate": 9.70609415519649e-06, + "loss": 5.7361, + "step": 140925 + }, + { + "epoch": 12.652603231597846, + "grad_norm": 16.862340927124023, + "learning_rate": 9.705844803510871e-06, + "loss": 5.6802, + "step": 140950 + }, + { + "epoch": 12.654847396768401, + "grad_norm": 15.49255084991455, + "learning_rate": 9.705595451825255e-06, + "loss": 5.493, + "step": 140975 + }, + { + "epoch": 12.657091561938959, + "grad_norm": 15.0742826461792, + "learning_rate": 9.705346100139638e-06, + "loss": 5.7825, + "step": 141000 + }, + { + "epoch": 12.659335727109514, + "grad_norm": 15.00446891784668, + "learning_rate": 9.70509674845402e-06, + "loss": 5.8061, + "step": 141025 + }, + { + "epoch": 12.661579892280072, + "grad_norm": 17.772262573242188, + "learning_rate": 9.704847396768404e-06, + "loss": 5.7452, + "step": 141050 + }, + { + "epoch": 12.66382405745063, + "grad_norm": 14.327622413635254, + "learning_rate": 9.704598045082786e-06, + "loss": 5.4575, + "step": 141075 + }, + { + "epoch": 12.666068222621185, + "grad_norm": 15.610748291015625, + "learning_rate": 9.704348693397169e-06, + "loss": 5.8439, + "step": 141100 + }, + { + "epoch": 12.668312387791742, + "grad_norm": 16.770219802856445, + "learning_rate": 9.704099341711551e-06, + "loss": 5.5478, + "step": 141125 + }, + { + "epoch": 12.670556552962298, + "grad_norm": 15.701597213745117, + "learning_rate": 9.703849990025933e-06, + "loss": 5.7496, + "step": 141150 + }, + { + "epoch": 12.672800718132855, + "grad_norm": 14.923449516296387, + "learning_rate": 9.703600638340316e-06, + "loss": 5.748, + "step": 141175 + }, + { + "epoch": 12.67504488330341, + "grad_norm": 14.599117279052734, + "learning_rate": 9.703351286654698e-06, + "loss": 5.606, + "step": 141200 + }, + { + "epoch": 12.677289048473968, + "grad_norm": 17.295616149902344, + "learning_rate": 9.703101934969082e-06, + "loss": 5.7697, + "step": 141225 + }, + { + "epoch": 12.679533213644524, + "grad_norm": 14.8917236328125, + "learning_rate": 9.702852583283464e-06, + "loss": 5.8439, + "step": 141250 + }, + { + "epoch": 12.681777378815081, + "grad_norm": 14.673590660095215, + "learning_rate": 9.702603231597846e-06, + "loss": 5.796, + "step": 141275 + }, + { + "epoch": 12.684021543985637, + "grad_norm": 17.726486206054688, + "learning_rate": 9.702353879912229e-06, + "loss": 5.9905, + "step": 141300 + }, + { + "epoch": 12.686265709156194, + "grad_norm": 14.306629180908203, + "learning_rate": 9.702104528226611e-06, + "loss": 5.5789, + "step": 141325 + }, + { + "epoch": 12.688509874326751, + "grad_norm": 13.40650749206543, + "learning_rate": 9.701855176540993e-06, + "loss": 5.7008, + "step": 141350 + }, + { + "epoch": 12.690754039497307, + "grad_norm": 15.95039176940918, + "learning_rate": 9.701605824855377e-06, + "loss": 5.9305, + "step": 141375 + }, + { + "epoch": 12.692998204667864, + "grad_norm": 15.428372383117676, + "learning_rate": 9.70135647316976e-06, + "loss": 5.8858, + "step": 141400 + }, + { + "epoch": 12.69524236983842, + "grad_norm": 16.04150390625, + "learning_rate": 9.701107121484142e-06, + "loss": 5.3789, + "step": 141425 + }, + { + "epoch": 12.697486535008977, + "grad_norm": 14.12529468536377, + "learning_rate": 9.700857769798524e-06, + "loss": 5.9206, + "step": 141450 + }, + { + "epoch": 12.699730700179533, + "grad_norm": 16.857452392578125, + "learning_rate": 9.700608418112907e-06, + "loss": 5.4145, + "step": 141475 + }, + { + "epoch": 12.70197486535009, + "grad_norm": 15.065543174743652, + "learning_rate": 9.700359066427289e-06, + "loss": 5.7645, + "step": 141500 + }, + { + "epoch": 12.704219030520646, + "grad_norm": 14.72140884399414, + "learning_rate": 9.700109714741673e-06, + "loss": 5.6259, + "step": 141525 + }, + { + "epoch": 12.706463195691203, + "grad_norm": 12.177007675170898, + "learning_rate": 9.699860363056055e-06, + "loss": 5.9296, + "step": 141550 + }, + { + "epoch": 12.708707360861759, + "grad_norm": 12.88140869140625, + "learning_rate": 9.699611011370438e-06, + "loss": 5.9318, + "step": 141575 + }, + { + "epoch": 12.710951526032316, + "grad_norm": 17.81228256225586, + "learning_rate": 9.69936165968482e-06, + "loss": 5.7567, + "step": 141600 + }, + { + "epoch": 12.713195691202873, + "grad_norm": 14.76459789276123, + "learning_rate": 9.699112307999204e-06, + "loss": 5.6464, + "step": 141625 + }, + { + "epoch": 12.715439856373429, + "grad_norm": 12.91917610168457, + "learning_rate": 9.698862956313585e-06, + "loss": 5.7825, + "step": 141650 + }, + { + "epoch": 12.717684021543986, + "grad_norm": 13.78593635559082, + "learning_rate": 9.698613604627967e-06, + "loss": 5.6655, + "step": 141675 + }, + { + "epoch": 12.719928186714542, + "grad_norm": 14.805804252624512, + "learning_rate": 9.698364252942351e-06, + "loss": 5.8688, + "step": 141700 + }, + { + "epoch": 12.7221723518851, + "grad_norm": 13.251453399658203, + "learning_rate": 9.698114901256733e-06, + "loss": 5.8173, + "step": 141725 + }, + { + "epoch": 12.724416517055655, + "grad_norm": 16.405315399169922, + "learning_rate": 9.697865549571116e-06, + "loss": 5.6823, + "step": 141750 + }, + { + "epoch": 12.726660682226212, + "grad_norm": 17.475780487060547, + "learning_rate": 9.6976161978855e-06, + "loss": 5.5683, + "step": 141775 + }, + { + "epoch": 12.728904847396768, + "grad_norm": 16.22955894470215, + "learning_rate": 9.697366846199882e-06, + "loss": 5.5144, + "step": 141800 + }, + { + "epoch": 12.731149012567325, + "grad_norm": 16.22881507873535, + "learning_rate": 9.697117494514263e-06, + "loss": 6.0185, + "step": 141825 + }, + { + "epoch": 12.73339317773788, + "grad_norm": 19.80466651916504, + "learning_rate": 9.696868142828647e-06, + "loss": 5.9306, + "step": 141850 + }, + { + "epoch": 12.735637342908438, + "grad_norm": 15.19687271118164, + "learning_rate": 9.696618791143029e-06, + "loss": 5.974, + "step": 141875 + }, + { + "epoch": 12.737881508078996, + "grad_norm": 17.12420654296875, + "learning_rate": 9.696369439457411e-06, + "loss": 5.6059, + "step": 141900 + }, + { + "epoch": 12.740125673249551, + "grad_norm": 17.95290756225586, + "learning_rate": 9.696120087771793e-06, + "loss": 5.7321, + "step": 141925 + }, + { + "epoch": 12.742369838420109, + "grad_norm": 15.153387069702148, + "learning_rate": 9.695870736086177e-06, + "loss": 5.5763, + "step": 141950 + }, + { + "epoch": 12.744614003590664, + "grad_norm": 15.574457168579102, + "learning_rate": 9.69562138440056e-06, + "loss": 5.822, + "step": 141975 + }, + { + "epoch": 12.746858168761221, + "grad_norm": 16.600297927856445, + "learning_rate": 9.695372032714942e-06, + "loss": 5.5613, + "step": 142000 + }, + { + "epoch": 12.749102333931777, + "grad_norm": 14.858553886413574, + "learning_rate": 9.695122681029324e-06, + "loss": 5.8377, + "step": 142025 + }, + { + "epoch": 12.751346499102334, + "grad_norm": 13.09264850616455, + "learning_rate": 9.694873329343707e-06, + "loss": 5.4541, + "step": 142050 + }, + { + "epoch": 12.75359066427289, + "grad_norm": 14.466107368469238, + "learning_rate": 9.694623977658089e-06, + "loss": 5.7328, + "step": 142075 + }, + { + "epoch": 12.755834829443447, + "grad_norm": 14.733787536621094, + "learning_rate": 9.694374625972473e-06, + "loss": 5.8893, + "step": 142100 + }, + { + "epoch": 12.758078994614003, + "grad_norm": 16.82180404663086, + "learning_rate": 9.694125274286855e-06, + "loss": 5.3864, + "step": 142125 + }, + { + "epoch": 12.76032315978456, + "grad_norm": 13.847810745239258, + "learning_rate": 9.693875922601238e-06, + "loss": 5.8891, + "step": 142150 + }, + { + "epoch": 12.762567324955116, + "grad_norm": 13.6124267578125, + "learning_rate": 9.69362657091562e-06, + "loss": 5.5863, + "step": 142175 + }, + { + "epoch": 12.764811490125673, + "grad_norm": 17.84532928466797, + "learning_rate": 9.693377219230002e-06, + "loss": 5.708, + "step": 142200 + }, + { + "epoch": 12.767055655296229, + "grad_norm": 15.949077606201172, + "learning_rate": 9.693127867544385e-06, + "loss": 5.67, + "step": 142225 + }, + { + "epoch": 12.769299820466786, + "grad_norm": 15.524420738220215, + "learning_rate": 9.692878515858769e-06, + "loss": 5.9889, + "step": 142250 + }, + { + "epoch": 12.771543985637344, + "grad_norm": 15.76369857788086, + "learning_rate": 9.692629164173151e-06, + "loss": 5.6819, + "step": 142275 + }, + { + "epoch": 12.7737881508079, + "grad_norm": 16.180543899536133, + "learning_rate": 9.692379812487533e-06, + "loss": 5.8237, + "step": 142300 + }, + { + "epoch": 12.776032315978457, + "grad_norm": 14.745806694030762, + "learning_rate": 9.692130460801916e-06, + "loss": 5.5193, + "step": 142325 + }, + { + "epoch": 12.778276481149012, + "grad_norm": 17.874963760375977, + "learning_rate": 9.691881109116298e-06, + "loss": 5.5999, + "step": 142350 + }, + { + "epoch": 12.78052064631957, + "grad_norm": 17.274417877197266, + "learning_rate": 9.69163175743068e-06, + "loss": 5.6984, + "step": 142375 + }, + { + "epoch": 12.782764811490125, + "grad_norm": 18.130849838256836, + "learning_rate": 9.691382405745063e-06, + "loss": 5.8263, + "step": 142400 + }, + { + "epoch": 12.785008976660682, + "grad_norm": 15.909831047058105, + "learning_rate": 9.691133054059447e-06, + "loss": 5.9899, + "step": 142425 + }, + { + "epoch": 12.787253141831238, + "grad_norm": 15.40202522277832, + "learning_rate": 9.690883702373829e-06, + "loss": 5.7761, + "step": 142450 + }, + { + "epoch": 12.789497307001795, + "grad_norm": 14.98423957824707, + "learning_rate": 9.690634350688211e-06, + "loss": 5.8884, + "step": 142475 + }, + { + "epoch": 12.791741472172351, + "grad_norm": 23.54323387145996, + "learning_rate": 9.690384999002595e-06, + "loss": 5.7653, + "step": 142500 + }, + { + "epoch": 12.793985637342908, + "grad_norm": 14.450077056884766, + "learning_rate": 9.690135647316978e-06, + "loss": 5.8861, + "step": 142525 + }, + { + "epoch": 12.796229802513466, + "grad_norm": 21.980024337768555, + "learning_rate": 9.689886295631358e-06, + "loss": 5.8342, + "step": 142550 + }, + { + "epoch": 12.798473967684021, + "grad_norm": 15.687006950378418, + "learning_rate": 9.689636943945742e-06, + "loss": 5.6942, + "step": 142575 + }, + { + "epoch": 12.800718132854579, + "grad_norm": 16.36006736755371, + "learning_rate": 9.689387592260124e-06, + "loss": 5.4893, + "step": 142600 + }, + { + "epoch": 12.802962298025134, + "grad_norm": 14.716980934143066, + "learning_rate": 9.689138240574507e-06, + "loss": 5.7608, + "step": 142625 + }, + { + "epoch": 12.805206463195692, + "grad_norm": 15.73531723022461, + "learning_rate": 9.688888888888889e-06, + "loss": 5.7459, + "step": 142650 + }, + { + "epoch": 12.807450628366247, + "grad_norm": 15.102977752685547, + "learning_rate": 9.688639537203273e-06, + "loss": 5.7592, + "step": 142675 + }, + { + "epoch": 12.809694793536805, + "grad_norm": 15.507452964782715, + "learning_rate": 9.688390185517655e-06, + "loss": 5.8459, + "step": 142700 + }, + { + "epoch": 12.81193895870736, + "grad_norm": 16.50343894958496, + "learning_rate": 9.688140833832038e-06, + "loss": 5.7949, + "step": 142725 + }, + { + "epoch": 12.814183123877918, + "grad_norm": 17.479536056518555, + "learning_rate": 9.68789148214642e-06, + "loss": 5.8255, + "step": 142750 + }, + { + "epoch": 12.816427289048473, + "grad_norm": 12.287793159484863, + "learning_rate": 9.687642130460802e-06, + "loss": 5.8693, + "step": 142775 + }, + { + "epoch": 12.81867145421903, + "grad_norm": 14.643245697021484, + "learning_rate": 9.687392778775185e-06, + "loss": 5.8424, + "step": 142800 + }, + { + "epoch": 12.820915619389588, + "grad_norm": 15.506651878356934, + "learning_rate": 9.687143427089569e-06, + "loss": 5.8224, + "step": 142825 + }, + { + "epoch": 12.823159784560143, + "grad_norm": 14.61227798461914, + "learning_rate": 9.686894075403951e-06, + "loss": 5.6636, + "step": 142850 + }, + { + "epoch": 12.8254039497307, + "grad_norm": 15.96293830871582, + "learning_rate": 9.686644723718333e-06, + "loss": 5.9353, + "step": 142875 + }, + { + "epoch": 12.827648114901256, + "grad_norm": 13.681604385375977, + "learning_rate": 9.686395372032716e-06, + "loss": 5.9304, + "step": 142900 + }, + { + "epoch": 12.829892280071814, + "grad_norm": 16.116010665893555, + "learning_rate": 9.686146020347098e-06, + "loss": 5.7908, + "step": 142925 + }, + { + "epoch": 12.83213644524237, + "grad_norm": 17.137815475463867, + "learning_rate": 9.68589666866148e-06, + "loss": 5.6462, + "step": 142950 + }, + { + "epoch": 12.834380610412927, + "grad_norm": 16.1744441986084, + "learning_rate": 9.685647316975864e-06, + "loss": 5.7035, + "step": 142975 + }, + { + "epoch": 12.836624775583482, + "grad_norm": 21.031757354736328, + "learning_rate": 9.685397965290247e-06, + "loss": 5.823, + "step": 143000 + }, + { + "epoch": 12.83886894075404, + "grad_norm": 15.580668449401855, + "learning_rate": 9.685158587672054e-06, + "loss": 5.7911, + "step": 143025 + }, + { + "epoch": 12.841113105924595, + "grad_norm": 15.164033889770508, + "learning_rate": 9.684909235986436e-06, + "loss": 5.7772, + "step": 143050 + }, + { + "epoch": 12.843357271095153, + "grad_norm": 16.49911880493164, + "learning_rate": 9.684659884300818e-06, + "loss": 5.5971, + "step": 143075 + }, + { + "epoch": 12.84560143626571, + "grad_norm": 19.363229751586914, + "learning_rate": 9.684410532615202e-06, + "loss": 5.6486, + "step": 143100 + }, + { + "epoch": 12.847845601436266, + "grad_norm": 15.81507396697998, + "learning_rate": 9.684161180929585e-06, + "loss": 5.7941, + "step": 143125 + }, + { + "epoch": 12.850089766606823, + "grad_norm": 17.790964126586914, + "learning_rate": 9.683911829243967e-06, + "loss": 5.78, + "step": 143150 + }, + { + "epoch": 12.852333931777379, + "grad_norm": 16.72043800354004, + "learning_rate": 9.68366247755835e-06, + "loss": 5.8456, + "step": 143175 + }, + { + "epoch": 12.854578096947936, + "grad_norm": 15.323291778564453, + "learning_rate": 9.683413125872731e-06, + "loss": 5.8839, + "step": 143200 + }, + { + "epoch": 12.856822262118492, + "grad_norm": 17.12639808654785, + "learning_rate": 9.683163774187114e-06, + "loss": 5.9068, + "step": 143225 + }, + { + "epoch": 12.859066427289049, + "grad_norm": 16.698474884033203, + "learning_rate": 9.682914422501498e-06, + "loss": 5.9126, + "step": 143250 + }, + { + "epoch": 12.861310592459605, + "grad_norm": 14.785530090332031, + "learning_rate": 9.68266507081588e-06, + "loss": 5.4551, + "step": 143275 + }, + { + "epoch": 12.863554757630162, + "grad_norm": 17.440704345703125, + "learning_rate": 9.682415719130262e-06, + "loss": 5.8563, + "step": 143300 + }, + { + "epoch": 12.865798922800717, + "grad_norm": 17.021989822387695, + "learning_rate": 9.682166367444645e-06, + "loss": 5.7901, + "step": 143325 + }, + { + "epoch": 12.868043087971275, + "grad_norm": 15.535030364990234, + "learning_rate": 9.681917015759027e-06, + "loss": 5.5444, + "step": 143350 + }, + { + "epoch": 12.87028725314183, + "grad_norm": 16.653667449951172, + "learning_rate": 9.68166766407341e-06, + "loss": 5.7232, + "step": 143375 + }, + { + "epoch": 12.872531418312388, + "grad_norm": 18.316993713378906, + "learning_rate": 9.681418312387792e-06, + "loss": 5.9911, + "step": 143400 + }, + { + "epoch": 12.874775583482945, + "grad_norm": 15.05341625213623, + "learning_rate": 9.681168960702176e-06, + "loss": 5.9162, + "step": 143425 + }, + { + "epoch": 12.8770197486535, + "grad_norm": 14.03790283203125, + "learning_rate": 9.680919609016558e-06, + "loss": 5.8184, + "step": 143450 + }, + { + "epoch": 12.879263913824058, + "grad_norm": 15.071718215942383, + "learning_rate": 9.68067025733094e-06, + "loss": 5.5646, + "step": 143475 + }, + { + "epoch": 12.881508078994614, + "grad_norm": 18.935482025146484, + "learning_rate": 9.680420905645323e-06, + "loss": 5.7582, + "step": 143500 + }, + { + "epoch": 12.883752244165171, + "grad_norm": 16.40807342529297, + "learning_rate": 9.680171553959705e-06, + "loss": 5.5064, + "step": 143525 + }, + { + "epoch": 12.885996409335727, + "grad_norm": 15.792996406555176, + "learning_rate": 9.679922202274087e-06, + "loss": 5.8263, + "step": 143550 + }, + { + "epoch": 12.888240574506284, + "grad_norm": 18.028345108032227, + "learning_rate": 9.679672850588471e-06, + "loss": 5.8621, + "step": 143575 + }, + { + "epoch": 12.89048473967684, + "grad_norm": 13.5221586227417, + "learning_rate": 9.679423498902854e-06, + "loss": 5.868, + "step": 143600 + }, + { + "epoch": 12.892728904847397, + "grad_norm": 14.191381454467773, + "learning_rate": 9.679174147217236e-06, + "loss": 5.7475, + "step": 143625 + }, + { + "epoch": 12.894973070017953, + "grad_norm": 18.03630256652832, + "learning_rate": 9.678924795531618e-06, + "loss": 5.7424, + "step": 143650 + }, + { + "epoch": 12.89721723518851, + "grad_norm": 15.891719818115234, + "learning_rate": 9.678675443846e-06, + "loss": 5.694, + "step": 143675 + }, + { + "epoch": 12.899461400359066, + "grad_norm": 15.134607315063477, + "learning_rate": 9.678426092160383e-06, + "loss": 5.7772, + "step": 143700 + }, + { + "epoch": 12.901705565529623, + "grad_norm": 16.7781925201416, + "learning_rate": 9.678176740474767e-06, + "loss": 5.7041, + "step": 143725 + }, + { + "epoch": 12.90394973070018, + "grad_norm": 19.21622657775879, + "learning_rate": 9.67792738878915e-06, + "loss": 5.7222, + "step": 143750 + }, + { + "epoch": 12.906193895870736, + "grad_norm": 18.34974479675293, + "learning_rate": 9.677678037103532e-06, + "loss": 5.8886, + "step": 143775 + }, + { + "epoch": 12.908438061041293, + "grad_norm": 15.413179397583008, + "learning_rate": 9.677428685417914e-06, + "loss": 6.005, + "step": 143800 + }, + { + "epoch": 12.910682226211849, + "grad_norm": 12.522608757019043, + "learning_rate": 9.677179333732298e-06, + "loss": 5.6975, + "step": 143825 + }, + { + "epoch": 12.912926391382406, + "grad_norm": 16.829296112060547, + "learning_rate": 9.676929982046678e-06, + "loss": 5.6783, + "step": 143850 + }, + { + "epoch": 12.915170556552962, + "grad_norm": 14.035555839538574, + "learning_rate": 9.676680630361062e-06, + "loss": 5.6741, + "step": 143875 + }, + { + "epoch": 12.91741472172352, + "grad_norm": 14.664019584655762, + "learning_rate": 9.676431278675445e-06, + "loss": 5.6496, + "step": 143900 + }, + { + "epoch": 12.919658886894075, + "grad_norm": 18.074682235717773, + "learning_rate": 9.676181926989827e-06, + "loss": 5.6887, + "step": 143925 + }, + { + "epoch": 12.921903052064632, + "grad_norm": 16.644330978393555, + "learning_rate": 9.67593257530421e-06, + "loss": 5.777, + "step": 143950 + }, + { + "epoch": 12.924147217235188, + "grad_norm": 18.54749870300293, + "learning_rate": 9.675683223618593e-06, + "loss": 5.7229, + "step": 143975 + }, + { + "epoch": 12.926391382405745, + "grad_norm": 13.625261306762695, + "learning_rate": 9.675433871932976e-06, + "loss": 5.7798, + "step": 144000 + }, + { + "epoch": 12.928635547576302, + "grad_norm": 13.388113975524902, + "learning_rate": 9.675184520247358e-06, + "loss": 5.7477, + "step": 144025 + }, + { + "epoch": 12.930879712746858, + "grad_norm": 13.304505348205566, + "learning_rate": 9.67493516856174e-06, + "loss": 5.8867, + "step": 144050 + }, + { + "epoch": 12.933123877917415, + "grad_norm": 13.646252632141113, + "learning_rate": 9.674685816876123e-06, + "loss": 5.6803, + "step": 144075 + }, + { + "epoch": 12.935368043087971, + "grad_norm": 14.93198299407959, + "learning_rate": 9.674436465190505e-06, + "loss": 5.6633, + "step": 144100 + }, + { + "epoch": 12.937612208258528, + "grad_norm": 15.404502868652344, + "learning_rate": 9.674187113504887e-06, + "loss": 5.8463, + "step": 144125 + }, + { + "epoch": 12.939856373429084, + "grad_norm": 17.494138717651367, + "learning_rate": 9.673937761819271e-06, + "loss": 5.6162, + "step": 144150 + }, + { + "epoch": 12.942100538599641, + "grad_norm": 15.19752025604248, + "learning_rate": 9.673688410133654e-06, + "loss": 5.605, + "step": 144175 + }, + { + "epoch": 12.944344703770197, + "grad_norm": 13.780790328979492, + "learning_rate": 9.673439058448036e-06, + "loss": 5.7209, + "step": 144200 + }, + { + "epoch": 12.946588868940754, + "grad_norm": 17.852476119995117, + "learning_rate": 9.673189706762418e-06, + "loss": 5.5461, + "step": 144225 + }, + { + "epoch": 12.94883303411131, + "grad_norm": 15.266759872436523, + "learning_rate": 9.6729403550768e-06, + "loss": 5.743, + "step": 144250 + }, + { + "epoch": 12.951077199281867, + "grad_norm": 13.101790428161621, + "learning_rate": 9.672691003391183e-06, + "loss": 5.6037, + "step": 144275 + }, + { + "epoch": 12.953321364452425, + "grad_norm": 18.164888381958008, + "learning_rate": 9.672441651705567e-06, + "loss": 6.0292, + "step": 144300 + }, + { + "epoch": 12.95556552962298, + "grad_norm": 15.49859619140625, + "learning_rate": 9.67219230001995e-06, + "loss": 5.8148, + "step": 144325 + }, + { + "epoch": 12.957809694793538, + "grad_norm": 14.296930313110352, + "learning_rate": 9.671942948334332e-06, + "loss": 5.4898, + "step": 144350 + }, + { + "epoch": 12.960053859964093, + "grad_norm": 15.575675964355469, + "learning_rate": 9.671693596648714e-06, + "loss": 5.7782, + "step": 144375 + }, + { + "epoch": 12.96229802513465, + "grad_norm": 15.873454093933105, + "learning_rate": 9.671444244963096e-06, + "loss": 5.7621, + "step": 144400 + }, + { + "epoch": 12.964542190305206, + "grad_norm": 13.737582206726074, + "learning_rate": 9.671194893277479e-06, + "loss": 5.6798, + "step": 144425 + }, + { + "epoch": 12.966786355475763, + "grad_norm": 12.965108871459961, + "learning_rate": 9.670945541591863e-06, + "loss": 5.7007, + "step": 144450 + }, + { + "epoch": 12.969030520646319, + "grad_norm": 16.085796356201172, + "learning_rate": 9.670696189906245e-06, + "loss": 5.9604, + "step": 144475 + }, + { + "epoch": 12.971274685816876, + "grad_norm": 13.732763290405273, + "learning_rate": 9.670446838220627e-06, + "loss": 5.5597, + "step": 144500 + }, + { + "epoch": 12.973518850987432, + "grad_norm": 18.106760025024414, + "learning_rate": 9.67019748653501e-06, + "loss": 5.5559, + "step": 144525 + }, + { + "epoch": 12.97576301615799, + "grad_norm": 14.373555183410645, + "learning_rate": 9.669948134849393e-06, + "loss": 5.9162, + "step": 144550 + }, + { + "epoch": 12.978007181328547, + "grad_norm": 16.81574821472168, + "learning_rate": 9.669698783163774e-06, + "loss": 6.0083, + "step": 144575 + }, + { + "epoch": 12.980251346499102, + "grad_norm": 13.484031677246094, + "learning_rate": 9.669449431478158e-06, + "loss": 5.7839, + "step": 144600 + }, + { + "epoch": 12.98249551166966, + "grad_norm": 14.512414932250977, + "learning_rate": 9.66920007979254e-06, + "loss": 5.7958, + "step": 144625 + }, + { + "epoch": 12.984739676840215, + "grad_norm": 17.671977996826172, + "learning_rate": 9.668950728106923e-06, + "loss": 5.9329, + "step": 144650 + }, + { + "epoch": 12.986983842010773, + "grad_norm": 19.066638946533203, + "learning_rate": 9.668701376421305e-06, + "loss": 5.6506, + "step": 144675 + }, + { + "epoch": 12.989228007181328, + "grad_norm": 15.247946739196777, + "learning_rate": 9.668452024735689e-06, + "loss": 5.9267, + "step": 144700 + }, + { + "epoch": 12.991472172351886, + "grad_norm": 15.145872116088867, + "learning_rate": 9.668202673050071e-06, + "loss": 5.7325, + "step": 144725 + }, + { + "epoch": 12.993716337522441, + "grad_norm": 17.67262840270996, + "learning_rate": 9.667953321364452e-06, + "loss": 5.5712, + "step": 144750 + }, + { + "epoch": 12.995960502692999, + "grad_norm": 16.779090881347656, + "learning_rate": 9.667703969678836e-06, + "loss": 5.6303, + "step": 144775 + }, + { + "epoch": 12.998204667863554, + "grad_norm": 15.146683692932129, + "learning_rate": 9.667454617993218e-06, + "loss": 5.7688, + "step": 144800 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.07788364233479082, + "eval_f1_macro": 0.0055951041585835665, + "eval_f1_micro": 0.07788364233479082, + "eval_f1_weighted": 0.04109505208998605, + "eval_loss": 6.954439640045166, + "eval_precision_macro": 0.005136959503217181, + "eval_precision_micro": 0.07788364233479082, + "eval_precision_weighted": 0.03337007711608134, + "eval_recall_macro": 0.010135004065458067, + "eval_recall_micro": 0.07788364233479082, + "eval_recall_weighted": 0.07788364233479082, + "eval_runtime": 128.125, + "eval_samples_per_second": 408.765, + "eval_steps_per_second": 12.777, + "step": 144820 + }, + { + "epoch": 13.000448833034111, + "grad_norm": 18.231826782226562, + "learning_rate": 9.6672052663076e-06, + "loss": 5.7371, + "step": 144825 + }, + { + "epoch": 13.002692998204667, + "grad_norm": 15.793295860290527, + "learning_rate": 9.666955914621983e-06, + "loss": 5.3369, + "step": 144850 + }, + { + "epoch": 13.004937163375224, + "grad_norm": 17.597307205200195, + "learning_rate": 9.666706562936367e-06, + "loss": 5.4195, + "step": 144875 + }, + { + "epoch": 13.007181328545782, + "grad_norm": 12.768475532531738, + "learning_rate": 9.66645721125075e-06, + "loss": 5.6755, + "step": 144900 + }, + { + "epoch": 13.009425493716337, + "grad_norm": 15.05236530303955, + "learning_rate": 9.666207859565132e-06, + "loss": 5.5353, + "step": 144925 + }, + { + "epoch": 13.011669658886895, + "grad_norm": 15.081932067871094, + "learning_rate": 9.665958507879514e-06, + "loss": 5.4726, + "step": 144950 + }, + { + "epoch": 13.01391382405745, + "grad_norm": 14.348904609680176, + "learning_rate": 9.665709156193896e-06, + "loss": 5.4623, + "step": 144975 + }, + { + "epoch": 13.016157989228008, + "grad_norm": 20.055374145507812, + "learning_rate": 9.665459804508279e-06, + "loss": 5.3055, + "step": 145000 + }, + { + "epoch": 13.018402154398563, + "grad_norm": 14.734442710876465, + "learning_rate": 9.665210452822663e-06, + "loss": 5.5533, + "step": 145025 + }, + { + "epoch": 13.02064631956912, + "grad_norm": 15.459388732910156, + "learning_rate": 9.664961101137045e-06, + "loss": 5.5123, + "step": 145050 + }, + { + "epoch": 13.022890484739676, + "grad_norm": 18.471710205078125, + "learning_rate": 9.664711749451427e-06, + "loss": 5.4369, + "step": 145075 + }, + { + "epoch": 13.025134649910234, + "grad_norm": 14.595921516418457, + "learning_rate": 9.66446239776581e-06, + "loss": 5.5004, + "step": 145100 + }, + { + "epoch": 13.02737881508079, + "grad_norm": 14.965012550354004, + "learning_rate": 9.664213046080192e-06, + "loss": 5.3384, + "step": 145125 + }, + { + "epoch": 13.029622980251347, + "grad_norm": 14.35052490234375, + "learning_rate": 9.663963694394574e-06, + "loss": 5.6549, + "step": 145150 + }, + { + "epoch": 13.031867145421902, + "grad_norm": 13.527505874633789, + "learning_rate": 9.663714342708958e-06, + "loss": 5.5034, + "step": 145175 + }, + { + "epoch": 13.03411131059246, + "grad_norm": 17.962316513061523, + "learning_rate": 9.66346499102334e-06, + "loss": 5.4801, + "step": 145200 + }, + { + "epoch": 13.036355475763017, + "grad_norm": 13.505860328674316, + "learning_rate": 9.663215639337723e-06, + "loss": 5.4534, + "step": 145225 + }, + { + "epoch": 13.038599640933572, + "grad_norm": 16.35609245300293, + "learning_rate": 9.662966287652105e-06, + "loss": 5.3502, + "step": 145250 + }, + { + "epoch": 13.04084380610413, + "grad_norm": 15.171685218811035, + "learning_rate": 9.662716935966487e-06, + "loss": 5.3954, + "step": 145275 + }, + { + "epoch": 13.043087971274685, + "grad_norm": 15.980904579162598, + "learning_rate": 9.66246758428087e-06, + "loss": 5.4751, + "step": 145300 + }, + { + "epoch": 13.045332136445243, + "grad_norm": 16.67418098449707, + "learning_rate": 9.662218232595254e-06, + "loss": 5.5155, + "step": 145325 + }, + { + "epoch": 13.047576301615798, + "grad_norm": 14.143266677856445, + "learning_rate": 9.661968880909636e-06, + "loss": 5.3487, + "step": 145350 + }, + { + "epoch": 13.049820466786356, + "grad_norm": 14.833431243896484, + "learning_rate": 9.661719529224018e-06, + "loss": 5.631, + "step": 145375 + }, + { + "epoch": 13.052064631956911, + "grad_norm": 15.713119506835938, + "learning_rate": 9.6614701775384e-06, + "loss": 5.218, + "step": 145400 + }, + { + "epoch": 13.054308797127469, + "grad_norm": 14.440738677978516, + "learning_rate": 9.661220825852785e-06, + "loss": 5.3695, + "step": 145425 + }, + { + "epoch": 13.056552962298024, + "grad_norm": 15.749710083007812, + "learning_rate": 9.660971474167165e-06, + "loss": 5.456, + "step": 145450 + }, + { + "epoch": 13.058797127468582, + "grad_norm": 19.786771774291992, + "learning_rate": 9.660722122481548e-06, + "loss": 5.6024, + "step": 145475 + }, + { + "epoch": 13.061041292639139, + "grad_norm": 16.08129119873047, + "learning_rate": 9.660472770795932e-06, + "loss": 5.5036, + "step": 145500 + }, + { + "epoch": 13.063285457809695, + "grad_norm": 18.057031631469727, + "learning_rate": 9.660223419110314e-06, + "loss": 5.3839, + "step": 145525 + }, + { + "epoch": 13.065529622980252, + "grad_norm": 14.714485168457031, + "learning_rate": 9.659974067424696e-06, + "loss": 5.4404, + "step": 145550 + }, + { + "epoch": 13.067773788150808, + "grad_norm": 14.846445083618164, + "learning_rate": 9.659724715739079e-06, + "loss": 5.5239, + "step": 145575 + }, + { + "epoch": 13.070017953321365, + "grad_norm": 12.310763359069824, + "learning_rate": 9.659475364053463e-06, + "loss": 5.5419, + "step": 145600 + }, + { + "epoch": 13.07226211849192, + "grad_norm": 18.06586265563965, + "learning_rate": 9.659226012367843e-06, + "loss": 5.6004, + "step": 145625 + }, + { + "epoch": 13.074506283662478, + "grad_norm": 15.223028182983398, + "learning_rate": 9.658976660682227e-06, + "loss": 5.6415, + "step": 145650 + }, + { + "epoch": 13.076750448833034, + "grad_norm": 16.006895065307617, + "learning_rate": 9.65872730899661e-06, + "loss": 5.5052, + "step": 145675 + }, + { + "epoch": 13.07899461400359, + "grad_norm": 14.27542781829834, + "learning_rate": 9.658477957310992e-06, + "loss": 5.2916, + "step": 145700 + }, + { + "epoch": 13.081238779174146, + "grad_norm": 12.879544258117676, + "learning_rate": 9.658228605625374e-06, + "loss": 5.5206, + "step": 145725 + }, + { + "epoch": 13.083482944344704, + "grad_norm": 14.858102798461914, + "learning_rate": 9.657979253939758e-06, + "loss": 5.4097, + "step": 145750 + }, + { + "epoch": 13.085727109515261, + "grad_norm": 14.359781265258789, + "learning_rate": 9.65772990225414e-06, + "loss": 5.5165, + "step": 145775 + }, + { + "epoch": 13.087971274685817, + "grad_norm": 11.81078815460205, + "learning_rate": 9.657480550568523e-06, + "loss": 5.6214, + "step": 145800 + }, + { + "epoch": 13.090215439856374, + "grad_norm": 18.87162208557129, + "learning_rate": 9.657231198882905e-06, + "loss": 5.5746, + "step": 145825 + }, + { + "epoch": 13.09245960502693, + "grad_norm": 13.291383743286133, + "learning_rate": 9.656981847197287e-06, + "loss": 5.564, + "step": 145850 + }, + { + "epoch": 13.094703770197487, + "grad_norm": 15.023479461669922, + "learning_rate": 9.65673249551167e-06, + "loss": 5.5445, + "step": 145875 + }, + { + "epoch": 13.096947935368043, + "grad_norm": 18.165000915527344, + "learning_rate": 9.656483143826054e-06, + "loss": 5.4151, + "step": 145900 + }, + { + "epoch": 13.0991921005386, + "grad_norm": 16.949996948242188, + "learning_rate": 9.656233792140436e-06, + "loss": 5.3607, + "step": 145925 + }, + { + "epoch": 13.101436265709156, + "grad_norm": 15.651041030883789, + "learning_rate": 9.655984440454818e-06, + "loss": 5.4111, + "step": 145950 + }, + { + "epoch": 13.103680430879713, + "grad_norm": 15.13811206817627, + "learning_rate": 9.6557350887692e-06, + "loss": 5.3334, + "step": 145975 + }, + { + "epoch": 13.105924596050269, + "grad_norm": 14.583745002746582, + "learning_rate": 9.655485737083583e-06, + "loss": 5.4831, + "step": 146000 + }, + { + "epoch": 13.108168761220826, + "grad_norm": 23.117284774780273, + "learning_rate": 9.655236385397965e-06, + "loss": 5.4296, + "step": 146025 + }, + { + "epoch": 13.110412926391382, + "grad_norm": 16.196800231933594, + "learning_rate": 9.65498703371235e-06, + "loss": 5.0988, + "step": 146050 + }, + { + "epoch": 13.112657091561939, + "grad_norm": 15.906505584716797, + "learning_rate": 9.654737682026732e-06, + "loss": 5.3935, + "step": 146075 + }, + { + "epoch": 13.114901256732496, + "grad_norm": 14.090576171875, + "learning_rate": 9.654488330341114e-06, + "loss": 5.4485, + "step": 146100 + }, + { + "epoch": 13.117145421903052, + "grad_norm": 17.5083065032959, + "learning_rate": 9.654238978655496e-06, + "loss": 5.2433, + "step": 146125 + }, + { + "epoch": 13.11938958707361, + "grad_norm": 16.7033634185791, + "learning_rate": 9.65398962696988e-06, + "loss": 5.3433, + "step": 146150 + }, + { + "epoch": 13.121633752244165, + "grad_norm": 16.249237060546875, + "learning_rate": 9.653740275284261e-06, + "loss": 5.5261, + "step": 146175 + }, + { + "epoch": 13.123877917414722, + "grad_norm": 13.530921936035156, + "learning_rate": 9.653490923598643e-06, + "loss": 5.657, + "step": 146200 + }, + { + "epoch": 13.126122082585278, + "grad_norm": 15.148859977722168, + "learning_rate": 9.653241571913027e-06, + "loss": 5.3625, + "step": 146225 + }, + { + "epoch": 13.128366247755835, + "grad_norm": 14.897529602050781, + "learning_rate": 9.65299222022741e-06, + "loss": 5.3904, + "step": 146250 + }, + { + "epoch": 13.13061041292639, + "grad_norm": 16.55928611755371, + "learning_rate": 9.652742868541792e-06, + "loss": 5.4968, + "step": 146275 + }, + { + "epoch": 13.132854578096948, + "grad_norm": 12.534045219421387, + "learning_rate": 9.652493516856176e-06, + "loss": 5.6474, + "step": 146300 + }, + { + "epoch": 13.135098743267504, + "grad_norm": 16.633716583251953, + "learning_rate": 9.652244165170558e-06, + "loss": 5.5787, + "step": 146325 + }, + { + "epoch": 13.137342908438061, + "grad_norm": 16.04751968383789, + "learning_rate": 9.651994813484939e-06, + "loss": 5.5855, + "step": 146350 + }, + { + "epoch": 13.139587073608618, + "grad_norm": 14.621399879455566, + "learning_rate": 9.651745461799323e-06, + "loss": 5.6001, + "step": 146375 + }, + { + "epoch": 13.141831238779174, + "grad_norm": 15.568477630615234, + "learning_rate": 9.651496110113705e-06, + "loss": 5.5063, + "step": 146400 + }, + { + "epoch": 13.144075403949731, + "grad_norm": 16.78246307373047, + "learning_rate": 9.651246758428088e-06, + "loss": 5.4387, + "step": 146425 + }, + { + "epoch": 13.146319569120287, + "grad_norm": 15.601312637329102, + "learning_rate": 9.651007380809894e-06, + "loss": 5.4797, + "step": 146450 + }, + { + "epoch": 13.148563734290844, + "grad_norm": 14.80150318145752, + "learning_rate": 9.650758029124277e-06, + "loss": 5.41, + "step": 146475 + }, + { + "epoch": 13.1508078994614, + "grad_norm": 16.118614196777344, + "learning_rate": 9.65050867743866e-06, + "loss": 5.4497, + "step": 146500 + }, + { + "epoch": 13.153052064631957, + "grad_norm": 15.911327362060547, + "learning_rate": 9.650259325753043e-06, + "loss": 5.3905, + "step": 146525 + }, + { + "epoch": 13.155296229802513, + "grad_norm": 13.994759559631348, + "learning_rate": 9.650009974067425e-06, + "loss": 5.4481, + "step": 146550 + }, + { + "epoch": 13.15754039497307, + "grad_norm": 16.898496627807617, + "learning_rate": 9.649760622381808e-06, + "loss": 5.4961, + "step": 146575 + }, + { + "epoch": 13.159784560143626, + "grad_norm": 14.925539016723633, + "learning_rate": 9.64951127069619e-06, + "loss": 5.3886, + "step": 146600 + }, + { + "epoch": 13.162028725314183, + "grad_norm": 15.706470489501953, + "learning_rate": 9.649261919010572e-06, + "loss": 5.4551, + "step": 146625 + }, + { + "epoch": 13.164272890484739, + "grad_norm": 14.174884796142578, + "learning_rate": 9.649012567324956e-06, + "loss": 5.6163, + "step": 146650 + }, + { + "epoch": 13.166517055655296, + "grad_norm": 22.547208786010742, + "learning_rate": 9.648763215639339e-06, + "loss": 5.5455, + "step": 146675 + }, + { + "epoch": 13.168761220825854, + "grad_norm": 19.982240676879883, + "learning_rate": 9.648513863953721e-06, + "loss": 5.5616, + "step": 146700 + }, + { + "epoch": 13.17100538599641, + "grad_norm": 18.432525634765625, + "learning_rate": 9.648264512268103e-06, + "loss": 5.4449, + "step": 146725 + }, + { + "epoch": 13.173249551166966, + "grad_norm": 13.927005767822266, + "learning_rate": 9.648015160582487e-06, + "loss": 5.4437, + "step": 146750 + }, + { + "epoch": 13.175493716337522, + "grad_norm": 15.922321319580078, + "learning_rate": 9.647765808896868e-06, + "loss": 5.6245, + "step": 146775 + }, + { + "epoch": 13.17773788150808, + "grad_norm": 15.828764915466309, + "learning_rate": 9.647516457211252e-06, + "loss": 5.5269, + "step": 146800 + }, + { + "epoch": 13.179982046678635, + "grad_norm": 15.654434204101562, + "learning_rate": 9.647267105525634e-06, + "loss": 5.441, + "step": 146825 + }, + { + "epoch": 13.182226211849192, + "grad_norm": 14.56989860534668, + "learning_rate": 9.647017753840017e-06, + "loss": 5.4945, + "step": 146850 + }, + { + "epoch": 13.184470377019748, + "grad_norm": 15.797466278076172, + "learning_rate": 9.646768402154399e-06, + "loss": 5.5293, + "step": 146875 + }, + { + "epoch": 13.186714542190305, + "grad_norm": 19.87085723876953, + "learning_rate": 9.646519050468783e-06, + "loss": 5.2745, + "step": 146900 + }, + { + "epoch": 13.188958707360861, + "grad_norm": 14.680383682250977, + "learning_rate": 9.646269698783165e-06, + "loss": 5.4469, + "step": 146925 + }, + { + "epoch": 13.191202872531418, + "grad_norm": 17.25788688659668, + "learning_rate": 9.646020347097546e-06, + "loss": 5.4597, + "step": 146950 + }, + { + "epoch": 13.193447037701976, + "grad_norm": 18.375015258789062, + "learning_rate": 9.64577099541193e-06, + "loss": 5.4729, + "step": 146975 + }, + { + "epoch": 13.195691202872531, + "grad_norm": 15.275649070739746, + "learning_rate": 9.645521643726312e-06, + "loss": 5.4473, + "step": 147000 + }, + { + "epoch": 13.197935368043089, + "grad_norm": 16.432605743408203, + "learning_rate": 9.645272292040695e-06, + "loss": 5.3835, + "step": 147025 + }, + { + "epoch": 13.200179533213644, + "grad_norm": 15.237411499023438, + "learning_rate": 9.645022940355079e-06, + "loss": 5.4977, + "step": 147050 + }, + { + "epoch": 13.202423698384202, + "grad_norm": 17.344728469848633, + "learning_rate": 9.64477358866946e-06, + "loss": 5.6213, + "step": 147075 + }, + { + "epoch": 13.204667863554757, + "grad_norm": 18.0125789642334, + "learning_rate": 9.644524236983843e-06, + "loss": 5.4226, + "step": 147100 + }, + { + "epoch": 13.206912028725315, + "grad_norm": 16.570682525634766, + "learning_rate": 9.644274885298225e-06, + "loss": 5.4935, + "step": 147125 + }, + { + "epoch": 13.20915619389587, + "grad_norm": 15.612373352050781, + "learning_rate": 9.644025533612608e-06, + "loss": 5.5238, + "step": 147150 + }, + { + "epoch": 13.211400359066428, + "grad_norm": 15.526009559631348, + "learning_rate": 9.64377618192699e-06, + "loss": 5.36, + "step": 147175 + }, + { + "epoch": 13.213644524236983, + "grad_norm": 16.30401611328125, + "learning_rate": 9.643526830241372e-06, + "loss": 5.5085, + "step": 147200 + }, + { + "epoch": 13.21588868940754, + "grad_norm": 15.855030059814453, + "learning_rate": 9.643277478555756e-06, + "loss": 5.618, + "step": 147225 + }, + { + "epoch": 13.218132854578098, + "grad_norm": 16.846342086791992, + "learning_rate": 9.643028126870139e-06, + "loss": 5.4785, + "step": 147250 + }, + { + "epoch": 13.220377019748653, + "grad_norm": 16.05097770690918, + "learning_rate": 9.642778775184521e-06, + "loss": 5.5132, + "step": 147275 + }, + { + "epoch": 13.22262118491921, + "grad_norm": 14.975716590881348, + "learning_rate": 9.642529423498903e-06, + "loss": 5.5109, + "step": 147300 + }, + { + "epoch": 13.224865350089766, + "grad_norm": 16.114887237548828, + "learning_rate": 9.642280071813286e-06, + "loss": 5.5207, + "step": 147325 + }, + { + "epoch": 13.227109515260324, + "grad_norm": 14.992668151855469, + "learning_rate": 9.642030720127668e-06, + "loss": 5.0959, + "step": 147350 + }, + { + "epoch": 13.22935368043088, + "grad_norm": 17.85599708557129, + "learning_rate": 9.641781368442052e-06, + "loss": 5.7022, + "step": 147375 + }, + { + "epoch": 13.231597845601437, + "grad_norm": 14.51505184173584, + "learning_rate": 9.641532016756434e-06, + "loss": 5.2449, + "step": 147400 + }, + { + "epoch": 13.233842010771992, + "grad_norm": 13.528204917907715, + "learning_rate": 9.641282665070817e-06, + "loss": 5.3599, + "step": 147425 + }, + { + "epoch": 13.23608617594255, + "grad_norm": 14.565969467163086, + "learning_rate": 9.641033313385199e-06, + "loss": 5.401, + "step": 147450 + }, + { + "epoch": 13.238330341113105, + "grad_norm": 16.65776252746582, + "learning_rate": 9.640783961699583e-06, + "loss": 5.5358, + "step": 147475 + }, + { + "epoch": 13.240574506283663, + "grad_norm": 17.09721565246582, + "learning_rate": 9.640534610013964e-06, + "loss": 5.393, + "step": 147500 + }, + { + "epoch": 13.242818671454218, + "grad_norm": 18.270320892333984, + "learning_rate": 9.640285258328348e-06, + "loss": 5.5966, + "step": 147525 + }, + { + "epoch": 13.245062836624776, + "grad_norm": 13.89951229095459, + "learning_rate": 9.64003590664273e-06, + "loss": 5.3564, + "step": 147550 + }, + { + "epoch": 13.247307001795333, + "grad_norm": 16.090715408325195, + "learning_rate": 9.639786554957112e-06, + "loss": 5.6104, + "step": 147575 + }, + { + "epoch": 13.249551166965889, + "grad_norm": 14.783391952514648, + "learning_rate": 9.639537203271495e-06, + "loss": 5.5089, + "step": 147600 + }, + { + "epoch": 13.251795332136446, + "grad_norm": 18.60951042175293, + "learning_rate": 9.639287851585879e-06, + "loss": 5.4326, + "step": 147625 + }, + { + "epoch": 13.254039497307001, + "grad_norm": 16.04306983947754, + "learning_rate": 9.639038499900261e-06, + "loss": 5.4814, + "step": 147650 + }, + { + "epoch": 13.256283662477559, + "grad_norm": 17.212804794311523, + "learning_rate": 9.638789148214641e-06, + "loss": 5.7155, + "step": 147675 + }, + { + "epoch": 13.258527827648114, + "grad_norm": 16.49783706665039, + "learning_rate": 9.638539796529026e-06, + "loss": 5.4529, + "step": 147700 + }, + { + "epoch": 13.260771992818672, + "grad_norm": 15.732194900512695, + "learning_rate": 9.638290444843408e-06, + "loss": 5.5747, + "step": 147725 + }, + { + "epoch": 13.263016157989227, + "grad_norm": 18.35817527770996, + "learning_rate": 9.63804109315779e-06, + "loss": 5.5457, + "step": 147750 + }, + { + "epoch": 13.265260323159785, + "grad_norm": 16.77217674255371, + "learning_rate": 9.637791741472174e-06, + "loss": 5.6703, + "step": 147775 + }, + { + "epoch": 13.26750448833034, + "grad_norm": 16.804336547851562, + "learning_rate": 9.637542389786556e-06, + "loss": 5.5986, + "step": 147800 + }, + { + "epoch": 13.269748653500898, + "grad_norm": 15.095252990722656, + "learning_rate": 9.637293038100939e-06, + "loss": 5.7636, + "step": 147825 + }, + { + "epoch": 13.271992818671453, + "grad_norm": 14.808623313903809, + "learning_rate": 9.637043686415321e-06, + "loss": 5.5157, + "step": 147850 + }, + { + "epoch": 13.27423698384201, + "grad_norm": 14.917145729064941, + "learning_rate": 9.636794334729703e-06, + "loss": 5.4665, + "step": 147875 + }, + { + "epoch": 13.276481149012568, + "grad_norm": 14.901209831237793, + "learning_rate": 9.636544983044086e-06, + "loss": 5.6339, + "step": 147900 + }, + { + "epoch": 13.278725314183124, + "grad_norm": 13.604937553405762, + "learning_rate": 9.636295631358468e-06, + "loss": 5.4845, + "step": 147925 + }, + { + "epoch": 13.280969479353681, + "grad_norm": 13.590177536010742, + "learning_rate": 9.636046279672852e-06, + "loss": 5.4466, + "step": 147950 + }, + { + "epoch": 13.283213644524237, + "grad_norm": 16.74699592590332, + "learning_rate": 9.635796927987234e-06, + "loss": 5.7419, + "step": 147975 + }, + { + "epoch": 13.285457809694794, + "grad_norm": 15.241861343383789, + "learning_rate": 9.635547576301617e-06, + "loss": 5.6814, + "step": 148000 + }, + { + "epoch": 13.28770197486535, + "grad_norm": 14.222386360168457, + "learning_rate": 9.635298224615999e-06, + "loss": 5.4367, + "step": 148025 + }, + { + "epoch": 13.289946140035907, + "grad_norm": 18.11845588684082, + "learning_rate": 9.635048872930381e-06, + "loss": 5.5398, + "step": 148050 + }, + { + "epoch": 13.292190305206462, + "grad_norm": 15.607288360595703, + "learning_rate": 9.634799521244764e-06, + "loss": 5.6776, + "step": 148075 + }, + { + "epoch": 13.29443447037702, + "grad_norm": 18.583467483520508, + "learning_rate": 9.634550169559148e-06, + "loss": 5.5688, + "step": 148100 + }, + { + "epoch": 13.296678635547575, + "grad_norm": 16.80494499206543, + "learning_rate": 9.63430081787353e-06, + "loss": 5.575, + "step": 148125 + }, + { + "epoch": 13.298922800718133, + "grad_norm": 18.84225845336914, + "learning_rate": 9.634051466187912e-06, + "loss": 5.4476, + "step": 148150 + }, + { + "epoch": 13.30116696588869, + "grad_norm": 17.969282150268555, + "learning_rate": 9.633802114502295e-06, + "loss": 5.4793, + "step": 148175 + }, + { + "epoch": 13.303411131059246, + "grad_norm": 16.36315155029297, + "learning_rate": 9.633552762816677e-06, + "loss": 5.3751, + "step": 148200 + }, + { + "epoch": 13.305655296229803, + "grad_norm": 15.374359130859375, + "learning_rate": 9.63330341113106e-06, + "loss": 5.512, + "step": 148225 + }, + { + "epoch": 13.307899461400359, + "grad_norm": 16.466617584228516, + "learning_rate": 9.633054059445443e-06, + "loss": 5.3136, + "step": 148250 + }, + { + "epoch": 13.310143626570916, + "grad_norm": 19.335994720458984, + "learning_rate": 9.632804707759826e-06, + "loss": 5.4395, + "step": 148275 + }, + { + "epoch": 13.312387791741472, + "grad_norm": 15.52336597442627, + "learning_rate": 9.632555356074208e-06, + "loss": 5.4318, + "step": 148300 + }, + { + "epoch": 13.314631956912029, + "grad_norm": 18.83104705810547, + "learning_rate": 9.63230600438859e-06, + "loss": 5.4986, + "step": 148325 + }, + { + "epoch": 13.316876122082585, + "grad_norm": 15.552826881408691, + "learning_rate": 9.632056652702974e-06, + "loss": 5.5161, + "step": 148350 + }, + { + "epoch": 13.319120287253142, + "grad_norm": 15.689159393310547, + "learning_rate": 9.631807301017355e-06, + "loss": 5.5556, + "step": 148375 + }, + { + "epoch": 13.321364452423698, + "grad_norm": 16.357587814331055, + "learning_rate": 9.631557949331737e-06, + "loss": 5.2513, + "step": 148400 + }, + { + "epoch": 13.323608617594255, + "grad_norm": 15.378134727478027, + "learning_rate": 9.631308597646121e-06, + "loss": 5.6983, + "step": 148425 + }, + { + "epoch": 13.325852782764812, + "grad_norm": 16.004358291625977, + "learning_rate": 9.631059245960503e-06, + "loss": 5.5304, + "step": 148450 + }, + { + "epoch": 13.328096947935368, + "grad_norm": 12.83535099029541, + "learning_rate": 9.630809894274886e-06, + "loss": 5.508, + "step": 148475 + }, + { + "epoch": 13.330341113105925, + "grad_norm": 15.281318664550781, + "learning_rate": 9.63056054258927e-06, + "loss": 5.4138, + "step": 148500 + }, + { + "epoch": 13.33258527827648, + "grad_norm": 16.002849578857422, + "learning_rate": 9.630311190903652e-06, + "loss": 5.7175, + "step": 148525 + }, + { + "epoch": 13.334829443447038, + "grad_norm": 14.343585968017578, + "learning_rate": 9.630061839218033e-06, + "loss": 5.4541, + "step": 148550 + }, + { + "epoch": 13.337073608617594, + "grad_norm": 18.30499267578125, + "learning_rate": 9.629812487532417e-06, + "loss": 5.6047, + "step": 148575 + }, + { + "epoch": 13.339317773788151, + "grad_norm": 19.122909545898438, + "learning_rate": 9.629563135846799e-06, + "loss": 5.4481, + "step": 148600 + }, + { + "epoch": 13.341561938958707, + "grad_norm": 15.703609466552734, + "learning_rate": 9.629313784161181e-06, + "loss": 5.8556, + "step": 148625 + }, + { + "epoch": 13.343806104129264, + "grad_norm": 14.119596481323242, + "learning_rate": 9.629064432475564e-06, + "loss": 5.6451, + "step": 148650 + }, + { + "epoch": 13.34605026929982, + "grad_norm": 20.03814125061035, + "learning_rate": 9.628815080789948e-06, + "loss": 5.4021, + "step": 148675 + }, + { + "epoch": 13.348294434470377, + "grad_norm": 16.260644912719727, + "learning_rate": 9.62856572910433e-06, + "loss": 5.3834, + "step": 148700 + }, + { + "epoch": 13.350538599640934, + "grad_norm": 15.582694053649902, + "learning_rate": 9.628316377418712e-06, + "loss": 5.5814, + "step": 148725 + }, + { + "epoch": 13.35278276481149, + "grad_norm": 18.95960235595703, + "learning_rate": 9.628067025733095e-06, + "loss": 5.6735, + "step": 148750 + }, + { + "epoch": 13.355026929982047, + "grad_norm": 17.226388931274414, + "learning_rate": 9.627817674047477e-06, + "loss": 5.6464, + "step": 148775 + }, + { + "epoch": 13.357271095152603, + "grad_norm": 15.002761840820312, + "learning_rate": 9.62756832236186e-06, + "loss": 5.546, + "step": 148800 + }, + { + "epoch": 13.35951526032316, + "grad_norm": 14.528529167175293, + "learning_rate": 9.627328944743666e-06, + "loss": 5.5039, + "step": 148825 + }, + { + "epoch": 13.361759425493716, + "grad_norm": 16.58690071105957, + "learning_rate": 9.62707959305805e-06, + "loss": 5.604, + "step": 148850 + }, + { + "epoch": 13.364003590664273, + "grad_norm": 17.054683685302734, + "learning_rate": 9.626830241372433e-06, + "loss": 5.7418, + "step": 148875 + }, + { + "epoch": 13.366247755834829, + "grad_norm": 19.84734344482422, + "learning_rate": 9.626580889686815e-06, + "loss": 5.6458, + "step": 148900 + }, + { + "epoch": 13.368491921005386, + "grad_norm": 15.38659954071045, + "learning_rate": 9.626331538001197e-06, + "loss": 5.8949, + "step": 148925 + }, + { + "epoch": 13.370736086175942, + "grad_norm": 19.852710723876953, + "learning_rate": 9.626082186315581e-06, + "loss": 5.475, + "step": 148950 + }, + { + "epoch": 13.3729802513465, + "grad_norm": 16.649383544921875, + "learning_rate": 9.625832834629964e-06, + "loss": 5.6491, + "step": 148975 + }, + { + "epoch": 13.375224416517055, + "grad_norm": 15.158028602600098, + "learning_rate": 9.625583482944346e-06, + "loss": 5.5308, + "step": 149000 + }, + { + "epoch": 13.377468581687612, + "grad_norm": 14.895779609680176, + "learning_rate": 9.625334131258728e-06, + "loss": 5.4717, + "step": 149025 + }, + { + "epoch": 13.37971274685817, + "grad_norm": 15.487646102905273, + "learning_rate": 9.62508477957311e-06, + "loss": 5.7692, + "step": 149050 + }, + { + "epoch": 13.381956912028725, + "grad_norm": 14.067713737487793, + "learning_rate": 9.624835427887493e-06, + "loss": 5.4349, + "step": 149075 + }, + { + "epoch": 13.384201077199283, + "grad_norm": 20.407201766967773, + "learning_rate": 9.624586076201877e-06, + "loss": 5.5395, + "step": 149100 + }, + { + "epoch": 13.386445242369838, + "grad_norm": 17.46686553955078, + "learning_rate": 9.624336724516259e-06, + "loss": 5.3869, + "step": 149125 + }, + { + "epoch": 13.388689407540395, + "grad_norm": 14.52600383758545, + "learning_rate": 9.624087372830641e-06, + "loss": 5.543, + "step": 149150 + }, + { + "epoch": 13.390933572710951, + "grad_norm": 15.914323806762695, + "learning_rate": 9.623838021145024e-06, + "loss": 5.662, + "step": 149175 + }, + { + "epoch": 13.393177737881508, + "grad_norm": 14.56016731262207, + "learning_rate": 9.623588669459406e-06, + "loss": 5.7623, + "step": 149200 + }, + { + "epoch": 13.395421903052064, + "grad_norm": 13.763602256774902, + "learning_rate": 9.623339317773788e-06, + "loss": 5.3713, + "step": 149225 + }, + { + "epoch": 13.397666068222621, + "grad_norm": 18.16983985900879, + "learning_rate": 9.623089966088172e-06, + "loss": 5.3376, + "step": 149250 + }, + { + "epoch": 13.399910233393177, + "grad_norm": 17.237436294555664, + "learning_rate": 9.622840614402555e-06, + "loss": 5.56, + "step": 149275 + }, + { + "epoch": 13.402154398563734, + "grad_norm": 11.771121978759766, + "learning_rate": 9.622591262716937e-06, + "loss": 5.5909, + "step": 149300 + }, + { + "epoch": 13.40439856373429, + "grad_norm": 15.98208999633789, + "learning_rate": 9.62234191103132e-06, + "loss": 5.3012, + "step": 149325 + }, + { + "epoch": 13.406642728904847, + "grad_norm": 14.631118774414062, + "learning_rate": 9.622092559345702e-06, + "loss": 5.4527, + "step": 149350 + }, + { + "epoch": 13.408886894075405, + "grad_norm": 15.705517768859863, + "learning_rate": 9.621843207660084e-06, + "loss": 5.6037, + "step": 149375 + }, + { + "epoch": 13.41113105924596, + "grad_norm": 13.14966106414795, + "learning_rate": 9.621593855974466e-06, + "loss": 5.5125, + "step": 149400 + }, + { + "epoch": 13.413375224416518, + "grad_norm": 15.244400978088379, + "learning_rate": 9.62134450428885e-06, + "loss": 5.6226, + "step": 149425 + }, + { + "epoch": 13.415619389587073, + "grad_norm": 16.860734939575195, + "learning_rate": 9.621095152603233e-06, + "loss": 5.4667, + "step": 149450 + }, + { + "epoch": 13.41786355475763, + "grad_norm": 15.110878944396973, + "learning_rate": 9.620845800917615e-06, + "loss": 5.4635, + "step": 149475 + }, + { + "epoch": 13.420107719928186, + "grad_norm": 21.834131240844727, + "learning_rate": 9.620596449231999e-06, + "loss": 5.3708, + "step": 149500 + }, + { + "epoch": 13.422351885098744, + "grad_norm": 16.543611526489258, + "learning_rate": 9.62034709754638e-06, + "loss": 5.402, + "step": 149525 + }, + { + "epoch": 13.4245960502693, + "grad_norm": 16.8908748626709, + "learning_rate": 9.620097745860762e-06, + "loss": 5.77, + "step": 149550 + }, + { + "epoch": 13.426840215439857, + "grad_norm": 16.543777465820312, + "learning_rate": 9.619848394175146e-06, + "loss": 5.2662, + "step": 149575 + }, + { + "epoch": 13.429084380610412, + "grad_norm": 13.482545852661133, + "learning_rate": 9.619599042489528e-06, + "loss": 5.718, + "step": 149600 + }, + { + "epoch": 13.43132854578097, + "grad_norm": 14.670124053955078, + "learning_rate": 9.61934969080391e-06, + "loss": 5.6215, + "step": 149625 + }, + { + "epoch": 13.433572710951527, + "grad_norm": 13.739845275878906, + "learning_rate": 9.619100339118293e-06, + "loss": 5.7465, + "step": 149650 + }, + { + "epoch": 13.435816876122082, + "grad_norm": 16.7188663482666, + "learning_rate": 9.618850987432677e-06, + "loss": 5.3897, + "step": 149675 + }, + { + "epoch": 13.43806104129264, + "grad_norm": 14.88851261138916, + "learning_rate": 9.618601635747057e-06, + "loss": 5.3676, + "step": 149700 + }, + { + "epoch": 13.440305206463195, + "grad_norm": 13.626384735107422, + "learning_rate": 9.618352284061441e-06, + "loss": 5.4703, + "step": 149725 + }, + { + "epoch": 13.442549371633753, + "grad_norm": 15.13778018951416, + "learning_rate": 9.618102932375824e-06, + "loss": 5.4934, + "step": 149750 + }, + { + "epoch": 13.444793536804308, + "grad_norm": 20.517763137817383, + "learning_rate": 9.617853580690206e-06, + "loss": 5.3516, + "step": 149775 + }, + { + "epoch": 13.447037701974866, + "grad_norm": 16.660736083984375, + "learning_rate": 9.617604229004588e-06, + "loss": 5.7043, + "step": 149800 + }, + { + "epoch": 13.449281867145421, + "grad_norm": 14.873493194580078, + "learning_rate": 9.617354877318972e-06, + "loss": 5.4944, + "step": 149825 + }, + { + "epoch": 13.451526032315979, + "grad_norm": 15.337474822998047, + "learning_rate": 9.617105525633355e-06, + "loss": 5.4722, + "step": 149850 + }, + { + "epoch": 13.453770197486534, + "grad_norm": 13.954229354858398, + "learning_rate": 9.616856173947735e-06, + "loss": 5.5602, + "step": 149875 + }, + { + "epoch": 13.456014362657092, + "grad_norm": 16.372013092041016, + "learning_rate": 9.61660682226212e-06, + "loss": 5.3297, + "step": 149900 + }, + { + "epoch": 13.458258527827649, + "grad_norm": 16.04224967956543, + "learning_rate": 9.616357470576502e-06, + "loss": 5.2821, + "step": 149925 + }, + { + "epoch": 13.460502692998205, + "grad_norm": 16.85519027709961, + "learning_rate": 9.616108118890884e-06, + "loss": 5.3109, + "step": 149950 + }, + { + "epoch": 13.462746858168762, + "grad_norm": 16.50066566467285, + "learning_rate": 9.615858767205268e-06, + "loss": 5.8438, + "step": 149975 + }, + { + "epoch": 13.464991023339318, + "grad_norm": 14.056427955627441, + "learning_rate": 9.61560941551965e-06, + "loss": 5.1796, + "step": 150000 + }, + { + "epoch": 13.467235188509875, + "grad_norm": 15.412620544433594, + "learning_rate": 9.615360063834033e-06, + "loss": 5.4284, + "step": 150025 + }, + { + "epoch": 13.46947935368043, + "grad_norm": 13.953865051269531, + "learning_rate": 9.615110712148415e-06, + "loss": 5.4138, + "step": 150050 + }, + { + "epoch": 13.471723518850988, + "grad_norm": 15.346589088439941, + "learning_rate": 9.614861360462797e-06, + "loss": 5.5642, + "step": 150075 + }, + { + "epoch": 13.473967684021543, + "grad_norm": 15.249682426452637, + "learning_rate": 9.61461200877718e-06, + "loss": 5.474, + "step": 150100 + }, + { + "epoch": 13.4762118491921, + "grad_norm": 17.74197769165039, + "learning_rate": 9.614362657091562e-06, + "loss": 5.5612, + "step": 150125 + }, + { + "epoch": 13.478456014362656, + "grad_norm": 15.279068946838379, + "learning_rate": 9.614113305405946e-06, + "loss": 5.6055, + "step": 150150 + }, + { + "epoch": 13.480700179533214, + "grad_norm": 14.993330001831055, + "learning_rate": 9.613863953720328e-06, + "loss": 5.7324, + "step": 150175 + }, + { + "epoch": 13.48294434470377, + "grad_norm": 14.351774215698242, + "learning_rate": 9.61361460203471e-06, + "loss": 5.3661, + "step": 150200 + }, + { + "epoch": 13.485188509874327, + "grad_norm": 14.315024375915527, + "learning_rate": 9.613365250349093e-06, + "loss": 5.5387, + "step": 150225 + }, + { + "epoch": 13.487432675044884, + "grad_norm": 14.305253028869629, + "learning_rate": 9.613115898663475e-06, + "loss": 5.3233, + "step": 150250 + }, + { + "epoch": 13.48967684021544, + "grad_norm": 15.829625129699707, + "learning_rate": 9.612866546977857e-06, + "loss": 5.1167, + "step": 150275 + }, + { + "epoch": 13.491921005385997, + "grad_norm": 13.267358779907227, + "learning_rate": 9.612617195292242e-06, + "loss": 5.49, + "step": 150300 + }, + { + "epoch": 13.494165170556553, + "grad_norm": 17.121196746826172, + "learning_rate": 9.612367843606624e-06, + "loss": 5.5634, + "step": 150325 + }, + { + "epoch": 13.49640933572711, + "grad_norm": 14.200715065002441, + "learning_rate": 9.612118491921006e-06, + "loss": 5.5089, + "step": 150350 + }, + { + "epoch": 13.498653500897666, + "grad_norm": 16.63199806213379, + "learning_rate": 9.611869140235388e-06, + "loss": 5.6738, + "step": 150375 + }, + { + "epoch": 13.500897666068223, + "grad_norm": 14.408029556274414, + "learning_rate": 9.61161978854977e-06, + "loss": 5.3308, + "step": 150400 + }, + { + "epoch": 13.503141831238779, + "grad_norm": 19.10359764099121, + "learning_rate": 9.611370436864153e-06, + "loss": 5.3505, + "step": 150425 + }, + { + "epoch": 13.505385996409336, + "grad_norm": 18.029834747314453, + "learning_rate": 9.611121085178537e-06, + "loss": 5.3906, + "step": 150450 + }, + { + "epoch": 13.507630161579891, + "grad_norm": 16.874252319335938, + "learning_rate": 9.61087173349292e-06, + "loss": 5.4002, + "step": 150475 + }, + { + "epoch": 13.509874326750449, + "grad_norm": 14.376928329467773, + "learning_rate": 9.610622381807302e-06, + "loss": 5.4488, + "step": 150500 + }, + { + "epoch": 13.512118491921004, + "grad_norm": 18.941558837890625, + "learning_rate": 9.610373030121684e-06, + "loss": 5.6315, + "step": 150525 + }, + { + "epoch": 13.514362657091562, + "grad_norm": 15.894304275512695, + "learning_rate": 9.610123678436068e-06, + "loss": 5.4833, + "step": 150550 + }, + { + "epoch": 13.51660682226212, + "grad_norm": 16.56924819946289, + "learning_rate": 9.609874326750449e-06, + "loss": 5.291, + "step": 150575 + }, + { + "epoch": 13.518850987432675, + "grad_norm": 15.179162979125977, + "learning_rate": 9.609624975064831e-06, + "loss": 5.4682, + "step": 150600 + }, + { + "epoch": 13.521095152603232, + "grad_norm": 14.844849586486816, + "learning_rate": 9.609375623379215e-06, + "loss": 5.5074, + "step": 150625 + }, + { + "epoch": 13.523339317773788, + "grad_norm": 16.836650848388672, + "learning_rate": 9.609126271693597e-06, + "loss": 5.5038, + "step": 150650 + }, + { + "epoch": 13.525583482944345, + "grad_norm": 16.258453369140625, + "learning_rate": 9.60887692000798e-06, + "loss": 5.5358, + "step": 150675 + }, + { + "epoch": 13.5278276481149, + "grad_norm": 16.263093948364258, + "learning_rate": 9.608627568322364e-06, + "loss": 5.5109, + "step": 150700 + }, + { + "epoch": 13.530071813285458, + "grad_norm": 15.737496376037598, + "learning_rate": 9.608378216636746e-06, + "loss": 5.5329, + "step": 150725 + }, + { + "epoch": 13.532315978456014, + "grad_norm": 14.089518547058105, + "learning_rate": 9.608128864951128e-06, + "loss": 5.5358, + "step": 150750 + }, + { + "epoch": 13.534560143626571, + "grad_norm": 14.662657737731934, + "learning_rate": 9.60787951326551e-06, + "loss": 5.5294, + "step": 150775 + }, + { + "epoch": 13.536804308797127, + "grad_norm": 17.088605880737305, + "learning_rate": 9.607630161579893e-06, + "loss": 5.6299, + "step": 150800 + }, + { + "epoch": 13.539048473967684, + "grad_norm": 15.889634132385254, + "learning_rate": 9.607380809894275e-06, + "loss": 5.4746, + "step": 150825 + }, + { + "epoch": 13.541292639138241, + "grad_norm": 18.3951358795166, + "learning_rate": 9.607131458208658e-06, + "loss": 5.4741, + "step": 150850 + }, + { + "epoch": 13.543536804308797, + "grad_norm": 15.961281776428223, + "learning_rate": 9.606882106523042e-06, + "loss": 5.6935, + "step": 150875 + }, + { + "epoch": 13.545780969479354, + "grad_norm": 15.102862358093262, + "learning_rate": 9.606632754837424e-06, + "loss": 5.4953, + "step": 150900 + }, + { + "epoch": 13.54802513464991, + "grad_norm": 16.98381805419922, + "learning_rate": 9.606383403151806e-06, + "loss": 5.3914, + "step": 150925 + }, + { + "epoch": 13.550269299820467, + "grad_norm": 13.73013687133789, + "learning_rate": 9.606144025533613e-06, + "loss": 5.7316, + "step": 150950 + }, + { + "epoch": 13.552513464991023, + "grad_norm": 17.095537185668945, + "learning_rate": 9.605894673847997e-06, + "loss": 5.5648, + "step": 150975 + }, + { + "epoch": 13.55475763016158, + "grad_norm": 16.49323081970215, + "learning_rate": 9.60564532216238e-06, + "loss": 5.3716, + "step": 151000 + }, + { + "epoch": 13.557001795332136, + "grad_norm": 18.961179733276367, + "learning_rate": 9.60539597047676e-06, + "loss": 5.6222, + "step": 151025 + }, + { + "epoch": 13.559245960502693, + "grad_norm": 19.0511531829834, + "learning_rate": 9.605146618791144e-06, + "loss": 5.4722, + "step": 151050 + }, + { + "epoch": 13.561490125673249, + "grad_norm": 15.309853553771973, + "learning_rate": 9.604897267105526e-06, + "loss": 5.7061, + "step": 151075 + }, + { + "epoch": 13.563734290843806, + "grad_norm": 17.058563232421875, + "learning_rate": 9.604647915419909e-06, + "loss": 5.43, + "step": 151100 + }, + { + "epoch": 13.565978456014363, + "grad_norm": 15.830438613891602, + "learning_rate": 9.604398563734291e-06, + "loss": 5.4404, + "step": 151125 + }, + { + "epoch": 13.568222621184919, + "grad_norm": 15.207098007202148, + "learning_rate": 9.604149212048675e-06, + "loss": 5.7742, + "step": 151150 + }, + { + "epoch": 13.570466786355476, + "grad_norm": 14.497210502624512, + "learning_rate": 9.603899860363057e-06, + "loss": 5.5971, + "step": 151175 + }, + { + "epoch": 13.572710951526032, + "grad_norm": 17.32406234741211, + "learning_rate": 9.60365050867744e-06, + "loss": 5.5876, + "step": 151200 + }, + { + "epoch": 13.57495511669659, + "grad_norm": 12.646613121032715, + "learning_rate": 9.603401156991822e-06, + "loss": 5.5798, + "step": 151225 + }, + { + "epoch": 13.577199281867145, + "grad_norm": 16.36904525756836, + "learning_rate": 9.603151805306204e-06, + "loss": 5.3128, + "step": 151250 + }, + { + "epoch": 13.579443447037702, + "grad_norm": 15.169769287109375, + "learning_rate": 9.602902453620587e-06, + "loss": 5.4107, + "step": 151275 + }, + { + "epoch": 13.581687612208258, + "grad_norm": 13.089876174926758, + "learning_rate": 9.60265310193497e-06, + "loss": 5.6423, + "step": 151300 + }, + { + "epoch": 13.583931777378815, + "grad_norm": 14.196114540100098, + "learning_rate": 9.602403750249353e-06, + "loss": 5.4357, + "step": 151325 + }, + { + "epoch": 13.58617594254937, + "grad_norm": 14.649672508239746, + "learning_rate": 9.602154398563735e-06, + "loss": 5.5489, + "step": 151350 + }, + { + "epoch": 13.588420107719928, + "grad_norm": 18.87257194519043, + "learning_rate": 9.601905046878118e-06, + "loss": 5.5, + "step": 151375 + }, + { + "epoch": 13.590664272890486, + "grad_norm": 15.42525577545166, + "learning_rate": 9.6016556951925e-06, + "loss": 5.5545, + "step": 151400 + }, + { + "epoch": 13.592908438061041, + "grad_norm": 16.942808151245117, + "learning_rate": 9.601406343506882e-06, + "loss": 5.4586, + "step": 151425 + }, + { + "epoch": 13.595152603231599, + "grad_norm": 16.647531509399414, + "learning_rate": 9.601156991821266e-06, + "loss": 5.3245, + "step": 151450 + }, + { + "epoch": 13.597396768402154, + "grad_norm": 14.173992156982422, + "learning_rate": 9.600907640135649e-06, + "loss": 5.711, + "step": 151475 + }, + { + "epoch": 13.599640933572712, + "grad_norm": 16.461742401123047, + "learning_rate": 9.600658288450031e-06, + "loss": 5.2751, + "step": 151500 + }, + { + "epoch": 13.601885098743267, + "grad_norm": 17.876510620117188, + "learning_rate": 9.600408936764413e-06, + "loss": 5.6849, + "step": 151525 + }, + { + "epoch": 13.604129263913824, + "grad_norm": 16.344606399536133, + "learning_rate": 9.600159585078795e-06, + "loss": 5.3743, + "step": 151550 + }, + { + "epoch": 13.60637342908438, + "grad_norm": 15.676910400390625, + "learning_rate": 9.599910233393178e-06, + "loss": 5.5657, + "step": 151575 + }, + { + "epoch": 13.608617594254937, + "grad_norm": 14.981542587280273, + "learning_rate": 9.59966088170756e-06, + "loss": 5.6011, + "step": 151600 + }, + { + "epoch": 13.610861759425493, + "grad_norm": 16.574251174926758, + "learning_rate": 9.599411530021944e-06, + "loss": 5.5218, + "step": 151625 + }, + { + "epoch": 13.61310592459605, + "grad_norm": 13.53555679321289, + "learning_rate": 9.599162178336326e-06, + "loss": 5.6555, + "step": 151650 + }, + { + "epoch": 13.615350089766606, + "grad_norm": 16.615676879882812, + "learning_rate": 9.598912826650709e-06, + "loss": 5.5729, + "step": 151675 + }, + { + "epoch": 13.617594254937163, + "grad_norm": 16.256532669067383, + "learning_rate": 9.598663474965093e-06, + "loss": 5.5252, + "step": 151700 + }, + { + "epoch": 13.61983842010772, + "grad_norm": 19.74225616455078, + "learning_rate": 9.598414123279473e-06, + "loss": 5.5124, + "step": 151725 + }, + { + "epoch": 13.622082585278276, + "grad_norm": 15.223608016967773, + "learning_rate": 9.598164771593856e-06, + "loss": 5.7439, + "step": 151750 + }, + { + "epoch": 13.624326750448834, + "grad_norm": 14.881782531738281, + "learning_rate": 9.59791541990824e-06, + "loss": 5.2821, + "step": 151775 + }, + { + "epoch": 13.62657091561939, + "grad_norm": 18.974342346191406, + "learning_rate": 9.597666068222622e-06, + "loss": 5.8043, + "step": 151800 + }, + { + "epoch": 13.628815080789947, + "grad_norm": 16.556711196899414, + "learning_rate": 9.597416716537004e-06, + "loss": 5.623, + "step": 151825 + }, + { + "epoch": 13.631059245960502, + "grad_norm": 12.282398223876953, + "learning_rate": 9.597167364851387e-06, + "loss": 5.3452, + "step": 151850 + }, + { + "epoch": 13.63330341113106, + "grad_norm": 12.791778564453125, + "learning_rate": 9.59691801316577e-06, + "loss": 5.3455, + "step": 151875 + }, + { + "epoch": 13.635547576301615, + "grad_norm": 16.010351181030273, + "learning_rate": 9.596668661480151e-06, + "loss": 5.7012, + "step": 151900 + }, + { + "epoch": 13.637791741472173, + "grad_norm": 14.03262996673584, + "learning_rate": 9.596419309794535e-06, + "loss": 5.6088, + "step": 151925 + }, + { + "epoch": 13.640035906642728, + "grad_norm": 17.389862060546875, + "learning_rate": 9.596169958108918e-06, + "loss": 5.4312, + "step": 151950 + }, + { + "epoch": 13.642280071813286, + "grad_norm": 15.275642395019531, + "learning_rate": 9.5959206064233e-06, + "loss": 5.6072, + "step": 151975 + }, + { + "epoch": 13.644524236983841, + "grad_norm": 13.004257202148438, + "learning_rate": 9.595671254737682e-06, + "loss": 5.5367, + "step": 152000 + }, + { + "epoch": 13.646768402154398, + "grad_norm": 13.956135749816895, + "learning_rate": 9.595421903052066e-06, + "loss": 5.6157, + "step": 152025 + }, + { + "epoch": 13.649012567324956, + "grad_norm": 15.474238395690918, + "learning_rate": 9.595172551366449e-06, + "loss": 5.3488, + "step": 152050 + }, + { + "epoch": 13.651256732495511, + "grad_norm": 19.955753326416016, + "learning_rate": 9.594923199680831e-06, + "loss": 5.5986, + "step": 152075 + }, + { + "epoch": 13.653500897666069, + "grad_norm": 14.492996215820312, + "learning_rate": 9.594673847995213e-06, + "loss": 5.5082, + "step": 152100 + }, + { + "epoch": 13.655745062836624, + "grad_norm": 15.702445983886719, + "learning_rate": 9.594424496309596e-06, + "loss": 5.631, + "step": 152125 + }, + { + "epoch": 13.657989228007182, + "grad_norm": 15.66208553314209, + "learning_rate": 9.594175144623978e-06, + "loss": 5.3818, + "step": 152150 + }, + { + "epoch": 13.660233393177737, + "grad_norm": 16.97306251525879, + "learning_rate": 9.593925792938362e-06, + "loss": 5.5327, + "step": 152175 + }, + { + "epoch": 13.662477558348295, + "grad_norm": 15.952521324157715, + "learning_rate": 9.593676441252744e-06, + "loss": 5.5639, + "step": 152200 + }, + { + "epoch": 13.66472172351885, + "grad_norm": 15.838451385498047, + "learning_rate": 9.593427089567126e-06, + "loss": 5.9062, + "step": 152225 + }, + { + "epoch": 13.666965888689408, + "grad_norm": 24.11509132385254, + "learning_rate": 9.593177737881509e-06, + "loss": 5.671, + "step": 152250 + }, + { + "epoch": 13.669210053859963, + "grad_norm": 16.618865966796875, + "learning_rate": 9.592928386195891e-06, + "loss": 5.6142, + "step": 152275 + }, + { + "epoch": 13.67145421903052, + "grad_norm": 14.063215255737305, + "learning_rate": 9.592679034510273e-06, + "loss": 5.4208, + "step": 152300 + }, + { + "epoch": 13.673698384201078, + "grad_norm": 16.624778747558594, + "learning_rate": 9.592429682824656e-06, + "loss": 5.5355, + "step": 152325 + }, + { + "epoch": 13.675942549371634, + "grad_norm": 17.987918853759766, + "learning_rate": 9.59218033113904e-06, + "loss": 5.8189, + "step": 152350 + }, + { + "epoch": 13.678186714542191, + "grad_norm": 15.597108840942383, + "learning_rate": 9.591930979453422e-06, + "loss": 5.6246, + "step": 152375 + }, + { + "epoch": 13.680430879712747, + "grad_norm": 14.44497299194336, + "learning_rate": 9.591681627767804e-06, + "loss": 5.689, + "step": 152400 + }, + { + "epoch": 13.682675044883304, + "grad_norm": 17.467369079589844, + "learning_rate": 9.591432276082188e-06, + "loss": 5.5943, + "step": 152425 + }, + { + "epoch": 13.68491921005386, + "grad_norm": 16.21368408203125, + "learning_rate": 9.591182924396569e-06, + "loss": 5.7093, + "step": 152450 + }, + { + "epoch": 13.687163375224417, + "grad_norm": 16.64864730834961, + "learning_rate": 9.590933572710951e-06, + "loss": 5.5559, + "step": 152475 + }, + { + "epoch": 13.689407540394972, + "grad_norm": 16.94707489013672, + "learning_rate": 9.590684221025335e-06, + "loss": 5.8666, + "step": 152500 + }, + { + "epoch": 13.69165170556553, + "grad_norm": 17.684894561767578, + "learning_rate": 9.590434869339718e-06, + "loss": 5.6459, + "step": 152525 + }, + { + "epoch": 13.693895870736085, + "grad_norm": 16.1064453125, + "learning_rate": 9.5901855176541e-06, + "loss": 5.6763, + "step": 152550 + }, + { + "epoch": 13.696140035906643, + "grad_norm": 15.153531074523926, + "learning_rate": 9.589936165968482e-06, + "loss": 5.5343, + "step": 152575 + }, + { + "epoch": 13.6983842010772, + "grad_norm": 16.423269271850586, + "learning_rate": 9.589686814282866e-06, + "loss": 5.585, + "step": 152600 + }, + { + "epoch": 13.700628366247756, + "grad_norm": 16.922122955322266, + "learning_rate": 9.589437462597247e-06, + "loss": 5.5904, + "step": 152625 + }, + { + "epoch": 13.702872531418313, + "grad_norm": 15.93065071105957, + "learning_rate": 9.589188110911631e-06, + "loss": 5.7246, + "step": 152650 + }, + { + "epoch": 13.705116696588869, + "grad_norm": 15.031031608581543, + "learning_rate": 9.588938759226013e-06, + "loss": 5.6628, + "step": 152675 + }, + { + "epoch": 13.707360861759426, + "grad_norm": 17.46678352355957, + "learning_rate": 9.588689407540396e-06, + "loss": 5.3883, + "step": 152700 + }, + { + "epoch": 13.709605026929982, + "grad_norm": 15.467307090759277, + "learning_rate": 9.588440055854778e-06, + "loss": 5.3485, + "step": 152725 + }, + { + "epoch": 13.711849192100539, + "grad_norm": 16.205341339111328, + "learning_rate": 9.588190704169162e-06, + "loss": 5.5256, + "step": 152750 + }, + { + "epoch": 13.714093357271095, + "grad_norm": 16.831336975097656, + "learning_rate": 9.587941352483544e-06, + "loss": 5.4897, + "step": 152775 + }, + { + "epoch": 13.716337522441652, + "grad_norm": 13.606795310974121, + "learning_rate": 9.587692000797927e-06, + "loss": 5.3185, + "step": 152800 + }, + { + "epoch": 13.718581687612208, + "grad_norm": 17.476890563964844, + "learning_rate": 9.587442649112309e-06, + "loss": 5.532, + "step": 152825 + }, + { + "epoch": 13.720825852782765, + "grad_norm": 19.247718811035156, + "learning_rate": 9.587193297426691e-06, + "loss": 5.5716, + "step": 152850 + }, + { + "epoch": 13.723070017953322, + "grad_norm": 14.288253784179688, + "learning_rate": 9.586943945741073e-06, + "loss": 5.7384, + "step": 152875 + }, + { + "epoch": 13.725314183123878, + "grad_norm": 15.7893648147583, + "learning_rate": 9.586694594055457e-06, + "loss": 5.3781, + "step": 152900 + }, + { + "epoch": 13.727558348294435, + "grad_norm": 17.843019485473633, + "learning_rate": 9.58644524236984e-06, + "loss": 5.5996, + "step": 152925 + }, + { + "epoch": 13.72980251346499, + "grad_norm": 16.92028045654297, + "learning_rate": 9.586195890684222e-06, + "loss": 5.4916, + "step": 152950 + }, + { + "epoch": 13.732046678635548, + "grad_norm": 17.55585479736328, + "learning_rate": 9.585946538998604e-06, + "loss": 5.5506, + "step": 152975 + }, + { + "epoch": 13.734290843806104, + "grad_norm": 17.256319046020508, + "learning_rate": 9.585697187312987e-06, + "loss": 5.4687, + "step": 153000 + }, + { + "epoch": 13.736535008976661, + "grad_norm": 13.717172622680664, + "learning_rate": 9.585447835627369e-06, + "loss": 5.6969, + "step": 153025 + }, + { + "epoch": 13.738779174147217, + "grad_norm": 17.094188690185547, + "learning_rate": 9.585198483941751e-06, + "loss": 5.4301, + "step": 153050 + }, + { + "epoch": 13.741023339317774, + "grad_norm": 16.435319900512695, + "learning_rate": 9.584949132256135e-06, + "loss": 5.6004, + "step": 153075 + }, + { + "epoch": 13.74326750448833, + "grad_norm": 15.598689079284668, + "learning_rate": 9.584699780570518e-06, + "loss": 5.6173, + "step": 153100 + }, + { + "epoch": 13.745511669658887, + "grad_norm": 14.668725967407227, + "learning_rate": 9.5844504288849e-06, + "loss": 5.6585, + "step": 153125 + }, + { + "epoch": 13.747755834829443, + "grad_norm": 15.771841049194336, + "learning_rate": 9.584201077199282e-06, + "loss": 5.6478, + "step": 153150 + }, + { + "epoch": 13.75, + "grad_norm": 15.718084335327148, + "learning_rate": 9.583951725513665e-06, + "loss": 5.5847, + "step": 153175 + }, + { + "epoch": 13.752244165170557, + "grad_norm": 19.757898330688477, + "learning_rate": 9.583702373828047e-06, + "loss": 5.3682, + "step": 153200 + }, + { + "epoch": 13.754488330341113, + "grad_norm": 15.479236602783203, + "learning_rate": 9.583453022142431e-06, + "loss": 5.4267, + "step": 153225 + }, + { + "epoch": 13.75673249551167, + "grad_norm": 15.613765716552734, + "learning_rate": 9.583203670456813e-06, + "loss": 5.4033, + "step": 153250 + }, + { + "epoch": 13.758976660682226, + "grad_norm": 15.72481632232666, + "learning_rate": 9.582954318771196e-06, + "loss": 5.6399, + "step": 153275 + }, + { + "epoch": 13.761220825852783, + "grad_norm": 15.825852394104004, + "learning_rate": 9.582704967085578e-06, + "loss": 5.6266, + "step": 153300 + }, + { + "epoch": 13.763464991023339, + "grad_norm": 14.591440200805664, + "learning_rate": 9.58245561539996e-06, + "loss": 5.4213, + "step": 153325 + }, + { + "epoch": 13.765709156193896, + "grad_norm": 16.221202850341797, + "learning_rate": 9.582206263714343e-06, + "loss": 5.7567, + "step": 153350 + }, + { + "epoch": 13.767953321364452, + "grad_norm": 16.334259033203125, + "learning_rate": 9.581956912028727e-06, + "loss": 5.5446, + "step": 153375 + }, + { + "epoch": 13.77019748653501, + "grad_norm": 15.799407958984375, + "learning_rate": 9.581707560343109e-06, + "loss": 5.5578, + "step": 153400 + }, + { + "epoch": 13.772441651705565, + "grad_norm": 18.73996353149414, + "learning_rate": 9.581458208657491e-06, + "loss": 5.5168, + "step": 153425 + }, + { + "epoch": 13.774685816876122, + "grad_norm": 15.678482055664062, + "learning_rate": 9.581208856971874e-06, + "loss": 5.8249, + "step": 153450 + }, + { + "epoch": 13.776929982046678, + "grad_norm": 14.410717010498047, + "learning_rate": 9.580959505286258e-06, + "loss": 5.6135, + "step": 153475 + }, + { + "epoch": 13.779174147217235, + "grad_norm": 15.198614120483398, + "learning_rate": 9.580710153600638e-06, + "loss": 5.6133, + "step": 153500 + }, + { + "epoch": 13.781418312387792, + "grad_norm": 16.093894958496094, + "learning_rate": 9.580460801915022e-06, + "loss": 5.3283, + "step": 153525 + }, + { + "epoch": 13.783662477558348, + "grad_norm": 16.99262809753418, + "learning_rate": 9.580211450229404e-06, + "loss": 5.493, + "step": 153550 + }, + { + "epoch": 13.785906642728905, + "grad_norm": 17.34305763244629, + "learning_rate": 9.579962098543787e-06, + "loss": 5.6267, + "step": 153575 + }, + { + "epoch": 13.788150807899461, + "grad_norm": 17.965438842773438, + "learning_rate": 9.579712746858169e-06, + "loss": 5.588, + "step": 153600 + }, + { + "epoch": 13.790394973070018, + "grad_norm": 14.656606674194336, + "learning_rate": 9.579463395172553e-06, + "loss": 5.5146, + "step": 153625 + }, + { + "epoch": 13.792639138240574, + "grad_norm": 17.711448669433594, + "learning_rate": 9.579214043486935e-06, + "loss": 5.577, + "step": 153650 + }, + { + "epoch": 13.794883303411131, + "grad_norm": 14.55700397491455, + "learning_rate": 9.578964691801316e-06, + "loss": 5.3327, + "step": 153675 + }, + { + "epoch": 13.797127468581687, + "grad_norm": 16.971094131469727, + "learning_rate": 9.5787153401157e-06, + "loss": 5.7678, + "step": 153700 + }, + { + "epoch": 13.799371633752244, + "grad_norm": 13.622431755065918, + "learning_rate": 9.578465988430082e-06, + "loss": 5.5973, + "step": 153725 + }, + { + "epoch": 13.8016157989228, + "grad_norm": 17.654787063598633, + "learning_rate": 9.578216636744465e-06, + "loss": 5.6879, + "step": 153750 + }, + { + "epoch": 13.803859964093357, + "grad_norm": 17.2679500579834, + "learning_rate": 9.577967285058849e-06, + "loss": 5.6939, + "step": 153775 + }, + { + "epoch": 13.806104129263915, + "grad_norm": 13.15770435333252, + "learning_rate": 9.577717933373231e-06, + "loss": 5.4166, + "step": 153800 + }, + { + "epoch": 13.80834829443447, + "grad_norm": 14.174090385437012, + "learning_rate": 9.577468581687613e-06, + "loss": 5.7418, + "step": 153825 + }, + { + "epoch": 13.810592459605028, + "grad_norm": 16.44097900390625, + "learning_rate": 9.577219230001996e-06, + "loss": 5.5816, + "step": 153850 + }, + { + "epoch": 13.812836624775583, + "grad_norm": 17.097108840942383, + "learning_rate": 9.576969878316378e-06, + "loss": 5.4945, + "step": 153875 + }, + { + "epoch": 13.81508078994614, + "grad_norm": 15.535455703735352, + "learning_rate": 9.57672052663076e-06, + "loss": 5.7566, + "step": 153900 + }, + { + "epoch": 13.817324955116696, + "grad_norm": 14.49013900756836, + "learning_rate": 9.576471174945143e-06, + "loss": 5.8457, + "step": 153925 + }, + { + "epoch": 13.819569120287253, + "grad_norm": 14.948716163635254, + "learning_rate": 9.576221823259527e-06, + "loss": 5.6463, + "step": 153950 + }, + { + "epoch": 13.821813285457809, + "grad_norm": 15.84876537322998, + "learning_rate": 9.575972471573909e-06, + "loss": 5.7053, + "step": 153975 + }, + { + "epoch": 13.824057450628366, + "grad_norm": 19.649368286132812, + "learning_rate": 9.575723119888291e-06, + "loss": 5.6373, + "step": 154000 + }, + { + "epoch": 13.826301615798922, + "grad_norm": 16.56194496154785, + "learning_rate": 9.575473768202674e-06, + "loss": 5.582, + "step": 154025 + }, + { + "epoch": 13.82854578096948, + "grad_norm": 15.858565330505371, + "learning_rate": 9.575224416517056e-06, + "loss": 5.3874, + "step": 154050 + }, + { + "epoch": 13.830789946140037, + "grad_norm": 17.842483520507812, + "learning_rate": 9.574975064831438e-06, + "loss": 5.6472, + "step": 154075 + }, + { + "epoch": 13.833034111310592, + "grad_norm": 12.659928321838379, + "learning_rate": 9.574725713145822e-06, + "loss": 5.4167, + "step": 154100 + }, + { + "epoch": 13.83527827648115, + "grad_norm": 19.611196517944336, + "learning_rate": 9.574476361460205e-06, + "loss": 5.6118, + "step": 154125 + }, + { + "epoch": 13.837522441651705, + "grad_norm": 13.564214706420898, + "learning_rate": 9.574227009774587e-06, + "loss": 5.4495, + "step": 154150 + }, + { + "epoch": 13.839766606822263, + "grad_norm": 17.776180267333984, + "learning_rate": 9.57397765808897e-06, + "loss": 5.6612, + "step": 154175 + }, + { + "epoch": 13.842010771992818, + "grad_norm": 17.766162872314453, + "learning_rate": 9.573728306403353e-06, + "loss": 5.7723, + "step": 154200 + }, + { + "epoch": 13.844254937163376, + "grad_norm": 15.12885856628418, + "learning_rate": 9.573478954717734e-06, + "loss": 5.5131, + "step": 154225 + }, + { + "epoch": 13.846499102333931, + "grad_norm": 16.82329750061035, + "learning_rate": 9.573229603032118e-06, + "loss": 5.6929, + "step": 154250 + }, + { + "epoch": 13.848743267504489, + "grad_norm": 19.06825828552246, + "learning_rate": 9.5729802513465e-06, + "loss": 5.4721, + "step": 154275 + }, + { + "epoch": 13.850987432675044, + "grad_norm": 14.902549743652344, + "learning_rate": 9.572730899660882e-06, + "loss": 5.7136, + "step": 154300 + }, + { + "epoch": 13.853231597845602, + "grad_norm": 17.127317428588867, + "learning_rate": 9.572481547975265e-06, + "loss": 5.586, + "step": 154325 + }, + { + "epoch": 13.855475763016159, + "grad_norm": 16.51497459411621, + "learning_rate": 9.572232196289649e-06, + "loss": 5.5842, + "step": 154350 + }, + { + "epoch": 13.857719928186714, + "grad_norm": 16.753013610839844, + "learning_rate": 9.571982844604031e-06, + "loss": 5.779, + "step": 154375 + }, + { + "epoch": 13.859964093357272, + "grad_norm": 17.85586166381836, + "learning_rate": 9.571733492918412e-06, + "loss": 5.3992, + "step": 154400 + }, + { + "epoch": 13.862208258527827, + "grad_norm": 14.878508567810059, + "learning_rate": 9.571484141232796e-06, + "loss": 5.6156, + "step": 154425 + }, + { + "epoch": 13.864452423698385, + "grad_norm": 15.05975341796875, + "learning_rate": 9.571234789547178e-06, + "loss": 5.6634, + "step": 154450 + }, + { + "epoch": 13.86669658886894, + "grad_norm": 19.072690963745117, + "learning_rate": 9.57098543786156e-06, + "loss": 5.7403, + "step": 154475 + }, + { + "epoch": 13.868940754039498, + "grad_norm": 15.67124080657959, + "learning_rate": 9.570736086175944e-06, + "loss": 5.3994, + "step": 154500 + }, + { + "epoch": 13.871184919210053, + "grad_norm": 17.462961196899414, + "learning_rate": 9.570486734490327e-06, + "loss": 5.5706, + "step": 154525 + }, + { + "epoch": 13.87342908438061, + "grad_norm": 16.719072341918945, + "learning_rate": 9.570237382804709e-06, + "loss": 5.5007, + "step": 154550 + }, + { + "epoch": 13.875673249551166, + "grad_norm": 17.03680419921875, + "learning_rate": 9.569998005186516e-06, + "loss": 5.7057, + "step": 154575 + }, + { + "epoch": 13.877917414721724, + "grad_norm": 18.08436393737793, + "learning_rate": 9.569748653500898e-06, + "loss": 5.4623, + "step": 154600 + }, + { + "epoch": 13.88016157989228, + "grad_norm": 16.3358211517334, + "learning_rate": 9.569499301815282e-06, + "loss": 5.5766, + "step": 154625 + }, + { + "epoch": 13.882405745062837, + "grad_norm": 14.078011512756348, + "learning_rate": 9.569249950129663e-06, + "loss": 5.4514, + "step": 154650 + }, + { + "epoch": 13.884649910233394, + "grad_norm": 14.523893356323242, + "learning_rate": 9.569000598444045e-06, + "loss": 5.7536, + "step": 154675 + }, + { + "epoch": 13.88689407540395, + "grad_norm": 14.998448371887207, + "learning_rate": 9.56875124675843e-06, + "loss": 5.4826, + "step": 154700 + }, + { + "epoch": 13.889138240574507, + "grad_norm": 14.853082656860352, + "learning_rate": 9.568501895072812e-06, + "loss": 5.5654, + "step": 154725 + }, + { + "epoch": 13.891382405745063, + "grad_norm": 18.435461044311523, + "learning_rate": 9.568252543387194e-06, + "loss": 5.5667, + "step": 154750 + }, + { + "epoch": 13.89362657091562, + "grad_norm": 14.845237731933594, + "learning_rate": 9.568003191701576e-06, + "loss": 5.6528, + "step": 154775 + }, + { + "epoch": 13.895870736086176, + "grad_norm": 16.554302215576172, + "learning_rate": 9.56775384001596e-06, + "loss": 5.6484, + "step": 154800 + }, + { + "epoch": 13.898114901256733, + "grad_norm": 14.431796073913574, + "learning_rate": 9.56750448833034e-06, + "loss": 5.5558, + "step": 154825 + }, + { + "epoch": 13.900359066427288, + "grad_norm": 13.53636646270752, + "learning_rate": 9.567255136644725e-06, + "loss": 5.5008, + "step": 154850 + }, + { + "epoch": 13.902603231597846, + "grad_norm": 15.390881538391113, + "learning_rate": 9.567005784959107e-06, + "loss": 5.4231, + "step": 154875 + }, + { + "epoch": 13.904847396768401, + "grad_norm": 16.52496910095215, + "learning_rate": 9.56675643327349e-06, + "loss": 5.4995, + "step": 154900 + }, + { + "epoch": 13.907091561938959, + "grad_norm": 16.435300827026367, + "learning_rate": 9.566507081587872e-06, + "loss": 5.5853, + "step": 154925 + }, + { + "epoch": 13.909335727109514, + "grad_norm": 14.132766723632812, + "learning_rate": 9.566257729902256e-06, + "loss": 5.5485, + "step": 154950 + }, + { + "epoch": 13.911579892280072, + "grad_norm": 15.367147445678711, + "learning_rate": 9.566008378216638e-06, + "loss": 5.5319, + "step": 154975 + }, + { + "epoch": 13.91382405745063, + "grad_norm": 15.24611759185791, + "learning_rate": 9.56575902653102e-06, + "loss": 5.5446, + "step": 155000 + }, + { + "epoch": 13.916068222621185, + "grad_norm": 14.316851615905762, + "learning_rate": 9.565509674845403e-06, + "loss": 5.4749, + "step": 155025 + }, + { + "epoch": 13.918312387791742, + "grad_norm": 15.599210739135742, + "learning_rate": 9.565260323159785e-06, + "loss": 5.6669, + "step": 155050 + }, + { + "epoch": 13.920556552962298, + "grad_norm": 18.68669891357422, + "learning_rate": 9.565010971474167e-06, + "loss": 5.6711, + "step": 155075 + }, + { + "epoch": 13.922800718132855, + "grad_norm": 16.16222381591797, + "learning_rate": 9.564761619788551e-06, + "loss": 5.6083, + "step": 155100 + }, + { + "epoch": 13.92504488330341, + "grad_norm": 13.896261215209961, + "learning_rate": 9.564512268102934e-06, + "loss": 5.584, + "step": 155125 + }, + { + "epoch": 13.927289048473968, + "grad_norm": 16.723670959472656, + "learning_rate": 9.564262916417316e-06, + "loss": 5.4945, + "step": 155150 + }, + { + "epoch": 13.929533213644524, + "grad_norm": 15.177323341369629, + "learning_rate": 9.564013564731698e-06, + "loss": 5.5581, + "step": 155175 + }, + { + "epoch": 13.931777378815081, + "grad_norm": 18.757944107055664, + "learning_rate": 9.56376421304608e-06, + "loss": 5.7053, + "step": 155200 + }, + { + "epoch": 13.934021543985637, + "grad_norm": 15.62706470489502, + "learning_rate": 9.563514861360463e-06, + "loss": 5.408, + "step": 155225 + }, + { + "epoch": 13.936265709156194, + "grad_norm": 14.868744850158691, + "learning_rate": 9.563265509674847e-06, + "loss": 5.5373, + "step": 155250 + }, + { + "epoch": 13.938509874326751, + "grad_norm": 15.01173210144043, + "learning_rate": 9.56301615798923e-06, + "loss": 5.387, + "step": 155275 + }, + { + "epoch": 13.940754039497307, + "grad_norm": 16.027429580688477, + "learning_rate": 9.562766806303612e-06, + "loss": 5.7409, + "step": 155300 + }, + { + "epoch": 13.942998204667864, + "grad_norm": 16.360088348388672, + "learning_rate": 9.562517454617994e-06, + "loss": 5.5909, + "step": 155325 + }, + { + "epoch": 13.94524236983842, + "grad_norm": 15.439250946044922, + "learning_rate": 9.562268102932376e-06, + "loss": 5.3695, + "step": 155350 + }, + { + "epoch": 13.947486535008977, + "grad_norm": 19.92350959777832, + "learning_rate": 9.562018751246759e-06, + "loss": 5.6707, + "step": 155375 + }, + { + "epoch": 13.949730700179533, + "grad_norm": 13.748856544494629, + "learning_rate": 9.56176939956114e-06, + "loss": 5.8322, + "step": 155400 + }, + { + "epoch": 13.95197486535009, + "grad_norm": 15.063067436218262, + "learning_rate": 9.561520047875525e-06, + "loss": 5.6322, + "step": 155425 + }, + { + "epoch": 13.954219030520646, + "grad_norm": 15.115462303161621, + "learning_rate": 9.561270696189907e-06, + "loss": 5.6425, + "step": 155450 + }, + { + "epoch": 13.956463195691203, + "grad_norm": 16.493406295776367, + "learning_rate": 9.56102134450429e-06, + "loss": 5.5752, + "step": 155475 + }, + { + "epoch": 13.958707360861759, + "grad_norm": 14.604622840881348, + "learning_rate": 9.560771992818672e-06, + "loss": 5.4943, + "step": 155500 + }, + { + "epoch": 13.960951526032316, + "grad_norm": 14.810802459716797, + "learning_rate": 9.560522641133056e-06, + "loss": 5.7899, + "step": 155525 + }, + { + "epoch": 13.963195691202873, + "grad_norm": 13.28005599975586, + "learning_rate": 9.560273289447436e-06, + "loss": 5.5495, + "step": 155550 + }, + { + "epoch": 13.965439856373429, + "grad_norm": 15.001934051513672, + "learning_rate": 9.56002393776182e-06, + "loss": 5.7588, + "step": 155575 + }, + { + "epoch": 13.967684021543986, + "grad_norm": 17.20914649963379, + "learning_rate": 9.559774586076203e-06, + "loss": 5.5365, + "step": 155600 + }, + { + "epoch": 13.969928186714542, + "grad_norm": 15.940528869628906, + "learning_rate": 9.559525234390585e-06, + "loss": 5.6014, + "step": 155625 + }, + { + "epoch": 13.9721723518851, + "grad_norm": 24.432334899902344, + "learning_rate": 9.559275882704967e-06, + "loss": 5.4454, + "step": 155650 + }, + { + "epoch": 13.974416517055655, + "grad_norm": 17.147981643676758, + "learning_rate": 9.559026531019351e-06, + "loss": 5.5776, + "step": 155675 + }, + { + "epoch": 13.976660682226212, + "grad_norm": 17.508270263671875, + "learning_rate": 9.558777179333734e-06, + "loss": 5.5445, + "step": 155700 + }, + { + "epoch": 13.978904847396768, + "grad_norm": 17.78807258605957, + "learning_rate": 9.558527827648116e-06, + "loss": 5.6602, + "step": 155725 + }, + { + "epoch": 13.981149012567325, + "grad_norm": 16.174495697021484, + "learning_rate": 9.558278475962498e-06, + "loss": 5.5108, + "step": 155750 + }, + { + "epoch": 13.98339317773788, + "grad_norm": 13.3116455078125, + "learning_rate": 9.55802912427688e-06, + "loss": 5.5139, + "step": 155775 + }, + { + "epoch": 13.985637342908438, + "grad_norm": 17.61614990234375, + "learning_rate": 9.557779772591263e-06, + "loss": 5.2558, + "step": 155800 + }, + { + "epoch": 13.987881508078996, + "grad_norm": 14.00696086883545, + "learning_rate": 9.557530420905647e-06, + "loss": 5.3932, + "step": 155825 + }, + { + "epoch": 13.990125673249551, + "grad_norm": 15.56527042388916, + "learning_rate": 9.55728106922003e-06, + "loss": 5.8017, + "step": 155850 + }, + { + "epoch": 13.992369838420109, + "grad_norm": 14.734479904174805, + "learning_rate": 9.557031717534412e-06, + "loss": 5.4716, + "step": 155875 + }, + { + "epoch": 13.994614003590664, + "grad_norm": 14.133214950561523, + "learning_rate": 9.556782365848794e-06, + "loss": 5.7114, + "step": 155900 + }, + { + "epoch": 13.996858168761221, + "grad_norm": 15.86937427520752, + "learning_rate": 9.556533014163176e-06, + "loss": 5.8527, + "step": 155925 + }, + { + "epoch": 13.999102333931777, + "grad_norm": 16.85582733154297, + "learning_rate": 9.556283662477559e-06, + "loss": 5.4682, + "step": 155950 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.07696713955664178, + "eval_f1_macro": 0.005927461828031584, + "eval_f1_micro": 0.07696713955664178, + "eval_f1_weighted": 0.041322337900408936, + "eval_loss": 6.924584865570068, + "eval_precision_macro": 0.0054766675608644184, + "eval_precision_micro": 0.07696713955664178, + "eval_precision_weighted": 0.03358631234811179, + "eval_recall_macro": 0.0103462963530426, + "eval_recall_micro": 0.07696713955664178, + "eval_recall_weighted": 0.07696713955664178, + "eval_runtime": 130.1916, + "eval_samples_per_second": 402.276, + "eval_steps_per_second": 12.574, + "step": 155960 + }, + { + "epoch": 14.001346499102334, + "grad_norm": 17.16093635559082, + "learning_rate": 9.556034310791943e-06, + "loss": 5.4885, + "step": 155975 + }, + { + "epoch": 14.00359066427289, + "grad_norm": 16.44987678527832, + "learning_rate": 9.555784959106325e-06, + "loss": 5.3854, + "step": 156000 + }, + { + "epoch": 14.005834829443447, + "grad_norm": 14.460708618164062, + "learning_rate": 9.555535607420707e-06, + "loss": 5.13, + "step": 156025 + }, + { + "epoch": 14.008078994614003, + "grad_norm": 16.15176010131836, + "learning_rate": 9.55528625573509e-06, + "loss": 5.3465, + "step": 156050 + }, + { + "epoch": 14.01032315978456, + "grad_norm": 16.333547592163086, + "learning_rate": 9.555036904049472e-06, + "loss": 5.25, + "step": 156075 + }, + { + "epoch": 14.012567324955116, + "grad_norm": 16.4566593170166, + "learning_rate": 9.554787552363854e-06, + "loss": 4.9938, + "step": 156100 + }, + { + "epoch": 14.014811490125673, + "grad_norm": 14.520027160644531, + "learning_rate": 9.554538200678236e-06, + "loss": 5.3458, + "step": 156125 + }, + { + "epoch": 14.01705565529623, + "grad_norm": 15.679475784301758, + "learning_rate": 9.55428884899262e-06, + "loss": 4.9904, + "step": 156150 + }, + { + "epoch": 14.019299820466786, + "grad_norm": 15.651152610778809, + "learning_rate": 9.554039497307003e-06, + "loss": 5.1307, + "step": 156175 + }, + { + "epoch": 14.021543985637344, + "grad_norm": 16.770883560180664, + "learning_rate": 9.553790145621385e-06, + "loss": 5.2518, + "step": 156200 + }, + { + "epoch": 14.0237881508079, + "grad_norm": 14.40566635131836, + "learning_rate": 9.553540793935767e-06, + "loss": 5.2497, + "step": 156225 + }, + { + "epoch": 14.026032315978457, + "grad_norm": 18.29364013671875, + "learning_rate": 9.55329144225015e-06, + "loss": 5.2196, + "step": 156250 + }, + { + "epoch": 14.028276481149012, + "grad_norm": 17.020051956176758, + "learning_rate": 9.553042090564532e-06, + "loss": 5.6597, + "step": 156275 + }, + { + "epoch": 14.03052064631957, + "grad_norm": 15.581949234008789, + "learning_rate": 9.552792738878916e-06, + "loss": 5.2405, + "step": 156300 + }, + { + "epoch": 14.032764811490125, + "grad_norm": 14.819197654724121, + "learning_rate": 9.552543387193298e-06, + "loss": 5.4162, + "step": 156325 + }, + { + "epoch": 14.035008976660682, + "grad_norm": 16.841630935668945, + "learning_rate": 9.55229403550768e-06, + "loss": 5.3475, + "step": 156350 + }, + { + "epoch": 14.037253141831238, + "grad_norm": 17.920883178710938, + "learning_rate": 9.552044683822063e-06, + "loss": 5.207, + "step": 156375 + }, + { + "epoch": 14.039497307001795, + "grad_norm": 16.37273406982422, + "learning_rate": 9.551795332136447e-06, + "loss": 5.1662, + "step": 156400 + }, + { + "epoch": 14.041741472172351, + "grad_norm": 16.82791519165039, + "learning_rate": 9.551545980450828e-06, + "loss": 5.312, + "step": 156425 + }, + { + "epoch": 14.043985637342908, + "grad_norm": 14.922860145568848, + "learning_rate": 9.551296628765212e-06, + "loss": 5.3914, + "step": 156450 + }, + { + "epoch": 14.046229802513466, + "grad_norm": 17.104127883911133, + "learning_rate": 9.551047277079594e-06, + "loss": 5.2359, + "step": 156475 + }, + { + "epoch": 14.048473967684021, + "grad_norm": 14.581120491027832, + "learning_rate": 9.550797925393976e-06, + "loss": 5.3228, + "step": 156500 + }, + { + "epoch": 14.050718132854579, + "grad_norm": 14.891237258911133, + "learning_rate": 9.550548573708359e-06, + "loss": 5.2559, + "step": 156525 + }, + { + "epoch": 14.052962298025134, + "grad_norm": 16.4041690826416, + "learning_rate": 9.550299222022743e-06, + "loss": 5.388, + "step": 156550 + }, + { + "epoch": 14.055206463195692, + "grad_norm": 16.148473739624023, + "learning_rate": 9.550049870337125e-06, + "loss": 5.1086, + "step": 156575 + }, + { + "epoch": 14.057450628366247, + "grad_norm": 14.260366439819336, + "learning_rate": 9.549800518651506e-06, + "loss": 5.2246, + "step": 156600 + }, + { + "epoch": 14.059694793536805, + "grad_norm": 17.306711196899414, + "learning_rate": 9.54955116696589e-06, + "loss": 5.2856, + "step": 156625 + }, + { + "epoch": 14.06193895870736, + "grad_norm": 19.780563354492188, + "learning_rate": 9.549301815280272e-06, + "loss": 5.4812, + "step": 156650 + }, + { + "epoch": 14.064183123877918, + "grad_norm": 15.392431259155273, + "learning_rate": 9.549052463594654e-06, + "loss": 5.2787, + "step": 156675 + }, + { + "epoch": 14.066427289048473, + "grad_norm": 17.376441955566406, + "learning_rate": 9.548803111909038e-06, + "loss": 5.3872, + "step": 156700 + }, + { + "epoch": 14.06867145421903, + "grad_norm": 16.009504318237305, + "learning_rate": 9.54855376022342e-06, + "loss": 5.4015, + "step": 156725 + }, + { + "epoch": 14.070915619389588, + "grad_norm": 17.79023551940918, + "learning_rate": 9.548304408537803e-06, + "loss": 4.9718, + "step": 156750 + }, + { + "epoch": 14.073159784560143, + "grad_norm": 18.029008865356445, + "learning_rate": 9.548055056852185e-06, + "loss": 5.0925, + "step": 156775 + }, + { + "epoch": 14.0754039497307, + "grad_norm": 16.95403480529785, + "learning_rate": 9.547805705166567e-06, + "loss": 5.3974, + "step": 156800 + }, + { + "epoch": 14.077648114901256, + "grad_norm": 17.076059341430664, + "learning_rate": 9.54755635348095e-06, + "loss": 5.4333, + "step": 156825 + }, + { + "epoch": 14.079892280071814, + "grad_norm": 14.49602222442627, + "learning_rate": 9.547307001795332e-06, + "loss": 5.2866, + "step": 156850 + }, + { + "epoch": 14.08213644524237, + "grad_norm": 15.842586517333984, + "learning_rate": 9.547057650109716e-06, + "loss": 5.0517, + "step": 156875 + }, + { + "epoch": 14.084380610412927, + "grad_norm": 18.980850219726562, + "learning_rate": 9.546808298424098e-06, + "loss": 5.2091, + "step": 156900 + }, + { + "epoch": 14.086624775583482, + "grad_norm": 16.472694396972656, + "learning_rate": 9.54655894673848e-06, + "loss": 5.1264, + "step": 156925 + }, + { + "epoch": 14.08886894075404, + "grad_norm": 14.912381172180176, + "learning_rate": 9.546309595052863e-06, + "loss": 5.2864, + "step": 156950 + }, + { + "epoch": 14.091113105924595, + "grad_norm": 15.726831436157227, + "learning_rate": 9.546060243367245e-06, + "loss": 5.3297, + "step": 156975 + }, + { + "epoch": 14.093357271095153, + "grad_norm": 15.604096412658691, + "learning_rate": 9.545810891681628e-06, + "loss": 5.2171, + "step": 157000 + }, + { + "epoch": 14.09560143626571, + "grad_norm": 20.159015655517578, + "learning_rate": 9.545561539996012e-06, + "loss": 5.3332, + "step": 157025 + }, + { + "epoch": 14.097845601436266, + "grad_norm": 17.9102783203125, + "learning_rate": 9.545312188310394e-06, + "loss": 5.355, + "step": 157050 + }, + { + "epoch": 14.100089766606823, + "grad_norm": 17.161792755126953, + "learning_rate": 9.545062836624776e-06, + "loss": 5.0986, + "step": 157075 + }, + { + "epoch": 14.102333931777379, + "grad_norm": 14.158895492553711, + "learning_rate": 9.544813484939159e-06, + "loss": 5.2245, + "step": 157100 + }, + { + "epoch": 14.104578096947936, + "grad_norm": 19.526302337646484, + "learning_rate": 9.544564133253541e-06, + "loss": 5.269, + "step": 157125 + }, + { + "epoch": 14.106822262118492, + "grad_norm": 14.187182426452637, + "learning_rate": 9.544314781567923e-06, + "loss": 5.3849, + "step": 157150 + }, + { + "epoch": 14.109066427289049, + "grad_norm": 19.216447830200195, + "learning_rate": 9.544065429882307e-06, + "loss": 5.4143, + "step": 157175 + }, + { + "epoch": 14.111310592459605, + "grad_norm": 15.73067855834961, + "learning_rate": 9.54381607819669e-06, + "loss": 5.2055, + "step": 157200 + }, + { + "epoch": 14.113554757630162, + "grad_norm": 14.61352825164795, + "learning_rate": 9.543566726511072e-06, + "loss": 5.3879, + "step": 157225 + }, + { + "epoch": 14.115798922800717, + "grad_norm": 14.743112564086914, + "learning_rate": 9.543317374825454e-06, + "loss": 5.2817, + "step": 157250 + }, + { + "epoch": 14.118043087971275, + "grad_norm": 16.146055221557617, + "learning_rate": 9.543068023139838e-06, + "loss": 5.206, + "step": 157275 + }, + { + "epoch": 14.12028725314183, + "grad_norm": 15.376167297363281, + "learning_rate": 9.54281867145422e-06, + "loss": 5.348, + "step": 157300 + }, + { + "epoch": 14.122531418312388, + "grad_norm": 15.15182876586914, + "learning_rate": 9.542569319768601e-06, + "loss": 5.2014, + "step": 157325 + }, + { + "epoch": 14.124775583482945, + "grad_norm": 15.86288070678711, + "learning_rate": 9.542319968082985e-06, + "loss": 5.1946, + "step": 157350 + }, + { + "epoch": 14.1270197486535, + "grad_norm": 20.725339889526367, + "learning_rate": 9.542070616397368e-06, + "loss": 5.4813, + "step": 157375 + }, + { + "epoch": 14.129263913824058, + "grad_norm": 14.7671480178833, + "learning_rate": 9.54182126471175e-06, + "loss": 5.3665, + "step": 157400 + }, + { + "epoch": 14.131508078994614, + "grad_norm": 15.890522003173828, + "learning_rate": 9.541571913026134e-06, + "loss": 5.2585, + "step": 157425 + }, + { + "epoch": 14.133752244165171, + "grad_norm": 16.154521942138672, + "learning_rate": 9.541322561340516e-06, + "loss": 5.3486, + "step": 157450 + }, + { + "epoch": 14.135996409335727, + "grad_norm": 14.84773063659668, + "learning_rate": 9.541073209654898e-06, + "loss": 5.3431, + "step": 157475 + }, + { + "epoch": 14.138240574506284, + "grad_norm": 21.367311477661133, + "learning_rate": 9.540833832036705e-06, + "loss": 5.3004, + "step": 157500 + }, + { + "epoch": 14.14048473967684, + "grad_norm": 18.042604446411133, + "learning_rate": 9.540584480351088e-06, + "loss": 5.2587, + "step": 157525 + }, + { + "epoch": 14.142728904847397, + "grad_norm": 14.884490013122559, + "learning_rate": 9.540335128665472e-06, + "loss": 5.3061, + "step": 157550 + }, + { + "epoch": 14.144973070017953, + "grad_norm": 17.413820266723633, + "learning_rate": 9.540085776979852e-06, + "loss": 5.3598, + "step": 157575 + }, + { + "epoch": 14.14721723518851, + "grad_norm": 15.643328666687012, + "learning_rate": 9.539836425294235e-06, + "loss": 4.9289, + "step": 157600 + }, + { + "epoch": 14.149461400359067, + "grad_norm": 18.35971450805664, + "learning_rate": 9.539587073608619e-06, + "loss": 5.3849, + "step": 157625 + }, + { + "epoch": 14.151705565529623, + "grad_norm": 18.07598304748535, + "learning_rate": 9.539337721923001e-06, + "loss": 5.0919, + "step": 157650 + }, + { + "epoch": 14.15394973070018, + "grad_norm": 16.12154769897461, + "learning_rate": 9.539088370237383e-06, + "loss": 5.149, + "step": 157675 + }, + { + "epoch": 14.156193895870736, + "grad_norm": 17.337013244628906, + "learning_rate": 9.538839018551767e-06, + "loss": 5.6022, + "step": 157700 + }, + { + "epoch": 14.158438061041293, + "grad_norm": 18.79071617126465, + "learning_rate": 9.53858966686615e-06, + "loss": 5.3411, + "step": 157725 + }, + { + "epoch": 14.160682226211849, + "grad_norm": 13.962294578552246, + "learning_rate": 9.53834031518053e-06, + "loss": 5.3994, + "step": 157750 + }, + { + "epoch": 14.162926391382406, + "grad_norm": 18.941368103027344, + "learning_rate": 9.538090963494914e-06, + "loss": 5.0545, + "step": 157775 + }, + { + "epoch": 14.165170556552962, + "grad_norm": 15.486512184143066, + "learning_rate": 9.537841611809297e-06, + "loss": 5.3661, + "step": 157800 + }, + { + "epoch": 14.16741472172352, + "grad_norm": 22.79514503479004, + "learning_rate": 9.537592260123679e-06, + "loss": 5.4505, + "step": 157825 + }, + { + "epoch": 14.169658886894075, + "grad_norm": 16.46045684814453, + "learning_rate": 9.537342908438061e-06, + "loss": 5.3594, + "step": 157850 + }, + { + "epoch": 14.171903052064632, + "grad_norm": 18.744827270507812, + "learning_rate": 9.537093556752445e-06, + "loss": 5.2031, + "step": 157875 + }, + { + "epoch": 14.174147217235188, + "grad_norm": 15.290898323059082, + "learning_rate": 9.536844205066828e-06, + "loss": 5.3152, + "step": 157900 + }, + { + "epoch": 14.176391382405745, + "grad_norm": 14.999969482421875, + "learning_rate": 9.53659485338121e-06, + "loss": 5.5023, + "step": 157925 + }, + { + "epoch": 14.178635547576302, + "grad_norm": 15.673391342163086, + "learning_rate": 9.536345501695592e-06, + "loss": 5.252, + "step": 157950 + }, + { + "epoch": 14.180879712746858, + "grad_norm": 18.78653907775879, + "learning_rate": 9.536096150009975e-06, + "loss": 5.3046, + "step": 157975 + }, + { + "epoch": 14.183123877917415, + "grad_norm": 16.506446838378906, + "learning_rate": 9.535846798324357e-06, + "loss": 5.1279, + "step": 158000 + }, + { + "epoch": 14.185368043087971, + "grad_norm": 14.34045124053955, + "learning_rate": 9.53559744663874e-06, + "loss": 5.1233, + "step": 158025 + }, + { + "epoch": 14.187612208258528, + "grad_norm": 14.993566513061523, + "learning_rate": 9.535348094953123e-06, + "loss": 5.3519, + "step": 158050 + }, + { + "epoch": 14.189856373429084, + "grad_norm": 14.74329948425293, + "learning_rate": 9.535098743267505e-06, + "loss": 5.4584, + "step": 158075 + }, + { + "epoch": 14.192100538599641, + "grad_norm": 16.156084060668945, + "learning_rate": 9.534849391581888e-06, + "loss": 5.2885, + "step": 158100 + }, + { + "epoch": 14.194344703770197, + "grad_norm": 13.78682804107666, + "learning_rate": 9.53460003989627e-06, + "loss": 5.0434, + "step": 158125 + }, + { + "epoch": 14.196588868940754, + "grad_norm": 16.63880157470703, + "learning_rate": 9.534350688210652e-06, + "loss": 5.3586, + "step": 158150 + }, + { + "epoch": 14.19883303411131, + "grad_norm": 19.436073303222656, + "learning_rate": 9.534101336525036e-06, + "loss": 5.4467, + "step": 158175 + }, + { + "epoch": 14.201077199281867, + "grad_norm": 15.746299743652344, + "learning_rate": 9.533851984839419e-06, + "loss": 5.3796, + "step": 158200 + }, + { + "epoch": 14.203321364452425, + "grad_norm": 16.30963706970215, + "learning_rate": 9.533602633153801e-06, + "loss": 5.2, + "step": 158225 + }, + { + "epoch": 14.20556552962298, + "grad_norm": 15.981695175170898, + "learning_rate": 9.533353281468183e-06, + "loss": 5.265, + "step": 158250 + }, + { + "epoch": 14.207809694793538, + "grad_norm": 16.02344512939453, + "learning_rate": 9.533103929782566e-06, + "loss": 5.0536, + "step": 158275 + }, + { + "epoch": 14.210053859964093, + "grad_norm": 16.596450805664062, + "learning_rate": 9.532854578096948e-06, + "loss": 5.4836, + "step": 158300 + }, + { + "epoch": 14.21229802513465, + "grad_norm": 22.07461929321289, + "learning_rate": 9.53260522641133e-06, + "loss": 5.2251, + "step": 158325 + }, + { + "epoch": 14.214542190305206, + "grad_norm": 15.024316787719727, + "learning_rate": 9.532355874725714e-06, + "loss": 5.3703, + "step": 158350 + }, + { + "epoch": 14.216786355475763, + "grad_norm": 14.182096481323242, + "learning_rate": 9.532106523040097e-06, + "loss": 5.5307, + "step": 158375 + }, + { + "epoch": 14.219030520646319, + "grad_norm": 17.63218879699707, + "learning_rate": 9.531857171354479e-06, + "loss": 5.5119, + "step": 158400 + }, + { + "epoch": 14.221274685816876, + "grad_norm": 15.41295051574707, + "learning_rate": 9.531607819668863e-06, + "loss": 5.2022, + "step": 158425 + }, + { + "epoch": 14.223518850987432, + "grad_norm": 17.623414993286133, + "learning_rate": 9.531358467983244e-06, + "loss": 5.1672, + "step": 158450 + }, + { + "epoch": 14.22576301615799, + "grad_norm": 24.131412506103516, + "learning_rate": 9.531109116297626e-06, + "loss": 5.1371, + "step": 158475 + }, + { + "epoch": 14.228007181328545, + "grad_norm": 15.078749656677246, + "learning_rate": 9.53085976461201e-06, + "loss": 5.2713, + "step": 158500 + }, + { + "epoch": 14.230251346499102, + "grad_norm": 15.216717720031738, + "learning_rate": 9.530610412926392e-06, + "loss": 5.4771, + "step": 158525 + }, + { + "epoch": 14.23249551166966, + "grad_norm": 17.963253021240234, + "learning_rate": 9.530361061240775e-06, + "loss": 5.2889, + "step": 158550 + }, + { + "epoch": 14.234739676840215, + "grad_norm": 16.23155975341797, + "learning_rate": 9.530111709555157e-06, + "loss": 5.277, + "step": 158575 + }, + { + "epoch": 14.236983842010773, + "grad_norm": 14.911420822143555, + "learning_rate": 9.529862357869541e-06, + "loss": 5.3084, + "step": 158600 + }, + { + "epoch": 14.239228007181328, + "grad_norm": 22.39423179626465, + "learning_rate": 9.529613006183922e-06, + "loss": 5.3141, + "step": 158625 + }, + { + "epoch": 14.241472172351886, + "grad_norm": 15.31244945526123, + "learning_rate": 9.529363654498306e-06, + "loss": 5.4235, + "step": 158650 + }, + { + "epoch": 14.243716337522441, + "grad_norm": 16.548431396484375, + "learning_rate": 9.529114302812688e-06, + "loss": 5.1487, + "step": 158675 + }, + { + "epoch": 14.245960502692999, + "grad_norm": 14.028298377990723, + "learning_rate": 9.52886495112707e-06, + "loss": 5.2769, + "step": 158700 + }, + { + "epoch": 14.248204667863554, + "grad_norm": 15.968683242797852, + "learning_rate": 9.528615599441452e-06, + "loss": 5.2751, + "step": 158725 + }, + { + "epoch": 14.250448833034111, + "grad_norm": 17.6341495513916, + "learning_rate": 9.528366247755836e-06, + "loss": 5.3946, + "step": 158750 + }, + { + "epoch": 14.252692998204667, + "grad_norm": 14.646106719970703, + "learning_rate": 9.528116896070219e-06, + "loss": 5.3318, + "step": 158775 + }, + { + "epoch": 14.254937163375224, + "grad_norm": 17.254413604736328, + "learning_rate": 9.527867544384601e-06, + "loss": 5.6097, + "step": 158800 + }, + { + "epoch": 14.257181328545782, + "grad_norm": 17.076129913330078, + "learning_rate": 9.527618192698983e-06, + "loss": 5.2446, + "step": 158825 + }, + { + "epoch": 14.259425493716337, + "grad_norm": 15.090773582458496, + "learning_rate": 9.527368841013366e-06, + "loss": 5.3608, + "step": 158850 + }, + { + "epoch": 14.261669658886895, + "grad_norm": 18.245376586914062, + "learning_rate": 9.527119489327748e-06, + "loss": 5.4001, + "step": 158875 + }, + { + "epoch": 14.26391382405745, + "grad_norm": 17.119417190551758, + "learning_rate": 9.526870137642132e-06, + "loss": 5.2084, + "step": 158900 + }, + { + "epoch": 14.266157989228008, + "grad_norm": 15.903617858886719, + "learning_rate": 9.526620785956514e-06, + "loss": 5.167, + "step": 158925 + }, + { + "epoch": 14.268402154398563, + "grad_norm": 16.769731521606445, + "learning_rate": 9.526371434270897e-06, + "loss": 5.2965, + "step": 158950 + }, + { + "epoch": 14.27064631956912, + "grad_norm": 15.96749210357666, + "learning_rate": 9.526122082585279e-06, + "loss": 5.5266, + "step": 158975 + }, + { + "epoch": 14.272890484739676, + "grad_norm": 15.830154418945312, + "learning_rate": 9.525872730899661e-06, + "loss": 5.2788, + "step": 159000 + }, + { + "epoch": 14.275134649910234, + "grad_norm": 14.641607284545898, + "learning_rate": 9.525623379214044e-06, + "loss": 5.4059, + "step": 159025 + }, + { + "epoch": 14.27737881508079, + "grad_norm": 17.329418182373047, + "learning_rate": 9.525374027528426e-06, + "loss": 5.4151, + "step": 159050 + }, + { + "epoch": 14.279622980251347, + "grad_norm": 14.035932540893555, + "learning_rate": 9.52512467584281e-06, + "loss": 5.4883, + "step": 159075 + }, + { + "epoch": 14.281867145421902, + "grad_norm": 15.991098403930664, + "learning_rate": 9.524875324157192e-06, + "loss": 5.424, + "step": 159100 + }, + { + "epoch": 14.28411131059246, + "grad_norm": 18.02419662475586, + "learning_rate": 9.524625972471575e-06, + "loss": 5.5231, + "step": 159125 + }, + { + "epoch": 14.286355475763017, + "grad_norm": 17.784555435180664, + "learning_rate": 9.524376620785959e-06, + "loss": 5.3852, + "step": 159150 + }, + { + "epoch": 14.288599640933572, + "grad_norm": 16.47865104675293, + "learning_rate": 9.52412726910034e-06, + "loss": 5.4194, + "step": 159175 + }, + { + "epoch": 14.29084380610413, + "grad_norm": 15.00224781036377, + "learning_rate": 9.523877917414722e-06, + "loss": 5.135, + "step": 159200 + }, + { + "epoch": 14.293087971274685, + "grad_norm": 16.886489868164062, + "learning_rate": 9.523628565729106e-06, + "loss": 5.2214, + "step": 159225 + }, + { + "epoch": 14.295332136445243, + "grad_norm": 21.706125259399414, + "learning_rate": 9.523379214043488e-06, + "loss": 5.3049, + "step": 159250 + }, + { + "epoch": 14.297576301615798, + "grad_norm": 15.553619384765625, + "learning_rate": 9.52312986235787e-06, + "loss": 5.6723, + "step": 159275 + }, + { + "epoch": 14.299820466786356, + "grad_norm": 16.243051528930664, + "learning_rate": 9.522880510672253e-06, + "loss": 5.4957, + "step": 159300 + }, + { + "epoch": 14.302064631956911, + "grad_norm": 15.52599048614502, + "learning_rate": 9.522631158986637e-06, + "loss": 5.2377, + "step": 159325 + }, + { + "epoch": 14.304308797127469, + "grad_norm": 17.481504440307617, + "learning_rate": 9.522381807301017e-06, + "loss": 5.2605, + "step": 159350 + }, + { + "epoch": 14.306552962298024, + "grad_norm": 17.14206314086914, + "learning_rate": 9.522132455615401e-06, + "loss": 5.3692, + "step": 159375 + }, + { + "epoch": 14.308797127468582, + "grad_norm": 13.696553230285645, + "learning_rate": 9.521883103929783e-06, + "loss": 5.325, + "step": 159400 + }, + { + "epoch": 14.311041292639139, + "grad_norm": 16.455236434936523, + "learning_rate": 9.521633752244166e-06, + "loss": 5.3483, + "step": 159425 + }, + { + "epoch": 14.313285457809695, + "grad_norm": 14.090852737426758, + "learning_rate": 9.521384400558548e-06, + "loss": 5.2961, + "step": 159450 + }, + { + "epoch": 14.315529622980252, + "grad_norm": 16.76572608947754, + "learning_rate": 9.521135048872932e-06, + "loss": 5.2962, + "step": 159475 + }, + { + "epoch": 14.317773788150808, + "grad_norm": 18.157939910888672, + "learning_rate": 9.520885697187314e-06, + "loss": 5.331, + "step": 159500 + }, + { + "epoch": 14.320017953321365, + "grad_norm": 18.368717193603516, + "learning_rate": 9.520636345501697e-06, + "loss": 5.3903, + "step": 159525 + }, + { + "epoch": 14.32226211849192, + "grad_norm": 18.11522674560547, + "learning_rate": 9.520386993816079e-06, + "loss": 5.4004, + "step": 159550 + }, + { + "epoch": 14.324506283662478, + "grad_norm": 15.495644569396973, + "learning_rate": 9.520137642130461e-06, + "loss": 5.277, + "step": 159575 + }, + { + "epoch": 14.326750448833034, + "grad_norm": 16.84779930114746, + "learning_rate": 9.519888290444844e-06, + "loss": 5.2982, + "step": 159600 + }, + { + "epoch": 14.32899461400359, + "grad_norm": 14.091211318969727, + "learning_rate": 9.519638938759228e-06, + "loss": 5.1793, + "step": 159625 + }, + { + "epoch": 14.331238779174146, + "grad_norm": 14.480620384216309, + "learning_rate": 9.51938958707361e-06, + "loss": 5.3742, + "step": 159650 + }, + { + "epoch": 14.333482944344704, + "grad_norm": 15.457980155944824, + "learning_rate": 9.519140235387992e-06, + "loss": 5.3113, + "step": 159675 + }, + { + "epoch": 14.335727109515261, + "grad_norm": 18.0935001373291, + "learning_rate": 9.518890883702375e-06, + "loss": 5.3542, + "step": 159700 + }, + { + "epoch": 14.337971274685817, + "grad_norm": 16.546337127685547, + "learning_rate": 9.518641532016757e-06, + "loss": 5.2601, + "step": 159725 + }, + { + "epoch": 14.340215439856374, + "grad_norm": 15.728120803833008, + "learning_rate": 9.51839218033114e-06, + "loss": 5.3544, + "step": 159750 + }, + { + "epoch": 14.34245960502693, + "grad_norm": 19.73622703552246, + "learning_rate": 9.518142828645522e-06, + "loss": 5.2934, + "step": 159775 + }, + { + "epoch": 14.344703770197487, + "grad_norm": 14.060446739196777, + "learning_rate": 9.517903451027329e-06, + "loss": 5.4858, + "step": 159800 + }, + { + "epoch": 14.346947935368043, + "grad_norm": 16.54719352722168, + "learning_rate": 9.517654099341713e-06, + "loss": 5.387, + "step": 159825 + }, + { + "epoch": 14.3491921005386, + "grad_norm": 15.319219589233398, + "learning_rate": 9.517404747656095e-06, + "loss": 5.5092, + "step": 159850 + }, + { + "epoch": 14.351436265709156, + "grad_norm": 15.912354469299316, + "learning_rate": 9.517155395970477e-06, + "loss": 5.1403, + "step": 159875 + }, + { + "epoch": 14.353680430879713, + "grad_norm": 16.043657302856445, + "learning_rate": 9.516906044284861e-06, + "loss": 5.7028, + "step": 159900 + }, + { + "epoch": 14.355924596050269, + "grad_norm": 19.17892074584961, + "learning_rate": 9.516656692599244e-06, + "loss": 5.5961, + "step": 159925 + }, + { + "epoch": 14.358168761220826, + "grad_norm": 14.987529754638672, + "learning_rate": 9.516407340913624e-06, + "loss": 5.2273, + "step": 159950 + }, + { + "epoch": 14.360412926391382, + "grad_norm": 21.784236907958984, + "learning_rate": 9.516157989228008e-06, + "loss": 5.1411, + "step": 159975 + }, + { + "epoch": 14.362657091561939, + "grad_norm": 15.405196189880371, + "learning_rate": 9.51590863754239e-06, + "loss": 5.305, + "step": 160000 + }, + { + "epoch": 14.364901256732496, + "grad_norm": 15.533830642700195, + "learning_rate": 9.515659285856773e-06, + "loss": 4.9979, + "step": 160025 + }, + { + "epoch": 14.367145421903052, + "grad_norm": 15.692192077636719, + "learning_rate": 9.515409934171155e-06, + "loss": 5.3571, + "step": 160050 + }, + { + "epoch": 14.36938958707361, + "grad_norm": 14.480134963989258, + "learning_rate": 9.515160582485539e-06, + "loss": 5.2122, + "step": 160075 + }, + { + "epoch": 14.371633752244165, + "grad_norm": 17.456323623657227, + "learning_rate": 9.514911230799921e-06, + "loss": 5.3524, + "step": 160100 + }, + { + "epoch": 14.373877917414722, + "grad_norm": 15.993956565856934, + "learning_rate": 9.514661879114304e-06, + "loss": 5.3511, + "step": 160125 + }, + { + "epoch": 14.376122082585278, + "grad_norm": 17.145471572875977, + "learning_rate": 9.514412527428686e-06, + "loss": 5.3511, + "step": 160150 + }, + { + "epoch": 14.378366247755835, + "grad_norm": 15.665968894958496, + "learning_rate": 9.514163175743068e-06, + "loss": 5.0083, + "step": 160175 + }, + { + "epoch": 14.38061041292639, + "grad_norm": 16.92049217224121, + "learning_rate": 9.51391382405745e-06, + "loss": 5.2561, + "step": 160200 + }, + { + "epoch": 14.382854578096948, + "grad_norm": 16.218505859375, + "learning_rate": 9.513664472371835e-06, + "loss": 5.4337, + "step": 160225 + }, + { + "epoch": 14.385098743267504, + "grad_norm": 16.69339370727539, + "learning_rate": 9.513415120686217e-06, + "loss": 5.3724, + "step": 160250 + }, + { + "epoch": 14.387342908438061, + "grad_norm": 17.51691246032715, + "learning_rate": 9.5131657690006e-06, + "loss": 5.345, + "step": 160275 + }, + { + "epoch": 14.389587073608618, + "grad_norm": 16.886600494384766, + "learning_rate": 9.512916417314982e-06, + "loss": 5.1472, + "step": 160300 + }, + { + "epoch": 14.391831238779174, + "grad_norm": 18.320457458496094, + "learning_rate": 9.512667065629364e-06, + "loss": 5.1556, + "step": 160325 + }, + { + "epoch": 14.394075403949731, + "grad_norm": 15.056193351745605, + "learning_rate": 9.512417713943746e-06, + "loss": 5.5359, + "step": 160350 + }, + { + "epoch": 14.396319569120287, + "grad_norm": 16.669921875, + "learning_rate": 9.51216836225813e-06, + "loss": 5.2984, + "step": 160375 + }, + { + "epoch": 14.398563734290844, + "grad_norm": 16.2537784576416, + "learning_rate": 9.511919010572513e-06, + "loss": 5.1857, + "step": 160400 + }, + { + "epoch": 14.4008078994614, + "grad_norm": 17.53745460510254, + "learning_rate": 9.511669658886895e-06, + "loss": 5.3148, + "step": 160425 + }, + { + "epoch": 14.403052064631957, + "grad_norm": 14.051669120788574, + "learning_rate": 9.511420307201277e-06, + "loss": 5.298, + "step": 160450 + }, + { + "epoch": 14.405296229802513, + "grad_norm": 17.351972579956055, + "learning_rate": 9.511170955515661e-06, + "loss": 5.2602, + "step": 160475 + }, + { + "epoch": 14.40754039497307, + "grad_norm": 16.41145896911621, + "learning_rate": 9.510921603830042e-06, + "loss": 5.2406, + "step": 160500 + }, + { + "epoch": 14.409784560143626, + "grad_norm": 15.643902778625488, + "learning_rate": 9.510672252144424e-06, + "loss": 5.3113, + "step": 160525 + }, + { + "epoch": 14.412028725314183, + "grad_norm": 18.53550910949707, + "learning_rate": 9.510422900458808e-06, + "loss": 5.3132, + "step": 160550 + }, + { + "epoch": 14.414272890484739, + "grad_norm": 17.446273803710938, + "learning_rate": 9.51017354877319e-06, + "loss": 5.3066, + "step": 160575 + }, + { + "epoch": 14.416517055655296, + "grad_norm": 15.58436393737793, + "learning_rate": 9.509924197087573e-06, + "loss": 5.2881, + "step": 160600 + }, + { + "epoch": 14.418761220825854, + "grad_norm": 15.654513359069824, + "learning_rate": 9.509674845401957e-06, + "loss": 5.6182, + "step": 160625 + }, + { + "epoch": 14.42100538599641, + "grad_norm": 14.550281524658203, + "learning_rate": 9.50942549371634e-06, + "loss": 5.275, + "step": 160650 + }, + { + "epoch": 14.423249551166966, + "grad_norm": 16.106351852416992, + "learning_rate": 9.50917614203072e-06, + "loss": 5.3544, + "step": 160675 + }, + { + "epoch": 14.425493716337522, + "grad_norm": 17.398447036743164, + "learning_rate": 9.508926790345104e-06, + "loss": 5.2716, + "step": 160700 + }, + { + "epoch": 14.42773788150808, + "grad_norm": 17.21194839477539, + "learning_rate": 9.508677438659486e-06, + "loss": 5.1718, + "step": 160725 + }, + { + "epoch": 14.429982046678635, + "grad_norm": 13.640111923217773, + "learning_rate": 9.508428086973868e-06, + "loss": 5.3814, + "step": 160750 + }, + { + "epoch": 14.432226211849192, + "grad_norm": 16.722043991088867, + "learning_rate": 9.50817873528825e-06, + "loss": 5.0215, + "step": 160775 + }, + { + "epoch": 14.434470377019748, + "grad_norm": 14.574141502380371, + "learning_rate": 9.507929383602635e-06, + "loss": 5.3412, + "step": 160800 + }, + { + "epoch": 14.436714542190305, + "grad_norm": 16.160091400146484, + "learning_rate": 9.507680031917017e-06, + "loss": 5.4069, + "step": 160825 + }, + { + "epoch": 14.438958707360861, + "grad_norm": 15.614630699157715, + "learning_rate": 9.5074306802314e-06, + "loss": 5.4781, + "step": 160850 + }, + { + "epoch": 14.441202872531418, + "grad_norm": 16.082372665405273, + "learning_rate": 9.507181328545782e-06, + "loss": 5.1346, + "step": 160875 + }, + { + "epoch": 14.443447037701976, + "grad_norm": 16.46169662475586, + "learning_rate": 9.506931976860164e-06, + "loss": 5.4458, + "step": 160900 + }, + { + "epoch": 14.445691202872531, + "grad_norm": 15.649592399597168, + "learning_rate": 9.506682625174546e-06, + "loss": 5.3737, + "step": 160925 + }, + { + "epoch": 14.447935368043089, + "grad_norm": 15.183234214782715, + "learning_rate": 9.50643327348893e-06, + "loss": 5.393, + "step": 160950 + }, + { + "epoch": 14.450179533213644, + "grad_norm": 16.511816024780273, + "learning_rate": 9.506183921803313e-06, + "loss": 5.4419, + "step": 160975 + }, + { + "epoch": 14.452423698384202, + "grad_norm": 16.380977630615234, + "learning_rate": 9.505934570117695e-06, + "loss": 5.4162, + "step": 161000 + }, + { + "epoch": 14.454667863554757, + "grad_norm": 18.617021560668945, + "learning_rate": 9.505685218432077e-06, + "loss": 5.5036, + "step": 161025 + }, + { + "epoch": 14.456912028725315, + "grad_norm": 17.67745590209961, + "learning_rate": 9.50543586674646e-06, + "loss": 5.3541, + "step": 161050 + }, + { + "epoch": 14.45915619389587, + "grad_norm": 19.152278900146484, + "learning_rate": 9.505186515060842e-06, + "loss": 5.3709, + "step": 161075 + }, + { + "epoch": 14.461400359066428, + "grad_norm": 17.563005447387695, + "learning_rate": 9.504937163375226e-06, + "loss": 5.5498, + "step": 161100 + }, + { + "epoch": 14.463644524236983, + "grad_norm": 16.0556640625, + "learning_rate": 9.504687811689608e-06, + "loss": 5.4085, + "step": 161125 + }, + { + "epoch": 14.46588868940754, + "grad_norm": 18.094985961914062, + "learning_rate": 9.50443846000399e-06, + "loss": 5.4463, + "step": 161150 + }, + { + "epoch": 14.468132854578098, + "grad_norm": 15.021489143371582, + "learning_rate": 9.504189108318373e-06, + "loss": 5.4204, + "step": 161175 + }, + { + "epoch": 14.470377019748653, + "grad_norm": 14.525598526000977, + "learning_rate": 9.503939756632755e-06, + "loss": 5.4067, + "step": 161200 + }, + { + "epoch": 14.47262118491921, + "grad_norm": 16.954050064086914, + "learning_rate": 9.503690404947138e-06, + "loss": 5.3049, + "step": 161225 + }, + { + "epoch": 14.474865350089766, + "grad_norm": 13.196468353271484, + "learning_rate": 9.50344105326152e-06, + "loss": 5.3772, + "step": 161250 + }, + { + "epoch": 14.477109515260324, + "grad_norm": 16.92797088623047, + "learning_rate": 9.503191701575904e-06, + "loss": 5.3866, + "step": 161275 + }, + { + "epoch": 14.47935368043088, + "grad_norm": 14.784772872924805, + "learning_rate": 9.502942349890286e-06, + "loss": 5.4075, + "step": 161300 + }, + { + "epoch": 14.481597845601437, + "grad_norm": 15.56276798248291, + "learning_rate": 9.502692998204668e-06, + "loss": 5.4956, + "step": 161325 + }, + { + "epoch": 14.483842010771992, + "grad_norm": 16.08914566040039, + "learning_rate": 9.502443646519052e-06, + "loss": 5.3459, + "step": 161350 + }, + { + "epoch": 14.48608617594255, + "grad_norm": 17.63920021057129, + "learning_rate": 9.502194294833433e-06, + "loss": 5.6548, + "step": 161375 + }, + { + "epoch": 14.488330341113105, + "grad_norm": 16.58810806274414, + "learning_rate": 9.501944943147815e-06, + "loss": 5.1864, + "step": 161400 + }, + { + "epoch": 14.490574506283663, + "grad_norm": 16.459754943847656, + "learning_rate": 9.5016955914622e-06, + "loss": 5.6663, + "step": 161425 + }, + { + "epoch": 14.492818671454218, + "grad_norm": 17.480100631713867, + "learning_rate": 9.501446239776582e-06, + "loss": 5.4106, + "step": 161450 + }, + { + "epoch": 14.495062836624776, + "grad_norm": 13.783245086669922, + "learning_rate": 9.501196888090964e-06, + "loss": 5.3684, + "step": 161475 + }, + { + "epoch": 14.497307001795333, + "grad_norm": 16.676607131958008, + "learning_rate": 9.500947536405346e-06, + "loss": 5.5879, + "step": 161500 + }, + { + "epoch": 14.499551166965889, + "grad_norm": 17.728593826293945, + "learning_rate": 9.50069818471973e-06, + "loss": 5.369, + "step": 161525 + }, + { + "epoch": 14.501795332136446, + "grad_norm": 14.935882568359375, + "learning_rate": 9.500448833034111e-06, + "loss": 5.5501, + "step": 161550 + }, + { + "epoch": 14.504039497307001, + "grad_norm": 21.657564163208008, + "learning_rate": 9.500199481348495e-06, + "loss": 5.557, + "step": 161575 + }, + { + "epoch": 14.506283662477559, + "grad_norm": 18.03963851928711, + "learning_rate": 9.499950129662877e-06, + "loss": 5.5685, + "step": 161600 + }, + { + "epoch": 14.508527827648114, + "grad_norm": 14.002265930175781, + "learning_rate": 9.49970077797726e-06, + "loss": 5.3213, + "step": 161625 + }, + { + "epoch": 14.510771992818672, + "grad_norm": 15.098649978637695, + "learning_rate": 9.499451426291642e-06, + "loss": 5.3074, + "step": 161650 + }, + { + "epoch": 14.513016157989227, + "grad_norm": 22.103168487548828, + "learning_rate": 9.499202074606026e-06, + "loss": 5.0841, + "step": 161675 + }, + { + "epoch": 14.515260323159785, + "grad_norm": 14.726582527160645, + "learning_rate": 9.498952722920408e-06, + "loss": 5.2489, + "step": 161700 + }, + { + "epoch": 14.51750448833034, + "grad_norm": 15.288349151611328, + "learning_rate": 9.49870337123479e-06, + "loss": 5.3009, + "step": 161725 + }, + { + "epoch": 14.519748653500898, + "grad_norm": 16.814189910888672, + "learning_rate": 9.498454019549173e-06, + "loss": 5.3311, + "step": 161750 + }, + { + "epoch": 14.521992818671453, + "grad_norm": 19.485898971557617, + "learning_rate": 9.498204667863555e-06, + "loss": 5.3551, + "step": 161775 + }, + { + "epoch": 14.52423698384201, + "grad_norm": 17.59235382080078, + "learning_rate": 9.497955316177938e-06, + "loss": 5.4472, + "step": 161800 + }, + { + "epoch": 14.526481149012568, + "grad_norm": 17.289857864379883, + "learning_rate": 9.497705964492322e-06, + "loss": 5.4011, + "step": 161825 + }, + { + "epoch": 14.528725314183124, + "grad_norm": 16.56238555908203, + "learning_rate": 9.497456612806704e-06, + "loss": 5.4712, + "step": 161850 + }, + { + "epoch": 14.530969479353681, + "grad_norm": 17.723167419433594, + "learning_rate": 9.497207261121086e-06, + "loss": 5.3797, + "step": 161875 + }, + { + "epoch": 14.533213644524237, + "grad_norm": 14.662124633789062, + "learning_rate": 9.496967883502893e-06, + "loss": 5.3155, + "step": 161900 + }, + { + "epoch": 14.535457809694794, + "grad_norm": 15.32670783996582, + "learning_rate": 9.496718531817275e-06, + "loss": 5.4498, + "step": 161925 + }, + { + "epoch": 14.53770197486535, + "grad_norm": 14.553692817687988, + "learning_rate": 9.49646918013166e-06, + "loss": 5.4926, + "step": 161950 + }, + { + "epoch": 14.539946140035907, + "grad_norm": 17.557086944580078, + "learning_rate": 9.496219828446042e-06, + "loss": 5.2811, + "step": 161975 + }, + { + "epoch": 14.542190305206462, + "grad_norm": 20.869953155517578, + "learning_rate": 9.495970476760424e-06, + "loss": 5.4124, + "step": 162000 + }, + { + "epoch": 14.54443447037702, + "grad_norm": 15.286640167236328, + "learning_rate": 9.495721125074806e-06, + "loss": 5.3535, + "step": 162025 + }, + { + "epoch": 14.546678635547575, + "grad_norm": 17.727201461791992, + "learning_rate": 9.495471773389189e-06, + "loss": 5.464, + "step": 162050 + }, + { + "epoch": 14.548922800718133, + "grad_norm": 16.869529724121094, + "learning_rate": 9.495222421703571e-06, + "loss": 5.3391, + "step": 162075 + }, + { + "epoch": 14.55116696588869, + "grad_norm": 17.728357315063477, + "learning_rate": 9.494973070017955e-06, + "loss": 5.0386, + "step": 162100 + }, + { + "epoch": 14.553411131059246, + "grad_norm": 15.947720527648926, + "learning_rate": 9.494723718332337e-06, + "loss": 5.2351, + "step": 162125 + }, + { + "epoch": 14.555655296229803, + "grad_norm": 16.145923614501953, + "learning_rate": 9.49447436664672e-06, + "loss": 5.2614, + "step": 162150 + }, + { + "epoch": 14.557899461400359, + "grad_norm": 16.050382614135742, + "learning_rate": 9.494225014961102e-06, + "loss": 5.3878, + "step": 162175 + }, + { + "epoch": 14.560143626570916, + "grad_norm": 17.562992095947266, + "learning_rate": 9.493975663275484e-06, + "loss": 5.6309, + "step": 162200 + }, + { + "epoch": 14.562387791741472, + "grad_norm": 14.661507606506348, + "learning_rate": 9.493726311589867e-06, + "loss": 5.4362, + "step": 162225 + }, + { + "epoch": 14.564631956912029, + "grad_norm": 15.55665111541748, + "learning_rate": 9.493476959904249e-06, + "loss": 5.1726, + "step": 162250 + }, + { + "epoch": 14.566876122082585, + "grad_norm": 15.776119232177734, + "learning_rate": 9.493227608218633e-06, + "loss": 5.4931, + "step": 162275 + }, + { + "epoch": 14.569120287253142, + "grad_norm": 15.494625091552734, + "learning_rate": 9.492978256533015e-06, + "loss": 5.2744, + "step": 162300 + }, + { + "epoch": 14.571364452423698, + "grad_norm": 18.22601318359375, + "learning_rate": 9.492728904847398e-06, + "loss": 5.1651, + "step": 162325 + }, + { + "epoch": 14.573608617594255, + "grad_norm": 16.547094345092773, + "learning_rate": 9.49247955316178e-06, + "loss": 5.391, + "step": 162350 + }, + { + "epoch": 14.575852782764812, + "grad_norm": 16.696739196777344, + "learning_rate": 9.492230201476162e-06, + "loss": 5.2946, + "step": 162375 + }, + { + "epoch": 14.578096947935368, + "grad_norm": 16.50330924987793, + "learning_rate": 9.491980849790545e-06, + "loss": 5.5194, + "step": 162400 + }, + { + "epoch": 14.580341113105925, + "grad_norm": 19.810165405273438, + "learning_rate": 9.491731498104929e-06, + "loss": 5.3959, + "step": 162425 + }, + { + "epoch": 14.58258527827648, + "grad_norm": 18.02105712890625, + "learning_rate": 9.491482146419311e-06, + "loss": 5.4578, + "step": 162450 + }, + { + "epoch": 14.584829443447038, + "grad_norm": 16.59722900390625, + "learning_rate": 9.491232794733693e-06, + "loss": 5.3082, + "step": 162475 + }, + { + "epoch": 14.587073608617594, + "grad_norm": 17.9363956451416, + "learning_rate": 9.490983443048076e-06, + "loss": 5.0533, + "step": 162500 + }, + { + "epoch": 14.589317773788151, + "grad_norm": 13.742971420288086, + "learning_rate": 9.490734091362458e-06, + "loss": 5.3808, + "step": 162525 + }, + { + "epoch": 14.591561938958707, + "grad_norm": 19.353734970092773, + "learning_rate": 9.49048473967684e-06, + "loss": 5.3984, + "step": 162550 + }, + { + "epoch": 14.593806104129264, + "grad_norm": 15.49664306640625, + "learning_rate": 9.490235387991224e-06, + "loss": 5.6294, + "step": 162575 + }, + { + "epoch": 14.59605026929982, + "grad_norm": 16.565107345581055, + "learning_rate": 9.489986036305606e-06, + "loss": 5.5032, + "step": 162600 + }, + { + "epoch": 14.598294434470377, + "grad_norm": 22.452491760253906, + "learning_rate": 9.489736684619989e-06, + "loss": 5.3778, + "step": 162625 + }, + { + "epoch": 14.600538599640934, + "grad_norm": 16.216073989868164, + "learning_rate": 9.489487332934371e-06, + "loss": 5.3095, + "step": 162650 + }, + { + "epoch": 14.60278276481149, + "grad_norm": 17.551969528198242, + "learning_rate": 9.489237981248755e-06, + "loss": 5.5024, + "step": 162675 + }, + { + "epoch": 14.605026929982047, + "grad_norm": 17.276947021484375, + "learning_rate": 9.488988629563136e-06, + "loss": 5.3806, + "step": 162700 + }, + { + "epoch": 14.607271095152603, + "grad_norm": 20.427837371826172, + "learning_rate": 9.48873927787752e-06, + "loss": 5.2564, + "step": 162725 + }, + { + "epoch": 14.60951526032316, + "grad_norm": 15.772135734558105, + "learning_rate": 9.488489926191902e-06, + "loss": 5.2935, + "step": 162750 + }, + { + "epoch": 14.611759425493716, + "grad_norm": 17.812986373901367, + "learning_rate": 9.488240574506284e-06, + "loss": 5.3438, + "step": 162775 + }, + { + "epoch": 14.614003590664273, + "grad_norm": 17.913217544555664, + "learning_rate": 9.487991222820667e-06, + "loss": 5.5781, + "step": 162800 + }, + { + "epoch": 14.616247755834829, + "grad_norm": 20.429182052612305, + "learning_rate": 9.48774187113505e-06, + "loss": 5.2086, + "step": 162825 + }, + { + "epoch": 14.618491921005386, + "grad_norm": 17.85345458984375, + "learning_rate": 9.487502493516858e-06, + "loss": 5.7361, + "step": 162850 + }, + { + "epoch": 14.620736086175942, + "grad_norm": 15.645843505859375, + "learning_rate": 9.48725314183124e-06, + "loss": 5.2507, + "step": 162875 + }, + { + "epoch": 14.6229802513465, + "grad_norm": 14.686118125915527, + "learning_rate": 9.487003790145622e-06, + "loss": 5.5511, + "step": 162900 + }, + { + "epoch": 14.625224416517055, + "grad_norm": 13.406709671020508, + "learning_rate": 9.486754438460005e-06, + "loss": 5.1407, + "step": 162925 + }, + { + "epoch": 14.627468581687612, + "grad_norm": 14.963677406311035, + "learning_rate": 9.486505086774387e-06, + "loss": 5.5053, + "step": 162950 + }, + { + "epoch": 14.62971274685817, + "grad_norm": 15.879207611083984, + "learning_rate": 9.48625573508877e-06, + "loss": 5.5303, + "step": 162975 + }, + { + "epoch": 14.631956912028725, + "grad_norm": 19.69736099243164, + "learning_rate": 9.486006383403152e-06, + "loss": 5.2975, + "step": 163000 + }, + { + "epoch": 14.634201077199283, + "grad_norm": 19.19990348815918, + "learning_rate": 9.485757031717536e-06, + "loss": 5.4046, + "step": 163025 + }, + { + "epoch": 14.636445242369838, + "grad_norm": 14.73696231842041, + "learning_rate": 9.485507680031918e-06, + "loss": 5.2135, + "step": 163050 + }, + { + "epoch": 14.638689407540395, + "grad_norm": 18.19021224975586, + "learning_rate": 9.4852583283463e-06, + "loss": 5.2304, + "step": 163075 + }, + { + "epoch": 14.640933572710951, + "grad_norm": 15.420568466186523, + "learning_rate": 9.485008976660684e-06, + "loss": 5.3439, + "step": 163100 + }, + { + "epoch": 14.643177737881508, + "grad_norm": 14.037897109985352, + "learning_rate": 9.484759624975065e-06, + "loss": 5.2997, + "step": 163125 + }, + { + "epoch": 14.645421903052064, + "grad_norm": 16.741178512573242, + "learning_rate": 9.484510273289447e-06, + "loss": 5.3335, + "step": 163150 + }, + { + "epoch": 14.647666068222621, + "grad_norm": 18.467439651489258, + "learning_rate": 9.484260921603831e-06, + "loss": 5.0904, + "step": 163175 + }, + { + "epoch": 14.649910233393177, + "grad_norm": 17.658313751220703, + "learning_rate": 9.484011569918213e-06, + "loss": 5.2278, + "step": 163200 + }, + { + "epoch": 14.652154398563734, + "grad_norm": 18.524879455566406, + "learning_rate": 9.483762218232596e-06, + "loss": 5.3503, + "step": 163225 + }, + { + "epoch": 14.65439856373429, + "grad_norm": 15.536983489990234, + "learning_rate": 9.483512866546978e-06, + "loss": 5.3544, + "step": 163250 + }, + { + "epoch": 14.656642728904847, + "grad_norm": 18.35321807861328, + "learning_rate": 9.483263514861362e-06, + "loss": 5.2786, + "step": 163275 + }, + { + "epoch": 14.658886894075405, + "grad_norm": 16.475738525390625, + "learning_rate": 9.483014163175743e-06, + "loss": 5.1734, + "step": 163300 + }, + { + "epoch": 14.66113105924596, + "grad_norm": 16.294170379638672, + "learning_rate": 9.482764811490127e-06, + "loss": 5.4228, + "step": 163325 + }, + { + "epoch": 14.663375224416518, + "grad_norm": 18.98794174194336, + "learning_rate": 9.482515459804509e-06, + "loss": 5.2703, + "step": 163350 + }, + { + "epoch": 14.665619389587073, + "grad_norm": 15.569275856018066, + "learning_rate": 9.482266108118891e-06, + "loss": 5.4101, + "step": 163375 + }, + { + "epoch": 14.66786355475763, + "grad_norm": 16.106678009033203, + "learning_rate": 9.482016756433274e-06, + "loss": 5.3971, + "step": 163400 + }, + { + "epoch": 14.670107719928186, + "grad_norm": 18.045381546020508, + "learning_rate": 9.481767404747658e-06, + "loss": 5.4757, + "step": 163425 + }, + { + "epoch": 14.672351885098744, + "grad_norm": 18.283164978027344, + "learning_rate": 9.48151805306204e-06, + "loss": 5.228, + "step": 163450 + }, + { + "epoch": 14.6745960502693, + "grad_norm": 17.111713409423828, + "learning_rate": 9.481268701376422e-06, + "loss": 5.1433, + "step": 163475 + }, + { + "epoch": 14.676840215439857, + "grad_norm": 17.257291793823242, + "learning_rate": 9.481019349690805e-06, + "loss": 5.4056, + "step": 163500 + }, + { + "epoch": 14.679084380610412, + "grad_norm": 16.73897361755371, + "learning_rate": 9.480769998005187e-06, + "loss": 5.3356, + "step": 163525 + }, + { + "epoch": 14.68132854578097, + "grad_norm": 16.09259796142578, + "learning_rate": 9.48052064631957e-06, + "loss": 5.2598, + "step": 163550 + }, + { + "epoch": 14.683572710951527, + "grad_norm": 17.526357650756836, + "learning_rate": 9.480271294633953e-06, + "loss": 5.5716, + "step": 163575 + }, + { + "epoch": 14.685816876122082, + "grad_norm": 16.85218048095703, + "learning_rate": 9.480021942948336e-06, + "loss": 5.1967, + "step": 163600 + }, + { + "epoch": 14.68806104129264, + "grad_norm": 16.39693832397461, + "learning_rate": 9.479772591262718e-06, + "loss": 5.3076, + "step": 163625 + }, + { + "epoch": 14.690305206463195, + "grad_norm": 17.555713653564453, + "learning_rate": 9.4795232395771e-06, + "loss": 5.3821, + "step": 163650 + }, + { + "epoch": 14.692549371633753, + "grad_norm": 16.414880752563477, + "learning_rate": 9.479273887891483e-06, + "loss": 5.242, + "step": 163675 + }, + { + "epoch": 14.694793536804308, + "grad_norm": 15.859696388244629, + "learning_rate": 9.479024536205865e-06, + "loss": 5.2845, + "step": 163700 + }, + { + "epoch": 14.697037701974866, + "grad_norm": 17.816879272460938, + "learning_rate": 9.478775184520249e-06, + "loss": 5.4623, + "step": 163725 + }, + { + "epoch": 14.699281867145421, + "grad_norm": 19.448575973510742, + "learning_rate": 9.478525832834631e-06, + "loss": 5.3181, + "step": 163750 + }, + { + "epoch": 14.701526032315979, + "grad_norm": 14.798949241638184, + "learning_rate": 9.478276481149014e-06, + "loss": 5.4926, + "step": 163775 + }, + { + "epoch": 14.703770197486534, + "grad_norm": 14.97256088256836, + "learning_rate": 9.478027129463396e-06, + "loss": 5.34, + "step": 163800 + }, + { + "epoch": 14.706014362657092, + "grad_norm": 14.463445663452148, + "learning_rate": 9.47777777777778e-06, + "loss": 5.5667, + "step": 163825 + }, + { + "epoch": 14.708258527827649, + "grad_norm": 16.70465087890625, + "learning_rate": 9.47752842609216e-06, + "loss": 5.287, + "step": 163850 + }, + { + "epoch": 14.710502692998205, + "grad_norm": 18.470977783203125, + "learning_rate": 9.477279074406543e-06, + "loss": 5.5431, + "step": 163875 + }, + { + "epoch": 14.712746858168762, + "grad_norm": 16.141464233398438, + "learning_rate": 9.477029722720927e-06, + "loss": 5.4562, + "step": 163900 + }, + { + "epoch": 14.714991023339318, + "grad_norm": 13.031444549560547, + "learning_rate": 9.476780371035309e-06, + "loss": 5.4894, + "step": 163925 + }, + { + "epoch": 14.717235188509875, + "grad_norm": 15.99395751953125, + "learning_rate": 9.476531019349691e-06, + "loss": 5.6214, + "step": 163950 + }, + { + "epoch": 14.71947935368043, + "grad_norm": 16.240999221801758, + "learning_rate": 9.476281667664074e-06, + "loss": 5.3295, + "step": 163975 + }, + { + "epoch": 14.721723518850988, + "grad_norm": 16.558382034301758, + "learning_rate": 9.476032315978458e-06, + "loss": 5.2471, + "step": 164000 + }, + { + "epoch": 14.723967684021543, + "grad_norm": 16.14316177368164, + "learning_rate": 9.475782964292838e-06, + "loss": 5.4008, + "step": 164025 + }, + { + "epoch": 14.7262118491921, + "grad_norm": 18.192853927612305, + "learning_rate": 9.475533612607222e-06, + "loss": 5.4107, + "step": 164050 + }, + { + "epoch": 14.728456014362656, + "grad_norm": 18.386722564697266, + "learning_rate": 9.475284260921605e-06, + "loss": 5.465, + "step": 164075 + }, + { + "epoch": 14.730700179533214, + "grad_norm": 17.092609405517578, + "learning_rate": 9.475034909235987e-06, + "loss": 5.4886, + "step": 164100 + }, + { + "epoch": 14.732944344703771, + "grad_norm": 14.963961601257324, + "learning_rate": 9.47478555755037e-06, + "loss": 5.3209, + "step": 164125 + }, + { + "epoch": 14.735188509874327, + "grad_norm": 15.865313529968262, + "learning_rate": 9.474536205864753e-06, + "loss": 5.4518, + "step": 164150 + }, + { + "epoch": 14.737432675044884, + "grad_norm": 14.954947471618652, + "learning_rate": 9.474286854179136e-06, + "loss": 5.3698, + "step": 164175 + }, + { + "epoch": 14.73967684021544, + "grad_norm": 16.221996307373047, + "learning_rate": 9.474037502493518e-06, + "loss": 5.5598, + "step": 164200 + }, + { + "epoch": 14.741921005385997, + "grad_norm": 15.762965202331543, + "learning_rate": 9.4737881508079e-06, + "loss": 5.4285, + "step": 164225 + }, + { + "epoch": 14.744165170556553, + "grad_norm": 13.765225410461426, + "learning_rate": 9.473538799122283e-06, + "loss": 5.1361, + "step": 164250 + }, + { + "epoch": 14.74640933572711, + "grad_norm": 17.719499588012695, + "learning_rate": 9.473289447436665e-06, + "loss": 5.4559, + "step": 164275 + }, + { + "epoch": 14.748653500897666, + "grad_norm": 17.507644653320312, + "learning_rate": 9.473040095751049e-06, + "loss": 5.4536, + "step": 164300 + }, + { + "epoch": 14.750897666068223, + "grad_norm": 15.781828880310059, + "learning_rate": 9.472790744065431e-06, + "loss": 5.3488, + "step": 164325 + }, + { + "epoch": 14.753141831238779, + "grad_norm": 14.553129196166992, + "learning_rate": 9.472541392379814e-06, + "loss": 5.4255, + "step": 164350 + }, + { + "epoch": 14.755385996409336, + "grad_norm": 16.04561996459961, + "learning_rate": 9.472292040694196e-06, + "loss": 5.4723, + "step": 164375 + }, + { + "epoch": 14.757630161579891, + "grad_norm": 15.780309677124023, + "learning_rate": 9.472042689008578e-06, + "loss": 5.2949, + "step": 164400 + }, + { + "epoch": 14.759874326750449, + "grad_norm": 17.968730926513672, + "learning_rate": 9.47179333732296e-06, + "loss": 5.3915, + "step": 164425 + }, + { + "epoch": 14.762118491921004, + "grad_norm": 14.487476348876953, + "learning_rate": 9.471543985637345e-06, + "loss": 5.5763, + "step": 164450 + }, + { + "epoch": 14.764362657091562, + "grad_norm": 17.20711326599121, + "learning_rate": 9.471294633951727e-06, + "loss": 5.5225, + "step": 164475 + }, + { + "epoch": 14.76660682226212, + "grad_norm": 19.563966751098633, + "learning_rate": 9.471045282266109e-06, + "loss": 5.4164, + "step": 164500 + }, + { + "epoch": 14.768850987432675, + "grad_norm": 18.47873878479004, + "learning_rate": 9.470795930580491e-06, + "loss": 5.3433, + "step": 164525 + }, + { + "epoch": 14.771095152603232, + "grad_norm": 13.911552429199219, + "learning_rate": 9.470546578894874e-06, + "loss": 5.638, + "step": 164550 + }, + { + "epoch": 14.773339317773788, + "grad_norm": 15.518125534057617, + "learning_rate": 9.470297227209256e-06, + "loss": 5.4437, + "step": 164575 + }, + { + "epoch": 14.775583482944345, + "grad_norm": 15.780840873718262, + "learning_rate": 9.470047875523638e-06, + "loss": 5.2002, + "step": 164600 + }, + { + "epoch": 14.7778276481149, + "grad_norm": 14.408989906311035, + "learning_rate": 9.469798523838022e-06, + "loss": 5.347, + "step": 164625 + }, + { + "epoch": 14.780071813285458, + "grad_norm": 20.630002975463867, + "learning_rate": 9.469549172152405e-06, + "loss": 5.6, + "step": 164650 + }, + { + "epoch": 14.782315978456014, + "grad_norm": 15.885149002075195, + "learning_rate": 9.469299820466787e-06, + "loss": 5.2981, + "step": 164675 + }, + { + "epoch": 14.784560143626571, + "grad_norm": 18.802892684936523, + "learning_rate": 9.46905046878117e-06, + "loss": 5.2188, + "step": 164700 + }, + { + "epoch": 14.786804308797127, + "grad_norm": 15.83047866821289, + "learning_rate": 9.468801117095552e-06, + "loss": 5.6534, + "step": 164725 + }, + { + "epoch": 14.789048473967684, + "grad_norm": 15.68708324432373, + "learning_rate": 9.468551765409934e-06, + "loss": 5.4293, + "step": 164750 + }, + { + "epoch": 14.791292639138241, + "grad_norm": 17.8818416595459, + "learning_rate": 9.468302413724318e-06, + "loss": 5.386, + "step": 164775 + }, + { + "epoch": 14.793536804308797, + "grad_norm": 18.853822708129883, + "learning_rate": 9.4680530620387e-06, + "loss": 5.4268, + "step": 164800 + }, + { + "epoch": 14.795780969479354, + "grad_norm": 15.739946365356445, + "learning_rate": 9.467803710353083e-06, + "loss": 5.6025, + "step": 164825 + }, + { + "epoch": 14.79802513464991, + "grad_norm": 13.97964859008789, + "learning_rate": 9.467554358667465e-06, + "loss": 5.2331, + "step": 164850 + }, + { + "epoch": 14.800269299820467, + "grad_norm": 19.133481979370117, + "learning_rate": 9.467305006981849e-06, + "loss": 5.3907, + "step": 164875 + }, + { + "epoch": 14.802513464991023, + "grad_norm": 16.69769287109375, + "learning_rate": 9.46705565529623e-06, + "loss": 5.3537, + "step": 164900 + }, + { + "epoch": 14.80475763016158, + "grad_norm": 15.327433586120605, + "learning_rate": 9.466806303610614e-06, + "loss": 5.6523, + "step": 164925 + }, + { + "epoch": 14.807001795332136, + "grad_norm": 15.259063720703125, + "learning_rate": 9.466556951924996e-06, + "loss": 5.5044, + "step": 164950 + }, + { + "epoch": 14.809245960502693, + "grad_norm": 16.9269962310791, + "learning_rate": 9.466307600239378e-06, + "loss": 5.0989, + "step": 164975 + }, + { + "epoch": 14.811490125673249, + "grad_norm": 17.703886032104492, + "learning_rate": 9.46605824855376e-06, + "loss": 5.1758, + "step": 165000 + }, + { + "epoch": 14.813734290843806, + "grad_norm": 15.095766067504883, + "learning_rate": 9.465808896868145e-06, + "loss": 5.446, + "step": 165025 + }, + { + "epoch": 14.815978456014363, + "grad_norm": 15.458380699157715, + "learning_rate": 9.465559545182527e-06, + "loss": 5.3499, + "step": 165050 + }, + { + "epoch": 14.818222621184919, + "grad_norm": 16.16407585144043, + "learning_rate": 9.465310193496907e-06, + "loss": 5.4207, + "step": 165075 + }, + { + "epoch": 14.820466786355476, + "grad_norm": 16.142074584960938, + "learning_rate": 9.465060841811291e-06, + "loss": 5.3482, + "step": 165100 + }, + { + "epoch": 14.822710951526032, + "grad_norm": 16.087505340576172, + "learning_rate": 9.464811490125674e-06, + "loss": 5.35, + "step": 165125 + }, + { + "epoch": 14.82495511669659, + "grad_norm": 16.92753028869629, + "learning_rate": 9.464562138440056e-06, + "loss": 5.3618, + "step": 165150 + }, + { + "epoch": 14.827199281867145, + "grad_norm": 17.896100997924805, + "learning_rate": 9.46431278675444e-06, + "loss": 5.1935, + "step": 165175 + }, + { + "epoch": 14.829443447037702, + "grad_norm": 17.658544540405273, + "learning_rate": 9.464063435068822e-06, + "loss": 5.4697, + "step": 165200 + }, + { + "epoch": 14.831687612208258, + "grad_norm": 18.590803146362305, + "learning_rate": 9.463814083383205e-06, + "loss": 5.4352, + "step": 165225 + }, + { + "epoch": 14.833931777378815, + "grad_norm": 15.60403060913086, + "learning_rate": 9.463564731697587e-06, + "loss": 5.2685, + "step": 165250 + }, + { + "epoch": 14.83617594254937, + "grad_norm": 17.301876068115234, + "learning_rate": 9.46331538001197e-06, + "loss": 5.4861, + "step": 165275 + }, + { + "epoch": 14.838420107719928, + "grad_norm": 16.619516372680664, + "learning_rate": 9.463066028326352e-06, + "loss": 5.5662, + "step": 165300 + }, + { + "epoch": 14.840664272890486, + "grad_norm": 15.150928497314453, + "learning_rate": 9.462816676640734e-06, + "loss": 5.4477, + "step": 165325 + }, + { + "epoch": 14.842908438061041, + "grad_norm": 18.762134552001953, + "learning_rate": 9.462567324955118e-06, + "loss": 5.2039, + "step": 165350 + }, + { + "epoch": 14.845152603231599, + "grad_norm": 17.883108139038086, + "learning_rate": 9.4623179732695e-06, + "loss": 5.3156, + "step": 165375 + }, + { + "epoch": 14.847396768402154, + "grad_norm": 14.713964462280273, + "learning_rate": 9.462068621583883e-06, + "loss": 5.4879, + "step": 165400 + }, + { + "epoch": 14.849640933572712, + "grad_norm": 15.120283126831055, + "learning_rate": 9.461819269898265e-06, + "loss": 5.3824, + "step": 165425 + }, + { + "epoch": 14.851885098743267, + "grad_norm": 16.71407127380371, + "learning_rate": 9.461569918212647e-06, + "loss": 5.4679, + "step": 165450 + }, + { + "epoch": 14.854129263913824, + "grad_norm": 17.891080856323242, + "learning_rate": 9.46132056652703e-06, + "loss": 5.1922, + "step": 165475 + }, + { + "epoch": 14.85637342908438, + "grad_norm": 17.55301856994629, + "learning_rate": 9.461071214841414e-06, + "loss": 5.2608, + "step": 165500 + }, + { + "epoch": 14.858617594254937, + "grad_norm": 16.909423828125, + "learning_rate": 9.460821863155796e-06, + "loss": 5.5073, + "step": 165525 + }, + { + "epoch": 14.860861759425493, + "grad_norm": 16.048324584960938, + "learning_rate": 9.460572511470178e-06, + "loss": 5.2618, + "step": 165550 + }, + { + "epoch": 14.86310592459605, + "grad_norm": 15.94693660736084, + "learning_rate": 9.46032315978456e-06, + "loss": 5.4441, + "step": 165575 + }, + { + "epoch": 14.865350089766606, + "grad_norm": 16.43539047241211, + "learning_rate": 9.460073808098945e-06, + "loss": 5.3573, + "step": 165600 + }, + { + "epoch": 14.867594254937163, + "grad_norm": 15.349939346313477, + "learning_rate": 9.459824456413325e-06, + "loss": 5.6369, + "step": 165625 + }, + { + "epoch": 14.86983842010772, + "grad_norm": 18.79943084716797, + "learning_rate": 9.45957510472771e-06, + "loss": 5.4412, + "step": 165650 + }, + { + "epoch": 14.872082585278276, + "grad_norm": 18.065656661987305, + "learning_rate": 9.459325753042092e-06, + "loss": 5.4484, + "step": 165675 + }, + { + "epoch": 14.874326750448834, + "grad_norm": 15.445833206176758, + "learning_rate": 9.459076401356474e-06, + "loss": 5.4057, + "step": 165700 + }, + { + "epoch": 14.87657091561939, + "grad_norm": 20.10173797607422, + "learning_rate": 9.458827049670856e-06, + "loss": 5.7409, + "step": 165725 + }, + { + "epoch": 14.878815080789947, + "grad_norm": 16.009944915771484, + "learning_rate": 9.45857769798524e-06, + "loss": 5.4986, + "step": 165750 + }, + { + "epoch": 14.881059245960502, + "grad_norm": 18.09114646911621, + "learning_rate": 9.458328346299623e-06, + "loss": 5.3693, + "step": 165775 + }, + { + "epoch": 14.88330341113106, + "grad_norm": 14.715738296508789, + "learning_rate": 9.458078994614003e-06, + "loss": 5.3607, + "step": 165800 + }, + { + "epoch": 14.885547576301615, + "grad_norm": 16.91377067565918, + "learning_rate": 9.457829642928387e-06, + "loss": 5.2891, + "step": 165825 + }, + { + "epoch": 14.887791741472173, + "grad_norm": 18.913522720336914, + "learning_rate": 9.45758029124277e-06, + "loss": 5.4606, + "step": 165850 + }, + { + "epoch": 14.890035906642728, + "grad_norm": 13.072259902954102, + "learning_rate": 9.457330939557152e-06, + "loss": 5.3803, + "step": 165875 + }, + { + "epoch": 14.892280071813286, + "grad_norm": 17.244701385498047, + "learning_rate": 9.457081587871536e-06, + "loss": 5.4872, + "step": 165900 + }, + { + "epoch": 14.894524236983841, + "grad_norm": 15.836042404174805, + "learning_rate": 9.456832236185918e-06, + "loss": 5.4309, + "step": 165925 + }, + { + "epoch": 14.896768402154398, + "grad_norm": 16.94944190979004, + "learning_rate": 9.4565828845003e-06, + "loss": 5.384, + "step": 165950 + }, + { + "epoch": 14.899012567324956, + "grad_norm": 17.4246768951416, + "learning_rate": 9.456333532814683e-06, + "loss": 5.3875, + "step": 165975 + }, + { + "epoch": 14.901256732495511, + "grad_norm": 17.646411895751953, + "learning_rate": 9.456084181129065e-06, + "loss": 5.2003, + "step": 166000 + }, + { + "epoch": 14.903500897666069, + "grad_norm": 12.667421340942383, + "learning_rate": 9.455834829443447e-06, + "loss": 5.4534, + "step": 166025 + }, + { + "epoch": 14.905745062836624, + "grad_norm": 16.912342071533203, + "learning_rate": 9.45558547775783e-06, + "loss": 5.5436, + "step": 166050 + }, + { + "epoch": 14.907989228007182, + "grad_norm": 16.383756637573242, + "learning_rate": 9.455336126072214e-06, + "loss": 5.478, + "step": 166075 + }, + { + "epoch": 14.910233393177737, + "grad_norm": 18.83669090270996, + "learning_rate": 9.455086774386596e-06, + "loss": 5.3126, + "step": 166100 + }, + { + "epoch": 14.912477558348295, + "grad_norm": 16.58588218688965, + "learning_rate": 9.454837422700978e-06, + "loss": 5.4855, + "step": 166125 + }, + { + "epoch": 14.91472172351885, + "grad_norm": 16.194908142089844, + "learning_rate": 9.45458807101536e-06, + "loss": 5.4763, + "step": 166150 + }, + { + "epoch": 14.916965888689408, + "grad_norm": 15.453468322753906, + "learning_rate": 9.454338719329743e-06, + "loss": 5.5854, + "step": 166175 + }, + { + "epoch": 14.919210053859963, + "grad_norm": 14.057950019836426, + "learning_rate": 9.454089367644125e-06, + "loss": 5.5446, + "step": 166200 + }, + { + "epoch": 14.92145421903052, + "grad_norm": 15.759504318237305, + "learning_rate": 9.45384001595851e-06, + "loss": 5.3579, + "step": 166225 + }, + { + "epoch": 14.923698384201078, + "grad_norm": 17.02426528930664, + "learning_rate": 9.453590664272892e-06, + "loss": 5.3609, + "step": 166250 + }, + { + "epoch": 14.925942549371634, + "grad_norm": 14.981860160827637, + "learning_rate": 9.453341312587274e-06, + "loss": 5.4222, + "step": 166275 + }, + { + "epoch": 14.928186714542191, + "grad_norm": 18.37451171875, + "learning_rate": 9.453091960901656e-06, + "loss": 5.4594, + "step": 166300 + }, + { + "epoch": 14.930430879712747, + "grad_norm": 15.923833847045898, + "learning_rate": 9.452842609216039e-06, + "loss": 5.3469, + "step": 166325 + }, + { + "epoch": 14.932675044883304, + "grad_norm": 17.055892944335938, + "learning_rate": 9.452593257530421e-06, + "loss": 5.2455, + "step": 166350 + }, + { + "epoch": 14.93491921005386, + "grad_norm": 16.837732315063477, + "learning_rate": 9.452343905844805e-06, + "loss": 5.2286, + "step": 166375 + }, + { + "epoch": 14.937163375224417, + "grad_norm": 18.558063507080078, + "learning_rate": 9.452094554159187e-06, + "loss": 5.6213, + "step": 166400 + }, + { + "epoch": 14.939407540394972, + "grad_norm": 16.218505859375, + "learning_rate": 9.45184520247357e-06, + "loss": 5.5778, + "step": 166425 + }, + { + "epoch": 14.94165170556553, + "grad_norm": 16.960693359375, + "learning_rate": 9.451595850787952e-06, + "loss": 5.1945, + "step": 166450 + }, + { + "epoch": 14.943895870736085, + "grad_norm": 15.409252166748047, + "learning_rate": 9.451346499102336e-06, + "loss": 5.347, + "step": 166475 + }, + { + "epoch": 14.946140035906643, + "grad_norm": 17.422157287597656, + "learning_rate": 9.451097147416716e-06, + "loss": 5.5614, + "step": 166500 + }, + { + "epoch": 14.9483842010772, + "grad_norm": 17.97561264038086, + "learning_rate": 9.450847795731099e-06, + "loss": 5.3687, + "step": 166525 + }, + { + "epoch": 14.950628366247756, + "grad_norm": 15.264737129211426, + "learning_rate": 9.450598444045483e-06, + "loss": 5.5451, + "step": 166550 + }, + { + "epoch": 14.952872531418313, + "grad_norm": 14.71354866027832, + "learning_rate": 9.450349092359865e-06, + "loss": 5.47, + "step": 166575 + }, + { + "epoch": 14.955116696588869, + "grad_norm": 17.35942840576172, + "learning_rate": 9.450099740674247e-06, + "loss": 5.6578, + "step": 166600 + }, + { + "epoch": 14.957360861759426, + "grad_norm": 16.822092056274414, + "learning_rate": 9.449850388988631e-06, + "loss": 5.5594, + "step": 166625 + }, + { + "epoch": 14.959605026929982, + "grad_norm": 16.20769691467285, + "learning_rate": 9.449601037303014e-06, + "loss": 5.462, + "step": 166650 + }, + { + "epoch": 14.961849192100539, + "grad_norm": 16.066011428833008, + "learning_rate": 9.449351685617394e-06, + "loss": 5.2121, + "step": 166675 + }, + { + "epoch": 14.964093357271095, + "grad_norm": 17.21317481994629, + "learning_rate": 9.449102333931778e-06, + "loss": 5.4752, + "step": 166700 + }, + { + "epoch": 14.966337522441652, + "grad_norm": 14.390694618225098, + "learning_rate": 9.44885298224616e-06, + "loss": 5.5716, + "step": 166725 + }, + { + "epoch": 14.968581687612208, + "grad_norm": 15.036930084228516, + "learning_rate": 9.448603630560543e-06, + "loss": 5.337, + "step": 166750 + }, + { + "epoch": 14.970825852782765, + "grad_norm": 16.645061492919922, + "learning_rate": 9.448354278874925e-06, + "loss": 5.5944, + "step": 166775 + }, + { + "epoch": 14.973070017953322, + "grad_norm": 16.74567985534668, + "learning_rate": 9.44810492718931e-06, + "loss": 5.2357, + "step": 166800 + }, + { + "epoch": 14.975314183123878, + "grad_norm": 19.340774536132812, + "learning_rate": 9.447855575503692e-06, + "loss": 5.5372, + "step": 166825 + }, + { + "epoch": 14.977558348294435, + "grad_norm": 17.324565887451172, + "learning_rate": 9.447606223818074e-06, + "loss": 5.3401, + "step": 166850 + }, + { + "epoch": 14.97980251346499, + "grad_norm": 15.31684684753418, + "learning_rate": 9.447356872132456e-06, + "loss": 5.3059, + "step": 166875 + }, + { + "epoch": 14.982046678635548, + "grad_norm": 18.298248291015625, + "learning_rate": 9.447107520446839e-06, + "loss": 5.6887, + "step": 166900 + }, + { + "epoch": 14.984290843806104, + "grad_norm": 16.313617706298828, + "learning_rate": 9.446858168761221e-06, + "loss": 5.1102, + "step": 166925 + }, + { + "epoch": 14.986535008976661, + "grad_norm": 15.168675422668457, + "learning_rate": 9.446608817075605e-06, + "loss": 5.4611, + "step": 166950 + }, + { + "epoch": 14.988779174147217, + "grad_norm": 19.47012710571289, + "learning_rate": 9.446359465389987e-06, + "loss": 5.4638, + "step": 166975 + }, + { + "epoch": 14.991023339317774, + "grad_norm": 14.470712661743164, + "learning_rate": 9.44611011370437e-06, + "loss": 5.3981, + "step": 167000 + }, + { + "epoch": 14.99326750448833, + "grad_norm": 18.60788917541504, + "learning_rate": 9.445870736086176e-06, + "loss": 5.516, + "step": 167025 + }, + { + "epoch": 14.995511669658887, + "grad_norm": 17.712751388549805, + "learning_rate": 9.445621384400559e-06, + "loss": 5.4488, + "step": 167050 + }, + { + "epoch": 14.997755834829443, + "grad_norm": 17.41798210144043, + "learning_rate": 9.445372032714943e-06, + "loss": 5.4107, + "step": 167075 + }, + { + "epoch": 15.0, + "grad_norm": 15.481304168701172, + "learning_rate": 9.445122681029325e-06, + "loss": 5.7712, + "step": 167100 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.0751723216160999, + "eval_f1_macro": 0.006533143801834642, + "eval_f1_micro": 0.0751723216160999, + "eval_f1_weighted": 0.04227025788308781, + "eval_loss": 6.879373550415039, + "eval_precision_macro": 0.005891597302737785, + "eval_precision_micro": 0.0751723216160999, + "eval_precision_weighted": 0.03427745262578502, + "eval_recall_macro": 0.011216060375715795, + "eval_recall_micro": 0.0751723216160999, + "eval_recall_weighted": 0.0751723216160999, + "eval_runtime": 128.6211, + "eval_samples_per_second": 407.188, + "eval_steps_per_second": 12.727, + "step": 167100 + }, + { + "epoch": 15.002244165170557, + "grad_norm": 16.905546188354492, + "learning_rate": 9.444873329343707e-06, + "loss": 4.9171, + "step": 167125 + }, + { + "epoch": 15.004488330341113, + "grad_norm": 15.371480941772461, + "learning_rate": 9.44462397765809e-06, + "loss": 4.9687, + "step": 167150 + }, + { + "epoch": 15.00673249551167, + "grad_norm": 15.597328186035156, + "learning_rate": 9.444374625972472e-06, + "loss": 4.9906, + "step": 167175 + }, + { + "epoch": 15.008976660682226, + "grad_norm": 16.423036575317383, + "learning_rate": 9.444125274286854e-06, + "loss": 5.1003, + "step": 167200 + }, + { + "epoch": 15.011220825852783, + "grad_norm": 18.787860870361328, + "learning_rate": 9.443875922601238e-06, + "loss": 5.2237, + "step": 167225 + }, + { + "epoch": 15.013464991023339, + "grad_norm": 15.360040664672852, + "learning_rate": 9.44362657091562e-06, + "loss": 5.0784, + "step": 167250 + }, + { + "epoch": 15.015709156193896, + "grad_norm": 18.442256927490234, + "learning_rate": 9.443377219230003e-06, + "loss": 5.0929, + "step": 167275 + }, + { + "epoch": 15.017953321364452, + "grad_norm": 17.165180206298828, + "learning_rate": 9.443127867544385e-06, + "loss": 5.3723, + "step": 167300 + }, + { + "epoch": 15.02019748653501, + "grad_norm": 15.534597396850586, + "learning_rate": 9.442878515858768e-06, + "loss": 5.1927, + "step": 167325 + }, + { + "epoch": 15.022441651705565, + "grad_norm": 15.077580451965332, + "learning_rate": 9.44262916417315e-06, + "loss": 5.0966, + "step": 167350 + }, + { + "epoch": 15.024685816876122, + "grad_norm": 16.908485412597656, + "learning_rate": 9.442379812487534e-06, + "loss": 5.1065, + "step": 167375 + }, + { + "epoch": 15.02692998204668, + "grad_norm": 15.234853744506836, + "learning_rate": 9.442130460801916e-06, + "loss": 5.0903, + "step": 167400 + }, + { + "epoch": 15.029174147217235, + "grad_norm": 12.258648872375488, + "learning_rate": 9.441881109116299e-06, + "loss": 5.1923, + "step": 167425 + }, + { + "epoch": 15.031418312387792, + "grad_norm": 13.52802848815918, + "learning_rate": 9.441631757430681e-06, + "loss": 5.1212, + "step": 167450 + }, + { + "epoch": 15.033662477558348, + "grad_norm": 16.808439254760742, + "learning_rate": 9.441382405745063e-06, + "loss": 5.0216, + "step": 167475 + }, + { + "epoch": 15.035906642728905, + "grad_norm": 16.119478225708008, + "learning_rate": 9.441133054059446e-06, + "loss": 4.9866, + "step": 167500 + }, + { + "epoch": 15.038150807899461, + "grad_norm": 16.148038864135742, + "learning_rate": 9.440883702373828e-06, + "loss": 5.0551, + "step": 167525 + }, + { + "epoch": 15.040394973070018, + "grad_norm": 16.123645782470703, + "learning_rate": 9.440634350688212e-06, + "loss": 4.8682, + "step": 167550 + }, + { + "epoch": 15.042639138240574, + "grad_norm": 17.872222900390625, + "learning_rate": 9.440384999002594e-06, + "loss": 4.8883, + "step": 167575 + }, + { + "epoch": 15.044883303411131, + "grad_norm": 15.317954063415527, + "learning_rate": 9.440135647316977e-06, + "loss": 5.2009, + "step": 167600 + }, + { + "epoch": 15.047127468581687, + "grad_norm": 20.104448318481445, + "learning_rate": 9.43988629563136e-06, + "loss": 4.9337, + "step": 167625 + }, + { + "epoch": 15.049371633752244, + "grad_norm": 17.14016342163086, + "learning_rate": 9.439636943945741e-06, + "loss": 5.1927, + "step": 167650 + }, + { + "epoch": 15.0516157989228, + "grad_norm": 15.730677604675293, + "learning_rate": 9.439387592260123e-06, + "loss": 4.9406, + "step": 167675 + }, + { + "epoch": 15.053859964093357, + "grad_norm": 17.030841827392578, + "learning_rate": 9.439138240574507e-06, + "loss": 5.051, + "step": 167700 + }, + { + "epoch": 15.056104129263915, + "grad_norm": 15.92912769317627, + "learning_rate": 9.43888888888889e-06, + "loss": 5.3058, + "step": 167725 + }, + { + "epoch": 15.05834829443447, + "grad_norm": 17.303773880004883, + "learning_rate": 9.438639537203272e-06, + "loss": 4.9677, + "step": 167750 + }, + { + "epoch": 15.060592459605028, + "grad_norm": 14.834202766418457, + "learning_rate": 9.438390185517654e-06, + "loss": 5.2515, + "step": 167775 + }, + { + "epoch": 15.062836624775583, + "grad_norm": 15.821149826049805, + "learning_rate": 9.438140833832038e-06, + "loss": 4.9693, + "step": 167800 + }, + { + "epoch": 15.06508078994614, + "grad_norm": 16.658967971801758, + "learning_rate": 9.437891482146419e-06, + "loss": 4.9288, + "step": 167825 + }, + { + "epoch": 15.067324955116696, + "grad_norm": 15.058110237121582, + "learning_rate": 9.437642130460803e-06, + "loss": 4.8901, + "step": 167850 + }, + { + "epoch": 15.069569120287253, + "grad_norm": 15.178117752075195, + "learning_rate": 9.437392778775185e-06, + "loss": 5.23, + "step": 167875 + }, + { + "epoch": 15.071813285457809, + "grad_norm": 15.542122840881348, + "learning_rate": 9.437143427089568e-06, + "loss": 4.9473, + "step": 167900 + }, + { + "epoch": 15.074057450628366, + "grad_norm": 15.29733657836914, + "learning_rate": 9.43689407540395e-06, + "loss": 5.0004, + "step": 167925 + }, + { + "epoch": 15.076301615798922, + "grad_norm": 14.3815336227417, + "learning_rate": 9.436644723718334e-06, + "loss": 4.869, + "step": 167950 + }, + { + "epoch": 15.07854578096948, + "grad_norm": 16.92493438720703, + "learning_rate": 9.436395372032716e-06, + "loss": 5.0318, + "step": 167975 + }, + { + "epoch": 15.080789946140037, + "grad_norm": 16.42003059387207, + "learning_rate": 9.436146020347097e-06, + "loss": 5.2873, + "step": 168000 + }, + { + "epoch": 15.083034111310592, + "grad_norm": 18.10274887084961, + "learning_rate": 9.435896668661481e-06, + "loss": 5.228, + "step": 168025 + }, + { + "epoch": 15.08527827648115, + "grad_norm": 13.912939071655273, + "learning_rate": 9.435647316975863e-06, + "loss": 5.1885, + "step": 168050 + }, + { + "epoch": 15.087522441651705, + "grad_norm": 17.11037254333496, + "learning_rate": 9.435397965290246e-06, + "loss": 5.2353, + "step": 168075 + }, + { + "epoch": 15.089766606822263, + "grad_norm": 19.477699279785156, + "learning_rate": 9.43514861360463e-06, + "loss": 5.1962, + "step": 168100 + }, + { + "epoch": 15.092010771992818, + "grad_norm": 18.264373779296875, + "learning_rate": 9.434899261919012e-06, + "loss": 5.0998, + "step": 168125 + }, + { + "epoch": 15.094254937163376, + "grad_norm": 19.33573341369629, + "learning_rate": 9.434649910233394e-06, + "loss": 5.0095, + "step": 168150 + }, + { + "epoch": 15.096499102333931, + "grad_norm": 16.583803176879883, + "learning_rate": 9.434400558547777e-06, + "loss": 5.0914, + "step": 168175 + }, + { + "epoch": 15.098743267504489, + "grad_norm": 16.870119094848633, + "learning_rate": 9.434151206862159e-06, + "loss": 4.9931, + "step": 168200 + }, + { + "epoch": 15.100987432675044, + "grad_norm": 16.50128173828125, + "learning_rate": 9.433901855176541e-06, + "loss": 5.0956, + "step": 168225 + }, + { + "epoch": 15.103231597845602, + "grad_norm": 14.714884757995605, + "learning_rate": 9.433652503490924e-06, + "loss": 5.0343, + "step": 168250 + }, + { + "epoch": 15.105475763016157, + "grad_norm": 15.86507797241211, + "learning_rate": 9.433403151805308e-06, + "loss": 5.1349, + "step": 168275 + }, + { + "epoch": 15.107719928186714, + "grad_norm": 15.421359062194824, + "learning_rate": 9.43315380011969e-06, + "loss": 5.0391, + "step": 168300 + }, + { + "epoch": 15.109964093357272, + "grad_norm": 16.797555923461914, + "learning_rate": 9.432904448434072e-06, + "loss": 5.2373, + "step": 168325 + }, + { + "epoch": 15.112208258527827, + "grad_norm": 14.105697631835938, + "learning_rate": 9.432655096748454e-06, + "loss": 5.1837, + "step": 168350 + }, + { + "epoch": 15.114452423698385, + "grad_norm": 15.916261672973633, + "learning_rate": 9.432405745062837e-06, + "loss": 5.1258, + "step": 168375 + }, + { + "epoch": 15.11669658886894, + "grad_norm": 16.758075714111328, + "learning_rate": 9.432156393377219e-06, + "loss": 5.3125, + "step": 168400 + }, + { + "epoch": 15.118940754039498, + "grad_norm": 15.86426830291748, + "learning_rate": 9.431907041691603e-06, + "loss": 5.2443, + "step": 168425 + }, + { + "epoch": 15.121184919210053, + "grad_norm": 16.162578582763672, + "learning_rate": 9.431657690005985e-06, + "loss": 5.1216, + "step": 168450 + }, + { + "epoch": 15.12342908438061, + "grad_norm": 16.49478530883789, + "learning_rate": 9.431408338320368e-06, + "loss": 4.9778, + "step": 168475 + }, + { + "epoch": 15.125673249551166, + "grad_norm": 17.32349967956543, + "learning_rate": 9.43115898663475e-06, + "loss": 5.115, + "step": 168500 + }, + { + "epoch": 15.127917414721724, + "grad_norm": 19.272920608520508, + "learning_rate": 9.430909634949132e-06, + "loss": 5.0481, + "step": 168525 + }, + { + "epoch": 15.13016157989228, + "grad_norm": 15.471632957458496, + "learning_rate": 9.430660283263515e-06, + "loss": 4.8091, + "step": 168550 + }, + { + "epoch": 15.132405745062837, + "grad_norm": 17.498008728027344, + "learning_rate": 9.430410931577899e-06, + "loss": 5.0244, + "step": 168575 + }, + { + "epoch": 15.134649910233394, + "grad_norm": 13.818135261535645, + "learning_rate": 9.430161579892281e-06, + "loss": 5.1231, + "step": 168600 + }, + { + "epoch": 15.13689407540395, + "grad_norm": 17.381771087646484, + "learning_rate": 9.429912228206663e-06, + "loss": 5.2892, + "step": 168625 + }, + { + "epoch": 15.139138240574507, + "grad_norm": 14.85590934753418, + "learning_rate": 9.429662876521046e-06, + "loss": 5.017, + "step": 168650 + }, + { + "epoch": 15.141382405745063, + "grad_norm": 15.535209655761719, + "learning_rate": 9.42941352483543e-06, + "loss": 5.0908, + "step": 168675 + }, + { + "epoch": 15.14362657091562, + "grad_norm": 15.613751411437988, + "learning_rate": 9.429164173149812e-06, + "loss": 5.3048, + "step": 168700 + }, + { + "epoch": 15.145870736086176, + "grad_norm": 16.892786026000977, + "learning_rate": 9.428914821464193e-06, + "loss": 5.2719, + "step": 168725 + }, + { + "epoch": 15.148114901256733, + "grad_norm": 17.448989868164062, + "learning_rate": 9.428665469778577e-06, + "loss": 5.2361, + "step": 168750 + }, + { + "epoch": 15.150359066427288, + "grad_norm": 15.147760391235352, + "learning_rate": 9.428416118092959e-06, + "loss": 5.3348, + "step": 168775 + }, + { + "epoch": 15.152603231597846, + "grad_norm": 15.12562370300293, + "learning_rate": 9.428166766407341e-06, + "loss": 5.1771, + "step": 168800 + }, + { + "epoch": 15.154847396768401, + "grad_norm": 18.720977783203125, + "learning_rate": 9.427917414721725e-06, + "loss": 5.3241, + "step": 168825 + }, + { + "epoch": 15.157091561938959, + "grad_norm": 16.65447235107422, + "learning_rate": 9.427668063036108e-06, + "loss": 5.0946, + "step": 168850 + }, + { + "epoch": 15.159335727109514, + "grad_norm": 16.88188934326172, + "learning_rate": 9.42741871135049e-06, + "loss": 5.1165, + "step": 168875 + }, + { + "epoch": 15.161579892280072, + "grad_norm": 18.232114791870117, + "learning_rate": 9.427169359664872e-06, + "loss": 5.0999, + "step": 168900 + }, + { + "epoch": 15.16382405745063, + "grad_norm": 15.161409378051758, + "learning_rate": 9.426920007979255e-06, + "loss": 5.0219, + "step": 168925 + }, + { + "epoch": 15.166068222621185, + "grad_norm": 17.108461380004883, + "learning_rate": 9.426670656293637e-06, + "loss": 5.0855, + "step": 168950 + }, + { + "epoch": 15.168312387791742, + "grad_norm": 14.500622749328613, + "learning_rate": 9.42642130460802e-06, + "loss": 5.1914, + "step": 168975 + }, + { + "epoch": 15.170556552962298, + "grad_norm": 17.113014221191406, + "learning_rate": 9.426171952922403e-06, + "loss": 5.0458, + "step": 169000 + }, + { + "epoch": 15.172800718132855, + "grad_norm": 18.864065170288086, + "learning_rate": 9.425922601236785e-06, + "loss": 5.1368, + "step": 169025 + }, + { + "epoch": 15.17504488330341, + "grad_norm": 16.598896026611328, + "learning_rate": 9.425673249551168e-06, + "loss": 4.964, + "step": 169050 + }, + { + "epoch": 15.177289048473968, + "grad_norm": 17.91145133972168, + "learning_rate": 9.42542389786555e-06, + "loss": 5.0421, + "step": 169075 + }, + { + "epoch": 15.179533213644524, + "grad_norm": 15.896475791931152, + "learning_rate": 9.425174546179932e-06, + "loss": 4.9206, + "step": 169100 + }, + { + "epoch": 15.181777378815081, + "grad_norm": 15.960590362548828, + "learning_rate": 9.424925194494315e-06, + "loss": 5.089, + "step": 169125 + }, + { + "epoch": 15.184021543985637, + "grad_norm": 15.960531234741211, + "learning_rate": 9.424675842808699e-06, + "loss": 5.4823, + "step": 169150 + }, + { + "epoch": 15.186265709156194, + "grad_norm": 16.027868270874023, + "learning_rate": 9.424426491123081e-06, + "loss": 5.2845, + "step": 169175 + }, + { + "epoch": 15.188509874326751, + "grad_norm": 14.157883644104004, + "learning_rate": 9.424177139437463e-06, + "loss": 5.2677, + "step": 169200 + }, + { + "epoch": 15.190754039497307, + "grad_norm": 17.112712860107422, + "learning_rate": 9.423927787751846e-06, + "loss": 5.2965, + "step": 169225 + }, + { + "epoch": 15.192998204667864, + "grad_norm": 13.806011199951172, + "learning_rate": 9.423678436066228e-06, + "loss": 5.2984, + "step": 169250 + }, + { + "epoch": 15.19524236983842, + "grad_norm": 16.180971145629883, + "learning_rate": 9.42342908438061e-06, + "loss": 5.2502, + "step": 169275 + }, + { + "epoch": 15.197486535008977, + "grad_norm": 17.82734489440918, + "learning_rate": 9.423179732694994e-06, + "loss": 5.2548, + "step": 169300 + }, + { + "epoch": 15.199730700179533, + "grad_norm": 12.83570671081543, + "learning_rate": 9.422930381009377e-06, + "loss": 5.1842, + "step": 169325 + }, + { + "epoch": 15.20197486535009, + "grad_norm": 17.569581985473633, + "learning_rate": 9.422681029323759e-06, + "loss": 5.0762, + "step": 169350 + }, + { + "epoch": 15.204219030520646, + "grad_norm": 16.94883918762207, + "learning_rate": 9.422431677638141e-06, + "loss": 5.1877, + "step": 169375 + }, + { + "epoch": 15.206463195691203, + "grad_norm": 18.61318016052246, + "learning_rate": 9.422182325952525e-06, + "loss": 5.3259, + "step": 169400 + }, + { + "epoch": 15.208707360861759, + "grad_norm": 16.40298080444336, + "learning_rate": 9.421942948334332e-06, + "loss": 5.3801, + "step": 169425 + }, + { + "epoch": 15.210951526032316, + "grad_norm": 15.95791244506836, + "learning_rate": 9.421693596648715e-06, + "loss": 5.1016, + "step": 169450 + }, + { + "epoch": 15.213195691202873, + "grad_norm": 15.490715026855469, + "learning_rate": 9.421444244963097e-06, + "loss": 5.289, + "step": 169475 + }, + { + "epoch": 15.215439856373429, + "grad_norm": 20.528657913208008, + "learning_rate": 9.42119489327748e-06, + "loss": 5.1391, + "step": 169500 + }, + { + "epoch": 15.217684021543986, + "grad_norm": 19.472156524658203, + "learning_rate": 9.420945541591862e-06, + "loss": 5.154, + "step": 169525 + }, + { + "epoch": 15.219928186714542, + "grad_norm": 16.15127182006836, + "learning_rate": 9.420696189906244e-06, + "loss": 5.1148, + "step": 169550 + }, + { + "epoch": 15.2221723518851, + "grad_norm": 15.06225872039795, + "learning_rate": 9.420446838220628e-06, + "loss": 4.9946, + "step": 169575 + }, + { + "epoch": 15.224416517055655, + "grad_norm": 14.635865211486816, + "learning_rate": 9.42019748653501e-06, + "loss": 5.5596, + "step": 169600 + }, + { + "epoch": 15.226660682226212, + "grad_norm": 17.290878295898438, + "learning_rate": 9.419948134849392e-06, + "loss": 5.2258, + "step": 169625 + }, + { + "epoch": 15.228904847396768, + "grad_norm": 15.404468536376953, + "learning_rate": 9.419698783163775e-06, + "loss": 5.1974, + "step": 169650 + }, + { + "epoch": 15.231149012567325, + "grad_norm": 13.523794174194336, + "learning_rate": 9.419449431478157e-06, + "loss": 5.1466, + "step": 169675 + }, + { + "epoch": 15.23339317773788, + "grad_norm": 17.137697219848633, + "learning_rate": 9.41920007979254e-06, + "loss": 5.0623, + "step": 169700 + }, + { + "epoch": 15.235637342908438, + "grad_norm": 16.447193145751953, + "learning_rate": 9.418950728106922e-06, + "loss": 5.2088, + "step": 169725 + }, + { + "epoch": 15.237881508078994, + "grad_norm": 17.728918075561523, + "learning_rate": 9.418701376421306e-06, + "loss": 5.19, + "step": 169750 + }, + { + "epoch": 15.240125673249551, + "grad_norm": 17.08087921142578, + "learning_rate": 9.418452024735688e-06, + "loss": 5.2947, + "step": 169775 + }, + { + "epoch": 15.242369838420109, + "grad_norm": 16.560588836669922, + "learning_rate": 9.41820267305007e-06, + "loss": 5.3315, + "step": 169800 + }, + { + "epoch": 15.244614003590664, + "grad_norm": 16.672985076904297, + "learning_rate": 9.417953321364454e-06, + "loss": 5.3127, + "step": 169825 + }, + { + "epoch": 15.246858168761221, + "grad_norm": 15.403053283691406, + "learning_rate": 9.417703969678835e-06, + "loss": 5.0197, + "step": 169850 + }, + { + "epoch": 15.249102333931777, + "grad_norm": 16.429096221923828, + "learning_rate": 9.417454617993217e-06, + "loss": 5.293, + "step": 169875 + }, + { + "epoch": 15.251346499102334, + "grad_norm": 17.09369468688965, + "learning_rate": 9.417205266307601e-06, + "loss": 5.2097, + "step": 169900 + }, + { + "epoch": 15.25359066427289, + "grad_norm": 16.262094497680664, + "learning_rate": 9.416955914621984e-06, + "loss": 5.0955, + "step": 169925 + }, + { + "epoch": 15.255834829443447, + "grad_norm": 18.300386428833008, + "learning_rate": 9.416706562936366e-06, + "loss": 5.2394, + "step": 169950 + }, + { + "epoch": 15.258078994614003, + "grad_norm": 21.972671508789062, + "learning_rate": 9.416457211250748e-06, + "loss": 5.1178, + "step": 169975 + }, + { + "epoch": 15.26032315978456, + "grad_norm": 14.338077545166016, + "learning_rate": 9.416207859565132e-06, + "loss": 5.1465, + "step": 170000 + }, + { + "epoch": 15.262567324955116, + "grad_norm": 18.842655181884766, + "learning_rate": 9.415958507879513e-06, + "loss": 5.1383, + "step": 170025 + }, + { + "epoch": 15.264811490125673, + "grad_norm": 16.833229064941406, + "learning_rate": 9.415709156193897e-06, + "loss": 5.3164, + "step": 170050 + }, + { + "epoch": 15.26705565529623, + "grad_norm": 15.796263694763184, + "learning_rate": 9.41545980450828e-06, + "loss": 5.1581, + "step": 170075 + }, + { + "epoch": 15.269299820466786, + "grad_norm": 14.87734317779541, + "learning_rate": 9.415210452822662e-06, + "loss": 4.824, + "step": 170100 + }, + { + "epoch": 15.271543985637344, + "grad_norm": 16.511234283447266, + "learning_rate": 9.414961101137044e-06, + "loss": 5.2595, + "step": 170125 + }, + { + "epoch": 15.2737881508079, + "grad_norm": 16.060808181762695, + "learning_rate": 9.414711749451428e-06, + "loss": 5.3854, + "step": 170150 + }, + { + "epoch": 15.276032315978457, + "grad_norm": 18.4323673248291, + "learning_rate": 9.41446239776581e-06, + "loss": 5.081, + "step": 170175 + }, + { + "epoch": 15.278276481149012, + "grad_norm": 15.943428993225098, + "learning_rate": 9.414213046080193e-06, + "loss": 5.1643, + "step": 170200 + }, + { + "epoch": 15.28052064631957, + "grad_norm": 18.507638931274414, + "learning_rate": 9.413963694394575e-06, + "loss": 5.2071, + "step": 170225 + }, + { + "epoch": 15.282764811490125, + "grad_norm": 17.86699867248535, + "learning_rate": 9.413714342708957e-06, + "loss": 5.1168, + "step": 170250 + }, + { + "epoch": 15.285008976660682, + "grad_norm": 17.637027740478516, + "learning_rate": 9.41346499102334e-06, + "loss": 4.9775, + "step": 170275 + }, + { + "epoch": 15.287253141831238, + "grad_norm": 17.520343780517578, + "learning_rate": 9.413215639337723e-06, + "loss": 5.2186, + "step": 170300 + }, + { + "epoch": 15.289497307001795, + "grad_norm": 14.755828857421875, + "learning_rate": 9.412966287652106e-06, + "loss": 4.9892, + "step": 170325 + }, + { + "epoch": 15.291741472172351, + "grad_norm": 18.053401947021484, + "learning_rate": 9.412716935966488e-06, + "loss": 4.8958, + "step": 170350 + }, + { + "epoch": 15.293985637342908, + "grad_norm": 14.904136657714844, + "learning_rate": 9.41246758428087e-06, + "loss": 5.2653, + "step": 170375 + }, + { + "epoch": 15.296229802513466, + "grad_norm": 19.603731155395508, + "learning_rate": 9.412218232595253e-06, + "loss": 5.2341, + "step": 170400 + }, + { + "epoch": 15.298473967684021, + "grad_norm": 13.172213554382324, + "learning_rate": 9.411968880909635e-06, + "loss": 4.9777, + "step": 170425 + }, + { + "epoch": 15.300718132854579, + "grad_norm": 18.705303192138672, + "learning_rate": 9.411719529224017e-06, + "loss": 5.0833, + "step": 170450 + }, + { + "epoch": 15.302962298025134, + "grad_norm": 18.268592834472656, + "learning_rate": 9.411470177538401e-06, + "loss": 5.0546, + "step": 170475 + }, + { + "epoch": 15.305206463195692, + "grad_norm": 21.093734741210938, + "learning_rate": 9.411220825852784e-06, + "loss": 5.0515, + "step": 170500 + }, + { + "epoch": 15.307450628366247, + "grad_norm": 17.885391235351562, + "learning_rate": 9.410971474167166e-06, + "loss": 5.1656, + "step": 170525 + }, + { + "epoch": 15.309694793536805, + "grad_norm": 15.67479133605957, + "learning_rate": 9.41072212248155e-06, + "loss": 4.9606, + "step": 170550 + }, + { + "epoch": 15.31193895870736, + "grad_norm": 13.609768867492676, + "learning_rate": 9.41047277079593e-06, + "loss": 5.124, + "step": 170575 + }, + { + "epoch": 15.314183123877918, + "grad_norm": 15.678227424621582, + "learning_rate": 9.410223419110313e-06, + "loss": 5.203, + "step": 170600 + }, + { + "epoch": 15.316427289048473, + "grad_norm": 17.66777229309082, + "learning_rate": 9.409974067424697e-06, + "loss": 5.1785, + "step": 170625 + }, + { + "epoch": 15.31867145421903, + "grad_norm": 15.34826946258545, + "learning_rate": 9.40972471573908e-06, + "loss": 5.2194, + "step": 170650 + }, + { + "epoch": 15.320915619389588, + "grad_norm": 17.29848861694336, + "learning_rate": 9.409475364053462e-06, + "loss": 5.0809, + "step": 170675 + }, + { + "epoch": 15.323159784560143, + "grad_norm": 18.486522674560547, + "learning_rate": 9.409226012367844e-06, + "loss": 5.1522, + "step": 170700 + }, + { + "epoch": 15.3254039497307, + "grad_norm": 20.1343994140625, + "learning_rate": 9.408976660682228e-06, + "loss": 5.0682, + "step": 170725 + }, + { + "epoch": 15.327648114901256, + "grad_norm": 17.015926361083984, + "learning_rate": 9.408727308996609e-06, + "loss": 4.9106, + "step": 170750 + }, + { + "epoch": 15.329892280071814, + "grad_norm": 15.635424613952637, + "learning_rate": 9.408477957310993e-06, + "loss": 5.2903, + "step": 170775 + }, + { + "epoch": 15.33213644524237, + "grad_norm": 16.33013916015625, + "learning_rate": 9.408228605625375e-06, + "loss": 5.1369, + "step": 170800 + }, + { + "epoch": 15.334380610412927, + "grad_norm": 17.996601104736328, + "learning_rate": 9.407979253939757e-06, + "loss": 5.1045, + "step": 170825 + }, + { + "epoch": 15.336624775583482, + "grad_norm": 16.869199752807617, + "learning_rate": 9.40772990225414e-06, + "loss": 4.979, + "step": 170850 + }, + { + "epoch": 15.33886894075404, + "grad_norm": 18.073444366455078, + "learning_rate": 9.407480550568524e-06, + "loss": 5.0372, + "step": 170875 + }, + { + "epoch": 15.341113105924595, + "grad_norm": 15.100030899047852, + "learning_rate": 9.407231198882906e-06, + "loss": 5.2961, + "step": 170900 + }, + { + "epoch": 15.343357271095153, + "grad_norm": 15.638938903808594, + "learning_rate": 9.406981847197288e-06, + "loss": 5.2081, + "step": 170925 + }, + { + "epoch": 15.34560143626571, + "grad_norm": 16.789554595947266, + "learning_rate": 9.40673249551167e-06, + "loss": 5.2931, + "step": 170950 + }, + { + "epoch": 15.347845601436266, + "grad_norm": 17.884092330932617, + "learning_rate": 9.406483143826053e-06, + "loss": 5.2467, + "step": 170975 + }, + { + "epoch": 15.350089766606823, + "grad_norm": 20.262786865234375, + "learning_rate": 9.406233792140435e-06, + "loss": 5.3393, + "step": 171000 + }, + { + "epoch": 15.352333931777379, + "grad_norm": 17.43756103515625, + "learning_rate": 9.405984440454819e-06, + "loss": 5.3487, + "step": 171025 + }, + { + "epoch": 15.354578096947936, + "grad_norm": 18.97504997253418, + "learning_rate": 9.405735088769201e-06, + "loss": 5.1342, + "step": 171050 + }, + { + "epoch": 15.356822262118492, + "grad_norm": 18.67794418334961, + "learning_rate": 9.405485737083584e-06, + "loss": 5.3077, + "step": 171075 + }, + { + "epoch": 15.359066427289049, + "grad_norm": 17.403602600097656, + "learning_rate": 9.405236385397966e-06, + "loss": 5.1577, + "step": 171100 + }, + { + "epoch": 15.361310592459605, + "grad_norm": 16.94464683532715, + "learning_rate": 9.404987033712348e-06, + "loss": 5.0746, + "step": 171125 + }, + { + "epoch": 15.363554757630162, + "grad_norm": 13.450267791748047, + "learning_rate": 9.40473768202673e-06, + "loss": 5.3315, + "step": 171150 + }, + { + "epoch": 15.365798922800717, + "grad_norm": 15.92452335357666, + "learning_rate": 9.404488330341113e-06, + "loss": 5.0475, + "step": 171175 + }, + { + "epoch": 15.368043087971275, + "grad_norm": 14.259045600891113, + "learning_rate": 9.404238978655497e-06, + "loss": 5.488, + "step": 171200 + }, + { + "epoch": 15.37028725314183, + "grad_norm": 18.360536575317383, + "learning_rate": 9.40398962696988e-06, + "loss": 5.2192, + "step": 171225 + }, + { + "epoch": 15.372531418312388, + "grad_norm": 16.539945602416992, + "learning_rate": 9.403740275284262e-06, + "loss": 5.3325, + "step": 171250 + }, + { + "epoch": 15.374775583482945, + "grad_norm": 15.6751070022583, + "learning_rate": 9.403490923598644e-06, + "loss": 5.3693, + "step": 171275 + }, + { + "epoch": 15.3770197486535, + "grad_norm": 15.989437103271484, + "learning_rate": 9.403241571913026e-06, + "loss": 4.9513, + "step": 171300 + }, + { + "epoch": 15.379263913824058, + "grad_norm": 18.580371856689453, + "learning_rate": 9.402992220227409e-06, + "loss": 5.1402, + "step": 171325 + }, + { + "epoch": 15.381508078994614, + "grad_norm": 16.988582611083984, + "learning_rate": 9.402742868541793e-06, + "loss": 5.2621, + "step": 171350 + }, + { + "epoch": 15.383752244165171, + "grad_norm": 16.871658325195312, + "learning_rate": 9.402493516856175e-06, + "loss": 5.0155, + "step": 171375 + }, + { + "epoch": 15.385996409335727, + "grad_norm": 19.48407554626465, + "learning_rate": 9.402244165170557e-06, + "loss": 5.1668, + "step": 171400 + }, + { + "epoch": 15.388240574506284, + "grad_norm": 14.918394088745117, + "learning_rate": 9.40199481348494e-06, + "loss": 5.1124, + "step": 171425 + }, + { + "epoch": 15.39048473967684, + "grad_norm": 17.279449462890625, + "learning_rate": 9.401745461799322e-06, + "loss": 5.2868, + "step": 171450 + }, + { + "epoch": 15.392728904847397, + "grad_norm": 18.50299072265625, + "learning_rate": 9.401496110113704e-06, + "loss": 5.3829, + "step": 171475 + }, + { + "epoch": 15.394973070017953, + "grad_norm": 15.40935230255127, + "learning_rate": 9.401246758428088e-06, + "loss": 5.1671, + "step": 171500 + }, + { + "epoch": 15.39721723518851, + "grad_norm": 16.680147171020508, + "learning_rate": 9.40099740674247e-06, + "loss": 5.3199, + "step": 171525 + }, + { + "epoch": 15.399461400359066, + "grad_norm": 16.816974639892578, + "learning_rate": 9.400748055056853e-06, + "loss": 4.9464, + "step": 171550 + }, + { + "epoch": 15.401705565529623, + "grad_norm": 16.412214279174805, + "learning_rate": 9.400498703371235e-06, + "loss": 5.1501, + "step": 171575 + }, + { + "epoch": 15.40394973070018, + "grad_norm": 18.148752212524414, + "learning_rate": 9.40024935168562e-06, + "loss": 5.1862, + "step": 171600 + }, + { + "epoch": 15.406193895870736, + "grad_norm": 16.2208251953125, + "learning_rate": 9.4e-06, + "loss": 5.0673, + "step": 171625 + }, + { + "epoch": 15.408438061041293, + "grad_norm": 18.540143966674805, + "learning_rate": 9.399760622381808e-06, + "loss": 5.0073, + "step": 171650 + }, + { + "epoch": 15.410682226211849, + "grad_norm": 16.500551223754883, + "learning_rate": 9.39951127069619e-06, + "loss": 5.3842, + "step": 171675 + }, + { + "epoch": 15.412926391382406, + "grad_norm": 16.279556274414062, + "learning_rate": 9.399261919010573e-06, + "loss": 5.0425, + "step": 171700 + }, + { + "epoch": 15.415170556552962, + "grad_norm": 16.74054718017578, + "learning_rate": 9.399012567324955e-06, + "loss": 5.0436, + "step": 171725 + }, + { + "epoch": 15.41741472172352, + "grad_norm": 21.253860473632812, + "learning_rate": 9.398763215639338e-06, + "loss": 5.0672, + "step": 171750 + }, + { + "epoch": 15.419658886894075, + "grad_norm": 17.397235870361328, + "learning_rate": 9.398513863953722e-06, + "loss": 5.2979, + "step": 171775 + }, + { + "epoch": 15.421903052064632, + "grad_norm": 26.973323822021484, + "learning_rate": 9.398264512268104e-06, + "loss": 5.0633, + "step": 171800 + }, + { + "epoch": 15.424147217235188, + "grad_norm": 17.532108306884766, + "learning_rate": 9.398015160582486e-06, + "loss": 5.2883, + "step": 171825 + }, + { + "epoch": 15.426391382405745, + "grad_norm": 14.566549301147461, + "learning_rate": 9.397765808896869e-06, + "loss": 5.3187, + "step": 171850 + }, + { + "epoch": 15.428635547576302, + "grad_norm": 17.730077743530273, + "learning_rate": 9.397516457211253e-06, + "loss": 5.1045, + "step": 171875 + }, + { + "epoch": 15.430879712746858, + "grad_norm": 15.219734191894531, + "learning_rate": 9.397267105525633e-06, + "loss": 5.1546, + "step": 171900 + }, + { + "epoch": 15.433123877917415, + "grad_norm": 18.642126083374023, + "learning_rate": 9.397017753840017e-06, + "loss": 5.2077, + "step": 171925 + }, + { + "epoch": 15.435368043087971, + "grad_norm": 19.292268753051758, + "learning_rate": 9.3967684021544e-06, + "loss": 5.1583, + "step": 171950 + }, + { + "epoch": 15.437612208258528, + "grad_norm": 16.113384246826172, + "learning_rate": 9.396519050468782e-06, + "loss": 4.9742, + "step": 171975 + }, + { + "epoch": 15.439856373429084, + "grad_norm": 16.390073776245117, + "learning_rate": 9.396269698783164e-06, + "loss": 5.2703, + "step": 172000 + }, + { + "epoch": 15.442100538599641, + "grad_norm": 14.91461181640625, + "learning_rate": 9.396020347097548e-06, + "loss": 5.3627, + "step": 172025 + }, + { + "epoch": 15.444344703770197, + "grad_norm": 14.026466369628906, + "learning_rate": 9.39577099541193e-06, + "loss": 5.1968, + "step": 172050 + }, + { + "epoch": 15.446588868940754, + "grad_norm": 19.566030502319336, + "learning_rate": 9.395521643726311e-06, + "loss": 5.27, + "step": 172075 + }, + { + "epoch": 15.44883303411131, + "grad_norm": 13.328837394714355, + "learning_rate": 9.395272292040695e-06, + "loss": 5.1104, + "step": 172100 + }, + { + "epoch": 15.451077199281867, + "grad_norm": 18.931846618652344, + "learning_rate": 9.395022940355078e-06, + "loss": 5.4299, + "step": 172125 + }, + { + "epoch": 15.453321364452425, + "grad_norm": 14.238887786865234, + "learning_rate": 9.39477358866946e-06, + "loss": 5.1644, + "step": 172150 + }, + { + "epoch": 15.45556552962298, + "grad_norm": 18.55306625366211, + "learning_rate": 9.394524236983842e-06, + "loss": 5.2678, + "step": 172175 + }, + { + "epoch": 15.457809694793538, + "grad_norm": 16.802061080932617, + "learning_rate": 9.394274885298226e-06, + "loss": 5.2051, + "step": 172200 + }, + { + "epoch": 15.460053859964093, + "grad_norm": 17.518638610839844, + "learning_rate": 9.394025533612608e-06, + "loss": 4.9515, + "step": 172225 + }, + { + "epoch": 15.46229802513465, + "grad_norm": 18.604629516601562, + "learning_rate": 9.39377618192699e-06, + "loss": 5.452, + "step": 172250 + }, + { + "epoch": 15.464542190305206, + "grad_norm": 15.286087036132812, + "learning_rate": 9.393526830241373e-06, + "loss": 5.3225, + "step": 172275 + }, + { + "epoch": 15.466786355475763, + "grad_norm": 16.684917449951172, + "learning_rate": 9.393277478555755e-06, + "loss": 5.1077, + "step": 172300 + }, + { + "epoch": 15.469030520646319, + "grad_norm": 16.5289306640625, + "learning_rate": 9.393028126870138e-06, + "loss": 5.2784, + "step": 172325 + }, + { + "epoch": 15.471274685816876, + "grad_norm": 18.313879013061523, + "learning_rate": 9.392778775184522e-06, + "loss": 4.9536, + "step": 172350 + }, + { + "epoch": 15.473518850987432, + "grad_norm": 17.55071449279785, + "learning_rate": 9.392529423498904e-06, + "loss": 5.2589, + "step": 172375 + }, + { + "epoch": 15.47576301615799, + "grad_norm": 15.504121780395508, + "learning_rate": 9.392280071813286e-06, + "loss": 5.1841, + "step": 172400 + }, + { + "epoch": 15.478007181328547, + "grad_norm": 14.144972801208496, + "learning_rate": 9.392030720127669e-06, + "loss": 5.1236, + "step": 172425 + }, + { + "epoch": 15.480251346499102, + "grad_norm": 15.576687812805176, + "learning_rate": 9.391781368442051e-06, + "loss": 5.1245, + "step": 172450 + }, + { + "epoch": 15.48249551166966, + "grad_norm": 15.224632263183594, + "learning_rate": 9.391532016756433e-06, + "loss": 5.1567, + "step": 172475 + }, + { + "epoch": 15.484739676840215, + "grad_norm": 16.06173324584961, + "learning_rate": 9.391282665070817e-06, + "loss": 5.3619, + "step": 172500 + }, + { + "epoch": 15.486983842010773, + "grad_norm": 18.460561752319336, + "learning_rate": 9.3910333133852e-06, + "loss": 5.179, + "step": 172525 + }, + { + "epoch": 15.489228007181328, + "grad_norm": 17.4730167388916, + "learning_rate": 9.390783961699582e-06, + "loss": 5.2687, + "step": 172550 + }, + { + "epoch": 15.491472172351886, + "grad_norm": 17.835922241210938, + "learning_rate": 9.390534610013964e-06, + "loss": 5.0793, + "step": 172575 + }, + { + "epoch": 15.493716337522441, + "grad_norm": 18.84181022644043, + "learning_rate": 9.390285258328347e-06, + "loss": 5.1182, + "step": 172600 + }, + { + "epoch": 15.495960502692999, + "grad_norm": 14.215325355529785, + "learning_rate": 9.390035906642729e-06, + "loss": 5.1247, + "step": 172625 + }, + { + "epoch": 15.498204667863554, + "grad_norm": 16.238956451416016, + "learning_rate": 9.389786554957113e-06, + "loss": 5.264, + "step": 172650 + }, + { + "epoch": 15.500448833034111, + "grad_norm": 22.6767635345459, + "learning_rate": 9.389537203271495e-06, + "loss": 5.3476, + "step": 172675 + }, + { + "epoch": 15.502692998204667, + "grad_norm": 16.971885681152344, + "learning_rate": 9.389287851585878e-06, + "loss": 5.2657, + "step": 172700 + }, + { + "epoch": 15.504937163375224, + "grad_norm": 16.693952560424805, + "learning_rate": 9.38903849990026e-06, + "loss": 5.1684, + "step": 172725 + }, + { + "epoch": 15.507181328545782, + "grad_norm": 14.949010848999023, + "learning_rate": 9.388789148214644e-06, + "loss": 5.1335, + "step": 172750 + }, + { + "epoch": 15.509425493716337, + "grad_norm": 16.41501235961914, + "learning_rate": 9.388539796529025e-06, + "loss": 5.3087, + "step": 172775 + }, + { + "epoch": 15.511669658886895, + "grad_norm": 16.621097564697266, + "learning_rate": 9.388290444843407e-06, + "loss": 5.463, + "step": 172800 + }, + { + "epoch": 15.51391382405745, + "grad_norm": 16.486318588256836, + "learning_rate": 9.38804109315779e-06, + "loss": 5.4035, + "step": 172825 + }, + { + "epoch": 15.516157989228008, + "grad_norm": 21.705245971679688, + "learning_rate": 9.387791741472173e-06, + "loss": 5.2101, + "step": 172850 + }, + { + "epoch": 15.518402154398563, + "grad_norm": 18.34836769104004, + "learning_rate": 9.387542389786555e-06, + "loss": 5.1257, + "step": 172875 + }, + { + "epoch": 15.52064631956912, + "grad_norm": 16.026655197143555, + "learning_rate": 9.387293038100938e-06, + "loss": 5.303, + "step": 172900 + }, + { + "epoch": 15.522890484739676, + "grad_norm": 17.17839813232422, + "learning_rate": 9.387043686415322e-06, + "loss": 5.2686, + "step": 172925 + }, + { + "epoch": 15.525134649910234, + "grad_norm": 15.149160385131836, + "learning_rate": 9.386794334729702e-06, + "loss": 5.2138, + "step": 172950 + }, + { + "epoch": 15.52737881508079, + "grad_norm": 19.199783325195312, + "learning_rate": 9.386544983044086e-06, + "loss": 5.3591, + "step": 172975 + }, + { + "epoch": 15.529622980251347, + "grad_norm": 16.496784210205078, + "learning_rate": 9.386295631358469e-06, + "loss": 5.0047, + "step": 173000 + }, + { + "epoch": 15.531867145421902, + "grad_norm": 15.022397994995117, + "learning_rate": 9.386046279672851e-06, + "loss": 5.0116, + "step": 173025 + }, + { + "epoch": 15.53411131059246, + "grad_norm": 15.96198844909668, + "learning_rate": 9.385796927987233e-06, + "loss": 5.0894, + "step": 173050 + }, + { + "epoch": 15.536355475763017, + "grad_norm": 19.81272315979004, + "learning_rate": 9.385547576301617e-06, + "loss": 5.1298, + "step": 173075 + }, + { + "epoch": 15.538599640933572, + "grad_norm": 15.260367393493652, + "learning_rate": 9.385298224616e-06, + "loss": 5.2852, + "step": 173100 + }, + { + "epoch": 15.54084380610413, + "grad_norm": 16.85382843017578, + "learning_rate": 9.385048872930382e-06, + "loss": 5.2603, + "step": 173125 + }, + { + "epoch": 15.543087971274685, + "grad_norm": 15.350150108337402, + "learning_rate": 9.384799521244764e-06, + "loss": 5.3763, + "step": 173150 + }, + { + "epoch": 15.545332136445243, + "grad_norm": 16.300289154052734, + "learning_rate": 9.384550169559147e-06, + "loss": 5.2048, + "step": 173175 + }, + { + "epoch": 15.547576301615798, + "grad_norm": 18.19504737854004, + "learning_rate": 9.384300817873529e-06, + "loss": 5.494, + "step": 173200 + }, + { + "epoch": 15.549820466786356, + "grad_norm": 18.830230712890625, + "learning_rate": 9.384051466187913e-06, + "loss": 5.1689, + "step": 173225 + }, + { + "epoch": 15.552064631956911, + "grad_norm": 20.936729431152344, + "learning_rate": 9.383802114502295e-06, + "loss": 5.2149, + "step": 173250 + }, + { + "epoch": 15.554308797127469, + "grad_norm": 15.91549301147461, + "learning_rate": 9.383552762816678e-06, + "loss": 5.3669, + "step": 173275 + }, + { + "epoch": 15.556552962298024, + "grad_norm": 15.36577033996582, + "learning_rate": 9.38330341113106e-06, + "loss": 5.1776, + "step": 173300 + }, + { + "epoch": 15.558797127468582, + "grad_norm": 17.45362663269043, + "learning_rate": 9.383054059445442e-06, + "loss": 5.2579, + "step": 173325 + }, + { + "epoch": 15.561041292639139, + "grad_norm": 18.609493255615234, + "learning_rate": 9.382804707759825e-06, + "loss": 5.0737, + "step": 173350 + }, + { + "epoch": 15.563285457809695, + "grad_norm": 16.221622467041016, + "learning_rate": 9.382555356074209e-06, + "loss": 5.0529, + "step": 173375 + }, + { + "epoch": 15.565529622980252, + "grad_norm": 16.652050018310547, + "learning_rate": 9.382306004388591e-06, + "loss": 5.2993, + "step": 173400 + }, + { + "epoch": 15.567773788150808, + "grad_norm": 18.435861587524414, + "learning_rate": 9.382056652702973e-06, + "loss": 5.3144, + "step": 173425 + }, + { + "epoch": 15.570017953321365, + "grad_norm": 17.08985137939453, + "learning_rate": 9.381807301017356e-06, + "loss": 4.9686, + "step": 173450 + }, + { + "epoch": 15.57226211849192, + "grad_norm": 15.297579765319824, + "learning_rate": 9.38155794933174e-06, + "loss": 5.1747, + "step": 173475 + }, + { + "epoch": 15.574506283662478, + "grad_norm": 19.553619384765625, + "learning_rate": 9.38130859764612e-06, + "loss": 5.2142, + "step": 173500 + }, + { + "epoch": 15.576750448833034, + "grad_norm": 20.53314208984375, + "learning_rate": 9.381059245960502e-06, + "loss": 5.2241, + "step": 173525 + }, + { + "epoch": 15.57899461400359, + "grad_norm": 14.993342399597168, + "learning_rate": 9.380809894274886e-06, + "loss": 5.2707, + "step": 173550 + }, + { + "epoch": 15.581238779174146, + "grad_norm": 16.204944610595703, + "learning_rate": 9.380560542589269e-06, + "loss": 5.0507, + "step": 173575 + }, + { + "epoch": 15.583482944344704, + "grad_norm": 19.431011199951172, + "learning_rate": 9.380311190903651e-06, + "loss": 5.0636, + "step": 173600 + }, + { + "epoch": 15.585727109515261, + "grad_norm": 15.130583763122559, + "learning_rate": 9.380061839218033e-06, + "loss": 5.019, + "step": 173625 + }, + { + "epoch": 15.587971274685817, + "grad_norm": 17.740358352661133, + "learning_rate": 9.379812487532417e-06, + "loss": 5.1524, + "step": 173650 + }, + { + "epoch": 15.590215439856374, + "grad_norm": 17.224672317504883, + "learning_rate": 9.379563135846798e-06, + "loss": 5.2859, + "step": 173675 + }, + { + "epoch": 15.59245960502693, + "grad_norm": 17.9877986907959, + "learning_rate": 9.379313784161182e-06, + "loss": 5.3404, + "step": 173700 + }, + { + "epoch": 15.594703770197487, + "grad_norm": 17.32868766784668, + "learning_rate": 9.379064432475564e-06, + "loss": 5.1865, + "step": 173725 + }, + { + "epoch": 15.596947935368043, + "grad_norm": 16.61978530883789, + "learning_rate": 9.378815080789947e-06, + "loss": 5.2615, + "step": 173750 + }, + { + "epoch": 15.5991921005386, + "grad_norm": 16.454429626464844, + "learning_rate": 9.378565729104329e-06, + "loss": 5.2512, + "step": 173775 + }, + { + "epoch": 15.601436265709156, + "grad_norm": 15.594141006469727, + "learning_rate": 9.378316377418713e-06, + "loss": 5.2143, + "step": 173800 + }, + { + "epoch": 15.603680430879713, + "grad_norm": 14.697361946105957, + "learning_rate": 9.378067025733095e-06, + "loss": 5.2667, + "step": 173825 + }, + { + "epoch": 15.605924596050269, + "grad_norm": 14.842855453491211, + "learning_rate": 9.377817674047478e-06, + "loss": 5.2059, + "step": 173850 + }, + { + "epoch": 15.608168761220826, + "grad_norm": 20.307065963745117, + "learning_rate": 9.37756832236186e-06, + "loss": 5.0581, + "step": 173875 + }, + { + "epoch": 15.610412926391383, + "grad_norm": 16.43044662475586, + "learning_rate": 9.377328944743667e-06, + "loss": 5.106, + "step": 173900 + }, + { + "epoch": 15.612657091561939, + "grad_norm": 17.36637306213379, + "learning_rate": 9.37707959305805e-06, + "loss": 5.2903, + "step": 173925 + }, + { + "epoch": 15.614901256732496, + "grad_norm": 17.274621963500977, + "learning_rate": 9.376830241372432e-06, + "loss": 5.2892, + "step": 173950 + }, + { + "epoch": 15.617145421903052, + "grad_norm": 16.78986167907715, + "learning_rate": 9.376580889686816e-06, + "loss": 4.9246, + "step": 173975 + }, + { + "epoch": 15.61938958707361, + "grad_norm": 14.700523376464844, + "learning_rate": 9.376331538001198e-06, + "loss": 5.0216, + "step": 174000 + }, + { + "epoch": 15.621633752244165, + "grad_norm": 19.425291061401367, + "learning_rate": 9.37608218631558e-06, + "loss": 5.0816, + "step": 174025 + }, + { + "epoch": 15.623877917414722, + "grad_norm": 18.06126594543457, + "learning_rate": 9.375832834629963e-06, + "loss": 4.9921, + "step": 174050 + }, + { + "epoch": 15.626122082585278, + "grad_norm": 18.626619338989258, + "learning_rate": 9.375583482944347e-06, + "loss": 5.1968, + "step": 174075 + }, + { + "epoch": 15.628366247755835, + "grad_norm": 14.45173454284668, + "learning_rate": 9.375334131258727e-06, + "loss": 5.1114, + "step": 174100 + }, + { + "epoch": 15.63061041292639, + "grad_norm": 15.691094398498535, + "learning_rate": 9.375084779573111e-06, + "loss": 5.4942, + "step": 174125 + }, + { + "epoch": 15.632854578096948, + "grad_norm": 13.676983833312988, + "learning_rate": 9.374835427887493e-06, + "loss": 5.1942, + "step": 174150 + }, + { + "epoch": 15.635098743267504, + "grad_norm": 18.026100158691406, + "learning_rate": 9.374586076201876e-06, + "loss": 5.1056, + "step": 174175 + }, + { + "epoch": 15.637342908438061, + "grad_norm": 16.220096588134766, + "learning_rate": 9.374336724516258e-06, + "loss": 5.1237, + "step": 174200 + }, + { + "epoch": 15.639587073608617, + "grad_norm": 18.399751663208008, + "learning_rate": 9.374087372830642e-06, + "loss": 5.3307, + "step": 174225 + }, + { + "epoch": 15.641831238779174, + "grad_norm": 16.64542579650879, + "learning_rate": 9.373838021145024e-06, + "loss": 5.2101, + "step": 174250 + }, + { + "epoch": 15.644075403949731, + "grad_norm": 14.620626449584961, + "learning_rate": 9.373588669459405e-06, + "loss": 5.1631, + "step": 174275 + }, + { + "epoch": 15.646319569120287, + "grad_norm": 21.833871841430664, + "learning_rate": 9.373339317773789e-06, + "loss": 5.2355, + "step": 174300 + }, + { + "epoch": 15.648563734290844, + "grad_norm": 17.01523208618164, + "learning_rate": 9.373089966088171e-06, + "loss": 5.3008, + "step": 174325 + }, + { + "epoch": 15.6508078994614, + "grad_norm": 15.838216781616211, + "learning_rate": 9.372840614402554e-06, + "loss": 5.1625, + "step": 174350 + }, + { + "epoch": 15.653052064631957, + "grad_norm": 18.36302947998047, + "learning_rate": 9.372591262716938e-06, + "loss": 5.499, + "step": 174375 + }, + { + "epoch": 15.655296229802513, + "grad_norm": 15.609925270080566, + "learning_rate": 9.37234191103132e-06, + "loss": 5.2379, + "step": 174400 + }, + { + "epoch": 15.65754039497307, + "grad_norm": 16.7713623046875, + "learning_rate": 9.372092559345702e-06, + "loss": 5.3144, + "step": 174425 + }, + { + "epoch": 15.659784560143626, + "grad_norm": 16.769582748413086, + "learning_rate": 9.371843207660085e-06, + "loss": 5.1796, + "step": 174450 + }, + { + "epoch": 15.662028725314183, + "grad_norm": 12.879998207092285, + "learning_rate": 9.371593855974467e-06, + "loss": 5.4077, + "step": 174475 + }, + { + "epoch": 15.664272890484739, + "grad_norm": 15.538269996643066, + "learning_rate": 9.37134450428885e-06, + "loss": 5.3981, + "step": 174500 + }, + { + "epoch": 15.666517055655296, + "grad_norm": 18.375835418701172, + "learning_rate": 9.371095152603232e-06, + "loss": 5.1203, + "step": 174525 + }, + { + "epoch": 15.668761220825854, + "grad_norm": 17.124183654785156, + "learning_rate": 9.370845800917616e-06, + "loss": 5.2342, + "step": 174550 + }, + { + "epoch": 15.67100538599641, + "grad_norm": 17.516929626464844, + "learning_rate": 9.370596449231998e-06, + "loss": 5.0736, + "step": 174575 + }, + { + "epoch": 15.673249551166966, + "grad_norm": 16.336793899536133, + "learning_rate": 9.37034709754638e-06, + "loss": 5.431, + "step": 174600 + }, + { + "epoch": 15.675493716337522, + "grad_norm": 17.815431594848633, + "learning_rate": 9.370097745860763e-06, + "loss": 5.3696, + "step": 174625 + }, + { + "epoch": 15.67773788150808, + "grad_norm": 18.381221771240234, + "learning_rate": 9.369848394175145e-06, + "loss": 5.2255, + "step": 174650 + }, + { + "epoch": 15.679982046678635, + "grad_norm": 20.8605899810791, + "learning_rate": 9.369599042489527e-06, + "loss": 5.1679, + "step": 174675 + }, + { + "epoch": 15.682226211849192, + "grad_norm": 15.409245491027832, + "learning_rate": 9.369349690803911e-06, + "loss": 5.2017, + "step": 174700 + }, + { + "epoch": 15.684470377019748, + "grad_norm": 22.86089324951172, + "learning_rate": 9.369100339118294e-06, + "loss": 5.4881, + "step": 174725 + }, + { + "epoch": 15.686714542190305, + "grad_norm": 15.159689903259277, + "learning_rate": 9.368850987432676e-06, + "loss": 5.1733, + "step": 174750 + }, + { + "epoch": 15.688958707360861, + "grad_norm": 17.697067260742188, + "learning_rate": 9.368601635747058e-06, + "loss": 5.1487, + "step": 174775 + }, + { + "epoch": 15.691202872531418, + "grad_norm": 18.81319808959961, + "learning_rate": 9.36835228406144e-06, + "loss": 5.0466, + "step": 174800 + }, + { + "epoch": 15.693447037701976, + "grad_norm": 16.372102737426758, + "learning_rate": 9.368102932375823e-06, + "loss": 5.199, + "step": 174825 + }, + { + "epoch": 15.695691202872531, + "grad_norm": 19.048337936401367, + "learning_rate": 9.367853580690207e-06, + "loss": 5.2407, + "step": 174850 + }, + { + "epoch": 15.697935368043089, + "grad_norm": 19.22449493408203, + "learning_rate": 9.367604229004589e-06, + "loss": 5.2132, + "step": 174875 + }, + { + "epoch": 15.700179533213644, + "grad_norm": 16.799339294433594, + "learning_rate": 9.367354877318971e-06, + "loss": 5.4816, + "step": 174900 + }, + { + "epoch": 15.702423698384202, + "grad_norm": 17.496294021606445, + "learning_rate": 9.367105525633354e-06, + "loss": 5.2441, + "step": 174925 + }, + { + "epoch": 15.704667863554757, + "grad_norm": 19.322290420532227, + "learning_rate": 9.366856173947738e-06, + "loss": 5.0488, + "step": 174950 + }, + { + "epoch": 15.706912028725315, + "grad_norm": 15.972211837768555, + "learning_rate": 9.366606822262118e-06, + "loss": 5.0214, + "step": 174975 + }, + { + "epoch": 15.70915619389587, + "grad_norm": 19.246604919433594, + "learning_rate": 9.3663574705765e-06, + "loss": 5.4283, + "step": 175000 + }, + { + "epoch": 15.711400359066428, + "grad_norm": 16.258588790893555, + "learning_rate": 9.366108118890885e-06, + "loss": 5.0899, + "step": 175025 + }, + { + "epoch": 15.713644524236983, + "grad_norm": 17.101659774780273, + "learning_rate": 9.365858767205267e-06, + "loss": 5.2807, + "step": 175050 + }, + { + "epoch": 15.71588868940754, + "grad_norm": 19.158092498779297, + "learning_rate": 9.36560941551965e-06, + "loss": 5.1884, + "step": 175075 + }, + { + "epoch": 15.718132854578098, + "grad_norm": 15.75550365447998, + "learning_rate": 9.365360063834033e-06, + "loss": 5.3207, + "step": 175100 + }, + { + "epoch": 15.720377019748653, + "grad_norm": 19.65778923034668, + "learning_rate": 9.365110712148416e-06, + "loss": 5.2035, + "step": 175125 + }, + { + "epoch": 15.72262118491921, + "grad_norm": 16.891632080078125, + "learning_rate": 9.364861360462798e-06, + "loss": 5.2902, + "step": 175150 + }, + { + "epoch": 15.724865350089766, + "grad_norm": 19.133859634399414, + "learning_rate": 9.36461200877718e-06, + "loss": 5.3815, + "step": 175175 + }, + { + "epoch": 15.727109515260324, + "grad_norm": 17.50444221496582, + "learning_rate": 9.364362657091563e-06, + "loss": 5.2186, + "step": 175200 + }, + { + "epoch": 15.72935368043088, + "grad_norm": 16.45500373840332, + "learning_rate": 9.364113305405945e-06, + "loss": 5.2219, + "step": 175225 + }, + { + "epoch": 15.731597845601437, + "grad_norm": 14.283411979675293, + "learning_rate": 9.363863953720327e-06, + "loss": 5.1877, + "step": 175250 + }, + { + "epoch": 15.733842010771992, + "grad_norm": 18.122268676757812, + "learning_rate": 9.363614602034711e-06, + "loss": 5.2403, + "step": 175275 + }, + { + "epoch": 15.73608617594255, + "grad_norm": 16.317249298095703, + "learning_rate": 9.363365250349094e-06, + "loss": 5.3915, + "step": 175300 + }, + { + "epoch": 15.738330341113105, + "grad_norm": 19.194189071655273, + "learning_rate": 9.363115898663476e-06, + "loss": 5.1494, + "step": 175325 + }, + { + "epoch": 15.740574506283663, + "grad_norm": 15.723702430725098, + "learning_rate": 9.362866546977858e-06, + "loss": 5.189, + "step": 175350 + }, + { + "epoch": 15.742818671454218, + "grad_norm": 18.125394821166992, + "learning_rate": 9.36261719529224e-06, + "loss": 5.3054, + "step": 175375 + }, + { + "epoch": 15.745062836624776, + "grad_norm": 21.46958351135254, + "learning_rate": 9.362367843606623e-06, + "loss": 5.1631, + "step": 175400 + }, + { + "epoch": 15.747307001795333, + "grad_norm": 15.511146545410156, + "learning_rate": 9.362118491921007e-06, + "loss": 5.1012, + "step": 175425 + }, + { + "epoch": 15.749551166965889, + "grad_norm": 17.170988082885742, + "learning_rate": 9.361869140235389e-06, + "loss": 5.214, + "step": 175450 + }, + { + "epoch": 15.751795332136446, + "grad_norm": 14.87458324432373, + "learning_rate": 9.361619788549771e-06, + "loss": 5.0444, + "step": 175475 + }, + { + "epoch": 15.754039497307001, + "grad_norm": 15.9212064743042, + "learning_rate": 9.361370436864154e-06, + "loss": 5.1371, + "step": 175500 + }, + { + "epoch": 15.756283662477559, + "grad_norm": 14.691150665283203, + "learning_rate": 9.361121085178536e-06, + "loss": 5.3434, + "step": 175525 + }, + { + "epoch": 15.758527827648114, + "grad_norm": 18.212364196777344, + "learning_rate": 9.360871733492918e-06, + "loss": 5.549, + "step": 175550 + }, + { + "epoch": 15.760771992818672, + "grad_norm": 15.54697322845459, + "learning_rate": 9.360622381807302e-06, + "loss": 5.5316, + "step": 175575 + }, + { + "epoch": 15.763016157989227, + "grad_norm": 16.14980125427246, + "learning_rate": 9.360373030121685e-06, + "loss": 5.2332, + "step": 175600 + }, + { + "epoch": 15.765260323159785, + "grad_norm": 15.197872161865234, + "learning_rate": 9.360123678436067e-06, + "loss": 5.1544, + "step": 175625 + }, + { + "epoch": 15.76750448833034, + "grad_norm": 13.934130668640137, + "learning_rate": 9.35987432675045e-06, + "loss": 4.9943, + "step": 175650 + }, + { + "epoch": 15.769748653500898, + "grad_norm": 15.312186241149902, + "learning_rate": 9.359624975064833e-06, + "loss": 5.2494, + "step": 175675 + }, + { + "epoch": 15.771992818671453, + "grad_norm": 15.285299301147461, + "learning_rate": 9.359375623379214e-06, + "loss": 5.3338, + "step": 175700 + }, + { + "epoch": 15.77423698384201, + "grad_norm": 16.668394088745117, + "learning_rate": 9.359126271693596e-06, + "loss": 5.0794, + "step": 175725 + }, + { + "epoch": 15.776481149012568, + "grad_norm": 16.089950561523438, + "learning_rate": 9.35887692000798e-06, + "loss": 5.2314, + "step": 175750 + }, + { + "epoch": 15.778725314183124, + "grad_norm": 17.794601440429688, + "learning_rate": 9.358627568322363e-06, + "loss": 5.3292, + "step": 175775 + }, + { + "epoch": 15.780969479353681, + "grad_norm": 18.20967674255371, + "learning_rate": 9.358378216636745e-06, + "loss": 5.1862, + "step": 175800 + }, + { + "epoch": 15.783213644524237, + "grad_norm": 16.638851165771484, + "learning_rate": 9.358128864951129e-06, + "loss": 5.1721, + "step": 175825 + }, + { + "epoch": 15.785457809694794, + "grad_norm": 15.175921440124512, + "learning_rate": 9.357879513265511e-06, + "loss": 5.3721, + "step": 175850 + }, + { + "epoch": 15.78770197486535, + "grad_norm": 18.795156478881836, + "learning_rate": 9.357630161579892e-06, + "loss": 5.3738, + "step": 175875 + }, + { + "epoch": 15.789946140035907, + "grad_norm": 15.665870666503906, + "learning_rate": 9.357380809894276e-06, + "loss": 5.4341, + "step": 175900 + }, + { + "epoch": 15.792190305206462, + "grad_norm": 16.293624877929688, + "learning_rate": 9.357131458208658e-06, + "loss": 5.357, + "step": 175925 + }, + { + "epoch": 15.79443447037702, + "grad_norm": 13.129931449890137, + "learning_rate": 9.35688210652304e-06, + "loss": 5.0348, + "step": 175950 + }, + { + "epoch": 15.796678635547575, + "grad_norm": 17.664310455322266, + "learning_rate": 9.356632754837423e-06, + "loss": 5.219, + "step": 175975 + }, + { + "epoch": 15.798922800718133, + "grad_norm": 17.3979549407959, + "learning_rate": 9.356383403151807e-06, + "loss": 5.272, + "step": 176000 + }, + { + "epoch": 15.80116696588869, + "grad_norm": 17.505441665649414, + "learning_rate": 9.35613405146619e-06, + "loss": 5.3417, + "step": 176025 + }, + { + "epoch": 15.803411131059246, + "grad_norm": 17.439727783203125, + "learning_rate": 9.355884699780572e-06, + "loss": 5.2641, + "step": 176050 + }, + { + "epoch": 15.805655296229803, + "grad_norm": 23.798316955566406, + "learning_rate": 9.355635348094954e-06, + "loss": 5.3298, + "step": 176075 + }, + { + "epoch": 15.807899461400359, + "grad_norm": 16.514846801757812, + "learning_rate": 9.355385996409336e-06, + "loss": 5.3845, + "step": 176100 + }, + { + "epoch": 15.810143626570916, + "grad_norm": 17.03875160217285, + "learning_rate": 9.355136644723718e-06, + "loss": 5.0672, + "step": 176125 + }, + { + "epoch": 15.812387791741472, + "grad_norm": 19.68347930908203, + "learning_rate": 9.354887293038102e-06, + "loss": 5.3284, + "step": 176150 + }, + { + "epoch": 15.814631956912029, + "grad_norm": 20.307411193847656, + "learning_rate": 9.354637941352485e-06, + "loss": 5.205, + "step": 176175 + }, + { + "epoch": 15.816876122082585, + "grad_norm": 17.103351593017578, + "learning_rate": 9.354388589666867e-06, + "loss": 5.3735, + "step": 176200 + }, + { + "epoch": 15.819120287253142, + "grad_norm": 16.90186309814453, + "learning_rate": 9.35413923798125e-06, + "loss": 5.2782, + "step": 176225 + }, + { + "epoch": 15.821364452423698, + "grad_norm": 16.723876953125, + "learning_rate": 9.353889886295632e-06, + "loss": 5.3154, + "step": 176250 + }, + { + "epoch": 15.823608617594255, + "grad_norm": 20.804269790649414, + "learning_rate": 9.353640534610014e-06, + "loss": 5.251, + "step": 176275 + }, + { + "epoch": 15.825852782764812, + "grad_norm": 18.52246856689453, + "learning_rate": 9.353391182924398e-06, + "loss": 5.1725, + "step": 176300 + }, + { + "epoch": 15.828096947935368, + "grad_norm": 18.723522186279297, + "learning_rate": 9.35314183123878e-06, + "loss": 5.2529, + "step": 176325 + }, + { + "epoch": 15.830341113105925, + "grad_norm": 15.742022514343262, + "learning_rate": 9.352892479553163e-06, + "loss": 5.2207, + "step": 176350 + }, + { + "epoch": 15.83258527827648, + "grad_norm": 17.51068878173828, + "learning_rate": 9.352643127867545e-06, + "loss": 5.1161, + "step": 176375 + }, + { + "epoch": 15.834829443447038, + "grad_norm": 17.468286514282227, + "learning_rate": 9.352393776181927e-06, + "loss": 5.212, + "step": 176400 + }, + { + "epoch": 15.837073608617594, + "grad_norm": 16.15768814086914, + "learning_rate": 9.35214442449631e-06, + "loss": 5.2901, + "step": 176425 + }, + { + "epoch": 15.839317773788151, + "grad_norm": 15.513739585876465, + "learning_rate": 9.351895072810692e-06, + "loss": 5.1536, + "step": 176450 + }, + { + "epoch": 15.841561938958707, + "grad_norm": 17.542436599731445, + "learning_rate": 9.351645721125076e-06, + "loss": 5.1039, + "step": 176475 + }, + { + "epoch": 15.843806104129264, + "grad_norm": 17.725706100463867, + "learning_rate": 9.351396369439458e-06, + "loss": 5.2238, + "step": 176500 + }, + { + "epoch": 15.84605026929982, + "grad_norm": 19.134113311767578, + "learning_rate": 9.351156991821265e-06, + "loss": 5.1499, + "step": 176525 + }, + { + "epoch": 15.848294434470377, + "grad_norm": 13.903157234191895, + "learning_rate": 9.350907640135648e-06, + "loss": 5.2133, + "step": 176550 + }, + { + "epoch": 15.850538599640934, + "grad_norm": 16.895523071289062, + "learning_rate": 9.350658288450032e-06, + "loss": 5.3198, + "step": 176575 + }, + { + "epoch": 15.85278276481149, + "grad_norm": 15.39908218383789, + "learning_rate": 9.350408936764414e-06, + "loss": 5.315, + "step": 176600 + }, + { + "epoch": 15.855026929982047, + "grad_norm": 15.771427154541016, + "learning_rate": 9.350159585078796e-06, + "loss": 5.5649, + "step": 176625 + }, + { + "epoch": 15.857271095152603, + "grad_norm": 19.34212875366211, + "learning_rate": 9.349910233393179e-06, + "loss": 5.2814, + "step": 176650 + }, + { + "epoch": 15.85951526032316, + "grad_norm": 18.727031707763672, + "learning_rate": 9.34966088170756e-06, + "loss": 5.2105, + "step": 176675 + }, + { + "epoch": 15.861759425493716, + "grad_norm": 24.142993927001953, + "learning_rate": 9.349411530021943e-06, + "loss": 5.3577, + "step": 176700 + }, + { + "epoch": 15.864003590664273, + "grad_norm": 15.45143985748291, + "learning_rate": 9.349162178336325e-06, + "loss": 5.1656, + "step": 176725 + }, + { + "epoch": 15.866247755834829, + "grad_norm": 18.739770889282227, + "learning_rate": 9.34891282665071e-06, + "loss": 5.1266, + "step": 176750 + }, + { + "epoch": 15.868491921005386, + "grad_norm": 18.446521759033203, + "learning_rate": 9.348663474965092e-06, + "loss": 5.1214, + "step": 176775 + }, + { + "epoch": 15.870736086175942, + "grad_norm": 18.116506576538086, + "learning_rate": 9.348414123279474e-06, + "loss": 5.0268, + "step": 176800 + }, + { + "epoch": 15.8729802513465, + "grad_norm": 17.883338928222656, + "learning_rate": 9.348164771593858e-06, + "loss": 5.1778, + "step": 176825 + }, + { + "epoch": 15.875224416517055, + "grad_norm": 16.01980972290039, + "learning_rate": 9.347915419908239e-06, + "loss": 5.2108, + "step": 176850 + }, + { + "epoch": 15.877468581687612, + "grad_norm": 17.21694564819336, + "learning_rate": 9.347666068222621e-06, + "loss": 5.2772, + "step": 176875 + }, + { + "epoch": 15.87971274685817, + "grad_norm": 16.614545822143555, + "learning_rate": 9.347416716537005e-06, + "loss": 5.4266, + "step": 176900 + }, + { + "epoch": 15.881956912028725, + "grad_norm": 19.759729385375977, + "learning_rate": 9.347167364851387e-06, + "loss": 5.1371, + "step": 176925 + }, + { + "epoch": 15.884201077199283, + "grad_norm": 18.309432983398438, + "learning_rate": 9.34691801316577e-06, + "loss": 5.0389, + "step": 176950 + }, + { + "epoch": 15.886445242369838, + "grad_norm": 17.11606788635254, + "learning_rate": 9.346668661480152e-06, + "loss": 5.0062, + "step": 176975 + }, + { + "epoch": 15.888689407540395, + "grad_norm": 16.599212646484375, + "learning_rate": 9.346419309794536e-06, + "loss": 5.4711, + "step": 177000 + }, + { + "epoch": 15.890933572710951, + "grad_norm": 17.22096061706543, + "learning_rate": 9.346169958108917e-06, + "loss": 5.3905, + "step": 177025 + }, + { + "epoch": 15.893177737881508, + "grad_norm": 13.840446472167969, + "learning_rate": 9.3459206064233e-06, + "loss": 5.1003, + "step": 177050 + }, + { + "epoch": 15.895421903052064, + "grad_norm": 16.54900550842285, + "learning_rate": 9.345671254737683e-06, + "loss": 5.2236, + "step": 177075 + }, + { + "epoch": 15.897666068222621, + "grad_norm": 19.51348876953125, + "learning_rate": 9.345421903052065e-06, + "loss": 5.1751, + "step": 177100 + }, + { + "epoch": 15.899910233393177, + "grad_norm": 16.93691635131836, + "learning_rate": 9.345172551366448e-06, + "loss": 5.1203, + "step": 177125 + }, + { + "epoch": 15.902154398563734, + "grad_norm": 20.475555419921875, + "learning_rate": 9.344923199680832e-06, + "loss": 5.1792, + "step": 177150 + }, + { + "epoch": 15.90439856373429, + "grad_norm": 16.524911880493164, + "learning_rate": 9.344673847995214e-06, + "loss": 5.4897, + "step": 177175 + }, + { + "epoch": 15.906642728904847, + "grad_norm": 15.375964164733887, + "learning_rate": 9.344424496309595e-06, + "loss": 4.9909, + "step": 177200 + }, + { + "epoch": 15.908886894075405, + "grad_norm": 16.739953994750977, + "learning_rate": 9.344175144623979e-06, + "loss": 5.3991, + "step": 177225 + }, + { + "epoch": 15.91113105924596, + "grad_norm": 18.04396629333496, + "learning_rate": 9.343925792938361e-06, + "loss": 5.1093, + "step": 177250 + }, + { + "epoch": 15.913375224416518, + "grad_norm": 18.550018310546875, + "learning_rate": 9.343676441252743e-06, + "loss": 5.2743, + "step": 177275 + }, + { + "epoch": 15.915619389587073, + "grad_norm": 15.645459175109863, + "learning_rate": 9.343427089567127e-06, + "loss": 5.2367, + "step": 177300 + }, + { + "epoch": 15.91786355475763, + "grad_norm": 15.768721580505371, + "learning_rate": 9.34317773788151e-06, + "loss": 5.3104, + "step": 177325 + }, + { + "epoch": 15.920107719928186, + "grad_norm": 17.24138832092285, + "learning_rate": 9.342928386195892e-06, + "loss": 5.1068, + "step": 177350 + }, + { + "epoch": 15.922351885098744, + "grad_norm": 18.14167022705078, + "learning_rate": 9.342679034510274e-06, + "loss": 5.3744, + "step": 177375 + }, + { + "epoch": 15.9245960502693, + "grad_norm": 18.541563034057617, + "learning_rate": 9.342429682824656e-06, + "loss": 5.2454, + "step": 177400 + }, + { + "epoch": 15.926840215439857, + "grad_norm": 18.642841339111328, + "learning_rate": 9.342180331139039e-06, + "loss": 5.5479, + "step": 177425 + }, + { + "epoch": 15.929084380610412, + "grad_norm": 15.767410278320312, + "learning_rate": 9.341930979453421e-06, + "loss": 5.4723, + "step": 177450 + }, + { + "epoch": 15.93132854578097, + "grad_norm": 13.869547843933105, + "learning_rate": 9.341681627767805e-06, + "loss": 5.5208, + "step": 177475 + }, + { + "epoch": 15.933572710951527, + "grad_norm": 17.376296997070312, + "learning_rate": 9.341432276082187e-06, + "loss": 5.2652, + "step": 177500 + }, + { + "epoch": 15.935816876122082, + "grad_norm": 18.910198211669922, + "learning_rate": 9.34118292439657e-06, + "loss": 5.3887, + "step": 177525 + }, + { + "epoch": 15.93806104129264, + "grad_norm": 16.97980499267578, + "learning_rate": 9.340933572710952e-06, + "loss": 5.2208, + "step": 177550 + }, + { + "epoch": 15.940305206463195, + "grad_norm": 17.161006927490234, + "learning_rate": 9.340684221025334e-06, + "loss": 4.935, + "step": 177575 + }, + { + "epoch": 15.942549371633753, + "grad_norm": 14.640435218811035, + "learning_rate": 9.340434869339717e-06, + "loss": 5.5865, + "step": 177600 + }, + { + "epoch": 15.944793536804308, + "grad_norm": 17.47064781188965, + "learning_rate": 9.3401855176541e-06, + "loss": 5.5574, + "step": 177625 + }, + { + "epoch": 15.947037701974866, + "grad_norm": 16.800369262695312, + "learning_rate": 9.339936165968483e-06, + "loss": 5.3977, + "step": 177650 + }, + { + "epoch": 15.949281867145421, + "grad_norm": 17.310611724853516, + "learning_rate": 9.339686814282865e-06, + "loss": 5.1561, + "step": 177675 + }, + { + "epoch": 15.951526032315979, + "grad_norm": 20.18649673461914, + "learning_rate": 9.339437462597248e-06, + "loss": 5.3009, + "step": 177700 + }, + { + "epoch": 15.953770197486534, + "grad_norm": 18.673263549804688, + "learning_rate": 9.33918811091163e-06, + "loss": 5.1476, + "step": 177725 + }, + { + "epoch": 15.956014362657092, + "grad_norm": 18.877065658569336, + "learning_rate": 9.338938759226012e-06, + "loss": 5.5549, + "step": 177750 + }, + { + "epoch": 15.958258527827649, + "grad_norm": 17.65509033203125, + "learning_rate": 9.338689407540396e-06, + "loss": 5.2885, + "step": 177775 + }, + { + "epoch": 15.960502692998205, + "grad_norm": 21.079755783081055, + "learning_rate": 9.338440055854779e-06, + "loss": 4.9068, + "step": 177800 + }, + { + "epoch": 15.962746858168762, + "grad_norm": 18.623157501220703, + "learning_rate": 9.338190704169161e-06, + "loss": 5.5779, + "step": 177825 + }, + { + "epoch": 15.964991023339318, + "grad_norm": 18.30314064025879, + "learning_rate": 9.337941352483543e-06, + "loss": 5.3495, + "step": 177850 + }, + { + "epoch": 15.967235188509875, + "grad_norm": 17.93517303466797, + "learning_rate": 9.337692000797927e-06, + "loss": 5.6191, + "step": 177875 + }, + { + "epoch": 15.96947935368043, + "grad_norm": 18.049684524536133, + "learning_rate": 9.337442649112308e-06, + "loss": 5.3191, + "step": 177900 + }, + { + "epoch": 15.971723518850988, + "grad_norm": 18.42012596130371, + "learning_rate": 9.33719329742669e-06, + "loss": 5.4176, + "step": 177925 + }, + { + "epoch": 15.973967684021543, + "grad_norm": 17.045812606811523, + "learning_rate": 9.336943945741074e-06, + "loss": 5.3408, + "step": 177950 + }, + { + "epoch": 15.9762118491921, + "grad_norm": 15.332551002502441, + "learning_rate": 9.336694594055457e-06, + "loss": 5.0652, + "step": 177975 + }, + { + "epoch": 15.978456014362656, + "grad_norm": 17.408676147460938, + "learning_rate": 9.336445242369839e-06, + "loss": 5.1784, + "step": 178000 + }, + { + "epoch": 15.980700179533214, + "grad_norm": 13.862305641174316, + "learning_rate": 9.336195890684223e-06, + "loss": 5.4535, + "step": 178025 + }, + { + "epoch": 15.982944344703771, + "grad_norm": 18.312788009643555, + "learning_rate": 9.335946538998605e-06, + "loss": 5.3354, + "step": 178050 + }, + { + "epoch": 15.985188509874327, + "grad_norm": 17.47098731994629, + "learning_rate": 9.335697187312986e-06, + "loss": 5.2016, + "step": 178075 + }, + { + "epoch": 15.987432675044884, + "grad_norm": 18.014896392822266, + "learning_rate": 9.33544783562737e-06, + "loss": 5.1512, + "step": 178100 + }, + { + "epoch": 15.98967684021544, + "grad_norm": 18.964656829833984, + "learning_rate": 9.335198483941752e-06, + "loss": 5.4234, + "step": 178125 + }, + { + "epoch": 15.991921005385997, + "grad_norm": 20.412002563476562, + "learning_rate": 9.334949132256134e-06, + "loss": 5.4703, + "step": 178150 + }, + { + "epoch": 15.994165170556553, + "grad_norm": 17.846982955932617, + "learning_rate": 9.334699780570517e-06, + "loss": 5.478, + "step": 178175 + }, + { + "epoch": 15.99640933572711, + "grad_norm": 17.41280174255371, + "learning_rate": 9.3344504288849e-06, + "loss": 5.1723, + "step": 178200 + }, + { + "epoch": 15.998653500897666, + "grad_norm": 21.4835205078125, + "learning_rate": 9.334201077199283e-06, + "loss": 5.3023, + "step": 178225 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.0733393160598018, + "eval_f1_macro": 0.007574076923174782, + "eval_f1_micro": 0.0733393160598018, + "eval_f1_weighted": 0.04247863613233251, + "eval_loss": 6.856902599334717, + "eval_precision_macro": 0.0069308159456596356, + "eval_precision_micro": 0.0733393160598018, + "eval_precision_weighted": 0.03461147295155675, + "eval_recall_macro": 0.012489363732224038, + "eval_recall_micro": 0.0733393160598018, + "eval_recall_weighted": 0.0733393160598018, + "eval_runtime": 126.7417, + "eval_samples_per_second": 413.226, + "eval_steps_per_second": 12.916, + "step": 178240 + }, + { + "epoch": 16.000897666068223, + "grad_norm": 18.335294723510742, + "learning_rate": 9.333951725513665e-06, + "loss": 5.1649, + "step": 178250 + }, + { + "epoch": 16.00314183123878, + "grad_norm": 14.239676475524902, + "learning_rate": 9.333702373828048e-06, + "loss": 5.0998, + "step": 178275 + }, + { + "epoch": 16.005385996409334, + "grad_norm": 17.269060134887695, + "learning_rate": 9.33345302214243e-06, + "loss": 5.0895, + "step": 178300 + }, + { + "epoch": 16.007630161579893, + "grad_norm": 18.767812728881836, + "learning_rate": 9.333203670456812e-06, + "loss": 4.9375, + "step": 178325 + }, + { + "epoch": 16.00987432675045, + "grad_norm": 15.866778373718262, + "learning_rate": 9.332954318771196e-06, + "loss": 4.8085, + "step": 178350 + }, + { + "epoch": 16.012118491921004, + "grad_norm": 15.101512908935547, + "learning_rate": 9.332704967085579e-06, + "loss": 5.0394, + "step": 178375 + }, + { + "epoch": 16.014362657091564, + "grad_norm": 16.900270462036133, + "learning_rate": 9.332455615399961e-06, + "loss": 5.2567, + "step": 178400 + }, + { + "epoch": 16.01660682226212, + "grad_norm": 17.061969757080078, + "learning_rate": 9.332206263714343e-06, + "loss": 5.1075, + "step": 178425 + }, + { + "epoch": 16.018850987432675, + "grad_norm": 19.74644660949707, + "learning_rate": 9.331956912028726e-06, + "loss": 4.926, + "step": 178450 + }, + { + "epoch": 16.02109515260323, + "grad_norm": 16.02646827697754, + "learning_rate": 9.331707560343108e-06, + "loss": 4.9005, + "step": 178475 + }, + { + "epoch": 16.02333931777379, + "grad_norm": 17.74357795715332, + "learning_rate": 9.331458208657492e-06, + "loss": 4.9969, + "step": 178500 + }, + { + "epoch": 16.025583482944345, + "grad_norm": 15.861205101013184, + "learning_rate": 9.331208856971874e-06, + "loss": 4.8841, + "step": 178525 + }, + { + "epoch": 16.0278276481149, + "grad_norm": 18.88298797607422, + "learning_rate": 9.330959505286257e-06, + "loss": 5.1585, + "step": 178550 + }, + { + "epoch": 16.030071813285456, + "grad_norm": 17.525348663330078, + "learning_rate": 9.330710153600639e-06, + "loss": 4.7752, + "step": 178575 + }, + { + "epoch": 16.032315978456015, + "grad_norm": 15.533987998962402, + "learning_rate": 9.330470775982446e-06, + "loss": 4.8344, + "step": 178600 + }, + { + "epoch": 16.03456014362657, + "grad_norm": 16.88707160949707, + "learning_rate": 9.33022142429683e-06, + "loss": 4.7194, + "step": 178625 + }, + { + "epoch": 16.036804308797127, + "grad_norm": 17.936870574951172, + "learning_rate": 9.329972072611212e-06, + "loss": 4.7614, + "step": 178650 + }, + { + "epoch": 16.039048473967686, + "grad_norm": 19.087549209594727, + "learning_rate": 9.329722720925594e-06, + "loss": 5.202, + "step": 178675 + }, + { + "epoch": 16.04129263913824, + "grad_norm": 15.853852272033691, + "learning_rate": 9.329473369239977e-06, + "loss": 5.1962, + "step": 178700 + }, + { + "epoch": 16.043536804308797, + "grad_norm": 14.338016510009766, + "learning_rate": 9.329224017554359e-06, + "loss": 4.8925, + "step": 178725 + }, + { + "epoch": 16.045780969479353, + "grad_norm": 16.028568267822266, + "learning_rate": 9.328974665868741e-06, + "loss": 5.001, + "step": 178750 + }, + { + "epoch": 16.04802513464991, + "grad_norm": 14.440081596374512, + "learning_rate": 9.328725314183125e-06, + "loss": 5.0031, + "step": 178775 + }, + { + "epoch": 16.050269299820467, + "grad_norm": 19.033409118652344, + "learning_rate": 9.328475962497508e-06, + "loss": 4.9454, + "step": 178800 + }, + { + "epoch": 16.052513464991023, + "grad_norm": 15.548419952392578, + "learning_rate": 9.32822661081189e-06, + "loss": 4.6392, + "step": 178825 + }, + { + "epoch": 16.05475763016158, + "grad_norm": 16.28857421875, + "learning_rate": 9.327977259126272e-06, + "loss": 5.0989, + "step": 178850 + }, + { + "epoch": 16.057001795332138, + "grad_norm": 18.952388763427734, + "learning_rate": 9.327727907440655e-06, + "loss": 5.1407, + "step": 178875 + }, + { + "epoch": 16.059245960502693, + "grad_norm": 16.639766693115234, + "learning_rate": 9.327478555755037e-06, + "loss": 4.9585, + "step": 178900 + }, + { + "epoch": 16.06149012567325, + "grad_norm": 15.043784141540527, + "learning_rate": 9.32722920406942e-06, + "loss": 4.7692, + "step": 178925 + }, + { + "epoch": 16.063734290843804, + "grad_norm": 18.128536224365234, + "learning_rate": 9.326979852383803e-06, + "loss": 4.989, + "step": 178950 + }, + { + "epoch": 16.065978456014363, + "grad_norm": 16.148998260498047, + "learning_rate": 9.326730500698186e-06, + "loss": 4.9916, + "step": 178975 + }, + { + "epoch": 16.06822262118492, + "grad_norm": 17.388809204101562, + "learning_rate": 9.326481149012568e-06, + "loss": 4.9245, + "step": 179000 + }, + { + "epoch": 16.070466786355475, + "grad_norm": 15.09352970123291, + "learning_rate": 9.326231797326952e-06, + "loss": 5.1611, + "step": 179025 + }, + { + "epoch": 16.072710951526034, + "grad_norm": 16.378679275512695, + "learning_rate": 9.325982445641333e-06, + "loss": 5.0784, + "step": 179050 + }, + { + "epoch": 16.07495511669659, + "grad_norm": 19.515274047851562, + "learning_rate": 9.325733093955715e-06, + "loss": 5.0269, + "step": 179075 + }, + { + "epoch": 16.077199281867145, + "grad_norm": 17.001514434814453, + "learning_rate": 9.325483742270099e-06, + "loss": 4.9096, + "step": 179100 + }, + { + "epoch": 16.0794434470377, + "grad_norm": 15.80053997039795, + "learning_rate": 9.325234390584481e-06, + "loss": 5.0321, + "step": 179125 + }, + { + "epoch": 16.08168761220826, + "grad_norm": 17.89588737487793, + "learning_rate": 9.324985038898864e-06, + "loss": 5.1748, + "step": 179150 + }, + { + "epoch": 16.083931777378815, + "grad_norm": 15.792251586914062, + "learning_rate": 9.324735687213246e-06, + "loss": 4.7737, + "step": 179175 + }, + { + "epoch": 16.08617594254937, + "grad_norm": 13.983036994934082, + "learning_rate": 9.32448633552763e-06, + "loss": 4.8038, + "step": 179200 + }, + { + "epoch": 16.088420107719926, + "grad_norm": 16.305400848388672, + "learning_rate": 9.32423698384201e-06, + "loss": 4.976, + "step": 179225 + }, + { + "epoch": 16.090664272890486, + "grad_norm": 18.15692138671875, + "learning_rate": 9.323987632156395e-06, + "loss": 4.9032, + "step": 179250 + }, + { + "epoch": 16.09290843806104, + "grad_norm": 14.315934181213379, + "learning_rate": 9.323738280470777e-06, + "loss": 5.0297, + "step": 179275 + }, + { + "epoch": 16.095152603231597, + "grad_norm": 16.776212692260742, + "learning_rate": 9.323488928785159e-06, + "loss": 5.012, + "step": 179300 + }, + { + "epoch": 16.097396768402156, + "grad_norm": 16.358863830566406, + "learning_rate": 9.323239577099541e-06, + "loss": 4.9085, + "step": 179325 + }, + { + "epoch": 16.09964093357271, + "grad_norm": 17.677806854248047, + "learning_rate": 9.322990225413925e-06, + "loss": 5.1467, + "step": 179350 + }, + { + "epoch": 16.101885098743267, + "grad_norm": 20.788002014160156, + "learning_rate": 9.322740873728308e-06, + "loss": 4.9811, + "step": 179375 + }, + { + "epoch": 16.104129263913823, + "grad_norm": 18.270370483398438, + "learning_rate": 9.32249152204269e-06, + "loss": 5.019, + "step": 179400 + }, + { + "epoch": 16.106373429084382, + "grad_norm": 18.703516006469727, + "learning_rate": 9.322242170357072e-06, + "loss": 4.9368, + "step": 179425 + }, + { + "epoch": 16.108617594254937, + "grad_norm": 15.729881286621094, + "learning_rate": 9.321992818671455e-06, + "loss": 5.0231, + "step": 179450 + }, + { + "epoch": 16.110861759425493, + "grad_norm": 20.981088638305664, + "learning_rate": 9.321743466985837e-06, + "loss": 4.8496, + "step": 179475 + }, + { + "epoch": 16.11310592459605, + "grad_norm": 18.14629554748535, + "learning_rate": 9.321494115300221e-06, + "loss": 4.8809, + "step": 179500 + }, + { + "epoch": 16.115350089766608, + "grad_norm": 14.880410194396973, + "learning_rate": 9.321244763614603e-06, + "loss": 4.702, + "step": 179525 + }, + { + "epoch": 16.117594254937163, + "grad_norm": 16.76110076904297, + "learning_rate": 9.320995411928986e-06, + "loss": 5.0146, + "step": 179550 + }, + { + "epoch": 16.11983842010772, + "grad_norm": 18.27703094482422, + "learning_rate": 9.320746060243368e-06, + "loss": 4.7188, + "step": 179575 + }, + { + "epoch": 16.122082585278278, + "grad_norm": 18.197710037231445, + "learning_rate": 9.32049670855775e-06, + "loss": 5.0794, + "step": 179600 + }, + { + "epoch": 16.124326750448834, + "grad_norm": 17.82663917541504, + "learning_rate": 9.320247356872133e-06, + "loss": 4.9494, + "step": 179625 + }, + { + "epoch": 16.12657091561939, + "grad_norm": 17.44951820373535, + "learning_rate": 9.319998005186515e-06, + "loss": 5.096, + "step": 179650 + }, + { + "epoch": 16.128815080789945, + "grad_norm": 14.807232856750488, + "learning_rate": 9.319748653500899e-06, + "loss": 5.0899, + "step": 179675 + }, + { + "epoch": 16.131059245960504, + "grad_norm": 18.838302612304688, + "learning_rate": 9.319499301815281e-06, + "loss": 5.0344, + "step": 179700 + }, + { + "epoch": 16.13330341113106, + "grad_norm": 15.447916030883789, + "learning_rate": 9.319249950129664e-06, + "loss": 5.0035, + "step": 179725 + }, + { + "epoch": 16.135547576301615, + "grad_norm": 16.924009323120117, + "learning_rate": 9.319000598444046e-06, + "loss": 5.1316, + "step": 179750 + }, + { + "epoch": 16.13779174147217, + "grad_norm": 25.53466796875, + "learning_rate": 9.318751246758428e-06, + "loss": 5.1643, + "step": 179775 + }, + { + "epoch": 16.14003590664273, + "grad_norm": 16.304218292236328, + "learning_rate": 9.31850189507281e-06, + "loss": 4.8344, + "step": 179800 + }, + { + "epoch": 16.142280071813286, + "grad_norm": 15.310173034667969, + "learning_rate": 9.318252543387195e-06, + "loss": 5.0632, + "step": 179825 + }, + { + "epoch": 16.14452423698384, + "grad_norm": 19.64466667175293, + "learning_rate": 9.318003191701577e-06, + "loss": 5.0746, + "step": 179850 + }, + { + "epoch": 16.1467684021544, + "grad_norm": 19.84743309020996, + "learning_rate": 9.31775384001596e-06, + "loss": 4.9482, + "step": 179875 + }, + { + "epoch": 16.149012567324956, + "grad_norm": 16.908428192138672, + "learning_rate": 9.317504488330341e-06, + "loss": 4.8675, + "step": 179900 + }, + { + "epoch": 16.15125673249551, + "grad_norm": 16.498716354370117, + "learning_rate": 9.317255136644726e-06, + "loss": 5.0138, + "step": 179925 + }, + { + "epoch": 16.153500897666067, + "grad_norm": 13.019558906555176, + "learning_rate": 9.317005784959106e-06, + "loss": 4.8742, + "step": 179950 + }, + { + "epoch": 16.155745062836626, + "grad_norm": 17.043970108032227, + "learning_rate": 9.31675643327349e-06, + "loss": 5.0963, + "step": 179975 + }, + { + "epoch": 16.15798922800718, + "grad_norm": 21.49835968017578, + "learning_rate": 9.316507081587872e-06, + "loss": 5.2628, + "step": 180000 + }, + { + "epoch": 16.160233393177737, + "grad_norm": 17.271751403808594, + "learning_rate": 9.316257729902255e-06, + "loss": 4.9755, + "step": 180025 + }, + { + "epoch": 16.162477558348293, + "grad_norm": 18.522924423217773, + "learning_rate": 9.316008378216637e-06, + "loss": 5.0434, + "step": 180050 + }, + { + "epoch": 16.164721723518852, + "grad_norm": 15.808985710144043, + "learning_rate": 9.315759026531021e-06, + "loss": 4.9914, + "step": 180075 + }, + { + "epoch": 16.166965888689408, + "grad_norm": 16.619792938232422, + "learning_rate": 9.315509674845403e-06, + "loss": 5.0018, + "step": 180100 + }, + { + "epoch": 16.169210053859963, + "grad_norm": 18.163698196411133, + "learning_rate": 9.315260323159786e-06, + "loss": 4.9467, + "step": 180125 + }, + { + "epoch": 16.171454219030522, + "grad_norm": 18.071304321289062, + "learning_rate": 9.315010971474168e-06, + "loss": 5.1012, + "step": 180150 + }, + { + "epoch": 16.173698384201078, + "grad_norm": 17.563520431518555, + "learning_rate": 9.31476161978855e-06, + "loss": 4.8694, + "step": 180175 + }, + { + "epoch": 16.175942549371634, + "grad_norm": 17.71146011352539, + "learning_rate": 9.314512268102933e-06, + "loss": 5.122, + "step": 180200 + }, + { + "epoch": 16.17818671454219, + "grad_norm": 16.43619728088379, + "learning_rate": 9.314262916417317e-06, + "loss": 4.9824, + "step": 180225 + }, + { + "epoch": 16.18043087971275, + "grad_norm": 19.47097396850586, + "learning_rate": 9.314013564731699e-06, + "loss": 5.0461, + "step": 180250 + }, + { + "epoch": 16.182675044883304, + "grad_norm": 16.204051971435547, + "learning_rate": 9.313764213046081e-06, + "loss": 5.0431, + "step": 180275 + }, + { + "epoch": 16.18491921005386, + "grad_norm": 15.326215744018555, + "learning_rate": 9.313514861360464e-06, + "loss": 5.1777, + "step": 180300 + }, + { + "epoch": 16.187163375224415, + "grad_norm": 16.28999900817871, + "learning_rate": 9.313265509674846e-06, + "loss": 5.1834, + "step": 180325 + }, + { + "epoch": 16.189407540394974, + "grad_norm": 17.465408325195312, + "learning_rate": 9.313016157989228e-06, + "loss": 4.9405, + "step": 180350 + }, + { + "epoch": 16.19165170556553, + "grad_norm": 18.393342971801758, + "learning_rate": 9.31276680630361e-06, + "loss": 4.9752, + "step": 180375 + }, + { + "epoch": 16.193895870736085, + "grad_norm": 20.984148025512695, + "learning_rate": 9.312517454617995e-06, + "loss": 5.08, + "step": 180400 + }, + { + "epoch": 16.19614003590664, + "grad_norm": 15.722599983215332, + "learning_rate": 9.312268102932377e-06, + "loss": 5.0228, + "step": 180425 + }, + { + "epoch": 16.1983842010772, + "grad_norm": 15.736994743347168, + "learning_rate": 9.31201875124676e-06, + "loss": 4.9887, + "step": 180450 + }, + { + "epoch": 16.200628366247756, + "grad_norm": 14.991681098937988, + "learning_rate": 9.311769399561142e-06, + "loss": 4.8702, + "step": 180475 + }, + { + "epoch": 16.20287253141831, + "grad_norm": 18.325401306152344, + "learning_rate": 9.311520047875524e-06, + "loss": 4.8389, + "step": 180500 + }, + { + "epoch": 16.20511669658887, + "grad_norm": 18.020870208740234, + "learning_rate": 9.311270696189906e-06, + "loss": 4.9157, + "step": 180525 + }, + { + "epoch": 16.207360861759426, + "grad_norm": 20.0115966796875, + "learning_rate": 9.31102134450429e-06, + "loss": 5.0082, + "step": 180550 + }, + { + "epoch": 16.20960502692998, + "grad_norm": 14.458548545837402, + "learning_rate": 9.310771992818672e-06, + "loss": 4.8525, + "step": 180575 + }, + { + "epoch": 16.211849192100537, + "grad_norm": 15.904760360717773, + "learning_rate": 9.310522641133055e-06, + "loss": 5.0477, + "step": 180600 + }, + { + "epoch": 16.214093357271096, + "grad_norm": 18.26080322265625, + "learning_rate": 9.310273289447437e-06, + "loss": 5.1034, + "step": 180625 + }, + { + "epoch": 16.216337522441652, + "grad_norm": 14.56369686126709, + "learning_rate": 9.31002393776182e-06, + "loss": 4.9715, + "step": 180650 + }, + { + "epoch": 16.218581687612208, + "grad_norm": 19.475522994995117, + "learning_rate": 9.309774586076202e-06, + "loss": 4.9793, + "step": 180675 + }, + { + "epoch": 16.220825852782763, + "grad_norm": 17.621618270874023, + "learning_rate": 9.30953520845801e-06, + "loss": 5.2208, + "step": 180700 + }, + { + "epoch": 16.223070017953322, + "grad_norm": 14.425657272338867, + "learning_rate": 9.309285856772393e-06, + "loss": 5.1197, + "step": 180725 + }, + { + "epoch": 16.225314183123878, + "grad_norm": 19.07314109802246, + "learning_rate": 9.309036505086775e-06, + "loss": 4.8554, + "step": 180750 + }, + { + "epoch": 16.227558348294433, + "grad_norm": 17.340476989746094, + "learning_rate": 9.308787153401157e-06, + "loss": 5.042, + "step": 180775 + }, + { + "epoch": 16.229802513464993, + "grad_norm": 19.293142318725586, + "learning_rate": 9.30853780171554e-06, + "loss": 5.013, + "step": 180800 + }, + { + "epoch": 16.232046678635548, + "grad_norm": 17.889240264892578, + "learning_rate": 9.308288450029924e-06, + "loss": 5.1291, + "step": 180825 + }, + { + "epoch": 16.234290843806104, + "grad_norm": 18.529109954833984, + "learning_rate": 9.308039098344306e-06, + "loss": 5.082, + "step": 180850 + }, + { + "epoch": 16.23653500897666, + "grad_norm": 14.463598251342773, + "learning_rate": 9.307789746658688e-06, + "loss": 4.9674, + "step": 180875 + }, + { + "epoch": 16.23877917414722, + "grad_norm": 15.750506401062012, + "learning_rate": 9.30754039497307e-06, + "loss": 5.1701, + "step": 180900 + }, + { + "epoch": 16.241023339317774, + "grad_norm": 16.683229446411133, + "learning_rate": 9.307291043287453e-06, + "loss": 4.8584, + "step": 180925 + }, + { + "epoch": 16.24326750448833, + "grad_norm": 16.809499740600586, + "learning_rate": 9.307041691601835e-06, + "loss": 4.9854, + "step": 180950 + }, + { + "epoch": 16.245511669658885, + "grad_norm": 15.072883605957031, + "learning_rate": 9.30679233991622e-06, + "loss": 4.9201, + "step": 180975 + }, + { + "epoch": 16.247755834829444, + "grad_norm": 17.663654327392578, + "learning_rate": 9.306542988230602e-06, + "loss": 4.9662, + "step": 181000 + }, + { + "epoch": 16.25, + "grad_norm": 17.602901458740234, + "learning_rate": 9.306293636544984e-06, + "loss": 4.8954, + "step": 181025 + }, + { + "epoch": 16.252244165170556, + "grad_norm": 19.127670288085938, + "learning_rate": 9.306044284859366e-06, + "loss": 4.9472, + "step": 181050 + }, + { + "epoch": 16.254488330341115, + "grad_norm": 17.398910522460938, + "learning_rate": 9.305794933173749e-06, + "loss": 4.9131, + "step": 181075 + }, + { + "epoch": 16.25673249551167, + "grad_norm": 19.5817928314209, + "learning_rate": 9.305545581488131e-06, + "loss": 4.7376, + "step": 181100 + }, + { + "epoch": 16.258976660682226, + "grad_norm": 16.58822250366211, + "learning_rate": 9.305296229802513e-06, + "loss": 5.0355, + "step": 181125 + }, + { + "epoch": 16.26122082585278, + "grad_norm": 17.32785987854004, + "learning_rate": 9.305046878116897e-06, + "loss": 4.859, + "step": 181150 + }, + { + "epoch": 16.26346499102334, + "grad_norm": 18.416471481323242, + "learning_rate": 9.30479752643128e-06, + "loss": 5.2237, + "step": 181175 + }, + { + "epoch": 16.265709156193896, + "grad_norm": 20.78618621826172, + "learning_rate": 9.304548174745662e-06, + "loss": 4.9231, + "step": 181200 + }, + { + "epoch": 16.267953321364452, + "grad_norm": 15.540297508239746, + "learning_rate": 9.304298823060046e-06, + "loss": 5.1179, + "step": 181225 + }, + { + "epoch": 16.270197486535007, + "grad_norm": 19.678098678588867, + "learning_rate": 9.304049471374426e-06, + "loss": 5.1975, + "step": 181250 + }, + { + "epoch": 16.272441651705567, + "grad_norm": 16.751564025878906, + "learning_rate": 9.303800119688809e-06, + "loss": 5.3301, + "step": 181275 + }, + { + "epoch": 16.274685816876122, + "grad_norm": 15.330033302307129, + "learning_rate": 9.303550768003193e-06, + "loss": 5.1166, + "step": 181300 + }, + { + "epoch": 16.276929982046678, + "grad_norm": 17.615680694580078, + "learning_rate": 9.303301416317575e-06, + "loss": 4.99, + "step": 181325 + }, + { + "epoch": 16.279174147217237, + "grad_norm": 18.98512840270996, + "learning_rate": 9.303052064631957e-06, + "loss": 5.0056, + "step": 181350 + }, + { + "epoch": 16.281418312387792, + "grad_norm": 18.309370040893555, + "learning_rate": 9.30280271294634e-06, + "loss": 5.0907, + "step": 181375 + }, + { + "epoch": 16.283662477558348, + "grad_norm": 16.907634735107422, + "learning_rate": 9.302553361260724e-06, + "loss": 4.9946, + "step": 181400 + }, + { + "epoch": 16.285906642728904, + "grad_norm": 16.912240982055664, + "learning_rate": 9.302304009575106e-06, + "loss": 5.0481, + "step": 181425 + }, + { + "epoch": 16.288150807899463, + "grad_norm": 17.374290466308594, + "learning_rate": 9.302054657889488e-06, + "loss": 5.0118, + "step": 181450 + }, + { + "epoch": 16.29039497307002, + "grad_norm": 17.633527755737305, + "learning_rate": 9.30180530620387e-06, + "loss": 5.0982, + "step": 181475 + }, + { + "epoch": 16.292639138240574, + "grad_norm": 14.92713451385498, + "learning_rate": 9.301555954518253e-06, + "loss": 5.1476, + "step": 181500 + }, + { + "epoch": 16.29488330341113, + "grad_norm": 18.27492904663086, + "learning_rate": 9.301306602832635e-06, + "loss": 4.9227, + "step": 181525 + }, + { + "epoch": 16.29712746858169, + "grad_norm": 15.388669967651367, + "learning_rate": 9.30105725114702e-06, + "loss": 5.071, + "step": 181550 + }, + { + "epoch": 16.299371633752244, + "grad_norm": 19.367794036865234, + "learning_rate": 9.300807899461402e-06, + "loss": 5.1571, + "step": 181575 + }, + { + "epoch": 16.3016157989228, + "grad_norm": 14.3370943069458, + "learning_rate": 9.300558547775784e-06, + "loss": 5.1389, + "step": 181600 + }, + { + "epoch": 16.303859964093355, + "grad_norm": 17.14916229248047, + "learning_rate": 9.300309196090166e-06, + "loss": 4.8912, + "step": 181625 + }, + { + "epoch": 16.306104129263915, + "grad_norm": 18.4705867767334, + "learning_rate": 9.300059844404549e-06, + "loss": 4.6179, + "step": 181650 + }, + { + "epoch": 16.30834829443447, + "grad_norm": 15.599902153015137, + "learning_rate": 9.299810492718931e-06, + "loss": 5.0123, + "step": 181675 + }, + { + "epoch": 16.310592459605026, + "grad_norm": 18.95281219482422, + "learning_rate": 9.299561141033315e-06, + "loss": 4.9495, + "step": 181700 + }, + { + "epoch": 16.312836624775585, + "grad_norm": 17.944103240966797, + "learning_rate": 9.299311789347697e-06, + "loss": 5.0066, + "step": 181725 + }, + { + "epoch": 16.31508078994614, + "grad_norm": 19.007646560668945, + "learning_rate": 9.29906243766208e-06, + "loss": 4.8986, + "step": 181750 + }, + { + "epoch": 16.317324955116696, + "grad_norm": 17.98164939880371, + "learning_rate": 9.298813085976462e-06, + "loss": 4.987, + "step": 181775 + }, + { + "epoch": 16.31956912028725, + "grad_norm": 18.136396408081055, + "learning_rate": 9.298563734290844e-06, + "loss": 4.8747, + "step": 181800 + }, + { + "epoch": 16.32181328545781, + "grad_norm": 18.81248664855957, + "learning_rate": 9.298314382605226e-06, + "loss": 5.1376, + "step": 181825 + }, + { + "epoch": 16.324057450628366, + "grad_norm": 16.922822952270508, + "learning_rate": 9.29806503091961e-06, + "loss": 4.9834, + "step": 181850 + }, + { + "epoch": 16.326301615798922, + "grad_norm": 18.70096778869629, + "learning_rate": 9.297815679233993e-06, + "loss": 5.1368, + "step": 181875 + }, + { + "epoch": 16.328545780969478, + "grad_norm": 15.557156562805176, + "learning_rate": 9.297566327548375e-06, + "loss": 5.1959, + "step": 181900 + }, + { + "epoch": 16.330789946140037, + "grad_norm": 17.095319747924805, + "learning_rate": 9.297316975862757e-06, + "loss": 4.9056, + "step": 181925 + }, + { + "epoch": 16.333034111310592, + "grad_norm": 19.200960159301758, + "learning_rate": 9.297067624177141e-06, + "loss": 4.9048, + "step": 181950 + }, + { + "epoch": 16.335278276481148, + "grad_norm": 18.062870025634766, + "learning_rate": 9.296818272491522e-06, + "loss": 5.158, + "step": 181975 + }, + { + "epoch": 16.337522441651707, + "grad_norm": 23.89350128173828, + "learning_rate": 9.296568920805904e-06, + "loss": 5.3062, + "step": 182000 + }, + { + "epoch": 16.339766606822263, + "grad_norm": 16.25609016418457, + "learning_rate": 9.296319569120288e-06, + "loss": 4.7796, + "step": 182025 + }, + { + "epoch": 16.34201077199282, + "grad_norm": 17.846967697143555, + "learning_rate": 9.29607021743467e-06, + "loss": 4.9487, + "step": 182050 + }, + { + "epoch": 16.344254937163374, + "grad_norm": 17.7120304107666, + "learning_rate": 9.295820865749053e-06, + "loss": 5.0576, + "step": 182075 + }, + { + "epoch": 16.346499102333933, + "grad_norm": 16.355466842651367, + "learning_rate": 9.295571514063435e-06, + "loss": 5.0027, + "step": 182100 + }, + { + "epoch": 16.34874326750449, + "grad_norm": 17.30757713317871, + "learning_rate": 9.29532216237782e-06, + "loss": 4.9374, + "step": 182125 + }, + { + "epoch": 16.350987432675044, + "grad_norm": 17.813480377197266, + "learning_rate": 9.2950728106922e-06, + "loss": 4.9962, + "step": 182150 + }, + { + "epoch": 16.3532315978456, + "grad_norm": 17.692068099975586, + "learning_rate": 9.294823459006584e-06, + "loss": 4.8547, + "step": 182175 + }, + { + "epoch": 16.35547576301616, + "grad_norm": 19.581743240356445, + "learning_rate": 9.294574107320966e-06, + "loss": 4.9886, + "step": 182200 + }, + { + "epoch": 16.357719928186714, + "grad_norm": 16.49506378173828, + "learning_rate": 9.294324755635349e-06, + "loss": 4.9604, + "step": 182225 + }, + { + "epoch": 16.35996409335727, + "grad_norm": 18.167015075683594, + "learning_rate": 9.294075403949731e-06, + "loss": 5.1452, + "step": 182250 + }, + { + "epoch": 16.36220825852783, + "grad_norm": 18.45227813720703, + "learning_rate": 9.293826052264115e-06, + "loss": 5.1846, + "step": 182275 + }, + { + "epoch": 16.364452423698385, + "grad_norm": 15.7965087890625, + "learning_rate": 9.293576700578497e-06, + "loss": 5.2153, + "step": 182300 + }, + { + "epoch": 16.36669658886894, + "grad_norm": 16.244047164916992, + "learning_rate": 9.29332734889288e-06, + "loss": 5.0615, + "step": 182325 + }, + { + "epoch": 16.368940754039496, + "grad_norm": 18.349977493286133, + "learning_rate": 9.293077997207262e-06, + "loss": 4.9239, + "step": 182350 + }, + { + "epoch": 16.371184919210055, + "grad_norm": 20.89505958557129, + "learning_rate": 9.292828645521644e-06, + "loss": 4.9234, + "step": 182375 + }, + { + "epoch": 16.37342908438061, + "grad_norm": 22.23287010192871, + "learning_rate": 9.292579293836027e-06, + "loss": 5.2852, + "step": 182400 + }, + { + "epoch": 16.375673249551166, + "grad_norm": 15.888681411743164, + "learning_rate": 9.29232994215041e-06, + "loss": 5.0917, + "step": 182425 + }, + { + "epoch": 16.377917414721722, + "grad_norm": 17.208515167236328, + "learning_rate": 9.292080590464793e-06, + "loss": 4.9604, + "step": 182450 + }, + { + "epoch": 16.38016157989228, + "grad_norm": 16.962745666503906, + "learning_rate": 9.291831238779175e-06, + "loss": 5.0664, + "step": 182475 + }, + { + "epoch": 16.382405745062837, + "grad_norm": 15.249493598937988, + "learning_rate": 9.291581887093557e-06, + "loss": 5.032, + "step": 182500 + }, + { + "epoch": 16.384649910233392, + "grad_norm": 14.886764526367188, + "learning_rate": 9.29133253540794e-06, + "loss": 5.2007, + "step": 182525 + }, + { + "epoch": 16.38689407540395, + "grad_norm": 15.650018692016602, + "learning_rate": 9.291083183722322e-06, + "loss": 5.0872, + "step": 182550 + }, + { + "epoch": 16.389138240574507, + "grad_norm": 15.16973876953125, + "learning_rate": 9.290833832036706e-06, + "loss": 4.8776, + "step": 182575 + }, + { + "epoch": 16.391382405745063, + "grad_norm": 15.986187934875488, + "learning_rate": 9.290584480351088e-06, + "loss": 5.0081, + "step": 182600 + }, + { + "epoch": 16.393626570915618, + "grad_norm": 15.45718765258789, + "learning_rate": 9.29033512866547e-06, + "loss": 4.9666, + "step": 182625 + }, + { + "epoch": 16.395870736086177, + "grad_norm": 18.52719497680664, + "learning_rate": 9.290085776979853e-06, + "loss": 4.9909, + "step": 182650 + }, + { + "epoch": 16.398114901256733, + "grad_norm": 20.343515396118164, + "learning_rate": 9.289836425294235e-06, + "loss": 5.1008, + "step": 182675 + }, + { + "epoch": 16.40035906642729, + "grad_norm": 17.27155876159668, + "learning_rate": 9.289587073608618e-06, + "loss": 5.2387, + "step": 182700 + }, + { + "epoch": 16.402603231597844, + "grad_norm": 18.670578002929688, + "learning_rate": 9.289337721923e-06, + "loss": 5.0662, + "step": 182725 + }, + { + "epoch": 16.404847396768403, + "grad_norm": 14.208568572998047, + "learning_rate": 9.289088370237384e-06, + "loss": 5.1319, + "step": 182750 + }, + { + "epoch": 16.40709156193896, + "grad_norm": 15.76247787475586, + "learning_rate": 9.288839018551766e-06, + "loss": 4.9905, + "step": 182775 + }, + { + "epoch": 16.409335727109514, + "grad_norm": 16.17038917541504, + "learning_rate": 9.288589666866149e-06, + "loss": 5.0863, + "step": 182800 + }, + { + "epoch": 16.411579892280074, + "grad_norm": 19.460773468017578, + "learning_rate": 9.288340315180531e-06, + "loss": 4.9106, + "step": 182825 + }, + { + "epoch": 16.41382405745063, + "grad_norm": 18.23048210144043, + "learning_rate": 9.288090963494913e-06, + "loss": 5.1759, + "step": 182850 + }, + { + "epoch": 16.416068222621185, + "grad_norm": 16.673093795776367, + "learning_rate": 9.287841611809296e-06, + "loss": 4.8272, + "step": 182875 + }, + { + "epoch": 16.41831238779174, + "grad_norm": 15.081791877746582, + "learning_rate": 9.28759226012368e-06, + "loss": 4.9799, + "step": 182900 + }, + { + "epoch": 16.4205565529623, + "grad_norm": 19.286365509033203, + "learning_rate": 9.287342908438062e-06, + "loss": 4.9667, + "step": 182925 + }, + { + "epoch": 16.422800718132855, + "grad_norm": 17.368366241455078, + "learning_rate": 9.287093556752444e-06, + "loss": 5.1768, + "step": 182950 + }, + { + "epoch": 16.42504488330341, + "grad_norm": 14.769664764404297, + "learning_rate": 9.286844205066827e-06, + "loss": 4.9987, + "step": 182975 + }, + { + "epoch": 16.427289048473966, + "grad_norm": 21.255399703979492, + "learning_rate": 9.28659485338121e-06, + "loss": 4.8192, + "step": 183000 + }, + { + "epoch": 16.429533213644525, + "grad_norm": 19.412050247192383, + "learning_rate": 9.286345501695591e-06, + "loss": 5.1293, + "step": 183025 + }, + { + "epoch": 16.43177737881508, + "grad_norm": 18.696334838867188, + "learning_rate": 9.286096150009975e-06, + "loss": 5.0248, + "step": 183050 + }, + { + "epoch": 16.434021543985637, + "grad_norm": 20.0008544921875, + "learning_rate": 9.285846798324358e-06, + "loss": 5.0928, + "step": 183075 + }, + { + "epoch": 16.436265709156196, + "grad_norm": 14.163819313049316, + "learning_rate": 9.28559744663874e-06, + "loss": 4.7293, + "step": 183100 + }, + { + "epoch": 16.43850987432675, + "grad_norm": 15.715277671813965, + "learning_rate": 9.285348094953122e-06, + "loss": 4.9135, + "step": 183125 + }, + { + "epoch": 16.440754039497307, + "grad_norm": 14.303266525268555, + "learning_rate": 9.285098743267506e-06, + "loss": 5.0024, + "step": 183150 + }, + { + "epoch": 16.442998204667862, + "grad_norm": 16.56536102294922, + "learning_rate": 9.284849391581888e-06, + "loss": 5.0352, + "step": 183175 + }, + { + "epoch": 16.44524236983842, + "grad_norm": 13.56983757019043, + "learning_rate": 9.28460003989627e-06, + "loss": 4.9796, + "step": 183200 + }, + { + "epoch": 16.447486535008977, + "grad_norm": 19.775264739990234, + "learning_rate": 9.284350688210653e-06, + "loss": 4.9978, + "step": 183225 + }, + { + "epoch": 16.449730700179533, + "grad_norm": 16.875822067260742, + "learning_rate": 9.284101336525035e-06, + "loss": 5.0382, + "step": 183250 + }, + { + "epoch": 16.45197486535009, + "grad_norm": 14.853713035583496, + "learning_rate": 9.283851984839418e-06, + "loss": 5.098, + "step": 183275 + }, + { + "epoch": 16.454219030520647, + "grad_norm": 24.54802131652832, + "learning_rate": 9.283602633153802e-06, + "loss": 4.8068, + "step": 183300 + }, + { + "epoch": 16.456463195691203, + "grad_norm": 18.058549880981445, + "learning_rate": 9.283353281468184e-06, + "loss": 5.3763, + "step": 183325 + }, + { + "epoch": 16.45870736086176, + "grad_norm": 17.23570442199707, + "learning_rate": 9.283103929782566e-06, + "loss": 5.0043, + "step": 183350 + }, + { + "epoch": 16.460951526032314, + "grad_norm": 18.16130256652832, + "learning_rate": 9.282854578096949e-06, + "loss": 4.8939, + "step": 183375 + }, + { + "epoch": 16.463195691202873, + "grad_norm": 19.462507247924805, + "learning_rate": 9.282605226411331e-06, + "loss": 5.184, + "step": 183400 + }, + { + "epoch": 16.46543985637343, + "grad_norm": 17.127195358276367, + "learning_rate": 9.282355874725713e-06, + "loss": 5.1398, + "step": 183425 + }, + { + "epoch": 16.467684021543985, + "grad_norm": 15.655888557434082, + "learning_rate": 9.282106523040096e-06, + "loss": 4.9855, + "step": 183450 + }, + { + "epoch": 16.469928186714544, + "grad_norm": 19.8765926361084, + "learning_rate": 9.28185717135448e-06, + "loss": 4.9495, + "step": 183475 + }, + { + "epoch": 16.4721723518851, + "grad_norm": 17.39535903930664, + "learning_rate": 9.281607819668862e-06, + "loss": 5.1116, + "step": 183500 + }, + { + "epoch": 16.474416517055655, + "grad_norm": 13.849928855895996, + "learning_rate": 9.281358467983244e-06, + "loss": 5.0122, + "step": 183525 + }, + { + "epoch": 16.47666068222621, + "grad_norm": 18.163394927978516, + "learning_rate": 9.281109116297627e-06, + "loss": 4.8744, + "step": 183550 + }, + { + "epoch": 16.47890484739677, + "grad_norm": 17.593442916870117, + "learning_rate": 9.280859764612009e-06, + "loss": 5.0559, + "step": 183575 + }, + { + "epoch": 16.481149012567325, + "grad_norm": 16.272096633911133, + "learning_rate": 9.280610412926391e-06, + "loss": 4.8768, + "step": 183600 + }, + { + "epoch": 16.48339317773788, + "grad_norm": 19.19920539855957, + "learning_rate": 9.280361061240775e-06, + "loss": 5.1701, + "step": 183625 + }, + { + "epoch": 16.485637342908436, + "grad_norm": 19.077795028686523, + "learning_rate": 9.280111709555158e-06, + "loss": 4.9578, + "step": 183650 + }, + { + "epoch": 16.487881508078996, + "grad_norm": 15.210630416870117, + "learning_rate": 9.27986235786954e-06, + "loss": 4.8028, + "step": 183675 + }, + { + "epoch": 16.49012567324955, + "grad_norm": 18.848947525024414, + "learning_rate": 9.279613006183922e-06, + "loss": 5.0881, + "step": 183700 + }, + { + "epoch": 16.492369838420107, + "grad_norm": 15.554614067077637, + "learning_rate": 9.279363654498306e-06, + "loss": 5.0088, + "step": 183725 + }, + { + "epoch": 16.494614003590666, + "grad_norm": 15.243609428405762, + "learning_rate": 9.279114302812687e-06, + "loss": 5.1671, + "step": 183750 + }, + { + "epoch": 16.49685816876122, + "grad_norm": 17.825653076171875, + "learning_rate": 9.278864951127071e-06, + "loss": 4.9954, + "step": 183775 + }, + { + "epoch": 16.499102333931777, + "grad_norm": 14.755377769470215, + "learning_rate": 9.278615599441453e-06, + "loss": 5.2395, + "step": 183800 + }, + { + "epoch": 16.501346499102333, + "grad_norm": 14.539373397827148, + "learning_rate": 9.278366247755835e-06, + "loss": 5.1361, + "step": 183825 + }, + { + "epoch": 16.503590664272892, + "grad_norm": 16.512821197509766, + "learning_rate": 9.278116896070218e-06, + "loss": 5.1638, + "step": 183850 + }, + { + "epoch": 16.505834829443447, + "grad_norm": 14.864096641540527, + "learning_rate": 9.277867544384602e-06, + "loss": 4.9101, + "step": 183875 + }, + { + "epoch": 16.508078994614003, + "grad_norm": 16.569791793823242, + "learning_rate": 9.277618192698984e-06, + "loss": 5.1331, + "step": 183900 + }, + { + "epoch": 16.51032315978456, + "grad_norm": 17.986011505126953, + "learning_rate": 9.277368841013365e-06, + "loss": 5.2435, + "step": 183925 + }, + { + "epoch": 16.512567324955118, + "grad_norm": 17.16109275817871, + "learning_rate": 9.277119489327749e-06, + "loss": 5.227, + "step": 183950 + }, + { + "epoch": 16.514811490125673, + "grad_norm": 17.564916610717773, + "learning_rate": 9.276870137642131e-06, + "loss": 4.9777, + "step": 183975 + }, + { + "epoch": 16.51705565529623, + "grad_norm": 20.44209098815918, + "learning_rate": 9.276620785956513e-06, + "loss": 4.9369, + "step": 184000 + }, + { + "epoch": 16.519299820466788, + "grad_norm": 14.533224105834961, + "learning_rate": 9.276371434270897e-06, + "loss": 5.0101, + "step": 184025 + }, + { + "epoch": 16.521543985637344, + "grad_norm": 14.813028335571289, + "learning_rate": 9.27612208258528e-06, + "loss": 4.9107, + "step": 184050 + }, + { + "epoch": 16.5237881508079, + "grad_norm": 19.765037536621094, + "learning_rate": 9.275872730899662e-06, + "loss": 5.2743, + "step": 184075 + }, + { + "epoch": 16.526032315978455, + "grad_norm": 19.441665649414062, + "learning_rate": 9.275633353281469e-06, + "loss": 5.072, + "step": 184100 + }, + { + "epoch": 16.528276481149014, + "grad_norm": 17.4279842376709, + "learning_rate": 9.275384001595851e-06, + "loss": 5.1426, + "step": 184125 + }, + { + "epoch": 16.53052064631957, + "grad_norm": 19.50225830078125, + "learning_rate": 9.275134649910235e-06, + "loss": 5.1734, + "step": 184150 + }, + { + "epoch": 16.532764811490125, + "grad_norm": 16.222177505493164, + "learning_rate": 9.274885298224616e-06, + "loss": 4.7732, + "step": 184175 + }, + { + "epoch": 16.53500897666068, + "grad_norm": 20.71293830871582, + "learning_rate": 9.274635946538998e-06, + "loss": 4.8717, + "step": 184200 + }, + { + "epoch": 16.53725314183124, + "grad_norm": 15.280790328979492, + "learning_rate": 9.274386594853382e-06, + "loss": 5.0375, + "step": 184225 + }, + { + "epoch": 16.539497307001795, + "grad_norm": 18.590435028076172, + "learning_rate": 9.274137243167765e-06, + "loss": 5.1733, + "step": 184250 + }, + { + "epoch": 16.54174147217235, + "grad_norm": 18.750778198242188, + "learning_rate": 9.273887891482147e-06, + "loss": 5.1748, + "step": 184275 + }, + { + "epoch": 16.543985637342907, + "grad_norm": 17.18494987487793, + "learning_rate": 9.273638539796531e-06, + "loss": 5.1085, + "step": 184300 + }, + { + "epoch": 16.546229802513466, + "grad_norm": 18.08639144897461, + "learning_rate": 9.273389188110913e-06, + "loss": 5.1557, + "step": 184325 + }, + { + "epoch": 16.54847396768402, + "grad_norm": 16.184673309326172, + "learning_rate": 9.273139836425294e-06, + "loss": 5.1974, + "step": 184350 + }, + { + "epoch": 16.550718132854577, + "grad_norm": 18.78789520263672, + "learning_rate": 9.272890484739678e-06, + "loss": 4.9415, + "step": 184375 + }, + { + "epoch": 16.552962298025136, + "grad_norm": 14.452696800231934, + "learning_rate": 9.27264113305406e-06, + "loss": 4.9634, + "step": 184400 + }, + { + "epoch": 16.55520646319569, + "grad_norm": 16.685962677001953, + "learning_rate": 9.272391781368442e-06, + "loss": 5.0566, + "step": 184425 + }, + { + "epoch": 16.557450628366247, + "grad_norm": 19.92796516418457, + "learning_rate": 9.272142429682825e-06, + "loss": 4.9068, + "step": 184450 + }, + { + "epoch": 16.559694793536803, + "grad_norm": 16.74758529663086, + "learning_rate": 9.271893077997209e-06, + "loss": 5.2769, + "step": 184475 + }, + { + "epoch": 16.561938958707362, + "grad_norm": 17.581235885620117, + "learning_rate": 9.271643726311591e-06, + "loss": 5.1096, + "step": 184500 + }, + { + "epoch": 16.564183123877918, + "grad_norm": 17.453950881958008, + "learning_rate": 9.271394374625973e-06, + "loss": 4.7688, + "step": 184525 + }, + { + "epoch": 16.566427289048473, + "grad_norm": 16.753753662109375, + "learning_rate": 9.271145022940356e-06, + "loss": 5.0641, + "step": 184550 + }, + { + "epoch": 16.56867145421903, + "grad_norm": 17.903789520263672, + "learning_rate": 9.270895671254738e-06, + "loss": 4.9558, + "step": 184575 + }, + { + "epoch": 16.570915619389588, + "grad_norm": 15.47413158416748, + "learning_rate": 9.27064631956912e-06, + "loss": 4.9718, + "step": 184600 + }, + { + "epoch": 16.573159784560143, + "grad_norm": 14.959647178649902, + "learning_rate": 9.270396967883504e-06, + "loss": 5.1719, + "step": 184625 + }, + { + "epoch": 16.5754039497307, + "grad_norm": 15.959126472473145, + "learning_rate": 9.270147616197887e-06, + "loss": 5.1096, + "step": 184650 + }, + { + "epoch": 16.57764811490126, + "grad_norm": 19.3013973236084, + "learning_rate": 9.269898264512269e-06, + "loss": 5.0496, + "step": 184675 + }, + { + "epoch": 16.579892280071814, + "grad_norm": 18.893945693969727, + "learning_rate": 9.269648912826651e-06, + "loss": 4.8748, + "step": 184700 + }, + { + "epoch": 16.58213644524237, + "grad_norm": 21.019222259521484, + "learning_rate": 9.269399561141034e-06, + "loss": 4.8866, + "step": 184725 + }, + { + "epoch": 16.584380610412925, + "grad_norm": 18.332538604736328, + "learning_rate": 9.269150209455416e-06, + "loss": 4.9335, + "step": 184750 + }, + { + "epoch": 16.586624775583484, + "grad_norm": 16.162044525146484, + "learning_rate": 9.2689008577698e-06, + "loss": 5.0882, + "step": 184775 + }, + { + "epoch": 16.58886894075404, + "grad_norm": 16.263687133789062, + "learning_rate": 9.268651506084182e-06, + "loss": 4.8981, + "step": 184800 + }, + { + "epoch": 16.591113105924595, + "grad_norm": 17.948623657226562, + "learning_rate": 9.268402154398565e-06, + "loss": 5.0553, + "step": 184825 + }, + { + "epoch": 16.59335727109515, + "grad_norm": 18.4186954498291, + "learning_rate": 9.268152802712947e-06, + "loss": 4.982, + "step": 184850 + }, + { + "epoch": 16.59560143626571, + "grad_norm": 18.17711067199707, + "learning_rate": 9.267903451027331e-06, + "loss": 5.0721, + "step": 184875 + }, + { + "epoch": 16.597845601436266, + "grad_norm": 16.54976463317871, + "learning_rate": 9.267654099341712e-06, + "loss": 4.7681, + "step": 184900 + }, + { + "epoch": 16.60008976660682, + "grad_norm": 19.2630672454834, + "learning_rate": 9.267404747656094e-06, + "loss": 5.3168, + "step": 184925 + }, + { + "epoch": 16.60233393177738, + "grad_norm": 17.64307403564453, + "learning_rate": 9.267155395970478e-06, + "loss": 4.9631, + "step": 184950 + }, + { + "epoch": 16.604578096947936, + "grad_norm": 13.729689598083496, + "learning_rate": 9.26690604428486e-06, + "loss": 4.8882, + "step": 184975 + }, + { + "epoch": 16.60682226211849, + "grad_norm": 18.444141387939453, + "learning_rate": 9.266656692599243e-06, + "loss": 5.1943, + "step": 185000 + }, + { + "epoch": 16.609066427289047, + "grad_norm": 20.227041244506836, + "learning_rate": 9.266407340913627e-06, + "loss": 4.964, + "step": 185025 + }, + { + "epoch": 16.611310592459606, + "grad_norm": 22.824447631835938, + "learning_rate": 9.266157989228009e-06, + "loss": 5.218, + "step": 185050 + }, + { + "epoch": 16.613554757630162, + "grad_norm": 19.295757293701172, + "learning_rate": 9.26590863754239e-06, + "loss": 4.9921, + "step": 185075 + }, + { + "epoch": 16.615798922800717, + "grad_norm": 18.730958938598633, + "learning_rate": 9.265659285856773e-06, + "loss": 5.1595, + "step": 185100 + }, + { + "epoch": 16.618043087971273, + "grad_norm": 17.36395263671875, + "learning_rate": 9.265409934171156e-06, + "loss": 5.0768, + "step": 185125 + }, + { + "epoch": 16.620287253141832, + "grad_norm": 16.024503707885742, + "learning_rate": 9.265160582485538e-06, + "loss": 5.0759, + "step": 185150 + }, + { + "epoch": 16.622531418312388, + "grad_norm": 18.256515502929688, + "learning_rate": 9.26491123079992e-06, + "loss": 5.0466, + "step": 185175 + }, + { + "epoch": 16.624775583482943, + "grad_norm": 13.651506423950195, + "learning_rate": 9.264661879114304e-06, + "loss": 5.0901, + "step": 185200 + }, + { + "epoch": 16.627019748653503, + "grad_norm": 19.935373306274414, + "learning_rate": 9.264412527428687e-06, + "loss": 4.9395, + "step": 185225 + }, + { + "epoch": 16.629263913824058, + "grad_norm": 18.005828857421875, + "learning_rate": 9.264163175743069e-06, + "loss": 5.1209, + "step": 185250 + }, + { + "epoch": 16.631508078994614, + "grad_norm": 18.567426681518555, + "learning_rate": 9.263913824057451e-06, + "loss": 5.2686, + "step": 185275 + }, + { + "epoch": 16.63375224416517, + "grad_norm": 17.151519775390625, + "learning_rate": 9.263664472371834e-06, + "loss": 5.1867, + "step": 185300 + }, + { + "epoch": 16.63599640933573, + "grad_norm": 15.068605422973633, + "learning_rate": 9.263415120686216e-06, + "loss": 5.0056, + "step": 185325 + }, + { + "epoch": 16.638240574506284, + "grad_norm": 18.49180030822754, + "learning_rate": 9.2631657690006e-06, + "loss": 5.0184, + "step": 185350 + }, + { + "epoch": 16.64048473967684, + "grad_norm": 23.132001876831055, + "learning_rate": 9.262916417314982e-06, + "loss": 4.9782, + "step": 185375 + }, + { + "epoch": 16.642728904847395, + "grad_norm": 19.97836685180664, + "learning_rate": 9.262667065629365e-06, + "loss": 4.9322, + "step": 185400 + }, + { + "epoch": 16.644973070017954, + "grad_norm": 20.588891983032227, + "learning_rate": 9.262417713943747e-06, + "loss": 4.9602, + "step": 185425 + }, + { + "epoch": 16.64721723518851, + "grad_norm": 13.944457054138184, + "learning_rate": 9.26216836225813e-06, + "loss": 5.2152, + "step": 185450 + }, + { + "epoch": 16.649461400359066, + "grad_norm": 15.823930740356445, + "learning_rate": 9.261919010572512e-06, + "loss": 5.1775, + "step": 185475 + }, + { + "epoch": 16.651705565529625, + "grad_norm": 16.09419059753418, + "learning_rate": 9.261669658886896e-06, + "loss": 4.9123, + "step": 185500 + }, + { + "epoch": 16.65394973070018, + "grad_norm": 19.340932846069336, + "learning_rate": 9.261420307201278e-06, + "loss": 5.1726, + "step": 185525 + }, + { + "epoch": 16.656193895870736, + "grad_norm": 16.511335372924805, + "learning_rate": 9.26117095551566e-06, + "loss": 5.1966, + "step": 185550 + }, + { + "epoch": 16.65843806104129, + "grad_norm": 15.186427116394043, + "learning_rate": 9.260921603830043e-06, + "loss": 5.422, + "step": 185575 + }, + { + "epoch": 16.66068222621185, + "grad_norm": 20.80697250366211, + "learning_rate": 9.260672252144425e-06, + "loss": 4.9442, + "step": 185600 + }, + { + "epoch": 16.662926391382406, + "grad_norm": 18.390121459960938, + "learning_rate": 9.260422900458807e-06, + "loss": 4.9481, + "step": 185625 + }, + { + "epoch": 16.66517055655296, + "grad_norm": 18.218263626098633, + "learning_rate": 9.26017354877319e-06, + "loss": 5.5418, + "step": 185650 + }, + { + "epoch": 16.667414721723517, + "grad_norm": 20.814313888549805, + "learning_rate": 9.259924197087574e-06, + "loss": 4.7945, + "step": 185675 + }, + { + "epoch": 16.669658886894076, + "grad_norm": 16.45659637451172, + "learning_rate": 9.259674845401956e-06, + "loss": 4.9792, + "step": 185700 + }, + { + "epoch": 16.671903052064632, + "grad_norm": 16.65972900390625, + "learning_rate": 9.259425493716338e-06, + "loss": 4.8725, + "step": 185725 + }, + { + "epoch": 16.674147217235188, + "grad_norm": 17.334476470947266, + "learning_rate": 9.259176142030722e-06, + "loss": 4.8932, + "step": 185750 + }, + { + "epoch": 16.676391382405747, + "grad_norm": 19.429094314575195, + "learning_rate": 9.258926790345103e-06, + "loss": 5.1888, + "step": 185775 + }, + { + "epoch": 16.678635547576302, + "grad_norm": 18.754825592041016, + "learning_rate": 9.258677438659485e-06, + "loss": 5.2362, + "step": 185800 + }, + { + "epoch": 16.680879712746858, + "grad_norm": 18.178857803344727, + "learning_rate": 9.258428086973869e-06, + "loss": 5.1536, + "step": 185825 + }, + { + "epoch": 16.683123877917414, + "grad_norm": 15.358358383178711, + "learning_rate": 9.258178735288251e-06, + "loss": 5.1625, + "step": 185850 + }, + { + "epoch": 16.685368043087973, + "grad_norm": 16.998485565185547, + "learning_rate": 9.257929383602634e-06, + "loss": 4.8307, + "step": 185875 + }, + { + "epoch": 16.68761220825853, + "grad_norm": 15.266356468200684, + "learning_rate": 9.257680031917016e-06, + "loss": 4.9814, + "step": 185900 + }, + { + "epoch": 16.689856373429084, + "grad_norm": 18.60629653930664, + "learning_rate": 9.2574306802314e-06, + "loss": 5.1749, + "step": 185925 + }, + { + "epoch": 16.69210053859964, + "grad_norm": 18.46822166442871, + "learning_rate": 9.25718132854578e-06, + "loss": 5.0962, + "step": 185950 + }, + { + "epoch": 16.6943447037702, + "grad_norm": 17.059438705444336, + "learning_rate": 9.256931976860165e-06, + "loss": 5.1204, + "step": 185975 + }, + { + "epoch": 16.696588868940754, + "grad_norm": 21.41077995300293, + "learning_rate": 9.256682625174547e-06, + "loss": 5.1642, + "step": 186000 + }, + { + "epoch": 16.69883303411131, + "grad_norm": 16.039949417114258, + "learning_rate": 9.25643327348893e-06, + "loss": 5.2439, + "step": 186025 + }, + { + "epoch": 16.70107719928187, + "grad_norm": 14.861481666564941, + "learning_rate": 9.256183921803312e-06, + "loss": 5.2589, + "step": 186050 + }, + { + "epoch": 16.703321364452425, + "grad_norm": 16.795621871948242, + "learning_rate": 9.255934570117696e-06, + "loss": 5.0252, + "step": 186075 + }, + { + "epoch": 16.70556552962298, + "grad_norm": 20.33220863342285, + "learning_rate": 9.255685218432078e-06, + "loss": 4.9408, + "step": 186100 + }, + { + "epoch": 16.707809694793536, + "grad_norm": 16.955835342407227, + "learning_rate": 9.255435866746459e-06, + "loss": 5.1602, + "step": 186125 + }, + { + "epoch": 16.710053859964095, + "grad_norm": 18.947874069213867, + "learning_rate": 9.255196489128267e-06, + "loss": 4.9678, + "step": 186150 + }, + { + "epoch": 16.71229802513465, + "grad_norm": 19.46836280822754, + "learning_rate": 9.25494713744265e-06, + "loss": 5.1611, + "step": 186175 + }, + { + "epoch": 16.714542190305206, + "grad_norm": 16.57416343688965, + "learning_rate": 9.254697785757032e-06, + "loss": 5.0539, + "step": 186200 + }, + { + "epoch": 16.71678635547576, + "grad_norm": 14.870833396911621, + "learning_rate": 9.254448434071414e-06, + "loss": 5.2877, + "step": 186225 + }, + { + "epoch": 16.71903052064632, + "grad_norm": 19.816761016845703, + "learning_rate": 9.254199082385798e-06, + "loss": 5.0434, + "step": 186250 + }, + { + "epoch": 16.721274685816876, + "grad_norm": 16.72554588317871, + "learning_rate": 9.25394973070018e-06, + "loss": 5.1323, + "step": 186275 + }, + { + "epoch": 16.723518850987432, + "grad_norm": 17.378509521484375, + "learning_rate": 9.253700379014563e-06, + "loss": 5.0268, + "step": 186300 + }, + { + "epoch": 16.725763016157988, + "grad_norm": 16.252878189086914, + "learning_rate": 9.253451027328945e-06, + "loss": 5.085, + "step": 186325 + }, + { + "epoch": 16.728007181328547, + "grad_norm": 17.272974014282227, + "learning_rate": 9.25320167564333e-06, + "loss": 5.1283, + "step": 186350 + }, + { + "epoch": 16.730251346499102, + "grad_norm": 13.94472599029541, + "learning_rate": 9.252952323957711e-06, + "loss": 4.9297, + "step": 186375 + }, + { + "epoch": 16.732495511669658, + "grad_norm": 16.52024269104004, + "learning_rate": 9.252702972272092e-06, + "loss": 5.0632, + "step": 186400 + }, + { + "epoch": 16.734739676840217, + "grad_norm": 18.43017578125, + "learning_rate": 9.252453620586476e-06, + "loss": 5.1157, + "step": 186425 + }, + { + "epoch": 16.736983842010773, + "grad_norm": 18.824129104614258, + "learning_rate": 9.252204268900858e-06, + "loss": 5.1537, + "step": 186450 + }, + { + "epoch": 16.739228007181328, + "grad_norm": 17.676340103149414, + "learning_rate": 9.25195491721524e-06, + "loss": 5.034, + "step": 186475 + }, + { + "epoch": 16.741472172351884, + "grad_norm": 17.20212745666504, + "learning_rate": 9.251705565529625e-06, + "loss": 5.2089, + "step": 186500 + }, + { + "epoch": 16.743716337522443, + "grad_norm": 21.581825256347656, + "learning_rate": 9.251456213844007e-06, + "loss": 5.0897, + "step": 186525 + }, + { + "epoch": 16.745960502693, + "grad_norm": 16.77974510192871, + "learning_rate": 9.25120686215839e-06, + "loss": 4.8846, + "step": 186550 + }, + { + "epoch": 16.748204667863554, + "grad_norm": 18.480764389038086, + "learning_rate": 9.250957510472772e-06, + "loss": 5.0118, + "step": 186575 + }, + { + "epoch": 16.75044883303411, + "grad_norm": 18.126651763916016, + "learning_rate": 9.250708158787154e-06, + "loss": 5.0296, + "step": 186600 + }, + { + "epoch": 16.75269299820467, + "grad_norm": 15.859213829040527, + "learning_rate": 9.250458807101536e-06, + "loss": 4.6951, + "step": 186625 + }, + { + "epoch": 16.754937163375224, + "grad_norm": 17.168689727783203, + "learning_rate": 9.250209455415919e-06, + "loss": 5.0927, + "step": 186650 + }, + { + "epoch": 16.75718132854578, + "grad_norm": 20.71480369567871, + "learning_rate": 9.249960103730303e-06, + "loss": 5.0312, + "step": 186675 + }, + { + "epoch": 16.75942549371634, + "grad_norm": 17.45353126525879, + "learning_rate": 9.249710752044685e-06, + "loss": 5.0654, + "step": 186700 + }, + { + "epoch": 16.761669658886895, + "grad_norm": 17.89164161682129, + "learning_rate": 9.249461400359067e-06, + "loss": 5.0582, + "step": 186725 + }, + { + "epoch": 16.76391382405745, + "grad_norm": 17.950891494750977, + "learning_rate": 9.24921204867345e-06, + "loss": 4.9048, + "step": 186750 + }, + { + "epoch": 16.766157989228006, + "grad_norm": 19.8294677734375, + "learning_rate": 9.248962696987832e-06, + "loss": 4.9111, + "step": 186775 + }, + { + "epoch": 16.768402154398565, + "grad_norm": 17.217153549194336, + "learning_rate": 9.248713345302214e-06, + "loss": 5.0196, + "step": 186800 + }, + { + "epoch": 16.77064631956912, + "grad_norm": 15.752894401550293, + "learning_rate": 9.248463993616598e-06, + "loss": 4.8257, + "step": 186825 + }, + { + "epoch": 16.772890484739676, + "grad_norm": 17.72098159790039, + "learning_rate": 9.24821464193098e-06, + "loss": 5.3884, + "step": 186850 + }, + { + "epoch": 16.775134649910232, + "grad_norm": 17.607620239257812, + "learning_rate": 9.247965290245363e-06, + "loss": 5.0648, + "step": 186875 + }, + { + "epoch": 16.77737881508079, + "grad_norm": 17.216596603393555, + "learning_rate": 9.247715938559745e-06, + "loss": 5.2824, + "step": 186900 + }, + { + "epoch": 16.779622980251347, + "grad_norm": 15.034346580505371, + "learning_rate": 9.247466586874128e-06, + "loss": 5.2308, + "step": 186925 + }, + { + "epoch": 16.781867145421902, + "grad_norm": 15.611014366149902, + "learning_rate": 9.24721723518851e-06, + "loss": 4.9987, + "step": 186950 + }, + { + "epoch": 16.78411131059246, + "grad_norm": 17.845491409301758, + "learning_rate": 9.246967883502894e-06, + "loss": 5.3289, + "step": 186975 + }, + { + "epoch": 16.786355475763017, + "grad_norm": 15.442728996276855, + "learning_rate": 9.246718531817276e-06, + "loss": 5.2282, + "step": 187000 + }, + { + "epoch": 16.788599640933572, + "grad_norm": 18.807790756225586, + "learning_rate": 9.246469180131658e-06, + "loss": 5.1298, + "step": 187025 + }, + { + "epoch": 16.790843806104128, + "grad_norm": 15.83580207824707, + "learning_rate": 9.24621982844604e-06, + "loss": 5.1411, + "step": 187050 + }, + { + "epoch": 16.793087971274687, + "grad_norm": 16.22919273376465, + "learning_rate": 9.245970476760425e-06, + "loss": 5.0075, + "step": 187075 + }, + { + "epoch": 16.795332136445243, + "grad_norm": 17.98222541809082, + "learning_rate": 9.245731099142232e-06, + "loss": 5.0708, + "step": 187100 + }, + { + "epoch": 16.7975763016158, + "grad_norm": 17.74541473388672, + "learning_rate": 9.245481747456614e-06, + "loss": 5.2379, + "step": 187125 + }, + { + "epoch": 16.799820466786354, + "grad_norm": 17.13970184326172, + "learning_rate": 9.245232395770996e-06, + "loss": 4.7979, + "step": 187150 + }, + { + "epoch": 16.802064631956913, + "grad_norm": 17.190814971923828, + "learning_rate": 9.244983044085379e-06, + "loss": 5.1909, + "step": 187175 + }, + { + "epoch": 16.80430879712747, + "grad_norm": 21.132844924926758, + "learning_rate": 9.244733692399761e-06, + "loss": 5.2257, + "step": 187200 + }, + { + "epoch": 16.806552962298024, + "grad_norm": 17.7537784576416, + "learning_rate": 9.244484340714143e-06, + "loss": 5.488, + "step": 187225 + }, + { + "epoch": 16.80879712746858, + "grad_norm": 21.363183975219727, + "learning_rate": 9.244234989028527e-06, + "loss": 5.0915, + "step": 187250 + }, + { + "epoch": 16.81104129263914, + "grad_norm": 15.828943252563477, + "learning_rate": 9.24398563734291e-06, + "loss": 4.7677, + "step": 187275 + }, + { + "epoch": 16.813285457809695, + "grad_norm": 13.574233055114746, + "learning_rate": 9.243736285657292e-06, + "loss": 5.0904, + "step": 187300 + }, + { + "epoch": 16.81552962298025, + "grad_norm": 18.690717697143555, + "learning_rate": 9.243486933971674e-06, + "loss": 5.4384, + "step": 187325 + }, + { + "epoch": 16.81777378815081, + "grad_norm": 15.1786527633667, + "learning_rate": 9.243237582286057e-06, + "loss": 4.9164, + "step": 187350 + }, + { + "epoch": 16.820017953321365, + "grad_norm": 15.112521171569824, + "learning_rate": 9.242988230600439e-06, + "loss": 5.0507, + "step": 187375 + }, + { + "epoch": 16.82226211849192, + "grad_norm": 14.773063659667969, + "learning_rate": 9.242738878914821e-06, + "loss": 5.0428, + "step": 187400 + }, + { + "epoch": 16.824506283662476, + "grad_norm": 17.041297912597656, + "learning_rate": 9.242489527229205e-06, + "loss": 5.0343, + "step": 187425 + }, + { + "epoch": 16.826750448833035, + "grad_norm": 19.036083221435547, + "learning_rate": 9.242240175543588e-06, + "loss": 5.4101, + "step": 187450 + }, + { + "epoch": 16.82899461400359, + "grad_norm": 16.254552841186523, + "learning_rate": 9.24199082385797e-06, + "loss": 4.9087, + "step": 187475 + }, + { + "epoch": 16.831238779174146, + "grad_norm": 17.010534286499023, + "learning_rate": 9.241741472172354e-06, + "loss": 5.1484, + "step": 187500 + }, + { + "epoch": 16.833482944344702, + "grad_norm": 16.33647918701172, + "learning_rate": 9.241492120486735e-06, + "loss": 5.158, + "step": 187525 + }, + { + "epoch": 16.83572710951526, + "grad_norm": 17.87706756591797, + "learning_rate": 9.241242768801117e-06, + "loss": 5.0408, + "step": 187550 + }, + { + "epoch": 16.837971274685817, + "grad_norm": 16.63873291015625, + "learning_rate": 9.2409934171155e-06, + "loss": 5.1151, + "step": 187575 + }, + { + "epoch": 16.840215439856372, + "grad_norm": 17.15445899963379, + "learning_rate": 9.240744065429883e-06, + "loss": 5.0572, + "step": 187600 + }, + { + "epoch": 16.84245960502693, + "grad_norm": 18.898834228515625, + "learning_rate": 9.240494713744265e-06, + "loss": 5.1542, + "step": 187625 + }, + { + "epoch": 16.844703770197487, + "grad_norm": 19.057968139648438, + "learning_rate": 9.240245362058648e-06, + "loss": 5.0517, + "step": 187650 + }, + { + "epoch": 16.846947935368043, + "grad_norm": 20.11956024169922, + "learning_rate": 9.239996010373032e-06, + "loss": 5.0834, + "step": 187675 + }, + { + "epoch": 16.8491921005386, + "grad_norm": 17.222618103027344, + "learning_rate": 9.239746658687412e-06, + "loss": 5.1406, + "step": 187700 + }, + { + "epoch": 16.851436265709157, + "grad_norm": 15.92898941040039, + "learning_rate": 9.239497307001796e-06, + "loss": 5.1119, + "step": 187725 + }, + { + "epoch": 16.853680430879713, + "grad_norm": 19.13507843017578, + "learning_rate": 9.239247955316179e-06, + "loss": 5.2161, + "step": 187750 + }, + { + "epoch": 16.85592459605027, + "grad_norm": 15.333892822265625, + "learning_rate": 9.238998603630561e-06, + "loss": 5.0957, + "step": 187775 + }, + { + "epoch": 16.858168761220824, + "grad_norm": 19.914955139160156, + "learning_rate": 9.238749251944943e-06, + "loss": 5.1359, + "step": 187800 + }, + { + "epoch": 16.860412926391383, + "grad_norm": 15.092683792114258, + "learning_rate": 9.238499900259327e-06, + "loss": 5.1914, + "step": 187825 + }, + { + "epoch": 16.86265709156194, + "grad_norm": 19.289960861206055, + "learning_rate": 9.23825054857371e-06, + "loss": 4.927, + "step": 187850 + }, + { + "epoch": 16.864901256732495, + "grad_norm": 15.50296401977539, + "learning_rate": 9.238001196888092e-06, + "loss": 4.9403, + "step": 187875 + }, + { + "epoch": 16.867145421903054, + "grad_norm": 19.481130599975586, + "learning_rate": 9.237751845202474e-06, + "loss": 4.764, + "step": 187900 + }, + { + "epoch": 16.86938958707361, + "grad_norm": 19.887027740478516, + "learning_rate": 9.237502493516857e-06, + "loss": 5.2565, + "step": 187925 + }, + { + "epoch": 16.871633752244165, + "grad_norm": 18.958969116210938, + "learning_rate": 9.237253141831239e-06, + "loss": 5.0463, + "step": 187950 + }, + { + "epoch": 16.87387791741472, + "grad_norm": 17.12819480895996, + "learning_rate": 9.237003790145623e-06, + "loss": 5.0951, + "step": 187975 + }, + { + "epoch": 16.87612208258528, + "grad_norm": 16.463241577148438, + "learning_rate": 9.236754438460005e-06, + "loss": 5.2252, + "step": 188000 + }, + { + "epoch": 16.878366247755835, + "grad_norm": 19.211456298828125, + "learning_rate": 9.236505086774388e-06, + "loss": 5.1562, + "step": 188025 + }, + { + "epoch": 16.88061041292639, + "grad_norm": 18.457242965698242, + "learning_rate": 9.23625573508877e-06, + "loss": 4.9591, + "step": 188050 + }, + { + "epoch": 16.882854578096946, + "grad_norm": 15.864561080932617, + "learning_rate": 9.236006383403152e-06, + "loss": 5.199, + "step": 188075 + }, + { + "epoch": 16.885098743267505, + "grad_norm": 15.784059524536133, + "learning_rate": 9.235757031717535e-06, + "loss": 5.1593, + "step": 188100 + }, + { + "epoch": 16.88734290843806, + "grad_norm": 17.147871017456055, + "learning_rate": 9.235507680031917e-06, + "loss": 5.0892, + "step": 188125 + }, + { + "epoch": 16.889587073608617, + "grad_norm": 16.352645874023438, + "learning_rate": 9.235258328346301e-06, + "loss": 5.2649, + "step": 188150 + }, + { + "epoch": 16.891831238779176, + "grad_norm": 24.440582275390625, + "learning_rate": 9.235008976660683e-06, + "loss": 5.0502, + "step": 188175 + }, + { + "epoch": 16.89407540394973, + "grad_norm": 16.783506393432617, + "learning_rate": 9.234759624975066e-06, + "loss": 5.2121, + "step": 188200 + }, + { + "epoch": 16.896319569120287, + "grad_norm": 17.61588478088379, + "learning_rate": 9.23451027328945e-06, + "loss": 5.3081, + "step": 188225 + }, + { + "epoch": 16.898563734290843, + "grad_norm": 17.233016967773438, + "learning_rate": 9.23426092160383e-06, + "loss": 5.1516, + "step": 188250 + }, + { + "epoch": 16.9008078994614, + "grad_norm": 19.851842880249023, + "learning_rate": 9.234011569918212e-06, + "loss": 4.915, + "step": 188275 + }, + { + "epoch": 16.903052064631957, + "grad_norm": 15.26449966430664, + "learning_rate": 9.233762218232596e-06, + "loss": 5.1802, + "step": 188300 + }, + { + "epoch": 16.905296229802513, + "grad_norm": 19.919645309448242, + "learning_rate": 9.233512866546979e-06, + "loss": 5.2924, + "step": 188325 + }, + { + "epoch": 16.90754039497307, + "grad_norm": 19.37224006652832, + "learning_rate": 9.233263514861361e-06, + "loss": 5.0779, + "step": 188350 + }, + { + "epoch": 16.909784560143628, + "grad_norm": 19.91158103942871, + "learning_rate": 9.233014163175743e-06, + "loss": 4.9158, + "step": 188375 + }, + { + "epoch": 16.912028725314183, + "grad_norm": 17.697566986083984, + "learning_rate": 9.232764811490127e-06, + "loss": 5.3011, + "step": 188400 + }, + { + "epoch": 16.91427289048474, + "grad_norm": 17.035381317138672, + "learning_rate": 9.232515459804508e-06, + "loss": 5.1913, + "step": 188425 + }, + { + "epoch": 16.916517055655298, + "grad_norm": 21.699703216552734, + "learning_rate": 9.232266108118892e-06, + "loss": 5.2482, + "step": 188450 + }, + { + "epoch": 16.918761220825854, + "grad_norm": 20.516979217529297, + "learning_rate": 9.232016756433274e-06, + "loss": 5.4836, + "step": 188475 + }, + { + "epoch": 16.92100538599641, + "grad_norm": 17.62624168395996, + "learning_rate": 9.231767404747657e-06, + "loss": 5.2332, + "step": 188500 + }, + { + "epoch": 16.923249551166965, + "grad_norm": 15.474282264709473, + "learning_rate": 9.231518053062039e-06, + "loss": 5.2653, + "step": 188525 + }, + { + "epoch": 16.925493716337524, + "grad_norm": 20.07545280456543, + "learning_rate": 9.231268701376423e-06, + "loss": 5.1459, + "step": 188550 + }, + { + "epoch": 16.92773788150808, + "grad_norm": 18.042543411254883, + "learning_rate": 9.231019349690805e-06, + "loss": 4.9698, + "step": 188575 + }, + { + "epoch": 16.929982046678635, + "grad_norm": 18.451513290405273, + "learning_rate": 9.230769998005186e-06, + "loss": 5.1941, + "step": 188600 + }, + { + "epoch": 16.93222621184919, + "grad_norm": 16.828231811523438, + "learning_rate": 9.23052064631957e-06, + "loss": 5.1216, + "step": 188625 + }, + { + "epoch": 16.93447037701975, + "grad_norm": 17.852556228637695, + "learning_rate": 9.230271294633952e-06, + "loss": 5.1992, + "step": 188650 + }, + { + "epoch": 16.936714542190305, + "grad_norm": 17.844451904296875, + "learning_rate": 9.230021942948335e-06, + "loss": 4.9576, + "step": 188675 + }, + { + "epoch": 16.93895870736086, + "grad_norm": 17.02151870727539, + "learning_rate": 9.229772591262719e-06, + "loss": 5.0512, + "step": 188700 + }, + { + "epoch": 16.94120287253142, + "grad_norm": 17.523639678955078, + "learning_rate": 9.229523239577101e-06, + "loss": 5.2137, + "step": 188725 + }, + { + "epoch": 16.943447037701976, + "grad_norm": 15.862361907958984, + "learning_rate": 9.229273887891483e-06, + "loss": 5.1951, + "step": 188750 + }, + { + "epoch": 16.94569120287253, + "grad_norm": 15.282449722290039, + "learning_rate": 9.229024536205866e-06, + "loss": 5.1114, + "step": 188775 + }, + { + "epoch": 16.947935368043087, + "grad_norm": 17.69174575805664, + "learning_rate": 9.228775184520248e-06, + "loss": 5.4171, + "step": 188800 + }, + { + "epoch": 16.950179533213646, + "grad_norm": 20.113311767578125, + "learning_rate": 9.22852583283463e-06, + "loss": 4.6129, + "step": 188825 + }, + { + "epoch": 16.9524236983842, + "grad_norm": 17.512920379638672, + "learning_rate": 9.228276481149013e-06, + "loss": 5.2822, + "step": 188850 + }, + { + "epoch": 16.954667863554757, + "grad_norm": 22.777692794799805, + "learning_rate": 9.228027129463397e-06, + "loss": 5.476, + "step": 188875 + }, + { + "epoch": 16.956912028725313, + "grad_norm": 16.10305404663086, + "learning_rate": 9.227777777777779e-06, + "loss": 4.9357, + "step": 188900 + }, + { + "epoch": 16.959156193895872, + "grad_norm": 14.664278030395508, + "learning_rate": 9.227528426092161e-06, + "loss": 5.4188, + "step": 188925 + }, + { + "epoch": 16.961400359066428, + "grad_norm": 18.008066177368164, + "learning_rate": 9.227279074406543e-06, + "loss": 5.1835, + "step": 188950 + }, + { + "epoch": 16.963644524236983, + "grad_norm": 20.021955490112305, + "learning_rate": 9.227029722720926e-06, + "loss": 5.1021, + "step": 188975 + }, + { + "epoch": 16.96588868940754, + "grad_norm": 16.97377586364746, + "learning_rate": 9.226780371035308e-06, + "loss": 5.4233, + "step": 189000 + }, + { + "epoch": 16.968132854578098, + "grad_norm": 17.688945770263672, + "learning_rate": 9.226531019349692e-06, + "loss": 4.9528, + "step": 189025 + }, + { + "epoch": 16.970377019748653, + "grad_norm": 18.50338363647461, + "learning_rate": 9.226281667664074e-06, + "loss": 5.245, + "step": 189050 + }, + { + "epoch": 16.97262118491921, + "grad_norm": 19.898250579833984, + "learning_rate": 9.226032315978457e-06, + "loss": 5.0214, + "step": 189075 + }, + { + "epoch": 16.974865350089768, + "grad_norm": 16.66047477722168, + "learning_rate": 9.225782964292839e-06, + "loss": 5.0317, + "step": 189100 + }, + { + "epoch": 16.977109515260324, + "grad_norm": 18.51504898071289, + "learning_rate": 9.225533612607221e-06, + "loss": 5.1287, + "step": 189125 + }, + { + "epoch": 16.97935368043088, + "grad_norm": 15.041906356811523, + "learning_rate": 9.225284260921604e-06, + "loss": 5.2123, + "step": 189150 + }, + { + "epoch": 16.981597845601435, + "grad_norm": 17.85148811340332, + "learning_rate": 9.225034909235988e-06, + "loss": 5.2041, + "step": 189175 + }, + { + "epoch": 16.983842010771994, + "grad_norm": 14.649674415588379, + "learning_rate": 9.22478555755037e-06, + "loss": 5.2225, + "step": 189200 + }, + { + "epoch": 16.98608617594255, + "grad_norm": 21.719728469848633, + "learning_rate": 9.224536205864752e-06, + "loss": 5.1202, + "step": 189225 + }, + { + "epoch": 16.988330341113105, + "grad_norm": 17.310100555419922, + "learning_rate": 9.224286854179135e-06, + "loss": 5.1847, + "step": 189250 + }, + { + "epoch": 16.99057450628366, + "grad_norm": 17.82012939453125, + "learning_rate": 9.224037502493519e-06, + "loss": 5.2418, + "step": 189275 + }, + { + "epoch": 16.99281867145422, + "grad_norm": 15.109498023986816, + "learning_rate": 9.2237881508079e-06, + "loss": 5.2058, + "step": 189300 + }, + { + "epoch": 16.995062836624776, + "grad_norm": 16.215335845947266, + "learning_rate": 9.223538799122283e-06, + "loss": 5.0053, + "step": 189325 + }, + { + "epoch": 16.99730700179533, + "grad_norm": 15.18300724029541, + "learning_rate": 9.223289447436666e-06, + "loss": 5.2994, + "step": 189350 + }, + { + "epoch": 16.99955116696589, + "grad_norm": 19.246883392333984, + "learning_rate": 9.223040095751048e-06, + "loss": 5.3048, + "step": 189375 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.0714872166956256, + "eval_f1_macro": 0.007813176272864342, + "eval_f1_micro": 0.0714872166956256, + "eval_f1_weighted": 0.04193524763808181, + "eval_loss": 6.8395304679870605, + "eval_precision_macro": 0.007200145356448146, + "eval_precision_micro": 0.0714872166956256, + "eval_precision_weighted": 0.03450170012401786, + "eval_recall_macro": 0.012753371215806894, + "eval_recall_micro": 0.0714872166956256, + "eval_recall_weighted": 0.0714872166956256, + "eval_runtime": 129.8925, + "eval_samples_per_second": 403.203, + "eval_steps_per_second": 12.603, + "step": 189380 + }, + { + "epoch": 17.001795332136446, + "grad_norm": 15.594199180603027, + "learning_rate": 9.22279074406543e-06, + "loss": 4.9243, + "step": 189400 + }, + { + "epoch": 17.004039497307, + "grad_norm": 16.00056266784668, + "learning_rate": 9.222541392379814e-06, + "loss": 5.0426, + "step": 189425 + }, + { + "epoch": 17.006283662477557, + "grad_norm": 18.560861587524414, + "learning_rate": 9.222292040694197e-06, + "loss": 4.7416, + "step": 189450 + }, + { + "epoch": 17.008527827648116, + "grad_norm": 17.932411193847656, + "learning_rate": 9.222042689008577e-06, + "loss": 4.6929, + "step": 189475 + }, + { + "epoch": 17.010771992818672, + "grad_norm": 14.250992774963379, + "learning_rate": 9.221793337322961e-06, + "loss": 5.007, + "step": 189500 + }, + { + "epoch": 17.013016157989227, + "grad_norm": 18.423404693603516, + "learning_rate": 9.221543985637344e-06, + "loss": 4.6372, + "step": 189525 + }, + { + "epoch": 17.015260323159783, + "grad_norm": 18.490188598632812, + "learning_rate": 9.221294633951726e-06, + "loss": 5.0123, + "step": 189550 + }, + { + "epoch": 17.017504488330342, + "grad_norm": 18.280664443969727, + "learning_rate": 9.221045282266108e-06, + "loss": 4.9091, + "step": 189575 + }, + { + "epoch": 17.019748653500898, + "grad_norm": 16.205141067504883, + "learning_rate": 9.220795930580492e-06, + "loss": 4.5177, + "step": 189600 + }, + { + "epoch": 17.021992818671453, + "grad_norm": 16.59116554260254, + "learning_rate": 9.220546578894874e-06, + "loss": 4.6113, + "step": 189625 + }, + { + "epoch": 17.024236983842012, + "grad_norm": 16.786771774291992, + "learning_rate": 9.220297227209257e-06, + "loss": 4.899, + "step": 189650 + }, + { + "epoch": 17.026481149012568, + "grad_norm": 18.203813552856445, + "learning_rate": 9.220047875523639e-06, + "loss": 4.8147, + "step": 189675 + }, + { + "epoch": 17.028725314183124, + "grad_norm": 19.50302505493164, + "learning_rate": 9.219798523838021e-06, + "loss": 4.7023, + "step": 189700 + }, + { + "epoch": 17.03096947935368, + "grad_norm": 20.49473762512207, + "learning_rate": 9.219549172152404e-06, + "loss": 4.8231, + "step": 189725 + }, + { + "epoch": 17.03321364452424, + "grad_norm": 16.653005599975586, + "learning_rate": 9.219299820466788e-06, + "loss": 4.769, + "step": 189750 + }, + { + "epoch": 17.035457809694794, + "grad_norm": 15.220914840698242, + "learning_rate": 9.21905046878117e-06, + "loss": 4.7877, + "step": 189775 + }, + { + "epoch": 17.03770197486535, + "grad_norm": 15.631165504455566, + "learning_rate": 9.218801117095552e-06, + "loss": 4.9379, + "step": 189800 + }, + { + "epoch": 17.039946140035905, + "grad_norm": 15.26508617401123, + "learning_rate": 9.218551765409935e-06, + "loss": 4.8318, + "step": 189825 + }, + { + "epoch": 17.042190305206464, + "grad_norm": 19.450489044189453, + "learning_rate": 9.218302413724317e-06, + "loss": 4.7984, + "step": 189850 + }, + { + "epoch": 17.04443447037702, + "grad_norm": 17.00530242919922, + "learning_rate": 9.2180530620387e-06, + "loss": 4.986, + "step": 189875 + }, + { + "epoch": 17.046678635547575, + "grad_norm": 15.4158935546875, + "learning_rate": 9.217803710353083e-06, + "loss": 4.7238, + "step": 189900 + }, + { + "epoch": 17.048922800718135, + "grad_norm": 17.719655990600586, + "learning_rate": 9.217554358667466e-06, + "loss": 4.7503, + "step": 189925 + }, + { + "epoch": 17.05116696588869, + "grad_norm": 17.802705764770508, + "learning_rate": 9.217305006981848e-06, + "loss": 4.7709, + "step": 189950 + }, + { + "epoch": 17.053411131059246, + "grad_norm": 16.782121658325195, + "learning_rate": 9.21705565529623e-06, + "loss": 4.8106, + "step": 189975 + }, + { + "epoch": 17.0556552962298, + "grad_norm": 18.89069366455078, + "learning_rate": 9.216806303610614e-06, + "loss": 4.7796, + "step": 190000 + }, + { + "epoch": 17.05789946140036, + "grad_norm": 15.27401351928711, + "learning_rate": 9.216556951924995e-06, + "loss": 4.7499, + "step": 190025 + }, + { + "epoch": 17.060143626570916, + "grad_norm": 18.986766815185547, + "learning_rate": 9.216307600239379e-06, + "loss": 5.0244, + "step": 190050 + }, + { + "epoch": 17.06238779174147, + "grad_norm": 18.00425148010254, + "learning_rate": 9.216058248553761e-06, + "loss": 4.7241, + "step": 190075 + }, + { + "epoch": 17.064631956912027, + "grad_norm": 16.220455169677734, + "learning_rate": 9.215808896868144e-06, + "loss": 4.7029, + "step": 190100 + }, + { + "epoch": 17.066876122082586, + "grad_norm": 16.12566375732422, + "learning_rate": 9.215559545182526e-06, + "loss": 4.9257, + "step": 190125 + }, + { + "epoch": 17.069120287253142, + "grad_norm": 15.175679206848145, + "learning_rate": 9.21531019349691e-06, + "loss": 5.0299, + "step": 190150 + }, + { + "epoch": 17.071364452423698, + "grad_norm": 19.154159545898438, + "learning_rate": 9.215060841811292e-06, + "loss": 4.9132, + "step": 190175 + }, + { + "epoch": 17.073608617594253, + "grad_norm": 15.835864067077637, + "learning_rate": 9.214811490125673e-06, + "loss": 4.9734, + "step": 190200 + }, + { + "epoch": 17.075852782764812, + "grad_norm": 16.78561019897461, + "learning_rate": 9.214562138440057e-06, + "loss": 4.9449, + "step": 190225 + }, + { + "epoch": 17.078096947935368, + "grad_norm": 16.00258445739746, + "learning_rate": 9.214312786754439e-06, + "loss": 4.9754, + "step": 190250 + }, + { + "epoch": 17.080341113105924, + "grad_norm": 19.01149559020996, + "learning_rate": 9.214063435068821e-06, + "loss": 4.7183, + "step": 190275 + }, + { + "epoch": 17.082585278276483, + "grad_norm": 14.2882661819458, + "learning_rate": 9.213814083383204e-06, + "loss": 4.9009, + "step": 190300 + }, + { + "epoch": 17.08482944344704, + "grad_norm": 17.289819717407227, + "learning_rate": 9.213564731697588e-06, + "loss": 4.8904, + "step": 190325 + }, + { + "epoch": 17.087073608617594, + "grad_norm": 17.86229705810547, + "learning_rate": 9.21331538001197e-06, + "loss": 5.0236, + "step": 190350 + }, + { + "epoch": 17.08931777378815, + "grad_norm": 20.22635269165039, + "learning_rate": 9.213066028326352e-06, + "loss": 4.7449, + "step": 190375 + }, + { + "epoch": 17.09156193895871, + "grad_norm": 21.345354080200195, + "learning_rate": 9.212816676640735e-06, + "loss": 4.9215, + "step": 190400 + }, + { + "epoch": 17.093806104129264, + "grad_norm": 18.88791847229004, + "learning_rate": 9.212567324955117e-06, + "loss": 4.8338, + "step": 190425 + }, + { + "epoch": 17.09605026929982, + "grad_norm": 22.29041290283203, + "learning_rate": 9.2123179732695e-06, + "loss": 4.8169, + "step": 190450 + }, + { + "epoch": 17.098294434470375, + "grad_norm": 17.445985794067383, + "learning_rate": 9.212068621583883e-06, + "loss": 4.7709, + "step": 190475 + }, + { + "epoch": 17.100538599640934, + "grad_norm": 16.5963077545166, + "learning_rate": 9.211819269898266e-06, + "loss": 4.8182, + "step": 190500 + }, + { + "epoch": 17.10278276481149, + "grad_norm": 16.248886108398438, + "learning_rate": 9.211569918212648e-06, + "loss": 4.7467, + "step": 190525 + }, + { + "epoch": 17.105026929982046, + "grad_norm": 19.56587028503418, + "learning_rate": 9.21132056652703e-06, + "loss": 5.0478, + "step": 190550 + }, + { + "epoch": 17.107271095152605, + "grad_norm": 19.6122989654541, + "learning_rate": 9.211071214841413e-06, + "loss": 4.7696, + "step": 190575 + }, + { + "epoch": 17.10951526032316, + "grad_norm": 16.249711990356445, + "learning_rate": 9.210821863155795e-06, + "loss": 4.5909, + "step": 190600 + }, + { + "epoch": 17.111759425493716, + "grad_norm": 17.319202423095703, + "learning_rate": 9.210572511470179e-06, + "loss": 4.6758, + "step": 190625 + }, + { + "epoch": 17.11400359066427, + "grad_norm": 15.526357650756836, + "learning_rate": 9.210323159784561e-06, + "loss": 4.5381, + "step": 190650 + }, + { + "epoch": 17.11624775583483, + "grad_norm": 15.531072616577148, + "learning_rate": 9.210073808098944e-06, + "loss": 4.7178, + "step": 190675 + }, + { + "epoch": 17.118491921005386, + "grad_norm": 21.72906494140625, + "learning_rate": 9.209824456413326e-06, + "loss": 4.6712, + "step": 190700 + }, + { + "epoch": 17.120736086175942, + "grad_norm": 16.38371467590332, + "learning_rate": 9.209575104727708e-06, + "loss": 4.7726, + "step": 190725 + }, + { + "epoch": 17.122980251346497, + "grad_norm": 17.708194732666016, + "learning_rate": 9.20932575304209e-06, + "loss": 5.0582, + "step": 190750 + }, + { + "epoch": 17.125224416517057, + "grad_norm": 17.538366317749023, + "learning_rate": 9.209076401356475e-06, + "loss": 4.9276, + "step": 190775 + }, + { + "epoch": 17.127468581687612, + "grad_norm": 16.307838439941406, + "learning_rate": 9.208827049670857e-06, + "loss": 5.1069, + "step": 190800 + }, + { + "epoch": 17.129712746858168, + "grad_norm": 15.964040756225586, + "learning_rate": 9.20857769798524e-06, + "loss": 4.7281, + "step": 190825 + }, + { + "epoch": 17.131956912028727, + "grad_norm": 17.289833068847656, + "learning_rate": 9.208328346299622e-06, + "loss": 4.7473, + "step": 190850 + }, + { + "epoch": 17.134201077199283, + "grad_norm": 19.641050338745117, + "learning_rate": 9.208078994614006e-06, + "loss": 4.8097, + "step": 190875 + }, + { + "epoch": 17.136445242369838, + "grad_norm": 15.366628646850586, + "learning_rate": 9.207829642928386e-06, + "loss": 4.77, + "step": 190900 + }, + { + "epoch": 17.138689407540394, + "grad_norm": 17.558231353759766, + "learning_rate": 9.207580291242768e-06, + "loss": 4.8935, + "step": 190925 + }, + { + "epoch": 17.140933572710953, + "grad_norm": 14.589859008789062, + "learning_rate": 9.207330939557152e-06, + "loss": 4.662, + "step": 190950 + }, + { + "epoch": 17.14317773788151, + "grad_norm": 17.595947265625, + "learning_rate": 9.207081587871535e-06, + "loss": 4.9348, + "step": 190975 + }, + { + "epoch": 17.145421903052064, + "grad_norm": 17.325368881225586, + "learning_rate": 9.206832236185917e-06, + "loss": 4.6819, + "step": 191000 + }, + { + "epoch": 17.14766606822262, + "grad_norm": 19.183671951293945, + "learning_rate": 9.2065828845003e-06, + "loss": 5.0186, + "step": 191025 + }, + { + "epoch": 17.14991023339318, + "grad_norm": 14.733057022094727, + "learning_rate": 9.206333532814683e-06, + "loss": 4.8791, + "step": 191050 + }, + { + "epoch": 17.152154398563734, + "grad_norm": 18.677356719970703, + "learning_rate": 9.206084181129064e-06, + "loss": 4.6798, + "step": 191075 + }, + { + "epoch": 17.15439856373429, + "grad_norm": 18.427263259887695, + "learning_rate": 9.205834829443448e-06, + "loss": 4.9065, + "step": 191100 + }, + { + "epoch": 17.15664272890485, + "grad_norm": 16.379112243652344, + "learning_rate": 9.20558547775783e-06, + "loss": 4.9453, + "step": 191125 + }, + { + "epoch": 17.158886894075405, + "grad_norm": 16.381031036376953, + "learning_rate": 9.205336126072213e-06, + "loss": 4.8535, + "step": 191150 + }, + { + "epoch": 17.16113105924596, + "grad_norm": 15.063063621520996, + "learning_rate": 9.205086774386595e-06, + "loss": 4.947, + "step": 191175 + }, + { + "epoch": 17.163375224416516, + "grad_norm": 15.341886520385742, + "learning_rate": 9.204837422700979e-06, + "loss": 4.9425, + "step": 191200 + }, + { + "epoch": 17.165619389587075, + "grad_norm": 17.887407302856445, + "learning_rate": 9.204588071015361e-06, + "loss": 4.9177, + "step": 191225 + }, + { + "epoch": 17.16786355475763, + "grad_norm": 19.25507164001465, + "learning_rate": 9.204338719329744e-06, + "loss": 4.8256, + "step": 191250 + }, + { + "epoch": 17.170107719928186, + "grad_norm": 16.987144470214844, + "learning_rate": 9.204089367644126e-06, + "loss": 4.8658, + "step": 191275 + }, + { + "epoch": 17.17235188509874, + "grad_norm": 20.770769119262695, + "learning_rate": 9.203840015958508e-06, + "loss": 4.7044, + "step": 191300 + }, + { + "epoch": 17.1745960502693, + "grad_norm": 19.79143524169922, + "learning_rate": 9.20359066427289e-06, + "loss": 4.7483, + "step": 191325 + }, + { + "epoch": 17.176840215439857, + "grad_norm": 18.31952667236328, + "learning_rate": 9.203341312587275e-06, + "loss": 4.8269, + "step": 191350 + }, + { + "epoch": 17.179084380610412, + "grad_norm": 18.936426162719727, + "learning_rate": 9.203091960901657e-06, + "loss": 4.7629, + "step": 191375 + }, + { + "epoch": 17.18132854578097, + "grad_norm": 19.521333694458008, + "learning_rate": 9.20284260921604e-06, + "loss": 4.7772, + "step": 191400 + }, + { + "epoch": 17.183572710951527, + "grad_norm": 19.508211135864258, + "learning_rate": 9.202603231597846e-06, + "loss": 4.8069, + "step": 191425 + }, + { + "epoch": 17.185816876122082, + "grad_norm": 20.614887237548828, + "learning_rate": 9.202353879912229e-06, + "loss": 4.8147, + "step": 191450 + }, + { + "epoch": 17.188061041292638, + "grad_norm": 16.912532806396484, + "learning_rate": 9.202104528226613e-06, + "loss": 4.8928, + "step": 191475 + }, + { + "epoch": 17.190305206463197, + "grad_norm": 17.102506637573242, + "learning_rate": 9.201855176540995e-06, + "loss": 4.8872, + "step": 191500 + }, + { + "epoch": 17.192549371633753, + "grad_norm": 23.739212036132812, + "learning_rate": 9.201605824855377e-06, + "loss": 4.8017, + "step": 191525 + }, + { + "epoch": 17.19479353680431, + "grad_norm": 16.984588623046875, + "learning_rate": 9.20135647316976e-06, + "loss": 4.7818, + "step": 191550 + }, + { + "epoch": 17.197037701974864, + "grad_norm": 17.89945411682129, + "learning_rate": 9.201107121484142e-06, + "loss": 4.6442, + "step": 191575 + }, + { + "epoch": 17.199281867145423, + "grad_norm": 19.015361785888672, + "learning_rate": 9.200857769798524e-06, + "loss": 4.8076, + "step": 191600 + }, + { + "epoch": 17.20152603231598, + "grad_norm": 15.554059982299805, + "learning_rate": 9.200608418112908e-06, + "loss": 4.7703, + "step": 191625 + }, + { + "epoch": 17.203770197486534, + "grad_norm": 17.93268394470215, + "learning_rate": 9.20035906642729e-06, + "loss": 4.7768, + "step": 191650 + }, + { + "epoch": 17.20601436265709, + "grad_norm": 14.753843307495117, + "learning_rate": 9.200109714741673e-06, + "loss": 4.7761, + "step": 191675 + }, + { + "epoch": 17.20825852782765, + "grad_norm": 17.55898666381836, + "learning_rate": 9.199860363056055e-06, + "loss": 5.0117, + "step": 191700 + }, + { + "epoch": 17.210502692998205, + "grad_norm": 18.07535743713379, + "learning_rate": 9.199611011370437e-06, + "loss": 4.872, + "step": 191725 + }, + { + "epoch": 17.21274685816876, + "grad_norm": 21.915599822998047, + "learning_rate": 9.19936165968482e-06, + "loss": 4.7793, + "step": 191750 + }, + { + "epoch": 17.21499102333932, + "grad_norm": 15.762125015258789, + "learning_rate": 9.199112307999204e-06, + "loss": 4.8293, + "step": 191775 + }, + { + "epoch": 17.217235188509875, + "grad_norm": 19.021554946899414, + "learning_rate": 9.198862956313586e-06, + "loss": 4.8305, + "step": 191800 + }, + { + "epoch": 17.21947935368043, + "grad_norm": 17.527692794799805, + "learning_rate": 9.198613604627968e-06, + "loss": 4.8055, + "step": 191825 + }, + { + "epoch": 17.221723518850986, + "grad_norm": 20.00971794128418, + "learning_rate": 9.19836425294235e-06, + "loss": 4.7896, + "step": 191850 + }, + { + "epoch": 17.223967684021545, + "grad_norm": 18.001840591430664, + "learning_rate": 9.198114901256733e-06, + "loss": 4.9875, + "step": 191875 + }, + { + "epoch": 17.2262118491921, + "grad_norm": 15.054513931274414, + "learning_rate": 9.197865549571115e-06, + "loss": 5.0668, + "step": 191900 + }, + { + "epoch": 17.228456014362656, + "grad_norm": 16.862945556640625, + "learning_rate": 9.197616197885498e-06, + "loss": 4.9439, + "step": 191925 + }, + { + "epoch": 17.230700179533212, + "grad_norm": 16.821561813354492, + "learning_rate": 9.197366846199882e-06, + "loss": 4.9917, + "step": 191950 + }, + { + "epoch": 17.23294434470377, + "grad_norm": 15.640066146850586, + "learning_rate": 9.197117494514264e-06, + "loss": 4.9875, + "step": 191975 + }, + { + "epoch": 17.235188509874327, + "grad_norm": 18.778783798217773, + "learning_rate": 9.196868142828646e-06, + "loss": 4.7155, + "step": 192000 + }, + { + "epoch": 17.237432675044882, + "grad_norm": 15.925152778625488, + "learning_rate": 9.196618791143029e-06, + "loss": 4.9388, + "step": 192025 + }, + { + "epoch": 17.23967684021544, + "grad_norm": 18.722885131835938, + "learning_rate": 9.196369439457411e-06, + "loss": 4.8715, + "step": 192050 + }, + { + "epoch": 17.241921005385997, + "grad_norm": 16.92053985595703, + "learning_rate": 9.196120087771793e-06, + "loss": 4.6953, + "step": 192075 + }, + { + "epoch": 17.244165170556553, + "grad_norm": 19.00566864013672, + "learning_rate": 9.195870736086177e-06, + "loss": 4.9414, + "step": 192100 + }, + { + "epoch": 17.246409335727108, + "grad_norm": 16.92559051513672, + "learning_rate": 9.19562138440056e-06, + "loss": 5.0038, + "step": 192125 + }, + { + "epoch": 17.248653500897667, + "grad_norm": 17.27791404724121, + "learning_rate": 9.195372032714942e-06, + "loss": 4.973, + "step": 192150 + }, + { + "epoch": 17.250897666068223, + "grad_norm": 18.990999221801758, + "learning_rate": 9.195122681029324e-06, + "loss": 5.0578, + "step": 192175 + }, + { + "epoch": 17.25314183123878, + "grad_norm": 16.037967681884766, + "learning_rate": 9.194873329343708e-06, + "loss": 4.6445, + "step": 192200 + }, + { + "epoch": 17.255385996409334, + "grad_norm": 17.30224609375, + "learning_rate": 9.194623977658089e-06, + "loss": 4.7652, + "step": 192225 + }, + { + "epoch": 17.257630161579893, + "grad_norm": 17.805334091186523, + "learning_rate": 9.194374625972473e-06, + "loss": 5.0654, + "step": 192250 + }, + { + "epoch": 17.25987432675045, + "grad_norm": 16.26702308654785, + "learning_rate": 9.194125274286855e-06, + "loss": 4.5719, + "step": 192275 + }, + { + "epoch": 17.262118491921004, + "grad_norm": 20.040292739868164, + "learning_rate": 9.193875922601237e-06, + "loss": 4.7254, + "step": 192300 + }, + { + "epoch": 17.264362657091564, + "grad_norm": 15.516215324401855, + "learning_rate": 9.19362657091562e-06, + "loss": 4.6747, + "step": 192325 + }, + { + "epoch": 17.26660682226212, + "grad_norm": 16.343547821044922, + "learning_rate": 9.193377219230004e-06, + "loss": 4.9192, + "step": 192350 + }, + { + "epoch": 17.268850987432675, + "grad_norm": 17.34676742553711, + "learning_rate": 9.193127867544386e-06, + "loss": 5.0862, + "step": 192375 + }, + { + "epoch": 17.27109515260323, + "grad_norm": 18.53835105895996, + "learning_rate": 9.192878515858767e-06, + "loss": 5.2306, + "step": 192400 + }, + { + "epoch": 17.27333931777379, + "grad_norm": 16.445148468017578, + "learning_rate": 9.19262916417315e-06, + "loss": 5.0292, + "step": 192425 + }, + { + "epoch": 17.275583482944345, + "grad_norm": 19.804349899291992, + "learning_rate": 9.192379812487533e-06, + "loss": 4.9202, + "step": 192450 + }, + { + "epoch": 17.2778276481149, + "grad_norm": 17.0264835357666, + "learning_rate": 9.192130460801915e-06, + "loss": 4.9957, + "step": 192475 + }, + { + "epoch": 17.280071813285456, + "grad_norm": 21.053409576416016, + "learning_rate": 9.1918811091163e-06, + "loss": 5.1064, + "step": 192500 + }, + { + "epoch": 17.282315978456015, + "grad_norm": 20.684659957885742, + "learning_rate": 9.191631757430682e-06, + "loss": 4.947, + "step": 192525 + }, + { + "epoch": 17.28456014362657, + "grad_norm": 17.16590118408203, + "learning_rate": 9.191382405745064e-06, + "loss": 5.0034, + "step": 192550 + }, + { + "epoch": 17.286804308797127, + "grad_norm": 17.270355224609375, + "learning_rate": 9.191133054059446e-06, + "loss": 5.0641, + "step": 192575 + }, + { + "epoch": 17.289048473967686, + "grad_norm": 19.048227310180664, + "learning_rate": 9.190883702373829e-06, + "loss": 4.9771, + "step": 192600 + }, + { + "epoch": 17.29129263913824, + "grad_norm": 20.137113571166992, + "learning_rate": 9.190644324755637e-06, + "loss": 4.5731, + "step": 192625 + }, + { + "epoch": 17.293536804308797, + "grad_norm": 18.019804000854492, + "learning_rate": 9.190394973070018e-06, + "loss": 4.8909, + "step": 192650 + }, + { + "epoch": 17.295780969479353, + "grad_norm": 19.53738784790039, + "learning_rate": 9.1901456213844e-06, + "loss": 4.8648, + "step": 192675 + }, + { + "epoch": 17.29802513464991, + "grad_norm": 18.83368682861328, + "learning_rate": 9.189896269698784e-06, + "loss": 4.8969, + "step": 192700 + }, + { + "epoch": 17.300269299820467, + "grad_norm": 15.399246215820312, + "learning_rate": 9.189646918013167e-06, + "loss": 4.9236, + "step": 192725 + }, + { + "epoch": 17.302513464991023, + "grad_norm": 18.893722534179688, + "learning_rate": 9.189397566327549e-06, + "loss": 5.1228, + "step": 192750 + }, + { + "epoch": 17.30475763016158, + "grad_norm": 17.380088806152344, + "learning_rate": 9.189148214641931e-06, + "loss": 4.6943, + "step": 192775 + }, + { + "epoch": 17.307001795332138, + "grad_norm": 18.718765258789062, + "learning_rate": 9.188898862956315e-06, + "loss": 4.647, + "step": 192800 + }, + { + "epoch": 17.309245960502693, + "grad_norm": 19.808013916015625, + "learning_rate": 9.188649511270697e-06, + "loss": 4.6771, + "step": 192825 + }, + { + "epoch": 17.31149012567325, + "grad_norm": 16.554004669189453, + "learning_rate": 9.18840015958508e-06, + "loss": 5.1222, + "step": 192850 + }, + { + "epoch": 17.313734290843804, + "grad_norm": 19.904287338256836, + "learning_rate": 9.188150807899462e-06, + "loss": 4.7482, + "step": 192875 + }, + { + "epoch": 17.315978456014363, + "grad_norm": 18.64961814880371, + "learning_rate": 9.187901456213844e-06, + "loss": 4.8438, + "step": 192900 + }, + { + "epoch": 17.31822262118492, + "grad_norm": 19.670461654663086, + "learning_rate": 9.187652104528227e-06, + "loss": 4.9258, + "step": 192925 + }, + { + "epoch": 17.320466786355475, + "grad_norm": 17.695072174072266, + "learning_rate": 9.18740275284261e-06, + "loss": 5.2406, + "step": 192950 + }, + { + "epoch": 17.322710951526034, + "grad_norm": 17.367782592773438, + "learning_rate": 9.187153401156993e-06, + "loss": 4.8966, + "step": 192975 + }, + { + "epoch": 17.32495511669659, + "grad_norm": 20.06094741821289, + "learning_rate": 9.186904049471375e-06, + "loss": 4.8752, + "step": 193000 + }, + { + "epoch": 17.327199281867145, + "grad_norm": 17.913663864135742, + "learning_rate": 9.186654697785758e-06, + "loss": 5.0375, + "step": 193025 + }, + { + "epoch": 17.3294434470377, + "grad_norm": 19.25859832763672, + "learning_rate": 9.18640534610014e-06, + "loss": 4.8296, + "step": 193050 + }, + { + "epoch": 17.33168761220826, + "grad_norm": 17.80164909362793, + "learning_rate": 9.186155994414522e-06, + "loss": 4.572, + "step": 193075 + }, + { + "epoch": 17.333931777378815, + "grad_norm": 16.28984832763672, + "learning_rate": 9.185906642728906e-06, + "loss": 4.7347, + "step": 193100 + }, + { + "epoch": 17.33617594254937, + "grad_norm": 17.758607864379883, + "learning_rate": 9.185657291043289e-06, + "loss": 4.6949, + "step": 193125 + }, + { + "epoch": 17.338420107719926, + "grad_norm": 17.925430297851562, + "learning_rate": 9.185407939357671e-06, + "loss": 4.9525, + "step": 193150 + }, + { + "epoch": 17.340664272890486, + "grad_norm": 16.595178604125977, + "learning_rate": 9.185158587672053e-06, + "loss": 4.8324, + "step": 193175 + }, + { + "epoch": 17.34290843806104, + "grad_norm": 19.1475887298584, + "learning_rate": 9.184909235986436e-06, + "loss": 4.9407, + "step": 193200 + }, + { + "epoch": 17.345152603231597, + "grad_norm": 23.452472686767578, + "learning_rate": 9.184659884300818e-06, + "loss": 4.9735, + "step": 193225 + }, + { + "epoch": 17.347396768402156, + "grad_norm": 19.79327964782715, + "learning_rate": 9.184410532615202e-06, + "loss": 4.8823, + "step": 193250 + }, + { + "epoch": 17.34964093357271, + "grad_norm": 19.845630645751953, + "learning_rate": 9.184161180929584e-06, + "loss": 4.7852, + "step": 193275 + }, + { + "epoch": 17.351885098743267, + "grad_norm": 20.230485916137695, + "learning_rate": 9.183911829243967e-06, + "loss": 4.9939, + "step": 193300 + }, + { + "epoch": 17.354129263913823, + "grad_norm": 16.033414840698242, + "learning_rate": 9.183662477558349e-06, + "loss": 5.0021, + "step": 193325 + }, + { + "epoch": 17.356373429084382, + "grad_norm": 16.228504180908203, + "learning_rate": 9.183413125872733e-06, + "loss": 5.176, + "step": 193350 + }, + { + "epoch": 17.358617594254937, + "grad_norm": 17.15692710876465, + "learning_rate": 9.183163774187113e-06, + "loss": 4.9904, + "step": 193375 + }, + { + "epoch": 17.360861759425493, + "grad_norm": 18.866043090820312, + "learning_rate": 9.182914422501496e-06, + "loss": 4.7489, + "step": 193400 + }, + { + "epoch": 17.36310592459605, + "grad_norm": 18.728151321411133, + "learning_rate": 9.18266507081588e-06, + "loss": 5.1372, + "step": 193425 + }, + { + "epoch": 17.365350089766608, + "grad_norm": 17.408000946044922, + "learning_rate": 9.182415719130262e-06, + "loss": 4.9644, + "step": 193450 + }, + { + "epoch": 17.367594254937163, + "grad_norm": 16.494022369384766, + "learning_rate": 9.182166367444644e-06, + "loss": 4.9147, + "step": 193475 + }, + { + "epoch": 17.36983842010772, + "grad_norm": 18.798601150512695, + "learning_rate": 9.181917015759027e-06, + "loss": 4.7305, + "step": 193500 + }, + { + "epoch": 17.372082585278278, + "grad_norm": 17.50949478149414, + "learning_rate": 9.18166766407341e-06, + "loss": 4.8223, + "step": 193525 + }, + { + "epoch": 17.374326750448834, + "grad_norm": 19.427509307861328, + "learning_rate": 9.181418312387791e-06, + "loss": 5.0026, + "step": 193550 + }, + { + "epoch": 17.37657091561939, + "grad_norm": 14.705662727355957, + "learning_rate": 9.181168960702175e-06, + "loss": 4.7331, + "step": 193575 + }, + { + "epoch": 17.378815080789945, + "grad_norm": 18.58798599243164, + "learning_rate": 9.180919609016558e-06, + "loss": 4.8392, + "step": 193600 + }, + { + "epoch": 17.381059245960504, + "grad_norm": 19.198505401611328, + "learning_rate": 9.18067025733094e-06, + "loss": 4.8797, + "step": 193625 + }, + { + "epoch": 17.38330341113106, + "grad_norm": 16.716814041137695, + "learning_rate": 9.180420905645322e-06, + "loss": 4.6174, + "step": 193650 + }, + { + "epoch": 17.385547576301615, + "grad_norm": 18.448705673217773, + "learning_rate": 9.180171553959706e-06, + "loss": 5.0103, + "step": 193675 + }, + { + "epoch": 17.38779174147217, + "grad_norm": 23.316308975219727, + "learning_rate": 9.179922202274089e-06, + "loss": 5.0181, + "step": 193700 + }, + { + "epoch": 17.39003590664273, + "grad_norm": 19.149879455566406, + "learning_rate": 9.179672850588471e-06, + "loss": 5.0153, + "step": 193725 + }, + { + "epoch": 17.392280071813286, + "grad_norm": 18.1882381439209, + "learning_rate": 9.179423498902853e-06, + "loss": 4.8645, + "step": 193750 + }, + { + "epoch": 17.39452423698384, + "grad_norm": 16.486413955688477, + "learning_rate": 9.179174147217236e-06, + "loss": 4.908, + "step": 193775 + }, + { + "epoch": 17.3967684021544, + "grad_norm": 13.753571510314941, + "learning_rate": 9.178924795531618e-06, + "loss": 4.8408, + "step": 193800 + }, + { + "epoch": 17.399012567324956, + "grad_norm": 16.52808380126953, + "learning_rate": 9.178675443846002e-06, + "loss": 4.9402, + "step": 193825 + }, + { + "epoch": 17.40125673249551, + "grad_norm": 16.055557250976562, + "learning_rate": 9.178426092160384e-06, + "loss": 4.7505, + "step": 193850 + }, + { + "epoch": 17.403500897666067, + "grad_norm": 18.331209182739258, + "learning_rate": 9.178176740474767e-06, + "loss": 4.7912, + "step": 193875 + }, + { + "epoch": 17.405745062836626, + "grad_norm": 16.50755500793457, + "learning_rate": 9.177927388789149e-06, + "loss": 4.9017, + "step": 193900 + }, + { + "epoch": 17.40798922800718, + "grad_norm": 15.959660530090332, + "learning_rate": 9.177678037103531e-06, + "loss": 4.7305, + "step": 193925 + }, + { + "epoch": 17.410233393177737, + "grad_norm": 21.11240577697754, + "learning_rate": 9.177428685417914e-06, + "loss": 4.6619, + "step": 193950 + }, + { + "epoch": 17.412477558348293, + "grad_norm": 19.09105110168457, + "learning_rate": 9.177179333732298e-06, + "loss": 4.9156, + "step": 193975 + }, + { + "epoch": 17.414721723518852, + "grad_norm": 18.860925674438477, + "learning_rate": 9.17692998204668e-06, + "loss": 4.6742, + "step": 194000 + }, + { + "epoch": 17.416965888689408, + "grad_norm": 20.879051208496094, + "learning_rate": 9.176680630361062e-06, + "loss": 5.031, + "step": 194025 + }, + { + "epoch": 17.419210053859963, + "grad_norm": 19.465797424316406, + "learning_rate": 9.176431278675444e-06, + "loss": 5.0181, + "step": 194050 + }, + { + "epoch": 17.421454219030522, + "grad_norm": 18.136308670043945, + "learning_rate": 9.176181926989827e-06, + "loss": 5.005, + "step": 194075 + }, + { + "epoch": 17.423698384201078, + "grad_norm": 15.103575706481934, + "learning_rate": 9.175932575304209e-06, + "loss": 4.7799, + "step": 194100 + }, + { + "epoch": 17.425942549371634, + "grad_norm": 15.39364242553711, + "learning_rate": 9.175683223618591e-06, + "loss": 4.9047, + "step": 194125 + }, + { + "epoch": 17.42818671454219, + "grad_norm": 16.867124557495117, + "learning_rate": 9.175433871932975e-06, + "loss": 4.9247, + "step": 194150 + }, + { + "epoch": 17.43043087971275, + "grad_norm": 18.170936584472656, + "learning_rate": 9.175184520247358e-06, + "loss": 5.0493, + "step": 194175 + }, + { + "epoch": 17.432675044883304, + "grad_norm": 16.253877639770508, + "learning_rate": 9.17493516856174e-06, + "loss": 5.0276, + "step": 194200 + }, + { + "epoch": 17.43491921005386, + "grad_norm": 17.30860137939453, + "learning_rate": 9.174685816876122e-06, + "loss": 4.7972, + "step": 194225 + }, + { + "epoch": 17.437163375224415, + "grad_norm": 21.527006149291992, + "learning_rate": 9.174436465190505e-06, + "loss": 4.9401, + "step": 194250 + }, + { + "epoch": 17.439407540394974, + "grad_norm": 16.614625930786133, + "learning_rate": 9.174187113504887e-06, + "loss": 4.8817, + "step": 194275 + }, + { + "epoch": 17.44165170556553, + "grad_norm": 14.245438575744629, + "learning_rate": 9.173937761819271e-06, + "loss": 4.6534, + "step": 194300 + }, + { + "epoch": 17.443895870736085, + "grad_norm": 16.610492706298828, + "learning_rate": 9.173688410133653e-06, + "loss": 4.7886, + "step": 194325 + }, + { + "epoch": 17.446140035906645, + "grad_norm": 17.46538543701172, + "learning_rate": 9.173439058448036e-06, + "loss": 5.0411, + "step": 194350 + }, + { + "epoch": 17.4483842010772, + "grad_norm": 17.602519989013672, + "learning_rate": 9.173189706762418e-06, + "loss": 4.949, + "step": 194375 + }, + { + "epoch": 17.450628366247756, + "grad_norm": 15.287520408630371, + "learning_rate": 9.172940355076802e-06, + "loss": 4.7122, + "step": 194400 + }, + { + "epoch": 17.45287253141831, + "grad_norm": 18.610700607299805, + "learning_rate": 9.172691003391183e-06, + "loss": 4.9719, + "step": 194425 + }, + { + "epoch": 17.45511669658887, + "grad_norm": 18.389429092407227, + "learning_rate": 9.172441651705567e-06, + "loss": 4.8106, + "step": 194450 + }, + { + "epoch": 17.457360861759426, + "grad_norm": 17.550188064575195, + "learning_rate": 9.172192300019949e-06, + "loss": 5.156, + "step": 194475 + }, + { + "epoch": 17.45960502692998, + "grad_norm": 17.236108779907227, + "learning_rate": 9.171942948334331e-06, + "loss": 4.8132, + "step": 194500 + }, + { + "epoch": 17.461849192100537, + "grad_norm": 17.4379825592041, + "learning_rate": 9.171693596648714e-06, + "loss": 5.0627, + "step": 194525 + }, + { + "epoch": 17.464093357271096, + "grad_norm": 16.68580436706543, + "learning_rate": 9.171444244963098e-06, + "loss": 4.9617, + "step": 194550 + }, + { + "epoch": 17.466337522441652, + "grad_norm": 17.803646087646484, + "learning_rate": 9.17119489327748e-06, + "loss": 5.0999, + "step": 194575 + }, + { + "epoch": 17.468581687612208, + "grad_norm": 15.396665573120117, + "learning_rate": 9.170945541591862e-06, + "loss": 4.7702, + "step": 194600 + }, + { + "epoch": 17.470825852782763, + "grad_norm": 17.545574188232422, + "learning_rate": 9.170696189906245e-06, + "loss": 5.1807, + "step": 194625 + }, + { + "epoch": 17.473070017953322, + "grad_norm": 19.23912239074707, + "learning_rate": 9.170446838220627e-06, + "loss": 4.7657, + "step": 194650 + }, + { + "epoch": 17.475314183123878, + "grad_norm": 20.42339324951172, + "learning_rate": 9.17019748653501e-06, + "loss": 4.6625, + "step": 194675 + }, + { + "epoch": 17.477558348294433, + "grad_norm": 19.99170684814453, + "learning_rate": 9.169948134849393e-06, + "loss": 5.1752, + "step": 194700 + }, + { + "epoch": 17.479802513464993, + "grad_norm": 21.35711097717285, + "learning_rate": 9.169698783163776e-06, + "loss": 5.0653, + "step": 194725 + }, + { + "epoch": 17.482046678635548, + "grad_norm": 18.142112731933594, + "learning_rate": 9.169449431478158e-06, + "loss": 5.1233, + "step": 194750 + }, + { + "epoch": 17.484290843806104, + "grad_norm": 15.700729370117188, + "learning_rate": 9.16920007979254e-06, + "loss": 5.0534, + "step": 194775 + }, + { + "epoch": 17.48653500897666, + "grad_norm": 14.977165222167969, + "learning_rate": 9.168950728106922e-06, + "loss": 5.0929, + "step": 194800 + }, + { + "epoch": 17.48877917414722, + "grad_norm": 20.139387130737305, + "learning_rate": 9.168701376421305e-06, + "loss": 4.8989, + "step": 194825 + }, + { + "epoch": 17.491023339317774, + "grad_norm": 16.301353454589844, + "learning_rate": 9.168452024735687e-06, + "loss": 5.1651, + "step": 194850 + }, + { + "epoch": 17.49326750448833, + "grad_norm": 15.62158489227295, + "learning_rate": 9.168202673050071e-06, + "loss": 4.7648, + "step": 194875 + }, + { + "epoch": 17.495511669658885, + "grad_norm": 15.633852005004883, + "learning_rate": 9.167953321364453e-06, + "loss": 4.6927, + "step": 194900 + }, + { + "epoch": 17.497755834829444, + "grad_norm": 16.524324417114258, + "learning_rate": 9.167703969678836e-06, + "loss": 4.9691, + "step": 194925 + }, + { + "epoch": 17.5, + "grad_norm": 15.066422462463379, + "learning_rate": 9.16745461799322e-06, + "loss": 5.15, + "step": 194950 + }, + { + "epoch": 17.502244165170556, + "grad_norm": 16.695158004760742, + "learning_rate": 9.1672052663076e-06, + "loss": 5.1241, + "step": 194975 + }, + { + "epoch": 17.504488330341115, + "grad_norm": 16.321216583251953, + "learning_rate": 9.166955914621983e-06, + "loss": 4.9898, + "step": 195000 + }, + { + "epoch": 17.50673249551167, + "grad_norm": 18.11136817932129, + "learning_rate": 9.166706562936367e-06, + "loss": 4.8696, + "step": 195025 + }, + { + "epoch": 17.508976660682226, + "grad_norm": 17.18471908569336, + "learning_rate": 9.166457211250749e-06, + "loss": 4.7467, + "step": 195050 + }, + { + "epoch": 17.51122082585278, + "grad_norm": 17.630586624145508, + "learning_rate": 9.166207859565131e-06, + "loss": 4.7766, + "step": 195075 + }, + { + "epoch": 17.51346499102334, + "grad_norm": 17.359079360961914, + "learning_rate": 9.165958507879514e-06, + "loss": 4.9176, + "step": 195100 + }, + { + "epoch": 17.515709156193896, + "grad_norm": 16.655202865600586, + "learning_rate": 9.165709156193898e-06, + "loss": 4.9684, + "step": 195125 + }, + { + "epoch": 17.517953321364452, + "grad_norm": 14.971992492675781, + "learning_rate": 9.165459804508278e-06, + "loss": 5.0679, + "step": 195150 + }, + { + "epoch": 17.520197486535007, + "grad_norm": 18.56954002380371, + "learning_rate": 9.165210452822662e-06, + "loss": 4.9233, + "step": 195175 + }, + { + "epoch": 17.522441651705567, + "grad_norm": 16.780916213989258, + "learning_rate": 9.164961101137045e-06, + "loss": 4.9508, + "step": 195200 + }, + { + "epoch": 17.524685816876122, + "grad_norm": 18.95134925842285, + "learning_rate": 9.164711749451427e-06, + "loss": 4.5865, + "step": 195225 + }, + { + "epoch": 17.526929982046678, + "grad_norm": 17.698352813720703, + "learning_rate": 9.16446239776581e-06, + "loss": 4.921, + "step": 195250 + }, + { + "epoch": 17.529174147217237, + "grad_norm": 15.84782886505127, + "learning_rate": 9.164213046080193e-06, + "loss": 5.1271, + "step": 195275 + }, + { + "epoch": 17.531418312387792, + "grad_norm": 19.855375289916992, + "learning_rate": 9.163963694394576e-06, + "loss": 4.9741, + "step": 195300 + }, + { + "epoch": 17.533662477558348, + "grad_norm": 16.378297805786133, + "learning_rate": 9.163714342708956e-06, + "loss": 4.9313, + "step": 195325 + }, + { + "epoch": 17.535906642728904, + "grad_norm": 20.135963439941406, + "learning_rate": 9.16346499102334e-06, + "loss": 4.848, + "step": 195350 + }, + { + "epoch": 17.538150807899463, + "grad_norm": 17.76005744934082, + "learning_rate": 9.163215639337722e-06, + "loss": 4.8034, + "step": 195375 + }, + { + "epoch": 17.54039497307002, + "grad_norm": 16.406795501708984, + "learning_rate": 9.162966287652105e-06, + "loss": 4.9444, + "step": 195400 + }, + { + "epoch": 17.542639138240574, + "grad_norm": 19.058454513549805, + "learning_rate": 9.162716935966489e-06, + "loss": 5.0391, + "step": 195425 + }, + { + "epoch": 17.54488330341113, + "grad_norm": 20.55812644958496, + "learning_rate": 9.162467584280871e-06, + "loss": 5.1454, + "step": 195450 + }, + { + "epoch": 17.54712746858169, + "grad_norm": 20.306604385375977, + "learning_rate": 9.162218232595253e-06, + "loss": 4.8766, + "step": 195475 + }, + { + "epoch": 17.549371633752244, + "grad_norm": 15.20395278930664, + "learning_rate": 9.161968880909636e-06, + "loss": 4.8784, + "step": 195500 + }, + { + "epoch": 17.5516157989228, + "grad_norm": 20.80891227722168, + "learning_rate": 9.161719529224018e-06, + "loss": 4.9341, + "step": 195525 + }, + { + "epoch": 17.553859964093355, + "grad_norm": 22.083311080932617, + "learning_rate": 9.1614701775384e-06, + "loss": 4.9578, + "step": 195550 + }, + { + "epoch": 17.556104129263915, + "grad_norm": 19.252643585205078, + "learning_rate": 9.161220825852783e-06, + "loss": 5.016, + "step": 195575 + }, + { + "epoch": 17.55834829443447, + "grad_norm": 15.750102043151855, + "learning_rate": 9.160971474167167e-06, + "loss": 4.9837, + "step": 195600 + }, + { + "epoch": 17.560592459605026, + "grad_norm": 16.788461685180664, + "learning_rate": 9.160722122481549e-06, + "loss": 4.8148, + "step": 195625 + }, + { + "epoch": 17.562836624775585, + "grad_norm": 15.171133995056152, + "learning_rate": 9.160472770795931e-06, + "loss": 4.8007, + "step": 195650 + }, + { + "epoch": 17.56508078994614, + "grad_norm": 18.00978660583496, + "learning_rate": 9.160223419110314e-06, + "loss": 4.7684, + "step": 195675 + }, + { + "epoch": 17.567324955116696, + "grad_norm": 20.820985794067383, + "learning_rate": 9.159974067424696e-06, + "loss": 5.0466, + "step": 195700 + }, + { + "epoch": 17.56956912028725, + "grad_norm": 17.39849281311035, + "learning_rate": 9.159724715739078e-06, + "loss": 4.8476, + "step": 195725 + }, + { + "epoch": 17.57181328545781, + "grad_norm": 15.890702247619629, + "learning_rate": 9.159475364053462e-06, + "loss": 4.8335, + "step": 195750 + }, + { + "epoch": 17.574057450628366, + "grad_norm": 20.21729850769043, + "learning_rate": 9.159226012367845e-06, + "loss": 4.7907, + "step": 195775 + }, + { + "epoch": 17.576301615798922, + "grad_norm": 19.993783950805664, + "learning_rate": 9.158976660682227e-06, + "loss": 4.9136, + "step": 195800 + }, + { + "epoch": 17.578545780969478, + "grad_norm": 17.65111541748047, + "learning_rate": 9.15872730899661e-06, + "loss": 5.0319, + "step": 195825 + }, + { + "epoch": 17.580789946140037, + "grad_norm": 18.84161949157715, + "learning_rate": 9.158477957310992e-06, + "loss": 4.7184, + "step": 195850 + }, + { + "epoch": 17.583034111310592, + "grad_norm": 17.563692092895508, + "learning_rate": 9.158228605625374e-06, + "loss": 4.7043, + "step": 195875 + }, + { + "epoch": 17.585278276481148, + "grad_norm": 22.05570411682129, + "learning_rate": 9.157979253939758e-06, + "loss": 4.9721, + "step": 195900 + }, + { + "epoch": 17.587522441651707, + "grad_norm": 20.70229721069336, + "learning_rate": 9.15772990225414e-06, + "loss": 5.0787, + "step": 195925 + }, + { + "epoch": 17.589766606822263, + "grad_norm": 21.62171745300293, + "learning_rate": 9.157480550568523e-06, + "loss": 4.9078, + "step": 195950 + }, + { + "epoch": 17.59201077199282, + "grad_norm": 19.86174774169922, + "learning_rate": 9.157231198882905e-06, + "loss": 5.0858, + "step": 195975 + }, + { + "epoch": 17.594254937163374, + "grad_norm": 17.29832649230957, + "learning_rate": 9.156981847197289e-06, + "loss": 5.0333, + "step": 196000 + }, + { + "epoch": 17.596499102333933, + "grad_norm": 20.729232788085938, + "learning_rate": 9.15673249551167e-06, + "loss": 4.5831, + "step": 196025 + }, + { + "epoch": 17.59874326750449, + "grad_norm": 20.451236724853516, + "learning_rate": 9.156483143826052e-06, + "loss": 5.1108, + "step": 196050 + }, + { + "epoch": 17.600987432675044, + "grad_norm": 20.06171226501465, + "learning_rate": 9.156233792140436e-06, + "loss": 5.0133, + "step": 196075 + }, + { + "epoch": 17.6032315978456, + "grad_norm": 21.093948364257812, + "learning_rate": 9.155984440454818e-06, + "loss": 5.1351, + "step": 196100 + }, + { + "epoch": 17.60547576301616, + "grad_norm": 18.024208068847656, + "learning_rate": 9.1557350887692e-06, + "loss": 4.8055, + "step": 196125 + }, + { + "epoch": 17.607719928186714, + "grad_norm": 16.28180503845215, + "learning_rate": 9.155485737083584e-06, + "loss": 4.6501, + "step": 196150 + }, + { + "epoch": 17.60996409335727, + "grad_norm": 20.23795509338379, + "learning_rate": 9.155236385397967e-06, + "loss": 4.6661, + "step": 196175 + }, + { + "epoch": 17.61220825852783, + "grad_norm": 20.361942291259766, + "learning_rate": 9.154987033712347e-06, + "loss": 5.106, + "step": 196200 + }, + { + "epoch": 17.614452423698385, + "grad_norm": 17.6870059967041, + "learning_rate": 9.154737682026731e-06, + "loss": 4.7025, + "step": 196225 + }, + { + "epoch": 17.61669658886894, + "grad_norm": 16.74889373779297, + "learning_rate": 9.154488330341114e-06, + "loss": 4.8729, + "step": 196250 + }, + { + "epoch": 17.618940754039496, + "grad_norm": 17.64471435546875, + "learning_rate": 9.154238978655496e-06, + "loss": 4.8548, + "step": 196275 + }, + { + "epoch": 17.621184919210055, + "grad_norm": 18.92573356628418, + "learning_rate": 9.153989626969878e-06, + "loss": 5.2381, + "step": 196300 + }, + { + "epoch": 17.62342908438061, + "grad_norm": 24.610992431640625, + "learning_rate": 9.153740275284262e-06, + "loss": 4.7162, + "step": 196325 + }, + { + "epoch": 17.625673249551166, + "grad_norm": 14.817203521728516, + "learning_rate": 9.153490923598645e-06, + "loss": 4.6229, + "step": 196350 + }, + { + "epoch": 17.627917414721722, + "grad_norm": 16.83196449279785, + "learning_rate": 9.153241571913027e-06, + "loss": 4.9321, + "step": 196375 + }, + { + "epoch": 17.63016157989228, + "grad_norm": 17.999980926513672, + "learning_rate": 9.15299222022741e-06, + "loss": 5.0981, + "step": 196400 + }, + { + "epoch": 17.632405745062837, + "grad_norm": 15.689520835876465, + "learning_rate": 9.152742868541792e-06, + "loss": 4.7882, + "step": 196425 + }, + { + "epoch": 17.634649910233392, + "grad_norm": 16.381160736083984, + "learning_rate": 9.152493516856174e-06, + "loss": 4.9338, + "step": 196450 + }, + { + "epoch": 17.63689407540395, + "grad_norm": 19.714815139770508, + "learning_rate": 9.152244165170558e-06, + "loss": 4.8139, + "step": 196475 + }, + { + "epoch": 17.639138240574507, + "grad_norm": 20.828258514404297, + "learning_rate": 9.15199481348494e-06, + "loss": 4.9031, + "step": 196500 + }, + { + "epoch": 17.641382405745063, + "grad_norm": 18.718822479248047, + "learning_rate": 9.151745461799323e-06, + "loss": 5.2161, + "step": 196525 + }, + { + "epoch": 17.643626570915618, + "grad_norm": 19.62946128845215, + "learning_rate": 9.151496110113705e-06, + "loss": 4.8092, + "step": 196550 + }, + { + "epoch": 17.645870736086177, + "grad_norm": 19.809635162353516, + "learning_rate": 9.151246758428087e-06, + "loss": 4.9708, + "step": 196575 + }, + { + "epoch": 17.648114901256733, + "grad_norm": 19.20097541809082, + "learning_rate": 9.15099740674247e-06, + "loss": 5.0058, + "step": 196600 + }, + { + "epoch": 17.65035906642729, + "grad_norm": 19.23854637145996, + "learning_rate": 9.150748055056854e-06, + "loss": 4.702, + "step": 196625 + }, + { + "epoch": 17.652603231597844, + "grad_norm": 22.895288467407227, + "learning_rate": 9.150498703371236e-06, + "loss": 4.9686, + "step": 196650 + }, + { + "epoch": 17.654847396768403, + "grad_norm": 15.60940170288086, + "learning_rate": 9.150249351685618e-06, + "loss": 4.6774, + "step": 196675 + }, + { + "epoch": 17.65709156193896, + "grad_norm": 15.580758094787598, + "learning_rate": 9.15e-06, + "loss": 5.1085, + "step": 196700 + }, + { + "epoch": 17.659335727109514, + "grad_norm": 20.79538917541504, + "learning_rate": 9.149750648314384e-06, + "loss": 4.8912, + "step": 196725 + }, + { + "epoch": 17.661579892280074, + "grad_norm": 19.493671417236328, + "learning_rate": 9.149501296628765e-06, + "loss": 4.872, + "step": 196750 + }, + { + "epoch": 17.66382405745063, + "grad_norm": 16.27743911743164, + "learning_rate": 9.149261919010574e-06, + "loss": 4.7718, + "step": 196775 + }, + { + "epoch": 17.666068222621185, + "grad_norm": 19.589496612548828, + "learning_rate": 9.149012567324956e-06, + "loss": 4.8836, + "step": 196800 + }, + { + "epoch": 17.66831238779174, + "grad_norm": 17.268726348876953, + "learning_rate": 9.148763215639338e-06, + "loss": 5.1125, + "step": 196825 + }, + { + "epoch": 17.6705565529623, + "grad_norm": 24.156484603881836, + "learning_rate": 9.14851386395372e-06, + "loss": 5.1527, + "step": 196850 + }, + { + "epoch": 17.672800718132855, + "grad_norm": 17.418399810791016, + "learning_rate": 9.148264512268103e-06, + "loss": 5.0805, + "step": 196875 + }, + { + "epoch": 17.67504488330341, + "grad_norm": 17.344987869262695, + "learning_rate": 9.148015160582487e-06, + "loss": 4.9747, + "step": 196900 + }, + { + "epoch": 17.677289048473966, + "grad_norm": 16.81334114074707, + "learning_rate": 9.14776580889687e-06, + "loss": 4.8178, + "step": 196925 + }, + { + "epoch": 17.679533213644525, + "grad_norm": 17.149824142456055, + "learning_rate": 9.147516457211252e-06, + "loss": 5.0707, + "step": 196950 + }, + { + "epoch": 17.68177737881508, + "grad_norm": 17.664596557617188, + "learning_rate": 9.147267105525634e-06, + "loss": 4.918, + "step": 196975 + }, + { + "epoch": 17.684021543985637, + "grad_norm": 18.26732635498047, + "learning_rate": 9.147017753840016e-06, + "loss": 5.0513, + "step": 197000 + }, + { + "epoch": 17.686265709156196, + "grad_norm": 20.97850227355957, + "learning_rate": 9.146768402154399e-06, + "loss": 4.9436, + "step": 197025 + }, + { + "epoch": 17.68850987432675, + "grad_norm": 18.22222137451172, + "learning_rate": 9.146519050468781e-06, + "loss": 5.0131, + "step": 197050 + }, + { + "epoch": 17.690754039497307, + "grad_norm": 18.235769271850586, + "learning_rate": 9.146269698783165e-06, + "loss": 4.9424, + "step": 197075 + }, + { + "epoch": 17.692998204667862, + "grad_norm": 12.930278778076172, + "learning_rate": 9.146020347097547e-06, + "loss": 4.8751, + "step": 197100 + }, + { + "epoch": 17.69524236983842, + "grad_norm": 18.80584716796875, + "learning_rate": 9.14577099541193e-06, + "loss": 4.9225, + "step": 197125 + }, + { + "epoch": 17.697486535008977, + "grad_norm": 19.355207443237305, + "learning_rate": 9.145521643726314e-06, + "loss": 4.7906, + "step": 197150 + }, + { + "epoch": 17.699730700179533, + "grad_norm": 20.468027114868164, + "learning_rate": 9.145272292040694e-06, + "loss": 4.8574, + "step": 197175 + }, + { + "epoch": 17.70197486535009, + "grad_norm": 20.840436935424805, + "learning_rate": 9.145022940355077e-06, + "loss": 5.0455, + "step": 197200 + }, + { + "epoch": 17.704219030520647, + "grad_norm": 20.917898178100586, + "learning_rate": 9.14477358866946e-06, + "loss": 4.9126, + "step": 197225 + }, + { + "epoch": 17.706463195691203, + "grad_norm": 17.79859733581543, + "learning_rate": 9.144524236983843e-06, + "loss": 5.0243, + "step": 197250 + }, + { + "epoch": 17.70870736086176, + "grad_norm": 16.9219913482666, + "learning_rate": 9.144274885298225e-06, + "loss": 4.8484, + "step": 197275 + }, + { + "epoch": 17.710951526032314, + "grad_norm": 21.47732925415039, + "learning_rate": 9.144025533612607e-06, + "loss": 5.0302, + "step": 197300 + }, + { + "epoch": 17.713195691202873, + "grad_norm": 13.90006160736084, + "learning_rate": 9.143776181926991e-06, + "loss": 5.0103, + "step": 197325 + }, + { + "epoch": 17.71543985637343, + "grad_norm": 19.98970603942871, + "learning_rate": 9.143526830241372e-06, + "loss": 4.8546, + "step": 197350 + }, + { + "epoch": 17.717684021543985, + "grad_norm": 21.04457664489746, + "learning_rate": 9.143277478555756e-06, + "loss": 5.0028, + "step": 197375 + }, + { + "epoch": 17.719928186714544, + "grad_norm": 16.908336639404297, + "learning_rate": 9.143028126870138e-06, + "loss": 5.0583, + "step": 197400 + }, + { + "epoch": 17.7221723518851, + "grad_norm": 17.513364791870117, + "learning_rate": 9.14277877518452e-06, + "loss": 4.9236, + "step": 197425 + }, + { + "epoch": 17.724416517055655, + "grad_norm": 18.650217056274414, + "learning_rate": 9.142529423498903e-06, + "loss": 4.7732, + "step": 197450 + }, + { + "epoch": 17.72666068222621, + "grad_norm": 17.86042022705078, + "learning_rate": 9.142280071813287e-06, + "loss": 4.9984, + "step": 197475 + }, + { + "epoch": 17.72890484739677, + "grad_norm": 17.34099006652832, + "learning_rate": 9.14203072012767e-06, + "loss": 4.9474, + "step": 197500 + }, + { + "epoch": 17.731149012567325, + "grad_norm": 17.02675437927246, + "learning_rate": 9.141781368442052e-06, + "loss": 4.8268, + "step": 197525 + }, + { + "epoch": 17.73339317773788, + "grad_norm": 18.973703384399414, + "learning_rate": 9.141532016756434e-06, + "loss": 4.87, + "step": 197550 + }, + { + "epoch": 17.735637342908436, + "grad_norm": 19.546737670898438, + "learning_rate": 9.141282665070816e-06, + "loss": 4.9235, + "step": 197575 + }, + { + "epoch": 17.737881508078996, + "grad_norm": 18.785184860229492, + "learning_rate": 9.141033313385199e-06, + "loss": 4.9229, + "step": 197600 + }, + { + "epoch": 17.74012567324955, + "grad_norm": 20.496131896972656, + "learning_rate": 9.140783961699583e-06, + "loss": 4.9735, + "step": 197625 + }, + { + "epoch": 17.742369838420107, + "grad_norm": 16.347551345825195, + "learning_rate": 9.140534610013965e-06, + "loss": 5.1171, + "step": 197650 + }, + { + "epoch": 17.744614003590666, + "grad_norm": 17.38701057434082, + "learning_rate": 9.140285258328347e-06, + "loss": 4.9143, + "step": 197675 + }, + { + "epoch": 17.74685816876122, + "grad_norm": 16.594186782836914, + "learning_rate": 9.14003590664273e-06, + "loss": 5.0782, + "step": 197700 + }, + { + "epoch": 17.749102333931777, + "grad_norm": 19.308706283569336, + "learning_rate": 9.139786554957112e-06, + "loss": 5.1426, + "step": 197725 + }, + { + "epoch": 17.751346499102333, + "grad_norm": 14.822589874267578, + "learning_rate": 9.139537203271494e-06, + "loss": 4.9797, + "step": 197750 + }, + { + "epoch": 17.753590664272892, + "grad_norm": 15.088176727294922, + "learning_rate": 9.139287851585877e-06, + "loss": 5.0627, + "step": 197775 + }, + { + "epoch": 17.755834829443447, + "grad_norm": 17.182863235473633, + "learning_rate": 9.13903849990026e-06, + "loss": 4.9348, + "step": 197800 + }, + { + "epoch": 17.758078994614003, + "grad_norm": 16.278778076171875, + "learning_rate": 9.138789148214643e-06, + "loss": 4.8449, + "step": 197825 + }, + { + "epoch": 17.76032315978456, + "grad_norm": 18.11627960205078, + "learning_rate": 9.138539796529025e-06, + "loss": 4.9511, + "step": 197850 + }, + { + "epoch": 17.762567324955118, + "grad_norm": 17.695999145507812, + "learning_rate": 9.13829044484341e-06, + "loss": 4.9786, + "step": 197875 + }, + { + "epoch": 17.764811490125673, + "grad_norm": 15.079052925109863, + "learning_rate": 9.13804109315779e-06, + "loss": 5.0889, + "step": 197900 + }, + { + "epoch": 17.76705565529623, + "grad_norm": 19.580331802368164, + "learning_rate": 9.137791741472172e-06, + "loss": 5.0588, + "step": 197925 + }, + { + "epoch": 17.769299820466788, + "grad_norm": 17.092809677124023, + "learning_rate": 9.137542389786556e-06, + "loss": 5.1136, + "step": 197950 + }, + { + "epoch": 17.771543985637344, + "grad_norm": 14.684435844421387, + "learning_rate": 9.137293038100938e-06, + "loss": 5.1252, + "step": 197975 + }, + { + "epoch": 17.7737881508079, + "grad_norm": 18.424190521240234, + "learning_rate": 9.13704368641532e-06, + "loss": 5.1872, + "step": 198000 + }, + { + "epoch": 17.776032315978455, + "grad_norm": 14.008508682250977, + "learning_rate": 9.136794334729703e-06, + "loss": 5.0789, + "step": 198025 + }, + { + "epoch": 17.778276481149014, + "grad_norm": 17.85378074645996, + "learning_rate": 9.136544983044087e-06, + "loss": 5.0436, + "step": 198050 + }, + { + "epoch": 17.78052064631957, + "grad_norm": 17.065696716308594, + "learning_rate": 9.136295631358468e-06, + "loss": 5.0894, + "step": 198075 + }, + { + "epoch": 17.782764811490125, + "grad_norm": 21.047061920166016, + "learning_rate": 9.136046279672852e-06, + "loss": 4.8914, + "step": 198100 + }, + { + "epoch": 17.78500897666068, + "grad_norm": 17.689470291137695, + "learning_rate": 9.135796927987234e-06, + "loss": 5.1706, + "step": 198125 + }, + { + "epoch": 17.78725314183124, + "grad_norm": 22.240434646606445, + "learning_rate": 9.135547576301616e-06, + "loss": 5.232, + "step": 198150 + }, + { + "epoch": 17.789497307001795, + "grad_norm": 17.079946517944336, + "learning_rate": 9.135298224615999e-06, + "loss": 4.919, + "step": 198175 + }, + { + "epoch": 17.79174147217235, + "grad_norm": 16.52745246887207, + "learning_rate": 9.135048872930383e-06, + "loss": 4.9754, + "step": 198200 + }, + { + "epoch": 17.793985637342907, + "grad_norm": 19.563385009765625, + "learning_rate": 9.134799521244765e-06, + "loss": 5.0279, + "step": 198225 + }, + { + "epoch": 17.796229802513466, + "grad_norm": 17.73080825805664, + "learning_rate": 9.134550169559147e-06, + "loss": 5.2121, + "step": 198250 + }, + { + "epoch": 17.79847396768402, + "grad_norm": 19.882978439331055, + "learning_rate": 9.13430081787353e-06, + "loss": 5.0802, + "step": 198275 + }, + { + "epoch": 17.800718132854577, + "grad_norm": 14.59212589263916, + "learning_rate": 9.134051466187912e-06, + "loss": 5.0741, + "step": 198300 + }, + { + "epoch": 17.802962298025136, + "grad_norm": 17.657794952392578, + "learning_rate": 9.133802114502294e-06, + "loss": 4.6886, + "step": 198325 + }, + { + "epoch": 17.80520646319569, + "grad_norm": 16.61407470703125, + "learning_rate": 9.133552762816678e-06, + "loss": 4.997, + "step": 198350 + }, + { + "epoch": 17.807450628366247, + "grad_norm": 21.272138595581055, + "learning_rate": 9.13330341113106e-06, + "loss": 5.1924, + "step": 198375 + }, + { + "epoch": 17.809694793536803, + "grad_norm": 19.30321502685547, + "learning_rate": 9.133054059445443e-06, + "loss": 4.752, + "step": 198400 + }, + { + "epoch": 17.811938958707362, + "grad_norm": 19.864042282104492, + "learning_rate": 9.132804707759825e-06, + "loss": 4.9488, + "step": 198425 + }, + { + "epoch": 17.814183123877918, + "grad_norm": 18.392047882080078, + "learning_rate": 9.132555356074208e-06, + "loss": 4.9924, + "step": 198450 + }, + { + "epoch": 17.816427289048473, + "grad_norm": 17.842235565185547, + "learning_rate": 9.13230600438859e-06, + "loss": 5.1065, + "step": 198475 + }, + { + "epoch": 17.81867145421903, + "grad_norm": 16.219453811645508, + "learning_rate": 9.132056652702972e-06, + "loss": 4.7968, + "step": 198500 + }, + { + "epoch": 17.820915619389588, + "grad_norm": 20.05539894104004, + "learning_rate": 9.131807301017356e-06, + "loss": 5.0302, + "step": 198525 + }, + { + "epoch": 17.823159784560143, + "grad_norm": 21.742216110229492, + "learning_rate": 9.131557949331739e-06, + "loss": 4.8977, + "step": 198550 + }, + { + "epoch": 17.8254039497307, + "grad_norm": 18.64249038696289, + "learning_rate": 9.13130859764612e-06, + "loss": 5.107, + "step": 198575 + }, + { + "epoch": 17.82764811490126, + "grad_norm": 17.928836822509766, + "learning_rate": 9.131059245960503e-06, + "loss": 4.9451, + "step": 198600 + }, + { + "epoch": 17.829892280071814, + "grad_norm": 18.797096252441406, + "learning_rate": 9.130809894274885e-06, + "loss": 5.1387, + "step": 198625 + }, + { + "epoch": 17.83213644524237, + "grad_norm": 18.821725845336914, + "learning_rate": 9.130560542589268e-06, + "loss": 4.6773, + "step": 198650 + }, + { + "epoch": 17.834380610412925, + "grad_norm": 22.11922836303711, + "learning_rate": 9.130311190903652e-06, + "loss": 4.6834, + "step": 198675 + }, + { + "epoch": 17.836624775583484, + "grad_norm": 18.911184310913086, + "learning_rate": 9.130061839218034e-06, + "loss": 4.9333, + "step": 198700 + }, + { + "epoch": 17.83886894075404, + "grad_norm": 15.755707740783691, + "learning_rate": 9.129812487532416e-06, + "loss": 4.7733, + "step": 198725 + }, + { + "epoch": 17.841113105924595, + "grad_norm": 15.687397956848145, + "learning_rate": 9.129563135846799e-06, + "loss": 5.0364, + "step": 198750 + }, + { + "epoch": 17.84335727109515, + "grad_norm": 17.19807243347168, + "learning_rate": 9.129313784161181e-06, + "loss": 4.9689, + "step": 198775 + }, + { + "epoch": 17.84560143626571, + "grad_norm": 17.519559860229492, + "learning_rate": 9.129064432475563e-06, + "loss": 5.1131, + "step": 198800 + }, + { + "epoch": 17.847845601436266, + "grad_norm": 20.51544952392578, + "learning_rate": 9.128815080789947e-06, + "loss": 4.9532, + "step": 198825 + }, + { + "epoch": 17.85008976660682, + "grad_norm": 19.055072784423828, + "learning_rate": 9.12856572910433e-06, + "loss": 5.0541, + "step": 198850 + }, + { + "epoch": 17.85233393177738, + "grad_norm": 19.849943161010742, + "learning_rate": 9.128316377418712e-06, + "loss": 4.9917, + "step": 198875 + }, + { + "epoch": 17.854578096947936, + "grad_norm": 18.815160751342773, + "learning_rate": 9.128067025733094e-06, + "loss": 4.9291, + "step": 198900 + }, + { + "epoch": 17.85682226211849, + "grad_norm": 17.641151428222656, + "learning_rate": 9.127817674047478e-06, + "loss": 5.1056, + "step": 198925 + }, + { + "epoch": 17.859066427289047, + "grad_norm": 14.16756534576416, + "learning_rate": 9.127568322361859e-06, + "loss": 5.0775, + "step": 198950 + }, + { + "epoch": 17.861310592459606, + "grad_norm": 19.10993766784668, + "learning_rate": 9.127318970676243e-06, + "loss": 5.1206, + "step": 198975 + }, + { + "epoch": 17.863554757630162, + "grad_norm": 16.375951766967773, + "learning_rate": 9.127069618990625e-06, + "loss": 5.2039, + "step": 199000 + }, + { + "epoch": 17.865798922800717, + "grad_norm": 19.87569808959961, + "learning_rate": 9.126820267305008e-06, + "loss": 4.9172, + "step": 199025 + }, + { + "epoch": 17.868043087971273, + "grad_norm": 17.278522491455078, + "learning_rate": 9.12657091561939e-06, + "loss": 5.2169, + "step": 199050 + }, + { + "epoch": 17.870287253141832, + "grad_norm": 16.71474266052246, + "learning_rate": 9.126321563933774e-06, + "loss": 5.1019, + "step": 199075 + }, + { + "epoch": 17.872531418312388, + "grad_norm": 20.161996841430664, + "learning_rate": 9.126072212248156e-06, + "loss": 5.0619, + "step": 199100 + }, + { + "epoch": 17.874775583482943, + "grad_norm": 17.31075668334961, + "learning_rate": 9.125822860562537e-06, + "loss": 5.042, + "step": 199125 + }, + { + "epoch": 17.877019748653503, + "grad_norm": 22.231616973876953, + "learning_rate": 9.125573508876921e-06, + "loss": 4.9945, + "step": 199150 + }, + { + "epoch": 17.879263913824058, + "grad_norm": 16.923402786254883, + "learning_rate": 9.125324157191303e-06, + "loss": 4.8714, + "step": 199175 + }, + { + "epoch": 17.881508078994614, + "grad_norm": 17.626445770263672, + "learning_rate": 9.125074805505686e-06, + "loss": 4.8743, + "step": 199200 + }, + { + "epoch": 17.88375224416517, + "grad_norm": 18.75847625732422, + "learning_rate": 9.124825453820068e-06, + "loss": 4.9051, + "step": 199225 + }, + { + "epoch": 17.88599640933573, + "grad_norm": 16.6588077545166, + "learning_rate": 9.124576102134452e-06, + "loss": 4.9132, + "step": 199250 + }, + { + "epoch": 17.888240574506284, + "grad_norm": 15.110862731933594, + "learning_rate": 9.124326750448834e-06, + "loss": 4.9891, + "step": 199275 + }, + { + "epoch": 17.89048473967684, + "grad_norm": 18.441797256469727, + "learning_rate": 9.124077398763216e-06, + "loss": 5.3115, + "step": 199300 + }, + { + "epoch": 17.892728904847395, + "grad_norm": 17.937541961669922, + "learning_rate": 9.123828047077599e-06, + "loss": 4.676, + "step": 199325 + }, + { + "epoch": 17.894973070017954, + "grad_norm": 17.17743682861328, + "learning_rate": 9.123578695391981e-06, + "loss": 4.939, + "step": 199350 + }, + { + "epoch": 17.89721723518851, + "grad_norm": 15.777093887329102, + "learning_rate": 9.123329343706363e-06, + "loss": 5.2209, + "step": 199375 + }, + { + "epoch": 17.899461400359066, + "grad_norm": 21.828153610229492, + "learning_rate": 9.123079992020747e-06, + "loss": 4.9837, + "step": 199400 + }, + { + "epoch": 17.901705565529625, + "grad_norm": 17.51154899597168, + "learning_rate": 9.12283064033513e-06, + "loss": 5.0317, + "step": 199425 + }, + { + "epoch": 17.90394973070018, + "grad_norm": 16.993946075439453, + "learning_rate": 9.122581288649512e-06, + "loss": 4.9864, + "step": 199450 + }, + { + "epoch": 17.906193895870736, + "grad_norm": 14.200679779052734, + "learning_rate": 9.122331936963894e-06, + "loss": 5.0076, + "step": 199475 + }, + { + "epoch": 17.90843806104129, + "grad_norm": 18.38384437561035, + "learning_rate": 9.122082585278277e-06, + "loss": 5.0517, + "step": 199500 + }, + { + "epoch": 17.91068222621185, + "grad_norm": 16.872682571411133, + "learning_rate": 9.121833233592659e-06, + "loss": 5.0349, + "step": 199525 + }, + { + "epoch": 17.912926391382406, + "grad_norm": 16.503582000732422, + "learning_rate": 9.121583881907043e-06, + "loss": 4.9235, + "step": 199550 + }, + { + "epoch": 17.91517055655296, + "grad_norm": 24.417339324951172, + "learning_rate": 9.12134450428885e-06, + "loss": 4.9061, + "step": 199575 + }, + { + "epoch": 17.917414721723517, + "grad_norm": 19.56569480895996, + "learning_rate": 9.121095152603232e-06, + "loss": 5.0994, + "step": 199600 + }, + { + "epoch": 17.919658886894076, + "grad_norm": 18.474172592163086, + "learning_rate": 9.120845800917615e-06, + "loss": 5.0357, + "step": 199625 + }, + { + "epoch": 17.921903052064632, + "grad_norm": 16.91902732849121, + "learning_rate": 9.120596449231997e-06, + "loss": 5.0534, + "step": 199650 + }, + { + "epoch": 17.924147217235188, + "grad_norm": 17.93754005432129, + "learning_rate": 9.120347097546381e-06, + "loss": 4.7667, + "step": 199675 + }, + { + "epoch": 17.926391382405747, + "grad_norm": 18.537538528442383, + "learning_rate": 9.120097745860763e-06, + "loss": 4.8429, + "step": 199700 + }, + { + "epoch": 17.928635547576302, + "grad_norm": 19.459247589111328, + "learning_rate": 9.119848394175146e-06, + "loss": 4.8312, + "step": 199725 + }, + { + "epoch": 17.930879712746858, + "grad_norm": 19.68691062927246, + "learning_rate": 9.119599042489528e-06, + "loss": 5.0472, + "step": 199750 + }, + { + "epoch": 17.933123877917414, + "grad_norm": 17.965288162231445, + "learning_rate": 9.11934969080391e-06, + "loss": 4.9384, + "step": 199775 + }, + { + "epoch": 17.935368043087973, + "grad_norm": 18.101621627807617, + "learning_rate": 9.119100339118293e-06, + "loss": 4.9688, + "step": 199800 + }, + { + "epoch": 17.93761220825853, + "grad_norm": 18.426542282104492, + "learning_rate": 9.118850987432677e-06, + "loss": 4.9424, + "step": 199825 + }, + { + "epoch": 17.939856373429084, + "grad_norm": 17.450807571411133, + "learning_rate": 9.118601635747059e-06, + "loss": 4.7997, + "step": 199850 + }, + { + "epoch": 17.94210053859964, + "grad_norm": 16.313562393188477, + "learning_rate": 9.118352284061441e-06, + "loss": 5.0435, + "step": 199875 + }, + { + "epoch": 17.9443447037702, + "grad_norm": 19.9228515625, + "learning_rate": 9.118102932375823e-06, + "loss": 5.0239, + "step": 199900 + }, + { + "epoch": 17.946588868940754, + "grad_norm": 15.420713424682617, + "learning_rate": 9.117853580690206e-06, + "loss": 4.9444, + "step": 199925 + }, + { + "epoch": 17.94883303411131, + "grad_norm": 20.38234519958496, + "learning_rate": 9.117604229004588e-06, + "loss": 5.2116, + "step": 199950 + }, + { + "epoch": 17.95107719928187, + "grad_norm": 16.097206115722656, + "learning_rate": 9.117354877318972e-06, + "loss": 5.002, + "step": 199975 + }, + { + "epoch": 17.953321364452425, + "grad_norm": 17.657381057739258, + "learning_rate": 9.117105525633354e-06, + "loss": 4.9875, + "step": 200000 + }, + { + "epoch": 17.95556552962298, + "grad_norm": 19.31199073791504, + "learning_rate": 9.116856173947737e-06, + "loss": 4.8078, + "step": 200025 + }, + { + "epoch": 17.957809694793536, + "grad_norm": 21.69796371459961, + "learning_rate": 9.116606822262119e-06, + "loss": 5.1799, + "step": 200050 + }, + { + "epoch": 17.960053859964095, + "grad_norm": 18.114734649658203, + "learning_rate": 9.116357470576503e-06, + "loss": 4.9339, + "step": 200075 + }, + { + "epoch": 17.96229802513465, + "grad_norm": 18.495580673217773, + "learning_rate": 9.116108118890884e-06, + "loss": 5.0347, + "step": 200100 + }, + { + "epoch": 17.964542190305206, + "grad_norm": 18.45655059814453, + "learning_rate": 9.115858767205266e-06, + "loss": 4.9884, + "step": 200125 + }, + { + "epoch": 17.96678635547576, + "grad_norm": 18.43984603881836, + "learning_rate": 9.11560941551965e-06, + "loss": 5.2309, + "step": 200150 + }, + { + "epoch": 17.96903052064632, + "grad_norm": 15.62181568145752, + "learning_rate": 9.115360063834032e-06, + "loss": 4.9767, + "step": 200175 + }, + { + "epoch": 17.971274685816876, + "grad_norm": 21.432703018188477, + "learning_rate": 9.115110712148415e-06, + "loss": 5.1689, + "step": 200200 + }, + { + "epoch": 17.973518850987432, + "grad_norm": 16.556610107421875, + "learning_rate": 9.114861360462797e-06, + "loss": 5.0756, + "step": 200225 + }, + { + "epoch": 17.975763016157988, + "grad_norm": 21.61834716796875, + "learning_rate": 9.114612008777181e-06, + "loss": 5.0606, + "step": 200250 + }, + { + "epoch": 17.978007181328547, + "grad_norm": 16.94976806640625, + "learning_rate": 9.114362657091562e-06, + "loss": 4.84, + "step": 200275 + }, + { + "epoch": 17.980251346499102, + "grad_norm": 18.2319393157959, + "learning_rate": 9.114113305405946e-06, + "loss": 5.2466, + "step": 200300 + }, + { + "epoch": 17.982495511669658, + "grad_norm": 15.81246280670166, + "learning_rate": 9.113863953720328e-06, + "loss": 5.095, + "step": 200325 + }, + { + "epoch": 17.984739676840217, + "grad_norm": 19.82427215576172, + "learning_rate": 9.11361460203471e-06, + "loss": 4.9411, + "step": 200350 + }, + { + "epoch": 17.986983842010773, + "grad_norm": 16.812856674194336, + "learning_rate": 9.113365250349093e-06, + "loss": 5.0213, + "step": 200375 + }, + { + "epoch": 17.989228007181328, + "grad_norm": 18.423614501953125, + "learning_rate": 9.113115898663477e-06, + "loss": 5.0485, + "step": 200400 + }, + { + "epoch": 17.991472172351884, + "grad_norm": 21.349367141723633, + "learning_rate": 9.112866546977859e-06, + "loss": 5.3443, + "step": 200425 + }, + { + "epoch": 17.993716337522443, + "grad_norm": 19.26146125793457, + "learning_rate": 9.112617195292241e-06, + "loss": 5.1333, + "step": 200450 + }, + { + "epoch": 17.995960502693, + "grad_norm": 16.399869918823242, + "learning_rate": 9.112367843606624e-06, + "loss": 4.78, + "step": 200475 + }, + { + "epoch": 17.998204667863554, + "grad_norm": 16.821533203125, + "learning_rate": 9.112118491921006e-06, + "loss": 4.8837, + "step": 200500 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.07003608729688962, + "eval_f1_macro": 0.008303999596153532, + "eval_f1_micro": 0.07003608729688962, + "eval_f1_weighted": 0.04185247253776263, + "eval_loss": 6.835184097290039, + "eval_precision_macro": 0.007327382537785927, + "eval_precision_micro": 0.07003608729688962, + "eval_precision_weighted": 0.034013391053478684, + "eval_recall_macro": 0.013681554018417582, + "eval_recall_micro": 0.07003608729688962, + "eval_recall_weighted": 0.07003608729688962, + "eval_runtime": 129.2599, + "eval_samples_per_second": 405.176, + "eval_steps_per_second": 12.664, + "step": 200520 + }, + { + "epoch": 18.00044883303411, + "grad_norm": 14.964735984802246, + "learning_rate": 9.111869140235388e-06, + "loss": 4.927, + "step": 200525 + }, + { + "epoch": 18.00269299820467, + "grad_norm": 17.215465545654297, + "learning_rate": 9.111619788549772e-06, + "loss": 4.8509, + "step": 200550 + }, + { + "epoch": 18.004937163375224, + "grad_norm": 16.070619583129883, + "learning_rate": 9.111370436864154e-06, + "loss": 4.4219, + "step": 200575 + }, + { + "epoch": 18.00718132854578, + "grad_norm": 17.979637145996094, + "learning_rate": 9.111121085178537e-06, + "loss": 4.4865, + "step": 200600 + }, + { + "epoch": 18.00942549371634, + "grad_norm": 18.432693481445312, + "learning_rate": 9.110871733492919e-06, + "loss": 4.4873, + "step": 200625 + }, + { + "epoch": 18.011669658886895, + "grad_norm": 15.131572723388672, + "learning_rate": 9.110622381807301e-06, + "loss": 4.7714, + "step": 200650 + }, + { + "epoch": 18.01391382405745, + "grad_norm": 15.525897026062012, + "learning_rate": 9.110373030121684e-06, + "loss": 4.8225, + "step": 200675 + }, + { + "epoch": 18.016157989228006, + "grad_norm": 18.433813095092773, + "learning_rate": 9.110123678436068e-06, + "loss": 4.6286, + "step": 200700 + }, + { + "epoch": 18.018402154398565, + "grad_norm": 18.683622360229492, + "learning_rate": 9.10987432675045e-06, + "loss": 4.628, + "step": 200725 + }, + { + "epoch": 18.02064631956912, + "grad_norm": 19.753507614135742, + "learning_rate": 9.109624975064832e-06, + "loss": 4.8038, + "step": 200750 + }, + { + "epoch": 18.022890484739676, + "grad_norm": 17.986879348754883, + "learning_rate": 9.109375623379215e-06, + "loss": 4.7125, + "step": 200775 + }, + { + "epoch": 18.025134649910232, + "grad_norm": 17.20915985107422, + "learning_rate": 9.109126271693597e-06, + "loss": 4.4964, + "step": 200800 + }, + { + "epoch": 18.02737881508079, + "grad_norm": 19.204862594604492, + "learning_rate": 9.10887692000798e-06, + "loss": 4.5834, + "step": 200825 + }, + { + "epoch": 18.029622980251347, + "grad_norm": 14.314895629882812, + "learning_rate": 9.108627568322362e-06, + "loss": 4.6474, + "step": 200850 + }, + { + "epoch": 18.031867145421902, + "grad_norm": 16.21242904663086, + "learning_rate": 9.108378216636746e-06, + "loss": 4.9428, + "step": 200875 + }, + { + "epoch": 18.03411131059246, + "grad_norm": 16.015859603881836, + "learning_rate": 9.108128864951128e-06, + "loss": 4.5075, + "step": 200900 + }, + { + "epoch": 18.036355475763017, + "grad_norm": 20.236831665039062, + "learning_rate": 9.10787951326551e-06, + "loss": 4.7245, + "step": 200925 + }, + { + "epoch": 18.038599640933572, + "grad_norm": 18.07596778869629, + "learning_rate": 9.107630161579893e-06, + "loss": 4.494, + "step": 200950 + }, + { + "epoch": 18.040843806104128, + "grad_norm": 19.49717140197754, + "learning_rate": 9.107380809894275e-06, + "loss": 4.652, + "step": 200975 + }, + { + "epoch": 18.043087971274687, + "grad_norm": 17.95393943786621, + "learning_rate": 9.107131458208657e-06, + "loss": 4.734, + "step": 201000 + }, + { + "epoch": 18.045332136445243, + "grad_norm": 17.971572875976562, + "learning_rate": 9.106882106523041e-06, + "loss": 4.5598, + "step": 201025 + }, + { + "epoch": 18.0475763016158, + "grad_norm": 16.11806869506836, + "learning_rate": 9.106632754837424e-06, + "loss": 4.6864, + "step": 201050 + }, + { + "epoch": 18.049820466786354, + "grad_norm": 18.05973243713379, + "learning_rate": 9.106383403151806e-06, + "loss": 4.7378, + "step": 201075 + }, + { + "epoch": 18.052064631956913, + "grad_norm": 15.541145324707031, + "learning_rate": 9.106134051466188e-06, + "loss": 4.6563, + "step": 201100 + }, + { + "epoch": 18.05430879712747, + "grad_norm": 19.02956771850586, + "learning_rate": 9.105884699780572e-06, + "loss": 4.7441, + "step": 201125 + }, + { + "epoch": 18.056552962298024, + "grad_norm": 15.93120002746582, + "learning_rate": 9.105635348094955e-06, + "loss": 4.8682, + "step": 201150 + }, + { + "epoch": 18.058797127468583, + "grad_norm": 16.870929718017578, + "learning_rate": 9.105385996409337e-06, + "loss": 4.8807, + "step": 201175 + }, + { + "epoch": 18.06104129263914, + "grad_norm": 18.863340377807617, + "learning_rate": 9.10513664472372e-06, + "loss": 4.6178, + "step": 201200 + }, + { + "epoch": 18.063285457809695, + "grad_norm": 18.765058517456055, + "learning_rate": 9.104887293038101e-06, + "loss": 4.6756, + "step": 201225 + }, + { + "epoch": 18.06552962298025, + "grad_norm": 17.91250991821289, + "learning_rate": 9.104637941352484e-06, + "loss": 4.6816, + "step": 201250 + }, + { + "epoch": 18.06777378815081, + "grad_norm": 13.582074165344238, + "learning_rate": 9.104388589666868e-06, + "loss": 4.7021, + "step": 201275 + }, + { + "epoch": 18.070017953321365, + "grad_norm": 19.457576751708984, + "learning_rate": 9.10413923798125e-06, + "loss": 4.8226, + "step": 201300 + }, + { + "epoch": 18.07226211849192, + "grad_norm": 21.68860626220703, + "learning_rate": 9.103889886295632e-06, + "loss": 4.7219, + "step": 201325 + }, + { + "epoch": 18.074506283662476, + "grad_norm": 17.987573623657227, + "learning_rate": 9.103640534610015e-06, + "loss": 4.5966, + "step": 201350 + }, + { + "epoch": 18.076750448833035, + "grad_norm": 16.455795288085938, + "learning_rate": 9.103391182924397e-06, + "loss": 5.1197, + "step": 201375 + }, + { + "epoch": 18.07899461400359, + "grad_norm": 18.072683334350586, + "learning_rate": 9.10314183123878e-06, + "loss": 4.6997, + "step": 201400 + }, + { + "epoch": 18.081238779174146, + "grad_norm": 20.118865966796875, + "learning_rate": 9.102892479553163e-06, + "loss": 4.6632, + "step": 201425 + }, + { + "epoch": 18.083482944344702, + "grad_norm": 18.342506408691406, + "learning_rate": 9.102643127867546e-06, + "loss": 4.8123, + "step": 201450 + }, + { + "epoch": 18.08572710951526, + "grad_norm": 18.760290145874023, + "learning_rate": 9.102393776181928e-06, + "loss": 4.6682, + "step": 201475 + }, + { + "epoch": 18.087971274685817, + "grad_norm": 16.5366268157959, + "learning_rate": 9.10214442449631e-06, + "loss": 4.6939, + "step": 201500 + }, + { + "epoch": 18.090215439856372, + "grad_norm": 23.347000122070312, + "learning_rate": 9.101895072810693e-06, + "loss": 4.6007, + "step": 201525 + }, + { + "epoch": 18.09245960502693, + "grad_norm": 16.244232177734375, + "learning_rate": 9.101645721125075e-06, + "loss": 4.8155, + "step": 201550 + }, + { + "epoch": 18.094703770197487, + "grad_norm": 18.330610275268555, + "learning_rate": 9.101396369439457e-06, + "loss": 4.8866, + "step": 201575 + }, + { + "epoch": 18.096947935368043, + "grad_norm": 20.934803009033203, + "learning_rate": 9.101147017753841e-06, + "loss": 4.7284, + "step": 201600 + }, + { + "epoch": 18.0991921005386, + "grad_norm": 19.60129737854004, + "learning_rate": 9.100897666068224e-06, + "loss": 4.6793, + "step": 201625 + }, + { + "epoch": 18.101436265709157, + "grad_norm": 23.512731552124023, + "learning_rate": 9.100648314382606e-06, + "loss": 4.5769, + "step": 201650 + }, + { + "epoch": 18.103680430879713, + "grad_norm": 14.707357406616211, + "learning_rate": 9.100398962696988e-06, + "loss": 4.7603, + "step": 201675 + }, + { + "epoch": 18.10592459605027, + "grad_norm": 15.347556114196777, + "learning_rate": 9.10014961101137e-06, + "loss": 4.5602, + "step": 201700 + }, + { + "epoch": 18.108168761220824, + "grad_norm": 15.262346267700195, + "learning_rate": 9.099900259325753e-06, + "loss": 4.6983, + "step": 201725 + }, + { + "epoch": 18.110412926391383, + "grad_norm": 17.32182502746582, + "learning_rate": 9.099660881707562e-06, + "loss": 4.6945, + "step": 201750 + }, + { + "epoch": 18.11265709156194, + "grad_norm": 15.633901596069336, + "learning_rate": 9.099411530021944e-06, + "loss": 4.8001, + "step": 201775 + }, + { + "epoch": 18.114901256732495, + "grad_norm": 19.830698013305664, + "learning_rate": 9.099162178336326e-06, + "loss": 4.7888, + "step": 201800 + }, + { + "epoch": 18.117145421903054, + "grad_norm": 18.013654708862305, + "learning_rate": 9.098912826650708e-06, + "loss": 4.6414, + "step": 201825 + }, + { + "epoch": 18.11938958707361, + "grad_norm": 20.63473892211914, + "learning_rate": 9.09866347496509e-06, + "loss": 4.6523, + "step": 201850 + }, + { + "epoch": 18.121633752244165, + "grad_norm": 15.954748153686523, + "learning_rate": 9.098414123279475e-06, + "loss": 4.7114, + "step": 201875 + }, + { + "epoch": 18.12387791741472, + "grad_norm": 20.6785888671875, + "learning_rate": 9.098164771593857e-06, + "loss": 4.7659, + "step": 201900 + }, + { + "epoch": 18.12612208258528, + "grad_norm": 15.292112350463867, + "learning_rate": 9.09791541990824e-06, + "loss": 4.7853, + "step": 201925 + }, + { + "epoch": 18.128366247755835, + "grad_norm": 17.239688873291016, + "learning_rate": 9.097666068222622e-06, + "loss": 4.846, + "step": 201950 + }, + { + "epoch": 18.13061041292639, + "grad_norm": 16.52985382080078, + "learning_rate": 9.097416716537004e-06, + "loss": 4.5902, + "step": 201975 + }, + { + "epoch": 18.132854578096946, + "grad_norm": 17.71623420715332, + "learning_rate": 9.097167364851386e-06, + "loss": 4.6805, + "step": 202000 + }, + { + "epoch": 18.135098743267505, + "grad_norm": 17.898103713989258, + "learning_rate": 9.09691801316577e-06, + "loss": 4.7574, + "step": 202025 + }, + { + "epoch": 18.13734290843806, + "grad_norm": 16.503374099731445, + "learning_rate": 9.096668661480153e-06, + "loss": 4.729, + "step": 202050 + }, + { + "epoch": 18.139587073608617, + "grad_norm": 18.647035598754883, + "learning_rate": 9.096419309794535e-06, + "loss": 4.6157, + "step": 202075 + }, + { + "epoch": 18.141831238779176, + "grad_norm": 18.20027732849121, + "learning_rate": 9.096169958108917e-06, + "loss": 4.7778, + "step": 202100 + }, + { + "epoch": 18.14407540394973, + "grad_norm": 18.927244186401367, + "learning_rate": 9.0959206064233e-06, + "loss": 4.8755, + "step": 202125 + }, + { + "epoch": 18.146319569120287, + "grad_norm": 15.75395393371582, + "learning_rate": 9.095671254737682e-06, + "loss": 4.6777, + "step": 202150 + }, + { + "epoch": 18.148563734290843, + "grad_norm": 19.421098709106445, + "learning_rate": 9.095421903052066e-06, + "loss": 4.8019, + "step": 202175 + }, + { + "epoch": 18.1508078994614, + "grad_norm": 17.916030883789062, + "learning_rate": 9.095172551366448e-06, + "loss": 4.5779, + "step": 202200 + }, + { + "epoch": 18.153052064631957, + "grad_norm": 15.48299503326416, + "learning_rate": 9.09492319968083e-06, + "loss": 4.5017, + "step": 202225 + }, + { + "epoch": 18.155296229802513, + "grad_norm": 19.70330238342285, + "learning_rate": 9.094673847995213e-06, + "loss": 4.7194, + "step": 202250 + }, + { + "epoch": 18.15754039497307, + "grad_norm": 16.76500129699707, + "learning_rate": 9.094424496309597e-06, + "loss": 4.6122, + "step": 202275 + }, + { + "epoch": 18.159784560143628, + "grad_norm": 17.118898391723633, + "learning_rate": 9.094175144623978e-06, + "loss": 4.5822, + "step": 202300 + }, + { + "epoch": 18.162028725314183, + "grad_norm": 18.612770080566406, + "learning_rate": 9.09392579293836e-06, + "loss": 4.6144, + "step": 202325 + }, + { + "epoch": 18.16427289048474, + "grad_norm": 16.923694610595703, + "learning_rate": 9.093676441252744e-06, + "loss": 4.7221, + "step": 202350 + }, + { + "epoch": 18.166517055655298, + "grad_norm": 18.086639404296875, + "learning_rate": 9.093427089567126e-06, + "loss": 4.9141, + "step": 202375 + }, + { + "epoch": 18.168761220825854, + "grad_norm": 16.396757125854492, + "learning_rate": 9.093177737881509e-06, + "loss": 4.7289, + "step": 202400 + }, + { + "epoch": 18.17100538599641, + "grad_norm": 17.958662033081055, + "learning_rate": 9.092928386195893e-06, + "loss": 4.4858, + "step": 202425 + }, + { + "epoch": 18.173249551166965, + "grad_norm": 16.896921157836914, + "learning_rate": 9.092679034510275e-06, + "loss": 4.8126, + "step": 202450 + }, + { + "epoch": 18.175493716337524, + "grad_norm": 18.97826385498047, + "learning_rate": 9.092429682824655e-06, + "loss": 4.4611, + "step": 202475 + }, + { + "epoch": 18.17773788150808, + "grad_norm": 21.56619644165039, + "learning_rate": 9.09218033113904e-06, + "loss": 4.5892, + "step": 202500 + }, + { + "epoch": 18.179982046678635, + "grad_norm": 17.15458869934082, + "learning_rate": 9.091930979453422e-06, + "loss": 4.7267, + "step": 202525 + }, + { + "epoch": 18.18222621184919, + "grad_norm": 22.34095573425293, + "learning_rate": 9.091681627767804e-06, + "loss": 4.7172, + "step": 202550 + }, + { + "epoch": 18.18447037701975, + "grad_norm": 16.061315536499023, + "learning_rate": 9.091432276082186e-06, + "loss": 4.7161, + "step": 202575 + }, + { + "epoch": 18.186714542190305, + "grad_norm": 19.404850006103516, + "learning_rate": 9.09118292439657e-06, + "loss": 4.541, + "step": 202600 + }, + { + "epoch": 18.18895870736086, + "grad_norm": 15.562599182128906, + "learning_rate": 9.090933572710953e-06, + "loss": 4.6271, + "step": 202625 + }, + { + "epoch": 18.19120287253142, + "grad_norm": 20.368093490600586, + "learning_rate": 9.090684221025335e-06, + "loss": 4.5247, + "step": 202650 + }, + { + "epoch": 18.193447037701976, + "grad_norm": 18.738262176513672, + "learning_rate": 9.090434869339717e-06, + "loss": 4.4454, + "step": 202675 + }, + { + "epoch": 18.19569120287253, + "grad_norm": 14.279268264770508, + "learning_rate": 9.0901855176541e-06, + "loss": 4.6712, + "step": 202700 + }, + { + "epoch": 18.197935368043087, + "grad_norm": 19.944650650024414, + "learning_rate": 9.089936165968482e-06, + "loss": 4.7874, + "step": 202725 + }, + { + "epoch": 18.200179533213646, + "grad_norm": 17.569387435913086, + "learning_rate": 9.089686814282866e-06, + "loss": 4.4474, + "step": 202750 + }, + { + "epoch": 18.2024236983842, + "grad_norm": 17.20966911315918, + "learning_rate": 9.089437462597248e-06, + "loss": 4.6101, + "step": 202775 + }, + { + "epoch": 18.204667863554757, + "grad_norm": 17.028858184814453, + "learning_rate": 9.08918811091163e-06, + "loss": 4.6503, + "step": 202800 + }, + { + "epoch": 18.206912028725313, + "grad_norm": 16.168407440185547, + "learning_rate": 9.088938759226013e-06, + "loss": 4.8639, + "step": 202825 + }, + { + "epoch": 18.209156193895872, + "grad_norm": 18.24694061279297, + "learning_rate": 9.088689407540395e-06, + "loss": 4.6114, + "step": 202850 + }, + { + "epoch": 18.211400359066428, + "grad_norm": 19.951337814331055, + "learning_rate": 9.088440055854778e-06, + "loss": 4.6646, + "step": 202875 + }, + { + "epoch": 18.213644524236983, + "grad_norm": 19.737688064575195, + "learning_rate": 9.088190704169162e-06, + "loss": 4.652, + "step": 202900 + }, + { + "epoch": 18.21588868940754, + "grad_norm": 18.685216903686523, + "learning_rate": 9.087941352483544e-06, + "loss": 4.6314, + "step": 202925 + }, + { + "epoch": 18.218132854578098, + "grad_norm": 14.120654106140137, + "learning_rate": 9.087692000797926e-06, + "loss": 4.4602, + "step": 202950 + }, + { + "epoch": 18.220377019748653, + "grad_norm": 18.2043399810791, + "learning_rate": 9.087442649112309e-06, + "loss": 4.5304, + "step": 202975 + }, + { + "epoch": 18.22262118491921, + "grad_norm": 17.518972396850586, + "learning_rate": 9.087193297426693e-06, + "loss": 4.8911, + "step": 203000 + }, + { + "epoch": 18.224865350089768, + "grad_norm": 16.92984962463379, + "learning_rate": 9.086943945741073e-06, + "loss": 4.6175, + "step": 203025 + }, + { + "epoch": 18.227109515260324, + "grad_norm": 20.072303771972656, + "learning_rate": 9.086694594055456e-06, + "loss": 4.4755, + "step": 203050 + }, + { + "epoch": 18.22935368043088, + "grad_norm": 18.866836547851562, + "learning_rate": 9.08644524236984e-06, + "loss": 4.8434, + "step": 203075 + }, + { + "epoch": 18.231597845601435, + "grad_norm": 19.499290466308594, + "learning_rate": 9.086195890684222e-06, + "loss": 4.4723, + "step": 203100 + }, + { + "epoch": 18.233842010771994, + "grad_norm": 19.218366622924805, + "learning_rate": 9.085946538998604e-06, + "loss": 4.8523, + "step": 203125 + }, + { + "epoch": 18.23608617594255, + "grad_norm": 17.635753631591797, + "learning_rate": 9.085697187312988e-06, + "loss": 4.768, + "step": 203150 + }, + { + "epoch": 18.238330341113105, + "grad_norm": 18.93582534790039, + "learning_rate": 9.08544783562737e-06, + "loss": 4.7369, + "step": 203175 + }, + { + "epoch": 18.24057450628366, + "grad_norm": 17.56230926513672, + "learning_rate": 9.085198483941751e-06, + "loss": 4.7942, + "step": 203200 + }, + { + "epoch": 18.24281867145422, + "grad_norm": 15.842763900756836, + "learning_rate": 9.084949132256135e-06, + "loss": 4.6404, + "step": 203225 + }, + { + "epoch": 18.245062836624776, + "grad_norm": 16.22980499267578, + "learning_rate": 9.084699780570517e-06, + "loss": 4.5893, + "step": 203250 + }, + { + "epoch": 18.24730700179533, + "grad_norm": 19.221195220947266, + "learning_rate": 9.0844504288849e-06, + "loss": 4.8693, + "step": 203275 + }, + { + "epoch": 18.24955116696589, + "grad_norm": 20.175485610961914, + "learning_rate": 9.084201077199282e-06, + "loss": 4.9654, + "step": 203300 + }, + { + "epoch": 18.251795332136446, + "grad_norm": 17.396493911743164, + "learning_rate": 9.083951725513666e-06, + "loss": 4.8806, + "step": 203325 + }, + { + "epoch": 18.254039497307, + "grad_norm": 17.706811904907227, + "learning_rate": 9.083702373828048e-06, + "loss": 4.5057, + "step": 203350 + }, + { + "epoch": 18.256283662477557, + "grad_norm": 14.400145530700684, + "learning_rate": 9.08345302214243e-06, + "loss": 4.8997, + "step": 203375 + }, + { + "epoch": 18.258527827648116, + "grad_norm": 17.37712287902832, + "learning_rate": 9.083203670456813e-06, + "loss": 4.6295, + "step": 203400 + }, + { + "epoch": 18.260771992818672, + "grad_norm": 18.105762481689453, + "learning_rate": 9.082954318771195e-06, + "loss": 4.6043, + "step": 203425 + }, + { + "epoch": 18.263016157989227, + "grad_norm": 17.413049697875977, + "learning_rate": 9.082704967085578e-06, + "loss": 4.9009, + "step": 203450 + }, + { + "epoch": 18.265260323159783, + "grad_norm": 19.558128356933594, + "learning_rate": 9.082455615399962e-06, + "loss": 4.7194, + "step": 203475 + }, + { + "epoch": 18.267504488330342, + "grad_norm": 16.029253005981445, + "learning_rate": 9.082206263714344e-06, + "loss": 4.7256, + "step": 203500 + }, + { + "epoch": 18.269748653500898, + "grad_norm": 19.855052947998047, + "learning_rate": 9.081956912028726e-06, + "loss": 4.7434, + "step": 203525 + }, + { + "epoch": 18.271992818671453, + "grad_norm": 17.666013717651367, + "learning_rate": 9.081707560343109e-06, + "loss": 4.8729, + "step": 203550 + }, + { + "epoch": 18.274236983842012, + "grad_norm": 18.98858642578125, + "learning_rate": 9.081458208657491e-06, + "loss": 4.7442, + "step": 203575 + }, + { + "epoch": 18.276481149012568, + "grad_norm": 16.367977142333984, + "learning_rate": 9.081208856971873e-06, + "loss": 4.5748, + "step": 203600 + }, + { + "epoch": 18.278725314183124, + "grad_norm": 20.214988708496094, + "learning_rate": 9.080959505286257e-06, + "loss": 4.7551, + "step": 203625 + }, + { + "epoch": 18.28096947935368, + "grad_norm": 16.880413055419922, + "learning_rate": 9.08071015360064e-06, + "loss": 4.7736, + "step": 203650 + }, + { + "epoch": 18.28321364452424, + "grad_norm": 18.980756759643555, + "learning_rate": 9.080460801915022e-06, + "loss": 4.8474, + "step": 203675 + }, + { + "epoch": 18.285457809694794, + "grad_norm": 21.641115188598633, + "learning_rate": 9.080211450229404e-06, + "loss": 4.6694, + "step": 203700 + }, + { + "epoch": 18.28770197486535, + "grad_norm": 19.767837524414062, + "learning_rate": 9.079962098543787e-06, + "loss": 4.9742, + "step": 203725 + }, + { + "epoch": 18.289946140035905, + "grad_norm": 20.494400024414062, + "learning_rate": 9.079712746858169e-06, + "loss": 4.7064, + "step": 203750 + }, + { + "epoch": 18.292190305206464, + "grad_norm": 19.5096435546875, + "learning_rate": 9.079463395172551e-06, + "loss": 4.9138, + "step": 203775 + }, + { + "epoch": 18.29443447037702, + "grad_norm": 19.244998931884766, + "learning_rate": 9.079214043486935e-06, + "loss": 4.734, + "step": 203800 + }, + { + "epoch": 18.296678635547575, + "grad_norm": 14.813807487487793, + "learning_rate": 9.078964691801317e-06, + "loss": 4.8593, + "step": 203825 + }, + { + "epoch": 18.298922800718135, + "grad_norm": 19.74814796447754, + "learning_rate": 9.0787153401157e-06, + "loss": 4.4937, + "step": 203850 + }, + { + "epoch": 18.30116696588869, + "grad_norm": 14.909433364868164, + "learning_rate": 9.078465988430084e-06, + "loss": 4.6845, + "step": 203875 + }, + { + "epoch": 18.303411131059246, + "grad_norm": 20.037639617919922, + "learning_rate": 9.078216636744464e-06, + "loss": 4.6565, + "step": 203900 + }, + { + "epoch": 18.3056552962298, + "grad_norm": 20.426742553710938, + "learning_rate": 9.077967285058847e-06, + "loss": 4.6214, + "step": 203925 + }, + { + "epoch": 18.30789946140036, + "grad_norm": 21.021265029907227, + "learning_rate": 9.07771793337323e-06, + "loss": 4.9456, + "step": 203950 + }, + { + "epoch": 18.310143626570916, + "grad_norm": 17.702756881713867, + "learning_rate": 9.077468581687613e-06, + "loss": 4.8192, + "step": 203975 + }, + { + "epoch": 18.31238779174147, + "grad_norm": 17.405803680419922, + "learning_rate": 9.077219230001995e-06, + "loss": 4.7736, + "step": 204000 + }, + { + "epoch": 18.314631956912027, + "grad_norm": 17.950763702392578, + "learning_rate": 9.076969878316378e-06, + "loss": 4.8702, + "step": 204025 + }, + { + "epoch": 18.316876122082586, + "grad_norm": 17.291589736938477, + "learning_rate": 9.076720526630762e-06, + "loss": 4.7076, + "step": 204050 + }, + { + "epoch": 18.319120287253142, + "grad_norm": 19.978450775146484, + "learning_rate": 9.076471174945142e-06, + "loss": 4.9408, + "step": 204075 + }, + { + "epoch": 18.321364452423698, + "grad_norm": 19.06855010986328, + "learning_rate": 9.076221823259526e-06, + "loss": 4.9235, + "step": 204100 + }, + { + "epoch": 18.323608617594253, + "grad_norm": 17.363615036010742, + "learning_rate": 9.075972471573909e-06, + "loss": 4.7809, + "step": 204125 + }, + { + "epoch": 18.325852782764812, + "grad_norm": 15.894251823425293, + "learning_rate": 9.075723119888291e-06, + "loss": 4.5168, + "step": 204150 + }, + { + "epoch": 18.328096947935368, + "grad_norm": 19.099224090576172, + "learning_rate": 9.075473768202673e-06, + "loss": 4.7173, + "step": 204175 + }, + { + "epoch": 18.330341113105924, + "grad_norm": 17.021085739135742, + "learning_rate": 9.075224416517057e-06, + "loss": 4.8303, + "step": 204200 + }, + { + "epoch": 18.332585278276483, + "grad_norm": 15.948204040527344, + "learning_rate": 9.07497506483144e-06, + "loss": 4.8006, + "step": 204225 + }, + { + "epoch": 18.33482944344704, + "grad_norm": 14.314899444580078, + "learning_rate": 9.07472571314582e-06, + "loss": 4.734, + "step": 204250 + }, + { + "epoch": 18.337073608617594, + "grad_norm": 18.804018020629883, + "learning_rate": 9.074476361460204e-06, + "loss": 4.8259, + "step": 204275 + }, + { + "epoch": 18.33931777378815, + "grad_norm": 19.04957389831543, + "learning_rate": 9.074227009774587e-06, + "loss": 5.106, + "step": 204300 + }, + { + "epoch": 18.34156193895871, + "grad_norm": 17.208606719970703, + "learning_rate": 9.073977658088969e-06, + "loss": 4.591, + "step": 204325 + }, + { + "epoch": 18.343806104129264, + "grad_norm": 17.43446159362793, + "learning_rate": 9.073728306403353e-06, + "loss": 4.7259, + "step": 204350 + }, + { + "epoch": 18.34605026929982, + "grad_norm": 18.22671127319336, + "learning_rate": 9.073478954717735e-06, + "loss": 4.7745, + "step": 204375 + }, + { + "epoch": 18.348294434470375, + "grad_norm": 23.09705352783203, + "learning_rate": 9.073229603032118e-06, + "loss": 4.7704, + "step": 204400 + }, + { + "epoch": 18.350538599640934, + "grad_norm": 21.454195022583008, + "learning_rate": 9.0729802513465e-06, + "loss": 4.8133, + "step": 204425 + }, + { + "epoch": 18.35278276481149, + "grad_norm": 17.46868133544922, + "learning_rate": 9.072730899660882e-06, + "loss": 4.7167, + "step": 204450 + }, + { + "epoch": 18.355026929982046, + "grad_norm": 18.241050720214844, + "learning_rate": 9.072481547975264e-06, + "loss": 4.9637, + "step": 204475 + }, + { + "epoch": 18.357271095152605, + "grad_norm": 16.23886489868164, + "learning_rate": 9.072232196289647e-06, + "loss": 4.7789, + "step": 204500 + }, + { + "epoch": 18.35951526032316, + "grad_norm": 17.813541412353516, + "learning_rate": 9.07198284460403e-06, + "loss": 4.8183, + "step": 204525 + }, + { + "epoch": 18.361759425493716, + "grad_norm": 16.344453811645508, + "learning_rate": 9.071733492918413e-06, + "loss": 4.7773, + "step": 204550 + }, + { + "epoch": 18.36400359066427, + "grad_norm": 16.738523483276367, + "learning_rate": 9.071484141232795e-06, + "loss": 4.7769, + "step": 204575 + }, + { + "epoch": 18.36624775583483, + "grad_norm": 15.209779739379883, + "learning_rate": 9.07123478954718e-06, + "loss": 4.5475, + "step": 204600 + }, + { + "epoch": 18.368491921005386, + "grad_norm": 18.378019332885742, + "learning_rate": 9.07098543786156e-06, + "loss": 4.7171, + "step": 204625 + }, + { + "epoch": 18.370736086175942, + "grad_norm": 19.693098068237305, + "learning_rate": 9.070736086175942e-06, + "loss": 4.84, + "step": 204650 + }, + { + "epoch": 18.372980251346497, + "grad_norm": 20.22697639465332, + "learning_rate": 9.070486734490326e-06, + "loss": 5.0333, + "step": 204675 + }, + { + "epoch": 18.375224416517057, + "grad_norm": 20.72254180908203, + "learning_rate": 9.070237382804709e-06, + "loss": 4.6662, + "step": 204700 + }, + { + "epoch": 18.377468581687612, + "grad_norm": 17.79045295715332, + "learning_rate": 9.069988031119091e-06, + "loss": 4.6493, + "step": 204725 + }, + { + "epoch": 18.379712746858168, + "grad_norm": 19.030763626098633, + "learning_rate": 9.069738679433473e-06, + "loss": 4.8754, + "step": 204750 + }, + { + "epoch": 18.381956912028727, + "grad_norm": 13.148117065429688, + "learning_rate": 9.069489327747857e-06, + "loss": 4.8137, + "step": 204775 + }, + { + "epoch": 18.384201077199283, + "grad_norm": 17.719301223754883, + "learning_rate": 9.069239976062238e-06, + "loss": 4.6405, + "step": 204800 + }, + { + "epoch": 18.386445242369838, + "grad_norm": 17.70392417907715, + "learning_rate": 9.068990624376622e-06, + "loss": 4.6494, + "step": 204825 + }, + { + "epoch": 18.388689407540394, + "grad_norm": 20.08864402770996, + "learning_rate": 9.068741272691004e-06, + "loss": 4.8032, + "step": 204850 + }, + { + "epoch": 18.390933572710953, + "grad_norm": 17.898550033569336, + "learning_rate": 9.068491921005387e-06, + "loss": 4.7797, + "step": 204875 + }, + { + "epoch": 18.39317773788151, + "grad_norm": 15.507244110107422, + "learning_rate": 9.068242569319769e-06, + "loss": 4.6786, + "step": 204900 + }, + { + "epoch": 18.395421903052064, + "grad_norm": 19.145919799804688, + "learning_rate": 9.067993217634153e-06, + "loss": 4.6533, + "step": 204925 + }, + { + "epoch": 18.39766606822262, + "grad_norm": 17.42308235168457, + "learning_rate": 9.067743865948535e-06, + "loss": 4.8379, + "step": 204950 + }, + { + "epoch": 18.39991023339318, + "grad_norm": 17.876127243041992, + "learning_rate": 9.067494514262918e-06, + "loss": 4.5518, + "step": 204975 + }, + { + "epoch": 18.402154398563734, + "grad_norm": 16.39630889892578, + "learning_rate": 9.0672451625773e-06, + "loss": 5.0145, + "step": 205000 + }, + { + "epoch": 18.40439856373429, + "grad_norm": 18.926485061645508, + "learning_rate": 9.066995810891682e-06, + "loss": 4.6447, + "step": 205025 + }, + { + "epoch": 18.40664272890485, + "grad_norm": 19.782115936279297, + "learning_rate": 9.066746459206065e-06, + "loss": 4.7836, + "step": 205050 + }, + { + "epoch": 18.408886894075405, + "grad_norm": 20.216176986694336, + "learning_rate": 9.066497107520449e-06, + "loss": 4.7924, + "step": 205075 + }, + { + "epoch": 18.41113105924596, + "grad_norm": 18.336395263671875, + "learning_rate": 9.06624775583483e-06, + "loss": 4.594, + "step": 205100 + }, + { + "epoch": 18.413375224416516, + "grad_norm": 17.617557525634766, + "learning_rate": 9.065998404149213e-06, + "loss": 4.631, + "step": 205125 + }, + { + "epoch": 18.415619389587075, + "grad_norm": 19.809160232543945, + "learning_rate": 9.065749052463595e-06, + "loss": 4.9792, + "step": 205150 + }, + { + "epoch": 18.41786355475763, + "grad_norm": 20.527618408203125, + "learning_rate": 9.065499700777978e-06, + "loss": 4.9589, + "step": 205175 + }, + { + "epoch": 18.420107719928186, + "grad_norm": 18.48562240600586, + "learning_rate": 9.06525034909236e-06, + "loss": 5.0005, + "step": 205200 + }, + { + "epoch": 18.42235188509874, + "grad_norm": 21.37814712524414, + "learning_rate": 9.065000997406742e-06, + "loss": 4.6923, + "step": 205225 + }, + { + "epoch": 18.4245960502693, + "grad_norm": 18.49932289123535, + "learning_rate": 9.064751645721126e-06, + "loss": 4.8528, + "step": 205250 + }, + { + "epoch": 18.426840215439857, + "grad_norm": 22.21766471862793, + "learning_rate": 9.064502294035509e-06, + "loss": 4.9091, + "step": 205275 + }, + { + "epoch": 18.429084380610412, + "grad_norm": 17.457502365112305, + "learning_rate": 9.064252942349891e-06, + "loss": 4.9207, + "step": 205300 + }, + { + "epoch": 18.43132854578097, + "grad_norm": 19.5006103515625, + "learning_rate": 9.064003590664273e-06, + "loss": 4.7275, + "step": 205325 + }, + { + "epoch": 18.433572710951527, + "grad_norm": 14.346498489379883, + "learning_rate": 9.063754238978656e-06, + "loss": 4.8297, + "step": 205350 + }, + { + "epoch": 18.435816876122082, + "grad_norm": 16.001253128051758, + "learning_rate": 9.063504887293038e-06, + "loss": 4.5414, + "step": 205375 + }, + { + "epoch": 18.438061041292638, + "grad_norm": 19.71884536743164, + "learning_rate": 9.063255535607422e-06, + "loss": 4.8542, + "step": 205400 + }, + { + "epoch": 18.440305206463197, + "grad_norm": 20.761938095092773, + "learning_rate": 9.063016157989229e-06, + "loss": 4.8354, + "step": 205425 + }, + { + "epoch": 18.442549371633753, + "grad_norm": 18.04549217224121, + "learning_rate": 9.062766806303611e-06, + "loss": 4.8328, + "step": 205450 + }, + { + "epoch": 18.44479353680431, + "grad_norm": 19.67920684814453, + "learning_rate": 9.062517454617994e-06, + "loss": 4.8143, + "step": 205475 + }, + { + "epoch": 18.447037701974864, + "grad_norm": 18.920909881591797, + "learning_rate": 9.062268102932376e-06, + "loss": 4.8815, + "step": 205500 + }, + { + "epoch": 18.449281867145423, + "grad_norm": 16.199003219604492, + "learning_rate": 9.06201875124676e-06, + "loss": 4.669, + "step": 205525 + }, + { + "epoch": 18.45152603231598, + "grad_norm": 18.77996063232422, + "learning_rate": 9.061769399561142e-06, + "loss": 4.7662, + "step": 205550 + }, + { + "epoch": 18.453770197486534, + "grad_norm": 17.835098266601562, + "learning_rate": 9.061520047875525e-06, + "loss": 4.4293, + "step": 205575 + }, + { + "epoch": 18.45601436265709, + "grad_norm": 19.884294509887695, + "learning_rate": 9.061270696189907e-06, + "loss": 4.8408, + "step": 205600 + }, + { + "epoch": 18.45825852782765, + "grad_norm": 20.057767868041992, + "learning_rate": 9.06102134450429e-06, + "loss": 4.8311, + "step": 205625 + }, + { + "epoch": 18.460502692998205, + "grad_norm": 16.40423011779785, + "learning_rate": 9.060771992818672e-06, + "loss": 4.9491, + "step": 205650 + }, + { + "epoch": 18.46274685816876, + "grad_norm": 17.693340301513672, + "learning_rate": 9.060522641133056e-06, + "loss": 4.8159, + "step": 205675 + }, + { + "epoch": 18.46499102333932, + "grad_norm": 19.834087371826172, + "learning_rate": 9.060273289447438e-06, + "loss": 4.6667, + "step": 205700 + }, + { + "epoch": 18.467235188509875, + "grad_norm": 16.75141716003418, + "learning_rate": 9.06002393776182e-06, + "loss": 4.9186, + "step": 205725 + }, + { + "epoch": 18.46947935368043, + "grad_norm": 19.27888298034668, + "learning_rate": 9.059774586076202e-06, + "loss": 4.8066, + "step": 205750 + }, + { + "epoch": 18.471723518850986, + "grad_norm": 18.444900512695312, + "learning_rate": 9.059525234390585e-06, + "loss": 4.9586, + "step": 205775 + }, + { + "epoch": 18.473967684021545, + "grad_norm": 15.91179084777832, + "learning_rate": 9.059275882704967e-06, + "loss": 4.6353, + "step": 205800 + }, + { + "epoch": 18.4762118491921, + "grad_norm": 17.832469940185547, + "learning_rate": 9.059026531019351e-06, + "loss": 4.77, + "step": 205825 + }, + { + "epoch": 18.478456014362656, + "grad_norm": 17.146535873413086, + "learning_rate": 9.058777179333733e-06, + "loss": 4.9426, + "step": 205850 + }, + { + "epoch": 18.480700179533212, + "grad_norm": 19.280622482299805, + "learning_rate": 9.058527827648116e-06, + "loss": 4.6645, + "step": 205875 + }, + { + "epoch": 18.48294434470377, + "grad_norm": 19.391359329223633, + "learning_rate": 9.058278475962498e-06, + "loss": 4.8003, + "step": 205900 + }, + { + "epoch": 18.485188509874327, + "grad_norm": 17.96445655822754, + "learning_rate": 9.05802912427688e-06, + "loss": 5.008, + "step": 205925 + }, + { + "epoch": 18.487432675044882, + "grad_norm": 21.49856185913086, + "learning_rate": 9.057779772591263e-06, + "loss": 4.8555, + "step": 205950 + }, + { + "epoch": 18.48967684021544, + "grad_norm": 19.812604904174805, + "learning_rate": 9.057530420905645e-06, + "loss": 4.8899, + "step": 205975 + }, + { + "epoch": 18.491921005385997, + "grad_norm": 19.412919998168945, + "learning_rate": 9.057281069220029e-06, + "loss": 4.7423, + "step": 206000 + }, + { + "epoch": 18.494165170556553, + "grad_norm": 14.78014850616455, + "learning_rate": 9.057031717534411e-06, + "loss": 4.6858, + "step": 206025 + }, + { + "epoch": 18.496409335727108, + "grad_norm": 19.56511878967285, + "learning_rate": 9.056782365848794e-06, + "loss": 4.8564, + "step": 206050 + }, + { + "epoch": 18.498653500897667, + "grad_norm": 20.715011596679688, + "learning_rate": 9.056533014163178e-06, + "loss": 4.8881, + "step": 206075 + }, + { + "epoch": 18.500897666068223, + "grad_norm": 19.943531036376953, + "learning_rate": 9.05628366247756e-06, + "loss": 4.869, + "step": 206100 + }, + { + "epoch": 18.50314183123878, + "grad_norm": 17.795635223388672, + "learning_rate": 9.05603431079194e-06, + "loss": 4.8091, + "step": 206125 + }, + { + "epoch": 18.505385996409334, + "grad_norm": 21.334531784057617, + "learning_rate": 9.055784959106325e-06, + "loss": 4.9879, + "step": 206150 + }, + { + "epoch": 18.507630161579893, + "grad_norm": 19.96022605895996, + "learning_rate": 9.055535607420707e-06, + "loss": 4.9958, + "step": 206175 + }, + { + "epoch": 18.50987432675045, + "grad_norm": 23.893898010253906, + "learning_rate": 9.05528625573509e-06, + "loss": 4.931, + "step": 206200 + }, + { + "epoch": 18.512118491921004, + "grad_norm": 16.79300308227539, + "learning_rate": 9.055036904049472e-06, + "loss": 4.8027, + "step": 206225 + }, + { + "epoch": 18.514362657091564, + "grad_norm": 19.60470962524414, + "learning_rate": 9.054787552363856e-06, + "loss": 4.7936, + "step": 206250 + }, + { + "epoch": 18.51660682226212, + "grad_norm": 19.213668823242188, + "learning_rate": 9.054538200678238e-06, + "loss": 4.8803, + "step": 206275 + }, + { + "epoch": 18.518850987432675, + "grad_norm": 20.730154037475586, + "learning_rate": 9.05428884899262e-06, + "loss": 4.9824, + "step": 206300 + }, + { + "epoch": 18.52109515260323, + "grad_norm": 14.855822563171387, + "learning_rate": 9.054039497307003e-06, + "loss": 4.973, + "step": 206325 + }, + { + "epoch": 18.52333931777379, + "grad_norm": 19.998811721801758, + "learning_rate": 9.053790145621385e-06, + "loss": 4.8066, + "step": 206350 + }, + { + "epoch": 18.525583482944345, + "grad_norm": 17.742265701293945, + "learning_rate": 9.053540793935767e-06, + "loss": 4.7962, + "step": 206375 + }, + { + "epoch": 18.5278276481149, + "grad_norm": 17.775970458984375, + "learning_rate": 9.053291442250151e-06, + "loss": 4.808, + "step": 206400 + }, + { + "epoch": 18.530071813285456, + "grad_norm": 18.843555450439453, + "learning_rate": 9.053042090564533e-06, + "loss": 4.9984, + "step": 206425 + }, + { + "epoch": 18.532315978456015, + "grad_norm": 17.422718048095703, + "learning_rate": 9.052792738878916e-06, + "loss": 5.0527, + "step": 206450 + }, + { + "epoch": 18.53456014362657, + "grad_norm": 19.630664825439453, + "learning_rate": 9.052543387193298e-06, + "loss": 4.9151, + "step": 206475 + }, + { + "epoch": 18.536804308797127, + "grad_norm": 18.677143096923828, + "learning_rate": 9.05229403550768e-06, + "loss": 4.9871, + "step": 206500 + }, + { + "epoch": 18.539048473967686, + "grad_norm": 16.8705997467041, + "learning_rate": 9.052044683822063e-06, + "loss": 4.8528, + "step": 206525 + }, + { + "epoch": 18.54129263913824, + "grad_norm": 17.9193172454834, + "learning_rate": 9.051795332136447e-06, + "loss": 4.8503, + "step": 206550 + }, + { + "epoch": 18.543536804308797, + "grad_norm": 18.17444610595703, + "learning_rate": 9.051545980450829e-06, + "loss": 4.6576, + "step": 206575 + }, + { + "epoch": 18.545780969479353, + "grad_norm": 17.651288986206055, + "learning_rate": 9.051296628765211e-06, + "loss": 4.7461, + "step": 206600 + }, + { + "epoch": 18.54802513464991, + "grad_norm": 16.03110694885254, + "learning_rate": 9.051047277079594e-06, + "loss": 4.7542, + "step": 206625 + }, + { + "epoch": 18.550269299820467, + "grad_norm": 18.466150283813477, + "learning_rate": 9.050797925393976e-06, + "loss": 4.7209, + "step": 206650 + }, + { + "epoch": 18.552513464991023, + "grad_norm": 18.145793914794922, + "learning_rate": 9.050548573708358e-06, + "loss": 4.6583, + "step": 206675 + }, + { + "epoch": 18.55475763016158, + "grad_norm": 16.2386474609375, + "learning_rate": 9.05029922202274e-06, + "loss": 4.8805, + "step": 206700 + }, + { + "epoch": 18.557001795332138, + "grad_norm": 20.762187957763672, + "learning_rate": 9.050049870337125e-06, + "loss": 4.9479, + "step": 206725 + }, + { + "epoch": 18.559245960502693, + "grad_norm": 17.52326011657715, + "learning_rate": 9.049800518651507e-06, + "loss": 4.6605, + "step": 206750 + }, + { + "epoch": 18.56149012567325, + "grad_norm": 18.4422607421875, + "learning_rate": 9.04955116696589e-06, + "loss": 4.7402, + "step": 206775 + }, + { + "epoch": 18.563734290843804, + "grad_norm": 16.857789993286133, + "learning_rate": 9.049301815280273e-06, + "loss": 4.7498, + "step": 206800 + }, + { + "epoch": 18.565978456014363, + "grad_norm": 19.69745445251465, + "learning_rate": 9.049052463594654e-06, + "loss": 4.8287, + "step": 206825 + }, + { + "epoch": 18.56822262118492, + "grad_norm": 18.813213348388672, + "learning_rate": 9.048803111909036e-06, + "loss": 4.6745, + "step": 206850 + }, + { + "epoch": 18.570466786355475, + "grad_norm": 19.47381591796875, + "learning_rate": 9.04855376022342e-06, + "loss": 4.9382, + "step": 206875 + }, + { + "epoch": 18.572710951526034, + "grad_norm": 19.64352798461914, + "learning_rate": 9.048304408537803e-06, + "loss": 4.9909, + "step": 206900 + }, + { + "epoch": 18.57495511669659, + "grad_norm": 19.473106384277344, + "learning_rate": 9.048055056852185e-06, + "loss": 4.7838, + "step": 206925 + }, + { + "epoch": 18.577199281867145, + "grad_norm": 19.132356643676758, + "learning_rate": 9.047805705166567e-06, + "loss": 4.8085, + "step": 206950 + }, + { + "epoch": 18.5794434470377, + "grad_norm": 20.663503646850586, + "learning_rate": 9.047556353480951e-06, + "loss": 4.853, + "step": 206975 + }, + { + "epoch": 18.58168761220826, + "grad_norm": 17.665870666503906, + "learning_rate": 9.047307001795332e-06, + "loss": 4.9972, + "step": 207000 + }, + { + "epoch": 18.583931777378815, + "grad_norm": 18.698328018188477, + "learning_rate": 9.047057650109716e-06, + "loss": 4.8795, + "step": 207025 + }, + { + "epoch": 18.58617594254937, + "grad_norm": 17.769968032836914, + "learning_rate": 9.046808298424098e-06, + "loss": 4.9608, + "step": 207050 + }, + { + "epoch": 18.588420107719926, + "grad_norm": 20.394540786743164, + "learning_rate": 9.04655894673848e-06, + "loss": 4.9876, + "step": 207075 + }, + { + "epoch": 18.590664272890486, + "grad_norm": 17.318586349487305, + "learning_rate": 9.046309595052863e-06, + "loss": 4.6723, + "step": 207100 + }, + { + "epoch": 18.59290843806104, + "grad_norm": 18.711679458618164, + "learning_rate": 9.046060243367247e-06, + "loss": 4.6944, + "step": 207125 + }, + { + "epoch": 18.595152603231597, + "grad_norm": 16.94316291809082, + "learning_rate": 9.045810891681629e-06, + "loss": 4.5107, + "step": 207150 + }, + { + "epoch": 18.597396768402156, + "grad_norm": 15.978387832641602, + "learning_rate": 9.045561539996011e-06, + "loss": 4.7353, + "step": 207175 + }, + { + "epoch": 18.59964093357271, + "grad_norm": 17.144895553588867, + "learning_rate": 9.045312188310394e-06, + "loss": 5.0077, + "step": 207200 + }, + { + "epoch": 18.601885098743267, + "grad_norm": 19.308807373046875, + "learning_rate": 9.045062836624776e-06, + "loss": 4.6419, + "step": 207225 + }, + { + "epoch": 18.604129263913823, + "grad_norm": 19.211069107055664, + "learning_rate": 9.044813484939158e-06, + "loss": 4.8354, + "step": 207250 + }, + { + "epoch": 18.606373429084382, + "grad_norm": 18.457143783569336, + "learning_rate": 9.044564133253542e-06, + "loss": 4.7033, + "step": 207275 + }, + { + "epoch": 18.608617594254937, + "grad_norm": 16.481170654296875, + "learning_rate": 9.044314781567925e-06, + "loss": 4.8603, + "step": 207300 + }, + { + "epoch": 18.610861759425493, + "grad_norm": 17.901391983032227, + "learning_rate": 9.044065429882307e-06, + "loss": 4.8014, + "step": 207325 + }, + { + "epoch": 18.61310592459605, + "grad_norm": 23.592554092407227, + "learning_rate": 9.04381607819669e-06, + "loss": 4.7194, + "step": 207350 + }, + { + "epoch": 18.615350089766608, + "grad_norm": 19.52501678466797, + "learning_rate": 9.043566726511072e-06, + "loss": 4.7281, + "step": 207375 + }, + { + "epoch": 18.617594254937163, + "grad_norm": 20.603107452392578, + "learning_rate": 9.043317374825454e-06, + "loss": 4.9136, + "step": 207400 + }, + { + "epoch": 18.61983842010772, + "grad_norm": 16.3880672454834, + "learning_rate": 9.043077997207261e-06, + "loss": 4.7333, + "step": 207425 + }, + { + "epoch": 18.622082585278278, + "grad_norm": 18.337617874145508, + "learning_rate": 9.042828645521645e-06, + "loss": 4.9599, + "step": 207450 + }, + { + "epoch": 18.624326750448834, + "grad_norm": 17.43996238708496, + "learning_rate": 9.042579293836027e-06, + "loss": 4.8816, + "step": 207475 + }, + { + "epoch": 18.62657091561939, + "grad_norm": 22.427391052246094, + "learning_rate": 9.04232994215041e-06, + "loss": 4.9396, + "step": 207500 + }, + { + "epoch": 18.628815080789945, + "grad_norm": 20.920372009277344, + "learning_rate": 9.042080590464792e-06, + "loss": 4.9227, + "step": 207525 + }, + { + "epoch": 18.631059245960504, + "grad_norm": 17.3348388671875, + "learning_rate": 9.041831238779176e-06, + "loss": 4.7163, + "step": 207550 + }, + { + "epoch": 18.63330341113106, + "grad_norm": 18.844676971435547, + "learning_rate": 9.041581887093558e-06, + "loss": 4.8226, + "step": 207575 + }, + { + "epoch": 18.635547576301615, + "grad_norm": 16.43255043029785, + "learning_rate": 9.04133253540794e-06, + "loss": 4.739, + "step": 207600 + }, + { + "epoch": 18.63779174147217, + "grad_norm": 18.253509521484375, + "learning_rate": 9.041083183722323e-06, + "loss": 4.5472, + "step": 207625 + }, + { + "epoch": 18.64003590664273, + "grad_norm": 17.407276153564453, + "learning_rate": 9.040833832036705e-06, + "loss": 4.5937, + "step": 207650 + }, + { + "epoch": 18.642280071813286, + "grad_norm": 18.58664321899414, + "learning_rate": 9.040584480351087e-06, + "loss": 4.7506, + "step": 207675 + }, + { + "epoch": 18.64452423698384, + "grad_norm": 18.116355895996094, + "learning_rate": 9.04033512866547e-06, + "loss": 4.7355, + "step": 207700 + }, + { + "epoch": 18.6467684021544, + "grad_norm": 20.318769454956055, + "learning_rate": 9.040085776979854e-06, + "loss": 4.9216, + "step": 207725 + }, + { + "epoch": 18.649012567324956, + "grad_norm": 18.650936126708984, + "learning_rate": 9.039836425294236e-06, + "loss": 4.815, + "step": 207750 + }, + { + "epoch": 18.65125673249551, + "grad_norm": 19.84956932067871, + "learning_rate": 9.039587073608618e-06, + "loss": 4.7588, + "step": 207775 + }, + { + "epoch": 18.653500897666067, + "grad_norm": 21.437602996826172, + "learning_rate": 9.039337721923e-06, + "loss": 5.1061, + "step": 207800 + }, + { + "epoch": 18.655745062836626, + "grad_norm": 19.545507431030273, + "learning_rate": 9.039088370237383e-06, + "loss": 5.0129, + "step": 207825 + }, + { + "epoch": 18.65798922800718, + "grad_norm": 16.58881187438965, + "learning_rate": 9.038839018551765e-06, + "loss": 4.8696, + "step": 207850 + }, + { + "epoch": 18.660233393177737, + "grad_norm": 17.009227752685547, + "learning_rate": 9.03858966686615e-06, + "loss": 4.8814, + "step": 207875 + }, + { + "epoch": 18.662477558348293, + "grad_norm": 21.56296157836914, + "learning_rate": 9.038340315180532e-06, + "loss": 4.4581, + "step": 207900 + }, + { + "epoch": 18.664721723518852, + "grad_norm": 17.627431869506836, + "learning_rate": 9.038090963494914e-06, + "loss": 4.7037, + "step": 207925 + }, + { + "epoch": 18.666965888689408, + "grad_norm": 17.643606185913086, + "learning_rate": 9.037841611809296e-06, + "loss": 4.7784, + "step": 207950 + }, + { + "epoch": 18.669210053859963, + "grad_norm": 16.768638610839844, + "learning_rate": 9.037592260123679e-06, + "loss": 4.8904, + "step": 207975 + }, + { + "epoch": 18.671454219030522, + "grad_norm": 20.987363815307617, + "learning_rate": 9.037342908438061e-06, + "loss": 4.6135, + "step": 208000 + }, + { + "epoch": 18.673698384201078, + "grad_norm": 22.349294662475586, + "learning_rate": 9.037093556752445e-06, + "loss": 4.8791, + "step": 208025 + }, + { + "epoch": 18.675942549371634, + "grad_norm": 16.74230194091797, + "learning_rate": 9.036844205066827e-06, + "loss": 4.991, + "step": 208050 + }, + { + "epoch": 18.67818671454219, + "grad_norm": 19.93352508544922, + "learning_rate": 9.03659485338121e-06, + "loss": 4.9725, + "step": 208075 + }, + { + "epoch": 18.68043087971275, + "grad_norm": 19.82401466369629, + "learning_rate": 9.036345501695592e-06, + "loss": 5.0212, + "step": 208100 + }, + { + "epoch": 18.682675044883304, + "grad_norm": 18.1345157623291, + "learning_rate": 9.036096150009976e-06, + "loss": 4.688, + "step": 208125 + }, + { + "epoch": 18.68491921005386, + "grad_norm": 19.641708374023438, + "learning_rate": 9.035846798324357e-06, + "loss": 4.6972, + "step": 208150 + }, + { + "epoch": 18.687163375224415, + "grad_norm": 15.935436248779297, + "learning_rate": 9.03559744663874e-06, + "loss": 4.9494, + "step": 208175 + }, + { + "epoch": 18.689407540394974, + "grad_norm": 18.41060447692871, + "learning_rate": 9.035348094953123e-06, + "loss": 4.8097, + "step": 208200 + }, + { + "epoch": 18.69165170556553, + "grad_norm": 18.12906837463379, + "learning_rate": 9.035098743267505e-06, + "loss": 4.712, + "step": 208225 + }, + { + "epoch": 18.693895870736085, + "grad_norm": 18.720308303833008, + "learning_rate": 9.034849391581887e-06, + "loss": 4.9106, + "step": 208250 + }, + { + "epoch": 18.696140035906645, + "grad_norm": 17.2650089263916, + "learning_rate": 9.034600039896272e-06, + "loss": 4.6825, + "step": 208275 + }, + { + "epoch": 18.6983842010772, + "grad_norm": 17.692169189453125, + "learning_rate": 9.034350688210654e-06, + "loss": 4.8517, + "step": 208300 + }, + { + "epoch": 18.700628366247756, + "grad_norm": 17.972042083740234, + "learning_rate": 9.034101336525034e-06, + "loss": 4.8224, + "step": 208325 + }, + { + "epoch": 18.70287253141831, + "grad_norm": 17.935401916503906, + "learning_rate": 9.033851984839418e-06, + "loss": 4.6293, + "step": 208350 + }, + { + "epoch": 18.70511669658887, + "grad_norm": 18.141010284423828, + "learning_rate": 9.0336026331538e-06, + "loss": 4.9456, + "step": 208375 + }, + { + "epoch": 18.707360861759426, + "grad_norm": 18.661527633666992, + "learning_rate": 9.033353281468183e-06, + "loss": 4.8626, + "step": 208400 + }, + { + "epoch": 18.70960502692998, + "grad_norm": 15.868023872375488, + "learning_rate": 9.033103929782565e-06, + "loss": 4.9662, + "step": 208425 + }, + { + "epoch": 18.711849192100537, + "grad_norm": 19.989805221557617, + "learning_rate": 9.03285457809695e-06, + "loss": 4.9454, + "step": 208450 + }, + { + "epoch": 18.714093357271096, + "grad_norm": 17.48849868774414, + "learning_rate": 9.032605226411332e-06, + "loss": 4.8369, + "step": 208475 + }, + { + "epoch": 18.716337522441652, + "grad_norm": 16.290250778198242, + "learning_rate": 9.032355874725714e-06, + "loss": 4.8512, + "step": 208500 + }, + { + "epoch": 18.718581687612208, + "grad_norm": 19.029251098632812, + "learning_rate": 9.032106523040096e-06, + "loss": 5.0051, + "step": 208525 + }, + { + "epoch": 18.720825852782763, + "grad_norm": 17.381696701049805, + "learning_rate": 9.031857171354479e-06, + "loss": 5.0478, + "step": 208550 + }, + { + "epoch": 18.723070017953322, + "grad_norm": 19.225706100463867, + "learning_rate": 9.031607819668861e-06, + "loss": 4.7133, + "step": 208575 + }, + { + "epoch": 18.725314183123878, + "grad_norm": 22.029685974121094, + "learning_rate": 9.031358467983245e-06, + "loss": 4.5103, + "step": 208600 + }, + { + "epoch": 18.727558348294433, + "grad_norm": 18.778350830078125, + "learning_rate": 9.031109116297627e-06, + "loss": 4.8227, + "step": 208625 + }, + { + "epoch": 18.729802513464993, + "grad_norm": 17.135751724243164, + "learning_rate": 9.03085976461201e-06, + "loss": 4.9494, + "step": 208650 + }, + { + "epoch": 18.732046678635548, + "grad_norm": 22.011430740356445, + "learning_rate": 9.030610412926392e-06, + "loss": 4.7758, + "step": 208675 + }, + { + "epoch": 18.734290843806104, + "grad_norm": 17.426687240600586, + "learning_rate": 9.030361061240774e-06, + "loss": 4.8237, + "step": 208700 + }, + { + "epoch": 18.73653500897666, + "grad_norm": 15.020225524902344, + "learning_rate": 9.030111709555157e-06, + "loss": 4.7501, + "step": 208725 + }, + { + "epoch": 18.73877917414722, + "grad_norm": 17.543161392211914, + "learning_rate": 9.02986235786954e-06, + "loss": 4.9536, + "step": 208750 + }, + { + "epoch": 18.741023339317774, + "grad_norm": 19.83104133605957, + "learning_rate": 9.029613006183923e-06, + "loss": 4.8531, + "step": 208775 + }, + { + "epoch": 18.74326750448833, + "grad_norm": 15.320123672485352, + "learning_rate": 9.029363654498305e-06, + "loss": 4.8739, + "step": 208800 + }, + { + "epoch": 18.745511669658885, + "grad_norm": 22.372230529785156, + "learning_rate": 9.029114302812688e-06, + "loss": 4.6297, + "step": 208825 + }, + { + "epoch": 18.747755834829444, + "grad_norm": 18.111570358276367, + "learning_rate": 9.02886495112707e-06, + "loss": 4.8634, + "step": 208850 + }, + { + "epoch": 18.75, + "grad_norm": 18.736610412597656, + "learning_rate": 9.028615599441452e-06, + "loss": 4.8832, + "step": 208875 + }, + { + "epoch": 18.752244165170556, + "grad_norm": 17.191448211669922, + "learning_rate": 9.028366247755836e-06, + "loss": 4.6756, + "step": 208900 + }, + { + "epoch": 18.754488330341115, + "grad_norm": 18.637535095214844, + "learning_rate": 9.028116896070218e-06, + "loss": 4.7986, + "step": 208925 + }, + { + "epoch": 18.75673249551167, + "grad_norm": 16.871719360351562, + "learning_rate": 9.0278675443846e-06, + "loss": 4.9962, + "step": 208950 + }, + { + "epoch": 18.758976660682226, + "grad_norm": 20.614336013793945, + "learning_rate": 9.027618192698983e-06, + "loss": 5.0611, + "step": 208975 + }, + { + "epoch": 18.76122082585278, + "grad_norm": 21.231061935424805, + "learning_rate": 9.027368841013367e-06, + "loss": 5.0077, + "step": 209000 + }, + { + "epoch": 18.76346499102334, + "grad_norm": 15.787453651428223, + "learning_rate": 9.027119489327748e-06, + "loss": 4.7836, + "step": 209025 + }, + { + "epoch": 18.765709156193896, + "grad_norm": 17.92066764831543, + "learning_rate": 9.02687013764213e-06, + "loss": 4.8398, + "step": 209050 + }, + { + "epoch": 18.767953321364452, + "grad_norm": 19.7448787689209, + "learning_rate": 9.026620785956514e-06, + "loss": 5.0134, + "step": 209075 + }, + { + "epoch": 18.770197486535007, + "grad_norm": 16.262794494628906, + "learning_rate": 9.026371434270896e-06, + "loss": 4.6983, + "step": 209100 + }, + { + "epoch": 18.772441651705567, + "grad_norm": 17.50380516052246, + "learning_rate": 9.026122082585279e-06, + "loss": 4.8547, + "step": 209125 + }, + { + "epoch": 18.774685816876122, + "grad_norm": 17.411880493164062, + "learning_rate": 9.025872730899661e-06, + "loss": 4.995, + "step": 209150 + }, + { + "epoch": 18.776929982046678, + "grad_norm": 17.70341682434082, + "learning_rate": 9.025623379214045e-06, + "loss": 5.1221, + "step": 209175 + }, + { + "epoch": 18.779174147217237, + "grad_norm": 16.913705825805664, + "learning_rate": 9.025374027528426e-06, + "loss": 4.5001, + "step": 209200 + }, + { + "epoch": 18.781418312387792, + "grad_norm": 15.563840866088867, + "learning_rate": 9.02512467584281e-06, + "loss": 4.8748, + "step": 209225 + }, + { + "epoch": 18.783662477558348, + "grad_norm": 20.913814544677734, + "learning_rate": 9.024875324157192e-06, + "loss": 4.9891, + "step": 209250 + }, + { + "epoch": 18.785906642728904, + "grad_norm": 19.2315616607666, + "learning_rate": 9.024625972471574e-06, + "loss": 4.8142, + "step": 209275 + }, + { + "epoch": 18.788150807899463, + "grad_norm": 21.14048194885254, + "learning_rate": 9.024376620785957e-06, + "loss": 4.7284, + "step": 209300 + }, + { + "epoch": 18.79039497307002, + "grad_norm": 16.58169937133789, + "learning_rate": 9.02412726910034e-06, + "loss": 4.6749, + "step": 209325 + }, + { + "epoch": 18.792639138240574, + "grad_norm": 16.214866638183594, + "learning_rate": 9.023877917414723e-06, + "loss": 4.8035, + "step": 209350 + }, + { + "epoch": 18.79488330341113, + "grad_norm": 22.12415313720703, + "learning_rate": 9.023628565729105e-06, + "loss": 4.6459, + "step": 209375 + }, + { + "epoch": 18.79712746858169, + "grad_norm": 19.782255172729492, + "learning_rate": 9.023379214043488e-06, + "loss": 4.6761, + "step": 209400 + }, + { + "epoch": 18.799371633752244, + "grad_norm": 20.18705940246582, + "learning_rate": 9.02312986235787e-06, + "loss": 4.8378, + "step": 209425 + }, + { + "epoch": 18.8016157989228, + "grad_norm": 17.47294044494629, + "learning_rate": 9.022880510672252e-06, + "loss": 4.9547, + "step": 209450 + }, + { + "epoch": 18.803859964093355, + "grad_norm": 15.571213722229004, + "learning_rate": 9.022631158986636e-06, + "loss": 5.0089, + "step": 209475 + }, + { + "epoch": 18.806104129263915, + "grad_norm": 18.86174964904785, + "learning_rate": 9.022381807301019e-06, + "loss": 4.7684, + "step": 209500 + }, + { + "epoch": 18.80834829443447, + "grad_norm": 17.438461303710938, + "learning_rate": 9.022132455615401e-06, + "loss": 4.8564, + "step": 209525 + }, + { + "epoch": 18.810592459605026, + "grad_norm": 19.799121856689453, + "learning_rate": 9.021883103929783e-06, + "loss": 4.9569, + "step": 209550 + }, + { + "epoch": 18.812836624775585, + "grad_norm": 18.56831169128418, + "learning_rate": 9.021633752244165e-06, + "loss": 4.752, + "step": 209575 + }, + { + "epoch": 18.81508078994614, + "grad_norm": 16.52011489868164, + "learning_rate": 9.021384400558548e-06, + "loss": 4.8023, + "step": 209600 + }, + { + "epoch": 18.817324955116696, + "grad_norm": 20.171138763427734, + "learning_rate": 9.021135048872932e-06, + "loss": 4.9383, + "step": 209625 + }, + { + "epoch": 18.81956912028725, + "grad_norm": 22.462562561035156, + "learning_rate": 9.020885697187314e-06, + "loss": 5.047, + "step": 209650 + }, + { + "epoch": 18.82181328545781, + "grad_norm": 16.878267288208008, + "learning_rate": 9.020636345501696e-06, + "loss": 5.0213, + "step": 209675 + }, + { + "epoch": 18.824057450628366, + "grad_norm": 17.658065795898438, + "learning_rate": 9.020386993816079e-06, + "loss": 4.7582, + "step": 209700 + }, + { + "epoch": 18.826301615798922, + "grad_norm": 19.651504516601562, + "learning_rate": 9.020137642130463e-06, + "loss": 4.8456, + "step": 209725 + }, + { + "epoch": 18.828545780969478, + "grad_norm": 19.13094711303711, + "learning_rate": 9.019888290444843e-06, + "loss": 4.9575, + "step": 209750 + }, + { + "epoch": 18.830789946140037, + "grad_norm": 17.347028732299805, + "learning_rate": 9.019648912826652e-06, + "loss": 4.9274, + "step": 209775 + }, + { + "epoch": 18.833034111310592, + "grad_norm": 18.818992614746094, + "learning_rate": 9.019399561141034e-06, + "loss": 4.6724, + "step": 209800 + }, + { + "epoch": 18.835278276481148, + "grad_norm": 17.345500946044922, + "learning_rate": 9.019150209455417e-06, + "loss": 5.0912, + "step": 209825 + }, + { + "epoch": 18.837522441651707, + "grad_norm": 20.185808181762695, + "learning_rate": 9.018900857769799e-06, + "loss": 4.8374, + "step": 209850 + }, + { + "epoch": 18.839766606822263, + "grad_norm": 17.946331024169922, + "learning_rate": 9.018651506084181e-06, + "loss": 4.7557, + "step": 209875 + }, + { + "epoch": 18.84201077199282, + "grad_norm": 16.712366104125977, + "learning_rate": 9.018402154398565e-06, + "loss": 4.8512, + "step": 209900 + }, + { + "epoch": 18.844254937163374, + "grad_norm": 18.795747756958008, + "learning_rate": 9.018152802712948e-06, + "loss": 4.9343, + "step": 209925 + }, + { + "epoch": 18.846499102333933, + "grad_norm": 17.397838592529297, + "learning_rate": 9.01790345102733e-06, + "loss": 4.9221, + "step": 209950 + }, + { + "epoch": 18.84874326750449, + "grad_norm": 17.477266311645508, + "learning_rate": 9.017654099341712e-06, + "loss": 4.7189, + "step": 209975 + }, + { + "epoch": 18.850987432675044, + "grad_norm": 16.128711700439453, + "learning_rate": 9.017404747656095e-06, + "loss": 4.8595, + "step": 210000 + }, + { + "epoch": 18.8532315978456, + "grad_norm": 17.130386352539062, + "learning_rate": 9.017155395970477e-06, + "loss": 5.0358, + "step": 210025 + }, + { + "epoch": 18.85547576301616, + "grad_norm": 18.087326049804688, + "learning_rate": 9.01690604428486e-06, + "loss": 4.9719, + "step": 210050 + }, + { + "epoch": 18.857719928186714, + "grad_norm": 16.54625701904297, + "learning_rate": 9.016656692599243e-06, + "loss": 4.9574, + "step": 210075 + }, + { + "epoch": 18.85996409335727, + "grad_norm": 19.52678680419922, + "learning_rate": 9.016407340913626e-06, + "loss": 4.7436, + "step": 210100 + }, + { + "epoch": 18.86220825852783, + "grad_norm": 22.687625885009766, + "learning_rate": 9.016157989228008e-06, + "loss": 4.7162, + "step": 210125 + }, + { + "epoch": 18.864452423698385, + "grad_norm": 16.617570877075195, + "learning_rate": 9.01590863754239e-06, + "loss": 4.933, + "step": 210150 + }, + { + "epoch": 18.86669658886894, + "grad_norm": 16.453298568725586, + "learning_rate": 9.015659285856772e-06, + "loss": 4.8863, + "step": 210175 + }, + { + "epoch": 18.868940754039496, + "grad_norm": 15.36475658416748, + "learning_rate": 9.015409934171155e-06, + "loss": 4.7844, + "step": 210200 + }, + { + "epoch": 18.871184919210055, + "grad_norm": 17.49483299255371, + "learning_rate": 9.015160582485539e-06, + "loss": 4.7866, + "step": 210225 + }, + { + "epoch": 18.87342908438061, + "grad_norm": 18.5, + "learning_rate": 9.014911230799921e-06, + "loss": 4.8826, + "step": 210250 + }, + { + "epoch": 18.875673249551166, + "grad_norm": 18.23296356201172, + "learning_rate": 9.014661879114303e-06, + "loss": 4.9203, + "step": 210275 + }, + { + "epoch": 18.877917414721722, + "grad_norm": 17.16512680053711, + "learning_rate": 9.014412527428686e-06, + "loss": 4.6977, + "step": 210300 + }, + { + "epoch": 18.88016157989228, + "grad_norm": 18.413827896118164, + "learning_rate": 9.01416317574307e-06, + "loss": 4.7231, + "step": 210325 + }, + { + "epoch": 18.882405745062837, + "grad_norm": 18.604816436767578, + "learning_rate": 9.01391382405745e-06, + "loss": 4.6591, + "step": 210350 + }, + { + "epoch": 18.884649910233392, + "grad_norm": 19.45214080810547, + "learning_rate": 9.013664472371834e-06, + "loss": 4.9921, + "step": 210375 + }, + { + "epoch": 18.88689407540395, + "grad_norm": 18.336889266967773, + "learning_rate": 9.013415120686217e-06, + "loss": 4.9483, + "step": 210400 + }, + { + "epoch": 18.889138240574507, + "grad_norm": 18.49163246154785, + "learning_rate": 9.013165769000599e-06, + "loss": 4.7868, + "step": 210425 + }, + { + "epoch": 18.891382405745063, + "grad_norm": 20.661239624023438, + "learning_rate": 9.012916417314981e-06, + "loss": 4.6545, + "step": 210450 + }, + { + "epoch": 18.893626570915618, + "grad_norm": 18.487424850463867, + "learning_rate": 9.012667065629365e-06, + "loss": 5.0675, + "step": 210475 + }, + { + "epoch": 18.895870736086177, + "grad_norm": 16.22946548461914, + "learning_rate": 9.012417713943748e-06, + "loss": 4.9471, + "step": 210500 + }, + { + "epoch": 18.898114901256733, + "grad_norm": 15.682883262634277, + "learning_rate": 9.012168362258128e-06, + "loss": 4.8026, + "step": 210525 + }, + { + "epoch": 18.90035906642729, + "grad_norm": 18.196687698364258, + "learning_rate": 9.011919010572512e-06, + "loss": 4.9014, + "step": 210550 + }, + { + "epoch": 18.902603231597844, + "grad_norm": 19.59432029724121, + "learning_rate": 9.011669658886895e-06, + "loss": 4.9604, + "step": 210575 + }, + { + "epoch": 18.904847396768403, + "grad_norm": 17.875186920166016, + "learning_rate": 9.011420307201277e-06, + "loss": 5.0667, + "step": 210600 + }, + { + "epoch": 18.90709156193896, + "grad_norm": 20.665184020996094, + "learning_rate": 9.011170955515661e-06, + "loss": 5.0176, + "step": 210625 + }, + { + "epoch": 18.909335727109514, + "grad_norm": 17.01182746887207, + "learning_rate": 9.010921603830043e-06, + "loss": 4.6684, + "step": 210650 + }, + { + "epoch": 18.911579892280074, + "grad_norm": 18.553346633911133, + "learning_rate": 9.010672252144426e-06, + "loss": 4.9947, + "step": 210675 + }, + { + "epoch": 18.91382405745063, + "grad_norm": 23.125591278076172, + "learning_rate": 9.010422900458808e-06, + "loss": 4.8712, + "step": 210700 + }, + { + "epoch": 18.916068222621185, + "grad_norm": 21.05198097229004, + "learning_rate": 9.01017354877319e-06, + "loss": 4.9762, + "step": 210725 + }, + { + "epoch": 18.91831238779174, + "grad_norm": 18.704137802124023, + "learning_rate": 9.009924197087573e-06, + "loss": 4.7061, + "step": 210750 + }, + { + "epoch": 18.9205565529623, + "grad_norm": 20.087100982666016, + "learning_rate": 9.009674845401955e-06, + "loss": 5.0236, + "step": 210775 + }, + { + "epoch": 18.922800718132855, + "grad_norm": 17.654653549194336, + "learning_rate": 9.009425493716339e-06, + "loss": 4.7575, + "step": 210800 + }, + { + "epoch": 18.92504488330341, + "grad_norm": 18.275426864624023, + "learning_rate": 9.009176142030721e-06, + "loss": 4.8588, + "step": 210825 + }, + { + "epoch": 18.927289048473966, + "grad_norm": 18.828908920288086, + "learning_rate": 9.008926790345103e-06, + "loss": 5.0067, + "step": 210850 + }, + { + "epoch": 18.929533213644525, + "grad_norm": 17.376840591430664, + "learning_rate": 9.008677438659486e-06, + "loss": 4.7539, + "step": 210875 + }, + { + "epoch": 18.93177737881508, + "grad_norm": 19.74369239807129, + "learning_rate": 9.008428086973868e-06, + "loss": 5.0776, + "step": 210900 + }, + { + "epoch": 18.934021543985637, + "grad_norm": 18.35953140258789, + "learning_rate": 9.00817873528825e-06, + "loss": 5.3687, + "step": 210925 + }, + { + "epoch": 18.936265709156196, + "grad_norm": 18.257972717285156, + "learning_rate": 9.007929383602634e-06, + "loss": 5.08, + "step": 210950 + }, + { + "epoch": 18.93850987432675, + "grad_norm": 19.01415252685547, + "learning_rate": 9.007680031917017e-06, + "loss": 4.9954, + "step": 210975 + }, + { + "epoch": 18.940754039497307, + "grad_norm": 20.48657989501953, + "learning_rate": 9.007430680231399e-06, + "loss": 4.9725, + "step": 211000 + }, + { + "epoch": 18.942998204667862, + "grad_norm": 19.765342712402344, + "learning_rate": 9.007181328545781e-06, + "loss": 4.7382, + "step": 211025 + }, + { + "epoch": 18.94524236983842, + "grad_norm": 17.091259002685547, + "learning_rate": 9.006931976860165e-06, + "loss": 4.9903, + "step": 211050 + }, + { + "epoch": 18.947486535008977, + "grad_norm": 20.654773712158203, + "learning_rate": 9.006682625174546e-06, + "loss": 4.9766, + "step": 211075 + }, + { + "epoch": 18.949730700179533, + "grad_norm": 18.226375579833984, + "learning_rate": 9.00643327348893e-06, + "loss": 4.9415, + "step": 211100 + }, + { + "epoch": 18.95197486535009, + "grad_norm": 21.476404190063477, + "learning_rate": 9.006183921803312e-06, + "loss": 4.9438, + "step": 211125 + }, + { + "epoch": 18.954219030520647, + "grad_norm": 18.469064712524414, + "learning_rate": 9.005934570117695e-06, + "loss": 4.9898, + "step": 211150 + }, + { + "epoch": 18.956463195691203, + "grad_norm": 16.29199981689453, + "learning_rate": 9.005685218432077e-06, + "loss": 5.0509, + "step": 211175 + }, + { + "epoch": 18.95870736086176, + "grad_norm": 17.650131225585938, + "learning_rate": 9.005435866746461e-06, + "loss": 4.6853, + "step": 211200 + }, + { + "epoch": 18.960951526032314, + "grad_norm": 17.41545295715332, + "learning_rate": 9.005186515060843e-06, + "loss": 4.7295, + "step": 211225 + }, + { + "epoch": 18.963195691202873, + "grad_norm": 16.94508934020996, + "learning_rate": 9.004937163375224e-06, + "loss": 5.0346, + "step": 211250 + }, + { + "epoch": 18.96543985637343, + "grad_norm": 17.562448501586914, + "learning_rate": 9.004687811689608e-06, + "loss": 5.0166, + "step": 211275 + }, + { + "epoch": 18.967684021543985, + "grad_norm": 12.920533180236816, + "learning_rate": 9.00443846000399e-06, + "loss": 4.7014, + "step": 211300 + }, + { + "epoch": 18.969928186714544, + "grad_norm": 16.487316131591797, + "learning_rate": 9.004189108318373e-06, + "loss": 4.9, + "step": 211325 + }, + { + "epoch": 18.9721723518851, + "grad_norm": 17.25110626220703, + "learning_rate": 9.003939756632757e-06, + "loss": 4.7952, + "step": 211350 + }, + { + "epoch": 18.974416517055655, + "grad_norm": 22.78798484802246, + "learning_rate": 9.003690404947139e-06, + "loss": 4.8908, + "step": 211375 + }, + { + "epoch": 18.97666068222621, + "grad_norm": 17.937881469726562, + "learning_rate": 9.003441053261521e-06, + "loss": 4.8, + "step": 211400 + }, + { + "epoch": 18.97890484739677, + "grad_norm": 18.64436912536621, + "learning_rate": 9.003191701575904e-06, + "loss": 4.9642, + "step": 211425 + }, + { + "epoch": 18.981149012567325, + "grad_norm": 20.764793395996094, + "learning_rate": 9.002942349890286e-06, + "loss": 4.8497, + "step": 211450 + }, + { + "epoch": 18.98339317773788, + "grad_norm": 18.126754760742188, + "learning_rate": 9.002692998204668e-06, + "loss": 4.8419, + "step": 211475 + }, + { + "epoch": 18.985637342908436, + "grad_norm": 17.151634216308594, + "learning_rate": 9.00244364651905e-06, + "loss": 4.8532, + "step": 211500 + }, + { + "epoch": 18.987881508078996, + "grad_norm": 18.170671463012695, + "learning_rate": 9.002194294833434e-06, + "loss": 4.773, + "step": 211525 + }, + { + "epoch": 18.99012567324955, + "grad_norm": 18.967622756958008, + "learning_rate": 9.001944943147817e-06, + "loss": 4.9132, + "step": 211550 + }, + { + "epoch": 18.992369838420107, + "grad_norm": 17.40595054626465, + "learning_rate": 9.001695591462199e-06, + "loss": 4.8995, + "step": 211575 + }, + { + "epoch": 18.994614003590666, + "grad_norm": 21.126060485839844, + "learning_rate": 9.001446239776581e-06, + "loss": 4.9952, + "step": 211600 + }, + { + "epoch": 18.99685816876122, + "grad_norm": 16.788206100463867, + "learning_rate": 9.001196888090964e-06, + "loss": 5.1847, + "step": 211625 + }, + { + "epoch": 18.999102333931777, + "grad_norm": 18.045379638671875, + "learning_rate": 9.000947536405346e-06, + "loss": 4.652, + "step": 211650 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.06971149256296183, + "eval_f1_macro": 0.008976309569737639, + "eval_f1_micro": 0.06971149256296183, + "eval_f1_weighted": 0.04265287126408958, + "eval_loss": 6.845921993255615, + "eval_precision_macro": 0.008011624593211973, + "eval_precision_micro": 0.06971149256296183, + "eval_precision_weighted": 0.03509303375478957, + "eval_recall_macro": 0.014397050425524064, + "eval_recall_micro": 0.06971149256296183, + "eval_recall_weighted": 0.06971149256296183, + "eval_runtime": 127.7085, + "eval_samples_per_second": 410.098, + "eval_steps_per_second": 12.818, + "step": 211660 + }, + { + "epoch": 19.001346499102333, + "grad_norm": 17.329744338989258, + "learning_rate": 9.00069818471973e-06, + "loss": 4.6439, + "step": 211675 + }, + { + "epoch": 19.003590664272892, + "grad_norm": 17.555002212524414, + "learning_rate": 9.000448833034112e-06, + "loss": 4.4641, + "step": 211700 + }, + { + "epoch": 19.005834829443447, + "grad_norm": 22.036609649658203, + "learning_rate": 9.000199481348495e-06, + "loss": 4.5341, + "step": 211725 + }, + { + "epoch": 19.008078994614003, + "grad_norm": 13.284202575683594, + "learning_rate": 8.999950129662877e-06, + "loss": 4.6887, + "step": 211750 + }, + { + "epoch": 19.01032315978456, + "grad_norm": 19.607553482055664, + "learning_rate": 8.99970077797726e-06, + "loss": 4.3729, + "step": 211775 + }, + { + "epoch": 19.012567324955118, + "grad_norm": 19.293973922729492, + "learning_rate": 8.999451426291642e-06, + "loss": 4.5806, + "step": 211800 + }, + { + "epoch": 19.014811490125673, + "grad_norm": 16.589468002319336, + "learning_rate": 8.999202074606026e-06, + "loss": 4.3803, + "step": 211825 + }, + { + "epoch": 19.01705565529623, + "grad_norm": 16.910146713256836, + "learning_rate": 8.998952722920408e-06, + "loss": 4.5257, + "step": 211850 + }, + { + "epoch": 19.019299820466788, + "grad_norm": 18.572338104248047, + "learning_rate": 8.99870337123479e-06, + "loss": 4.3903, + "step": 211875 + }, + { + "epoch": 19.021543985637344, + "grad_norm": 14.630165100097656, + "learning_rate": 8.998454019549173e-06, + "loss": 4.4186, + "step": 211900 + }, + { + "epoch": 19.0237881508079, + "grad_norm": 15.054580688476562, + "learning_rate": 8.998204667863557e-06, + "loss": 4.7761, + "step": 211925 + }, + { + "epoch": 19.026032315978455, + "grad_norm": 18.015579223632812, + "learning_rate": 8.997955316177937e-06, + "loss": 4.4068, + "step": 211950 + }, + { + "epoch": 19.028276481149014, + "grad_norm": 18.305166244506836, + "learning_rate": 8.99770596449232e-06, + "loss": 4.5913, + "step": 211975 + }, + { + "epoch": 19.03052064631957, + "grad_norm": 18.908464431762695, + "learning_rate": 8.997456612806704e-06, + "loss": 4.5264, + "step": 212000 + }, + { + "epoch": 19.032764811490125, + "grad_norm": 16.495290756225586, + "learning_rate": 8.997207261121086e-06, + "loss": 4.6274, + "step": 212025 + }, + { + "epoch": 19.03500897666068, + "grad_norm": 17.377395629882812, + "learning_rate": 8.996957909435468e-06, + "loss": 4.6557, + "step": 212050 + }, + { + "epoch": 19.03725314183124, + "grad_norm": 20.684083938598633, + "learning_rate": 8.996708557749852e-06, + "loss": 4.4593, + "step": 212075 + }, + { + "epoch": 19.039497307001795, + "grad_norm": 16.69588851928711, + "learning_rate": 8.996459206064235e-06, + "loss": 4.656, + "step": 212100 + }, + { + "epoch": 19.04174147217235, + "grad_norm": 18.407020568847656, + "learning_rate": 8.996209854378615e-06, + "loss": 4.7064, + "step": 212125 + }, + { + "epoch": 19.04398563734291, + "grad_norm": 20.039287567138672, + "learning_rate": 8.995960502693e-06, + "loss": 4.3065, + "step": 212150 + }, + { + "epoch": 19.046229802513466, + "grad_norm": 20.236509323120117, + "learning_rate": 8.995711151007381e-06, + "loss": 4.5801, + "step": 212175 + }, + { + "epoch": 19.04847396768402, + "grad_norm": 18.915624618530273, + "learning_rate": 8.995461799321764e-06, + "loss": 4.6611, + "step": 212200 + }, + { + "epoch": 19.050718132854577, + "grad_norm": 20.41779899597168, + "learning_rate": 8.995212447636146e-06, + "loss": 4.3249, + "step": 212225 + }, + { + "epoch": 19.052962298025136, + "grad_norm": 20.333744049072266, + "learning_rate": 8.99496309595053e-06, + "loss": 4.8159, + "step": 212250 + }, + { + "epoch": 19.05520646319569, + "grad_norm": 18.626636505126953, + "learning_rate": 8.994713744264912e-06, + "loss": 4.8236, + "step": 212275 + }, + { + "epoch": 19.057450628366247, + "grad_norm": 18.849910736083984, + "learning_rate": 8.994464392579295e-06, + "loss": 4.7215, + "step": 212300 + }, + { + "epoch": 19.059694793536803, + "grad_norm": 17.722320556640625, + "learning_rate": 8.994215040893677e-06, + "loss": 4.7731, + "step": 212325 + }, + { + "epoch": 19.061938958707362, + "grad_norm": 18.939119338989258, + "learning_rate": 8.99396568920806e-06, + "loss": 4.5542, + "step": 212350 + }, + { + "epoch": 19.064183123877918, + "grad_norm": 17.4371280670166, + "learning_rate": 8.993716337522442e-06, + "loss": 4.4729, + "step": 212375 + }, + { + "epoch": 19.066427289048473, + "grad_norm": 19.72248649597168, + "learning_rate": 8.993466985836826e-06, + "loss": 4.4417, + "step": 212400 + }, + { + "epoch": 19.06867145421903, + "grad_norm": 18.844623565673828, + "learning_rate": 8.993217634151208e-06, + "loss": 4.732, + "step": 212425 + }, + { + "epoch": 19.070915619389588, + "grad_norm": 18.328805923461914, + "learning_rate": 8.99296828246559e-06, + "loss": 4.3883, + "step": 212450 + }, + { + "epoch": 19.073159784560143, + "grad_norm": 19.4107723236084, + "learning_rate": 8.992718930779973e-06, + "loss": 4.419, + "step": 212475 + }, + { + "epoch": 19.0754039497307, + "grad_norm": 16.118186950683594, + "learning_rate": 8.992469579094355e-06, + "loss": 4.8816, + "step": 212500 + }, + { + "epoch": 19.07764811490126, + "grad_norm": 18.35953140258789, + "learning_rate": 8.992220227408737e-06, + "loss": 4.7098, + "step": 212525 + }, + { + "epoch": 19.079892280071814, + "grad_norm": 19.14643096923828, + "learning_rate": 8.991970875723121e-06, + "loss": 4.5577, + "step": 212550 + }, + { + "epoch": 19.08213644524237, + "grad_norm": 18.63652992248535, + "learning_rate": 8.991721524037504e-06, + "loss": 4.4247, + "step": 212575 + }, + { + "epoch": 19.084380610412925, + "grad_norm": 17.956981658935547, + "learning_rate": 8.991472172351886e-06, + "loss": 4.4868, + "step": 212600 + }, + { + "epoch": 19.086624775583484, + "grad_norm": 21.967571258544922, + "learning_rate": 8.991222820666268e-06, + "loss": 4.5718, + "step": 212625 + }, + { + "epoch": 19.08886894075404, + "grad_norm": 17.047035217285156, + "learning_rate": 8.990973468980652e-06, + "loss": 4.6486, + "step": 212650 + }, + { + "epoch": 19.091113105924595, + "grad_norm": 20.772964477539062, + "learning_rate": 8.990724117295033e-06, + "loss": 4.4326, + "step": 212675 + }, + { + "epoch": 19.09335727109515, + "grad_norm": 17.45540428161621, + "learning_rate": 8.990474765609415e-06, + "loss": 4.5675, + "step": 212700 + }, + { + "epoch": 19.09560143626571, + "grad_norm": 18.723888397216797, + "learning_rate": 8.9902254139238e-06, + "loss": 4.4441, + "step": 212725 + }, + { + "epoch": 19.097845601436266, + "grad_norm": 18.50554847717285, + "learning_rate": 8.989976062238182e-06, + "loss": 4.5835, + "step": 212750 + }, + { + "epoch": 19.10008976660682, + "grad_norm": 22.194629669189453, + "learning_rate": 8.989726710552564e-06, + "loss": 4.7768, + "step": 212775 + }, + { + "epoch": 19.10233393177738, + "grad_norm": 17.889511108398438, + "learning_rate": 8.989477358866948e-06, + "loss": 4.6757, + "step": 212800 + }, + { + "epoch": 19.104578096947936, + "grad_norm": 19.91042137145996, + "learning_rate": 8.98922800718133e-06, + "loss": 4.4457, + "step": 212825 + }, + { + "epoch": 19.10682226211849, + "grad_norm": 18.404123306274414, + "learning_rate": 8.98897865549571e-06, + "loss": 4.5609, + "step": 212850 + }, + { + "epoch": 19.109066427289047, + "grad_norm": 16.635189056396484, + "learning_rate": 8.988729303810095e-06, + "loss": 4.4772, + "step": 212875 + }, + { + "epoch": 19.111310592459606, + "grad_norm": 16.55670166015625, + "learning_rate": 8.988479952124477e-06, + "loss": 4.5277, + "step": 212900 + }, + { + "epoch": 19.113554757630162, + "grad_norm": 18.609928131103516, + "learning_rate": 8.98823060043886e-06, + "loss": 4.5464, + "step": 212925 + }, + { + "epoch": 19.115798922800717, + "grad_norm": 18.439739227294922, + "learning_rate": 8.987981248753242e-06, + "loss": 4.513, + "step": 212950 + }, + { + "epoch": 19.118043087971273, + "grad_norm": 17.92945671081543, + "learning_rate": 8.987731897067626e-06, + "loss": 4.3402, + "step": 212975 + }, + { + "epoch": 19.120287253141832, + "grad_norm": 19.84002113342285, + "learning_rate": 8.987482545382008e-06, + "loss": 4.6287, + "step": 213000 + }, + { + "epoch": 19.122531418312388, + "grad_norm": 18.291330337524414, + "learning_rate": 8.98723319369639e-06, + "loss": 4.4837, + "step": 213025 + }, + { + "epoch": 19.124775583482943, + "grad_norm": 17.550617218017578, + "learning_rate": 8.986983842010773e-06, + "loss": 4.6897, + "step": 213050 + }, + { + "epoch": 19.127019748653503, + "grad_norm": 18.817184448242188, + "learning_rate": 8.986734490325155e-06, + "loss": 4.5908, + "step": 213075 + }, + { + "epoch": 19.129263913824058, + "grad_norm": 19.60686683654785, + "learning_rate": 8.986485138639537e-06, + "loss": 4.6463, + "step": 213100 + }, + { + "epoch": 19.131508078994614, + "grad_norm": 19.773611068725586, + "learning_rate": 8.986235786953921e-06, + "loss": 4.8031, + "step": 213125 + }, + { + "epoch": 19.13375224416517, + "grad_norm": 17.40412139892578, + "learning_rate": 8.985986435268304e-06, + "loss": 4.764, + "step": 213150 + }, + { + "epoch": 19.13599640933573, + "grad_norm": 16.06819725036621, + "learning_rate": 8.985737083582686e-06, + "loss": 4.7638, + "step": 213175 + }, + { + "epoch": 19.138240574506284, + "grad_norm": 16.69786262512207, + "learning_rate": 8.985487731897068e-06, + "loss": 4.6242, + "step": 213200 + }, + { + "epoch": 19.14048473967684, + "grad_norm": 15.20014476776123, + "learning_rate": 8.98523838021145e-06, + "loss": 4.6319, + "step": 213225 + }, + { + "epoch": 19.142728904847395, + "grad_norm": 18.465137481689453, + "learning_rate": 8.984989028525833e-06, + "loss": 4.6742, + "step": 213250 + }, + { + "epoch": 19.144973070017954, + "grad_norm": 16.846025466918945, + "learning_rate": 8.984739676840217e-06, + "loss": 4.8643, + "step": 213275 + }, + { + "epoch": 19.14721723518851, + "grad_norm": 20.080202102661133, + "learning_rate": 8.9844903251546e-06, + "loss": 4.5002, + "step": 213300 + }, + { + "epoch": 19.149461400359066, + "grad_norm": 19.484914779663086, + "learning_rate": 8.984240973468982e-06, + "loss": 4.6929, + "step": 213325 + }, + { + "epoch": 19.151705565529625, + "grad_norm": 17.170318603515625, + "learning_rate": 8.983991621783364e-06, + "loss": 4.5374, + "step": 213350 + }, + { + "epoch": 19.15394973070018, + "grad_norm": 20.23688316345215, + "learning_rate": 8.983742270097746e-06, + "loss": 4.6598, + "step": 213375 + }, + { + "epoch": 19.156193895870736, + "grad_norm": 16.454689025878906, + "learning_rate": 8.983492918412129e-06, + "loss": 4.5693, + "step": 213400 + }, + { + "epoch": 19.15843806104129, + "grad_norm": 17.652164459228516, + "learning_rate": 8.98324356672651e-06, + "loss": 4.4188, + "step": 213425 + }, + { + "epoch": 19.16068222621185, + "grad_norm": 18.371545791625977, + "learning_rate": 8.982994215040895e-06, + "loss": 4.6202, + "step": 213450 + }, + { + "epoch": 19.162926391382406, + "grad_norm": 17.093502044677734, + "learning_rate": 8.982744863355277e-06, + "loss": 4.5082, + "step": 213475 + }, + { + "epoch": 19.16517055655296, + "grad_norm": 17.082883834838867, + "learning_rate": 8.98249551166966e-06, + "loss": 4.5824, + "step": 213500 + }, + { + "epoch": 19.167414721723517, + "grad_norm": 17.715206146240234, + "learning_rate": 8.982246159984043e-06, + "loss": 4.704, + "step": 213525 + }, + { + "epoch": 19.169658886894076, + "grad_norm": 22.760080337524414, + "learning_rate": 8.981996808298424e-06, + "loss": 4.5536, + "step": 213550 + }, + { + "epoch": 19.171903052064632, + "grad_norm": 17.761268615722656, + "learning_rate": 8.981747456612806e-06, + "loss": 4.6016, + "step": 213575 + }, + { + "epoch": 19.174147217235188, + "grad_norm": 19.19811248779297, + "learning_rate": 8.98149810492719e-06, + "loss": 4.4999, + "step": 213600 + }, + { + "epoch": 19.176391382405747, + "grad_norm": 19.47228240966797, + "learning_rate": 8.981248753241573e-06, + "loss": 4.5542, + "step": 213625 + }, + { + "epoch": 19.178635547576302, + "grad_norm": 17.581140518188477, + "learning_rate": 8.980999401555955e-06, + "loss": 4.6761, + "step": 213650 + }, + { + "epoch": 19.180879712746858, + "grad_norm": 18.091054916381836, + "learning_rate": 8.980750049870337e-06, + "loss": 4.3947, + "step": 213675 + }, + { + "epoch": 19.183123877917414, + "grad_norm": 15.649775505065918, + "learning_rate": 8.980500698184721e-06, + "loss": 4.3496, + "step": 213700 + }, + { + "epoch": 19.185368043087973, + "grad_norm": 17.744401931762695, + "learning_rate": 8.980251346499102e-06, + "loss": 4.7817, + "step": 213725 + }, + { + "epoch": 19.18761220825853, + "grad_norm": 16.240224838256836, + "learning_rate": 8.980001994813486e-06, + "loss": 4.6878, + "step": 213750 + }, + { + "epoch": 19.189856373429084, + "grad_norm": 21.20197868347168, + "learning_rate": 8.979762617195293e-06, + "loss": 4.5589, + "step": 213775 + }, + { + "epoch": 19.19210053859964, + "grad_norm": 17.904890060424805, + "learning_rate": 8.979513265509675e-06, + "loss": 4.8324, + "step": 213800 + }, + { + "epoch": 19.1943447037702, + "grad_norm": 16.63420867919922, + "learning_rate": 8.979263913824058e-06, + "loss": 4.6764, + "step": 213825 + }, + { + "epoch": 19.196588868940754, + "grad_norm": 16.39704132080078, + "learning_rate": 8.97901456213844e-06, + "loss": 4.5873, + "step": 213850 + }, + { + "epoch": 19.19883303411131, + "grad_norm": 18.16140365600586, + "learning_rate": 8.978765210452824e-06, + "loss": 4.7409, + "step": 213875 + }, + { + "epoch": 19.201077199281865, + "grad_norm": 15.422993659973145, + "learning_rate": 8.978515858767206e-06, + "loss": 4.5818, + "step": 213900 + }, + { + "epoch": 19.203321364452425, + "grad_norm": 17.20267105102539, + "learning_rate": 8.978266507081589e-06, + "loss": 4.7048, + "step": 213925 + }, + { + "epoch": 19.20556552962298, + "grad_norm": 20.80391502380371, + "learning_rate": 8.978017155395971e-06, + "loss": 4.6433, + "step": 213950 + }, + { + "epoch": 19.207809694793536, + "grad_norm": 18.909528732299805, + "learning_rate": 8.977767803710353e-06, + "loss": 4.6585, + "step": 213975 + }, + { + "epoch": 19.210053859964095, + "grad_norm": 18.93376922607422, + "learning_rate": 8.977518452024736e-06, + "loss": 4.4888, + "step": 214000 + }, + { + "epoch": 19.21229802513465, + "grad_norm": 17.587099075317383, + "learning_rate": 8.97726910033912e-06, + "loss": 4.3127, + "step": 214025 + }, + { + "epoch": 19.214542190305206, + "grad_norm": 17.852092742919922, + "learning_rate": 8.977019748653502e-06, + "loss": 4.5193, + "step": 214050 + }, + { + "epoch": 19.21678635547576, + "grad_norm": 15.02549934387207, + "learning_rate": 8.976770396967884e-06, + "loss": 4.5939, + "step": 214075 + }, + { + "epoch": 19.21903052064632, + "grad_norm": 17.164793014526367, + "learning_rate": 8.976521045282266e-06, + "loss": 4.7214, + "step": 214100 + }, + { + "epoch": 19.221274685816876, + "grad_norm": 20.4193172454834, + "learning_rate": 8.97627169359665e-06, + "loss": 4.8087, + "step": 214125 + }, + { + "epoch": 19.223518850987432, + "grad_norm": 22.35128402709961, + "learning_rate": 8.976022341911031e-06, + "loss": 4.7036, + "step": 214150 + }, + { + "epoch": 19.225763016157988, + "grad_norm": 17.363765716552734, + "learning_rate": 8.975772990225413e-06, + "loss": 4.8262, + "step": 214175 + }, + { + "epoch": 19.228007181328547, + "grad_norm": 19.499876022338867, + "learning_rate": 8.975523638539797e-06, + "loss": 4.6545, + "step": 214200 + }, + { + "epoch": 19.230251346499102, + "grad_norm": 19.0291690826416, + "learning_rate": 8.97527428685418e-06, + "loss": 4.4964, + "step": 214225 + }, + { + "epoch": 19.232495511669658, + "grad_norm": 18.30763053894043, + "learning_rate": 8.975024935168562e-06, + "loss": 4.5014, + "step": 214250 + }, + { + "epoch": 19.234739676840217, + "grad_norm": 19.314071655273438, + "learning_rate": 8.974775583482946e-06, + "loss": 4.7347, + "step": 214275 + }, + { + "epoch": 19.236983842010773, + "grad_norm": 15.098991394042969, + "learning_rate": 8.974526231797328e-06, + "loss": 4.5888, + "step": 214300 + }, + { + "epoch": 19.239228007181328, + "grad_norm": 18.934053421020508, + "learning_rate": 8.97427688011171e-06, + "loss": 4.271, + "step": 214325 + }, + { + "epoch": 19.241472172351884, + "grad_norm": Infinity, + "learning_rate": 8.974037502493518e-06, + "loss": 4.5395, + "step": 214350 + }, + { + "epoch": 19.243716337522443, + "grad_norm": 20.271942138671875, + "learning_rate": 8.9737881508079e-06, + "loss": 4.8003, + "step": 214375 + }, + { + "epoch": 19.245960502693, + "grad_norm": 16.597036361694336, + "learning_rate": 8.973538799122284e-06, + "loss": 4.4763, + "step": 214400 + }, + { + "epoch": 19.248204667863554, + "grad_norm": 22.464773178100586, + "learning_rate": 8.973289447436665e-06, + "loss": 4.5384, + "step": 214425 + }, + { + "epoch": 19.25044883303411, + "grad_norm": 18.792428970336914, + "learning_rate": 8.973040095751047e-06, + "loss": 4.7633, + "step": 214450 + }, + { + "epoch": 19.25269299820467, + "grad_norm": 16.991626739501953, + "learning_rate": 8.972790744065431e-06, + "loss": 4.562, + "step": 214475 + }, + { + "epoch": 19.254937163375224, + "grad_norm": 15.998200416564941, + "learning_rate": 8.972541392379813e-06, + "loss": 4.5557, + "step": 214500 + }, + { + "epoch": 19.25718132854578, + "grad_norm": 16.775251388549805, + "learning_rate": 8.972292040694196e-06, + "loss": 4.8539, + "step": 214525 + }, + { + "epoch": 19.25942549371634, + "grad_norm": 24.23675537109375, + "learning_rate": 8.97204268900858e-06, + "loss": 4.6396, + "step": 214550 + }, + { + "epoch": 19.261669658886895, + "grad_norm": 21.08257293701172, + "learning_rate": 8.971793337322962e-06, + "loss": 4.4747, + "step": 214575 + }, + { + "epoch": 19.26391382405745, + "grad_norm": 18.106231689453125, + "learning_rate": 8.971543985637343e-06, + "loss": 4.5893, + "step": 214600 + }, + { + "epoch": 19.266157989228006, + "grad_norm": 17.35015106201172, + "learning_rate": 8.971294633951727e-06, + "loss": 4.6444, + "step": 214625 + }, + { + "epoch": 19.268402154398565, + "grad_norm": 19.160564422607422, + "learning_rate": 8.971045282266109e-06, + "loss": 4.6256, + "step": 214650 + }, + { + "epoch": 19.27064631956912, + "grad_norm": 23.177873611450195, + "learning_rate": 8.970795930580491e-06, + "loss": 4.901, + "step": 214675 + }, + { + "epoch": 19.272890484739676, + "grad_norm": 18.761384963989258, + "learning_rate": 8.970546578894873e-06, + "loss": 4.7271, + "step": 214700 + }, + { + "epoch": 19.275134649910232, + "grad_norm": 17.675142288208008, + "learning_rate": 8.970297227209257e-06, + "loss": 4.6135, + "step": 214725 + }, + { + "epoch": 19.27737881508079, + "grad_norm": 20.141464233398438, + "learning_rate": 8.97004787552364e-06, + "loss": 4.4391, + "step": 214750 + }, + { + "epoch": 19.279622980251347, + "grad_norm": 17.06259536743164, + "learning_rate": 8.969798523838022e-06, + "loss": 4.7306, + "step": 214775 + }, + { + "epoch": 19.281867145421902, + "grad_norm": 16.99037742614746, + "learning_rate": 8.969549172152404e-06, + "loss": 4.66, + "step": 214800 + }, + { + "epoch": 19.28411131059246, + "grad_norm": 17.319040298461914, + "learning_rate": 8.969299820466787e-06, + "loss": 4.6408, + "step": 214825 + }, + { + "epoch": 19.286355475763017, + "grad_norm": 17.55424690246582, + "learning_rate": 8.969050468781169e-06, + "loss": 4.6693, + "step": 214850 + }, + { + "epoch": 19.288599640933572, + "grad_norm": 18.111045837402344, + "learning_rate": 8.968801117095553e-06, + "loss": 4.4353, + "step": 214875 + }, + { + "epoch": 19.290843806104128, + "grad_norm": 18.318679809570312, + "learning_rate": 8.968551765409935e-06, + "loss": 4.4976, + "step": 214900 + }, + { + "epoch": 19.293087971274687, + "grad_norm": 16.768190383911133, + "learning_rate": 8.968302413724318e-06, + "loss": 4.604, + "step": 214925 + }, + { + "epoch": 19.295332136445243, + "grad_norm": 17.130741119384766, + "learning_rate": 8.9680530620387e-06, + "loss": 4.5994, + "step": 214950 + }, + { + "epoch": 19.2975763016158, + "grad_norm": 16.03672218322754, + "learning_rate": 8.967803710353082e-06, + "loss": 4.7365, + "step": 214975 + }, + { + "epoch": 19.299820466786354, + "grad_norm": 19.214096069335938, + "learning_rate": 8.967554358667465e-06, + "loss": 4.5951, + "step": 215000 + }, + { + "epoch": 19.302064631956913, + "grad_norm": 19.334190368652344, + "learning_rate": 8.967305006981849e-06, + "loss": 4.6402, + "step": 215025 + }, + { + "epoch": 19.30430879712747, + "grad_norm": 16.329233169555664, + "learning_rate": 8.967055655296231e-06, + "loss": 4.5271, + "step": 215050 + }, + { + "epoch": 19.306552962298024, + "grad_norm": 22.63060760498047, + "learning_rate": 8.966806303610613e-06, + "loss": 4.7794, + "step": 215075 + }, + { + "epoch": 19.30879712746858, + "grad_norm": 20.06792640686035, + "learning_rate": 8.966556951924996e-06, + "loss": 4.6361, + "step": 215100 + }, + { + "epoch": 19.31104129263914, + "grad_norm": 19.234355926513672, + "learning_rate": 8.966307600239378e-06, + "loss": 4.6286, + "step": 215125 + }, + { + "epoch": 19.313285457809695, + "grad_norm": 17.697765350341797, + "learning_rate": 8.96605824855376e-06, + "loss": 4.5692, + "step": 215150 + }, + { + "epoch": 19.31552962298025, + "grad_norm": 18.04571533203125, + "learning_rate": 8.965808896868143e-06, + "loss": 4.6027, + "step": 215175 + }, + { + "epoch": 19.31777378815081, + "grad_norm": 18.767162322998047, + "learning_rate": 8.965559545182527e-06, + "loss": 4.5228, + "step": 215200 + }, + { + "epoch": 19.320017953321365, + "grad_norm": 19.064958572387695, + "learning_rate": 8.965310193496909e-06, + "loss": 4.5513, + "step": 215225 + }, + { + "epoch": 19.32226211849192, + "grad_norm": 23.289573669433594, + "learning_rate": 8.965060841811291e-06, + "loss": 4.2659, + "step": 215250 + }, + { + "epoch": 19.324506283662476, + "grad_norm": 20.15873908996582, + "learning_rate": 8.964811490125675e-06, + "loss": 4.7343, + "step": 215275 + }, + { + "epoch": 19.326750448833035, + "grad_norm": 18.302465438842773, + "learning_rate": 8.964562138440056e-06, + "loss": 4.6158, + "step": 215300 + }, + { + "epoch": 19.32899461400359, + "grad_norm": 19.59309959411621, + "learning_rate": 8.964312786754438e-06, + "loss": 4.7183, + "step": 215325 + }, + { + "epoch": 19.331238779174146, + "grad_norm": 19.411174774169922, + "learning_rate": 8.964063435068822e-06, + "loss": 4.4367, + "step": 215350 + }, + { + "epoch": 19.333482944344702, + "grad_norm": 14.104721069335938, + "learning_rate": 8.963814083383204e-06, + "loss": 4.5686, + "step": 215375 + }, + { + "epoch": 19.33572710951526, + "grad_norm": 19.574575424194336, + "learning_rate": 8.963564731697587e-06, + "loss": 4.6338, + "step": 215400 + }, + { + "epoch": 19.337971274685817, + "grad_norm": 22.42226791381836, + "learning_rate": 8.963315380011969e-06, + "loss": 4.4143, + "step": 215425 + }, + { + "epoch": 19.340215439856372, + "grad_norm": 18.210187911987305, + "learning_rate": 8.963066028326353e-06, + "loss": 4.7232, + "step": 215450 + }, + { + "epoch": 19.34245960502693, + "grad_norm": 17.07159996032715, + "learning_rate": 8.962816676640734e-06, + "loss": 4.7032, + "step": 215475 + }, + { + "epoch": 19.344703770197487, + "grad_norm": 17.83979606628418, + "learning_rate": 8.962567324955118e-06, + "loss": 4.7567, + "step": 215500 + }, + { + "epoch": 19.346947935368043, + "grad_norm": 17.931032180786133, + "learning_rate": 8.9623179732695e-06, + "loss": 4.7337, + "step": 215525 + }, + { + "epoch": 19.3491921005386, + "grad_norm": 18.050315856933594, + "learning_rate": 8.962068621583882e-06, + "loss": 4.5084, + "step": 215550 + }, + { + "epoch": 19.351436265709157, + "grad_norm": 20.887678146362305, + "learning_rate": 8.961819269898265e-06, + "loss": 4.2857, + "step": 215575 + }, + { + "epoch": 19.353680430879713, + "grad_norm": 15.839831352233887, + "learning_rate": 8.961569918212649e-06, + "loss": 4.6428, + "step": 215600 + }, + { + "epoch": 19.35592459605027, + "grad_norm": 18.420204162597656, + "learning_rate": 8.961320566527031e-06, + "loss": 4.6516, + "step": 215625 + }, + { + "epoch": 19.358168761220824, + "grad_norm": 20.576793670654297, + "learning_rate": 8.961071214841413e-06, + "loss": 4.6533, + "step": 215650 + }, + { + "epoch": 19.360412926391383, + "grad_norm": 17.466644287109375, + "learning_rate": 8.960821863155796e-06, + "loss": 4.8233, + "step": 215675 + }, + { + "epoch": 19.36265709156194, + "grad_norm": 21.700679779052734, + "learning_rate": 8.960572511470178e-06, + "loss": 4.3401, + "step": 215700 + }, + { + "epoch": 19.364901256732495, + "grad_norm": 17.660140991210938, + "learning_rate": 8.96032315978456e-06, + "loss": 4.4592, + "step": 215725 + }, + { + "epoch": 19.367145421903054, + "grad_norm": 19.678253173828125, + "learning_rate": 8.960073808098944e-06, + "loss": 4.7699, + "step": 215750 + }, + { + "epoch": 19.36938958707361, + "grad_norm": 20.552522659301758, + "learning_rate": 8.959824456413327e-06, + "loss": 4.7343, + "step": 215775 + }, + { + "epoch": 19.371633752244165, + "grad_norm": 21.330005645751953, + "learning_rate": 8.959575104727709e-06, + "loss": 4.7906, + "step": 215800 + }, + { + "epoch": 19.37387791741472, + "grad_norm": 17.230140686035156, + "learning_rate": 8.959325753042091e-06, + "loss": 4.3839, + "step": 215825 + }, + { + "epoch": 19.37612208258528, + "grad_norm": 18.758460998535156, + "learning_rate": 8.959076401356474e-06, + "loss": 4.6038, + "step": 215850 + }, + { + "epoch": 19.378366247755835, + "grad_norm": 16.912336349487305, + "learning_rate": 8.958827049670856e-06, + "loss": 4.5738, + "step": 215875 + }, + { + "epoch": 19.38061041292639, + "grad_norm": 15.142563819885254, + "learning_rate": 8.958577697985238e-06, + "loss": 4.5605, + "step": 215900 + }, + { + "epoch": 19.382854578096946, + "grad_norm": 21.979169845581055, + "learning_rate": 8.958328346299622e-06, + "loss": 4.9009, + "step": 215925 + }, + { + "epoch": 19.385098743267505, + "grad_norm": 16.96617889404297, + "learning_rate": 8.958078994614005e-06, + "loss": 4.7755, + "step": 215950 + }, + { + "epoch": 19.38734290843806, + "grad_norm": 18.826486587524414, + "learning_rate": 8.957829642928387e-06, + "loss": 4.5913, + "step": 215975 + }, + { + "epoch": 19.389587073608617, + "grad_norm": 17.592716217041016, + "learning_rate": 8.957580291242771e-06, + "loss": 4.5045, + "step": 216000 + }, + { + "epoch": 19.391831238779176, + "grad_norm": 20.370044708251953, + "learning_rate": 8.957330939557151e-06, + "loss": 4.584, + "step": 216025 + }, + { + "epoch": 19.39407540394973, + "grad_norm": 20.1439151763916, + "learning_rate": 8.957081587871534e-06, + "loss": 4.6468, + "step": 216050 + }, + { + "epoch": 19.396319569120287, + "grad_norm": 20.612688064575195, + "learning_rate": 8.956832236185918e-06, + "loss": 4.6255, + "step": 216075 + }, + { + "epoch": 19.398563734290843, + "grad_norm": 20.503812789916992, + "learning_rate": 8.9565828845003e-06, + "loss": 4.5725, + "step": 216100 + }, + { + "epoch": 19.4008078994614, + "grad_norm": 18.256656646728516, + "learning_rate": 8.956333532814682e-06, + "loss": 4.5562, + "step": 216125 + }, + { + "epoch": 19.403052064631957, + "grad_norm": 18.25686264038086, + "learning_rate": 8.956084181129065e-06, + "loss": 4.7041, + "step": 216150 + }, + { + "epoch": 19.405296229802513, + "grad_norm": 18.600500106811523, + "learning_rate": 8.955834829443449e-06, + "loss": 4.6235, + "step": 216175 + }, + { + "epoch": 19.40754039497307, + "grad_norm": 19.528806686401367, + "learning_rate": 8.95558547775783e-06, + "loss": 4.6801, + "step": 216200 + }, + { + "epoch": 19.409784560143628, + "grad_norm": 23.139827728271484, + "learning_rate": 8.955336126072213e-06, + "loss": 4.7233, + "step": 216225 + }, + { + "epoch": 19.412028725314183, + "grad_norm": 18.409427642822266, + "learning_rate": 8.955086774386596e-06, + "loss": 4.8775, + "step": 216250 + }, + { + "epoch": 19.41427289048474, + "grad_norm": 17.188560485839844, + "learning_rate": 8.954837422700978e-06, + "loss": 4.4738, + "step": 216275 + }, + { + "epoch": 19.416517055655298, + "grad_norm": 21.995113372802734, + "learning_rate": 8.95458807101536e-06, + "loss": 4.6326, + "step": 216300 + }, + { + "epoch": 19.418761220825854, + "grad_norm": 19.819461822509766, + "learning_rate": 8.954338719329744e-06, + "loss": 4.7787, + "step": 216325 + }, + { + "epoch": 19.42100538599641, + "grad_norm": 20.704980850219727, + "learning_rate": 8.954089367644127e-06, + "loss": 4.6148, + "step": 216350 + }, + { + "epoch": 19.423249551166965, + "grad_norm": 20.155698776245117, + "learning_rate": 8.953840015958509e-06, + "loss": 4.6451, + "step": 216375 + }, + { + "epoch": 19.425493716337524, + "grad_norm": 18.610292434692383, + "learning_rate": 8.953590664272891e-06, + "loss": 4.6238, + "step": 216400 + }, + { + "epoch": 19.42773788150808, + "grad_norm": 19.743181228637695, + "learning_rate": 8.953341312587274e-06, + "loss": 4.768, + "step": 216425 + }, + { + "epoch": 19.429982046678635, + "grad_norm": 17.43521499633789, + "learning_rate": 8.953091960901656e-06, + "loss": 4.4612, + "step": 216450 + }, + { + "epoch": 19.43222621184919, + "grad_norm": 18.88511085510254, + "learning_rate": 8.95284260921604e-06, + "loss": 4.8158, + "step": 216475 + }, + { + "epoch": 19.43447037701975, + "grad_norm": 17.858781814575195, + "learning_rate": 8.952593257530422e-06, + "loss": 4.6655, + "step": 216500 + }, + { + "epoch": 19.436714542190305, + "grad_norm": 20.278568267822266, + "learning_rate": 8.952343905844805e-06, + "loss": 4.6298, + "step": 216525 + }, + { + "epoch": 19.43895870736086, + "grad_norm": 18.593053817749023, + "learning_rate": 8.952094554159187e-06, + "loss": 4.7087, + "step": 216550 + }, + { + "epoch": 19.44120287253142, + "grad_norm": 19.211767196655273, + "learning_rate": 8.95184520247357e-06, + "loss": 4.4456, + "step": 216575 + }, + { + "epoch": 19.443447037701976, + "grad_norm": 19.007705688476562, + "learning_rate": 8.951595850787952e-06, + "loss": 4.5294, + "step": 216600 + }, + { + "epoch": 19.44569120287253, + "grad_norm": 15.433367729187012, + "learning_rate": 8.951346499102334e-06, + "loss": 4.6914, + "step": 216625 + }, + { + "epoch": 19.447935368043087, + "grad_norm": 23.172903060913086, + "learning_rate": 8.951097147416718e-06, + "loss": 4.6983, + "step": 216650 + }, + { + "epoch": 19.450179533213646, + "grad_norm": 14.997526168823242, + "learning_rate": 8.9508477957311e-06, + "loss": 4.7023, + "step": 216675 + }, + { + "epoch": 19.4524236983842, + "grad_norm": 17.85498046875, + "learning_rate": 8.950608418112907e-06, + "loss": 4.6903, + "step": 216700 + }, + { + "epoch": 19.454667863554757, + "grad_norm": 20.181713104248047, + "learning_rate": 8.95035906642729e-06, + "loss": 4.8679, + "step": 216725 + }, + { + "epoch": 19.456912028725313, + "grad_norm": 19.526947021484375, + "learning_rate": 8.950109714741673e-06, + "loss": 4.6071, + "step": 216750 + }, + { + "epoch": 19.459156193895872, + "grad_norm": 18.596193313598633, + "learning_rate": 8.949860363056056e-06, + "loss": 4.6181, + "step": 216775 + }, + { + "epoch": 19.461400359066428, + "grad_norm": 18.292875289916992, + "learning_rate": 8.949611011370436e-06, + "loss": 4.6605, + "step": 216800 + }, + { + "epoch": 19.463644524236983, + "grad_norm": 21.761348724365234, + "learning_rate": 8.94936165968482e-06, + "loss": 4.6242, + "step": 216825 + }, + { + "epoch": 19.46588868940754, + "grad_norm": 17.303752899169922, + "learning_rate": 8.949112307999203e-06, + "loss": 4.5841, + "step": 216850 + }, + { + "epoch": 19.468132854578098, + "grad_norm": 19.696659088134766, + "learning_rate": 8.948862956313585e-06, + "loss": 4.7123, + "step": 216875 + }, + { + "epoch": 19.470377019748653, + "grad_norm": 19.976577758789062, + "learning_rate": 8.948613604627967e-06, + "loss": 4.754, + "step": 216900 + }, + { + "epoch": 19.47262118491921, + "grad_norm": 20.58864402770996, + "learning_rate": 8.948364252942351e-06, + "loss": 4.6695, + "step": 216925 + }, + { + "epoch": 19.474865350089768, + "grad_norm": 23.43510627746582, + "learning_rate": 8.948114901256734e-06, + "loss": 4.7154, + "step": 216950 + }, + { + "epoch": 19.477109515260324, + "grad_norm": 19.553939819335938, + "learning_rate": 8.947865549571116e-06, + "loss": 4.6809, + "step": 216975 + }, + { + "epoch": 19.47935368043088, + "grad_norm": 20.87081527709961, + "learning_rate": 8.947616197885498e-06, + "loss": 4.3944, + "step": 217000 + }, + { + "epoch": 19.481597845601435, + "grad_norm": 19.616613388061523, + "learning_rate": 8.94736684619988e-06, + "loss": 4.6243, + "step": 217025 + }, + { + "epoch": 19.483842010771994, + "grad_norm": 20.93281364440918, + "learning_rate": 8.947117494514263e-06, + "loss": 4.6729, + "step": 217050 + }, + { + "epoch": 19.48608617594255, + "grad_norm": 19.14906883239746, + "learning_rate": 8.946868142828647e-06, + "loss": 5.0278, + "step": 217075 + }, + { + "epoch": 19.488330341113105, + "grad_norm": 18.17879295349121, + "learning_rate": 8.94661879114303e-06, + "loss": 4.5702, + "step": 217100 + }, + { + "epoch": 19.49057450628366, + "grad_norm": 17.229137420654297, + "learning_rate": 8.946369439457412e-06, + "loss": 4.8889, + "step": 217125 + }, + { + "epoch": 19.49281867145422, + "grad_norm": 20.480865478515625, + "learning_rate": 8.946120087771794e-06, + "loss": 4.6956, + "step": 217150 + }, + { + "epoch": 19.495062836624776, + "grad_norm": 15.397958755493164, + "learning_rate": 8.945870736086176e-06, + "loss": 4.9194, + "step": 217175 + }, + { + "epoch": 19.49730700179533, + "grad_norm": 18.666719436645508, + "learning_rate": 8.945621384400559e-06, + "loss": 4.6157, + "step": 217200 + }, + { + "epoch": 19.49955116696589, + "grad_norm": 16.527347564697266, + "learning_rate": 8.945372032714943e-06, + "loss": 4.7341, + "step": 217225 + }, + { + "epoch": 19.501795332136446, + "grad_norm": 19.69521141052246, + "learning_rate": 8.945122681029325e-06, + "loss": 4.4655, + "step": 217250 + }, + { + "epoch": 19.504039497307, + "grad_norm": 18.668922424316406, + "learning_rate": 8.944873329343707e-06, + "loss": 4.6193, + "step": 217275 + }, + { + "epoch": 19.506283662477557, + "grad_norm": 19.354957580566406, + "learning_rate": 8.94462397765809e-06, + "loss": 4.5618, + "step": 217300 + }, + { + "epoch": 19.508527827648116, + "grad_norm": 16.607789993286133, + "learning_rate": 8.944374625972473e-06, + "loss": 4.9388, + "step": 217325 + }, + { + "epoch": 19.510771992818672, + "grad_norm": 17.64926528930664, + "learning_rate": 8.944125274286854e-06, + "loss": 4.8724, + "step": 217350 + }, + { + "epoch": 19.513016157989227, + "grad_norm": 20.154762268066406, + "learning_rate": 8.943875922601238e-06, + "loss": 4.8326, + "step": 217375 + }, + { + "epoch": 19.515260323159783, + "grad_norm": 18.814842224121094, + "learning_rate": 8.94362657091562e-06, + "loss": 5.0133, + "step": 217400 + }, + { + "epoch": 19.517504488330342, + "grad_norm": 16.776405334472656, + "learning_rate": 8.943377219230003e-06, + "loss": 4.8876, + "step": 217425 + }, + { + "epoch": 19.519748653500898, + "grad_norm": 18.786497116088867, + "learning_rate": 8.943127867544385e-06, + "loss": 4.8117, + "step": 217450 + }, + { + "epoch": 19.521992818671453, + "grad_norm": 15.511019706726074, + "learning_rate": 8.942878515858769e-06, + "loss": 4.4898, + "step": 217475 + }, + { + "epoch": 19.524236983842012, + "grad_norm": 15.480743408203125, + "learning_rate": 8.942629164173151e-06, + "loss": 4.5531, + "step": 217500 + }, + { + "epoch": 19.526481149012568, + "grad_norm": 20.626296997070312, + "learning_rate": 8.942379812487532e-06, + "loss": 4.7409, + "step": 217525 + }, + { + "epoch": 19.528725314183124, + "grad_norm": 18.85663414001465, + "learning_rate": 8.942130460801916e-06, + "loss": 4.4789, + "step": 217550 + }, + { + "epoch": 19.53096947935368, + "grad_norm": 17.70938491821289, + "learning_rate": 8.941881109116298e-06, + "loss": 5.1394, + "step": 217575 + }, + { + "epoch": 19.53321364452424, + "grad_norm": 21.0555419921875, + "learning_rate": 8.94163175743068e-06, + "loss": 4.5693, + "step": 217600 + }, + { + "epoch": 19.535457809694794, + "grad_norm": 18.69247817993164, + "learning_rate": 8.941382405745063e-06, + "loss": 4.6331, + "step": 217625 + }, + { + "epoch": 19.53770197486535, + "grad_norm": 18.96192169189453, + "learning_rate": 8.941133054059447e-06, + "loss": 4.6379, + "step": 217650 + }, + { + "epoch": 19.539946140035905, + "grad_norm": 17.095081329345703, + "learning_rate": 8.94088370237383e-06, + "loss": 4.9036, + "step": 217675 + }, + { + "epoch": 19.542190305206464, + "grad_norm": 19.006816864013672, + "learning_rate": 8.940634350688212e-06, + "loss": 4.4334, + "step": 217700 + }, + { + "epoch": 19.54443447037702, + "grad_norm": 17.512039184570312, + "learning_rate": 8.940384999002594e-06, + "loss": 4.5635, + "step": 217725 + }, + { + "epoch": 19.546678635547575, + "grad_norm": 24.41291618347168, + "learning_rate": 8.940135647316976e-06, + "loss": 4.2818, + "step": 217750 + }, + { + "epoch": 19.54892280071813, + "grad_norm": 15.362828254699707, + "learning_rate": 8.939886295631359e-06, + "loss": 4.6057, + "step": 217775 + }, + { + "epoch": 19.55116696588869, + "grad_norm": 17.37406349182129, + "learning_rate": 8.939636943945743e-06, + "loss": 4.9021, + "step": 217800 + }, + { + "epoch": 19.553411131059246, + "grad_norm": 18.891902923583984, + "learning_rate": 8.939387592260125e-06, + "loss": 4.7753, + "step": 217825 + }, + { + "epoch": 19.5556552962298, + "grad_norm": 16.750810623168945, + "learning_rate": 8.939138240574507e-06, + "loss": 4.4847, + "step": 217850 + }, + { + "epoch": 19.55789946140036, + "grad_norm": 15.103419303894043, + "learning_rate": 8.93888888888889e-06, + "loss": 4.785, + "step": 217875 + }, + { + "epoch": 19.560143626570916, + "grad_norm": 19.368379592895508, + "learning_rate": 8.938639537203272e-06, + "loss": 4.6398, + "step": 217900 + }, + { + "epoch": 19.56238779174147, + "grad_norm": 20.184003829956055, + "learning_rate": 8.938390185517654e-06, + "loss": 4.6564, + "step": 217925 + }, + { + "epoch": 19.564631956912027, + "grad_norm": 18.794292449951172, + "learning_rate": 8.938140833832038e-06, + "loss": 4.5981, + "step": 217950 + }, + { + "epoch": 19.566876122082586, + "grad_norm": 17.248823165893555, + "learning_rate": 8.93789148214642e-06, + "loss": 4.7487, + "step": 217975 + }, + { + "epoch": 19.569120287253142, + "grad_norm": 18.581552505493164, + "learning_rate": 8.937642130460803e-06, + "loss": 5.062, + "step": 218000 + }, + { + "epoch": 19.571364452423698, + "grad_norm": 19.467880249023438, + "learning_rate": 8.937392778775185e-06, + "loss": 4.8387, + "step": 218025 + }, + { + "epoch": 19.573608617594253, + "grad_norm": 19.351600646972656, + "learning_rate": 8.937143427089567e-06, + "loss": 4.8795, + "step": 218050 + }, + { + "epoch": 19.575852782764812, + "grad_norm": 19.641746520996094, + "learning_rate": 8.93689407540395e-06, + "loss": 4.8086, + "step": 218075 + }, + { + "epoch": 19.578096947935368, + "grad_norm": 22.4274845123291, + "learning_rate": 8.936644723718334e-06, + "loss": 4.636, + "step": 218100 + }, + { + "epoch": 19.580341113105924, + "grad_norm": 21.427978515625, + "learning_rate": 8.936395372032716e-06, + "loss": 4.766, + "step": 218125 + }, + { + "epoch": 19.582585278276483, + "grad_norm": 17.650312423706055, + "learning_rate": 8.936146020347098e-06, + "loss": 4.6446, + "step": 218150 + }, + { + "epoch": 19.58482944344704, + "grad_norm": 19.471935272216797, + "learning_rate": 8.93589666866148e-06, + "loss": 4.677, + "step": 218175 + }, + { + "epoch": 19.587073608617594, + "grad_norm": 20.086761474609375, + "learning_rate": 8.935647316975865e-06, + "loss": 4.7913, + "step": 218200 + }, + { + "epoch": 19.58931777378815, + "grad_norm": 17.58665657043457, + "learning_rate": 8.935397965290245e-06, + "loss": 4.4659, + "step": 218225 + }, + { + "epoch": 19.59156193895871, + "grad_norm": 18.36222267150879, + "learning_rate": 8.935148613604628e-06, + "loss": 4.8504, + "step": 218250 + }, + { + "epoch": 19.593806104129264, + "grad_norm": 14.824063301086426, + "learning_rate": 8.934899261919012e-06, + "loss": 4.8266, + "step": 218275 + }, + { + "epoch": 19.59605026929982, + "grad_norm": 14.648045539855957, + "learning_rate": 8.934649910233394e-06, + "loss": 4.745, + "step": 218300 + }, + { + "epoch": 19.598294434470375, + "grad_norm": 17.404632568359375, + "learning_rate": 8.934400558547776e-06, + "loss": 4.6235, + "step": 218325 + }, + { + "epoch": 19.600538599640934, + "grad_norm": 17.145021438598633, + "learning_rate": 8.934151206862159e-06, + "loss": 4.6584, + "step": 218350 + }, + { + "epoch": 19.60278276481149, + "grad_norm": 18.1328182220459, + "learning_rate": 8.933901855176543e-06, + "loss": 4.5959, + "step": 218375 + }, + { + "epoch": 19.605026929982046, + "grad_norm": 17.62437629699707, + "learning_rate": 8.933652503490923e-06, + "loss": 4.7357, + "step": 218400 + }, + { + "epoch": 19.607271095152605, + "grad_norm": 16.76304817199707, + "learning_rate": 8.933403151805307e-06, + "loss": 4.7977, + "step": 218425 + }, + { + "epoch": 19.60951526032316, + "grad_norm": 17.242984771728516, + "learning_rate": 8.93315380011969e-06, + "loss": 4.7724, + "step": 218450 + }, + { + "epoch": 19.611759425493716, + "grad_norm": 20.656963348388672, + "learning_rate": 8.932904448434072e-06, + "loss": 4.5519, + "step": 218475 + }, + { + "epoch": 19.61400359066427, + "grad_norm": 17.186098098754883, + "learning_rate": 8.932655096748454e-06, + "loss": 4.7501, + "step": 218500 + }, + { + "epoch": 19.61624775583483, + "grad_norm": 18.6395206451416, + "learning_rate": 8.932405745062838e-06, + "loss": 4.748, + "step": 218525 + }, + { + "epoch": 19.618491921005386, + "grad_norm": 14.205994606018066, + "learning_rate": 8.93215639337722e-06, + "loss": 4.5672, + "step": 218550 + }, + { + "epoch": 19.620736086175942, + "grad_norm": 22.400100708007812, + "learning_rate": 8.931907041691603e-06, + "loss": 4.6876, + "step": 218575 + }, + { + "epoch": 19.622980251346497, + "grad_norm": 18.755136489868164, + "learning_rate": 8.931657690005985e-06, + "loss": 4.4383, + "step": 218600 + }, + { + "epoch": 19.625224416517057, + "grad_norm": 19.301267623901367, + "learning_rate": 8.931408338320367e-06, + "loss": 4.7052, + "step": 218625 + }, + { + "epoch": 19.627468581687612, + "grad_norm": 20.891000747680664, + "learning_rate": 8.93115898663475e-06, + "loss": 4.8536, + "step": 218650 + }, + { + "epoch": 19.629712746858168, + "grad_norm": 18.89409065246582, + "learning_rate": 8.930909634949134e-06, + "loss": 4.6179, + "step": 218675 + }, + { + "epoch": 19.631956912028727, + "grad_norm": 15.583870887756348, + "learning_rate": 8.930660283263516e-06, + "loss": 4.8323, + "step": 218700 + }, + { + "epoch": 19.634201077199283, + "grad_norm": 20.769052505493164, + "learning_rate": 8.930410931577898e-06, + "loss": 4.6895, + "step": 218725 + }, + { + "epoch": 19.636445242369838, + "grad_norm": 21.002573013305664, + "learning_rate": 8.93016157989228e-06, + "loss": 4.6182, + "step": 218750 + }, + { + "epoch": 19.638689407540394, + "grad_norm": 19.76067352294922, + "learning_rate": 8.929912228206663e-06, + "loss": 4.675, + "step": 218775 + }, + { + "epoch": 19.640933572710953, + "grad_norm": 20.477157592773438, + "learning_rate": 8.929662876521045e-06, + "loss": 4.7467, + "step": 218800 + }, + { + "epoch": 19.64317773788151, + "grad_norm": 15.498779296875, + "learning_rate": 8.92941352483543e-06, + "loss": 4.6165, + "step": 218825 + }, + { + "epoch": 19.645421903052064, + "grad_norm": 19.249549865722656, + "learning_rate": 8.929164173149812e-06, + "loss": 4.8345, + "step": 218850 + }, + { + "epoch": 19.64766606822262, + "grad_norm": 18.81645965576172, + "learning_rate": 8.928914821464194e-06, + "loss": 4.6453, + "step": 218875 + }, + { + "epoch": 19.64991023339318, + "grad_norm": 19.909257888793945, + "learning_rate": 8.928665469778576e-06, + "loss": 4.6334, + "step": 218900 + }, + { + "epoch": 19.652154398563734, + "grad_norm": 19.924827575683594, + "learning_rate": 8.928416118092959e-06, + "loss": 4.5176, + "step": 218925 + }, + { + "epoch": 19.65439856373429, + "grad_norm": 21.115455627441406, + "learning_rate": 8.928166766407341e-06, + "loss": 4.7829, + "step": 218950 + }, + { + "epoch": 19.65664272890485, + "grad_norm": 20.982709884643555, + "learning_rate": 8.927917414721723e-06, + "loss": 4.5749, + "step": 218975 + }, + { + "epoch": 19.658886894075405, + "grad_norm": 18.59268569946289, + "learning_rate": 8.927668063036107e-06, + "loss": 4.7417, + "step": 219000 + }, + { + "epoch": 19.66113105924596, + "grad_norm": 20.485919952392578, + "learning_rate": 8.92741871135049e-06, + "loss": 4.6318, + "step": 219025 + }, + { + "epoch": 19.663375224416516, + "grad_norm": 19.198486328125, + "learning_rate": 8.927169359664872e-06, + "loss": 4.76, + "step": 219050 + }, + { + "epoch": 19.665619389587075, + "grad_norm": 17.840789794921875, + "learning_rate": 8.926920007979254e-06, + "loss": 4.671, + "step": 219075 + }, + { + "epoch": 19.66786355475763, + "grad_norm": 18.21192169189453, + "learning_rate": 8.926670656293638e-06, + "loss": 5.0347, + "step": 219100 + }, + { + "epoch": 19.670107719928186, + "grad_norm": 17.450105667114258, + "learning_rate": 8.926421304608019e-06, + "loss": 4.7497, + "step": 219125 + }, + { + "epoch": 19.67235188509874, + "grad_norm": 16.96944808959961, + "learning_rate": 8.926181926989828e-06, + "loss": 4.7843, + "step": 219150 + }, + { + "epoch": 19.6745960502693, + "grad_norm": 21.180362701416016, + "learning_rate": 8.92593257530421e-06, + "loss": 4.405, + "step": 219175 + }, + { + "epoch": 19.676840215439857, + "grad_norm": 16.905773162841797, + "learning_rate": 8.925683223618592e-06, + "loss": 4.4702, + "step": 219200 + }, + { + "epoch": 19.679084380610412, + "grad_norm": 18.757179260253906, + "learning_rate": 8.925433871932974e-06, + "loss": 4.4976, + "step": 219225 + }, + { + "epoch": 19.68132854578097, + "grad_norm": 18.967304229736328, + "learning_rate": 8.925184520247357e-06, + "loss": 4.5363, + "step": 219250 + }, + { + "epoch": 19.683572710951527, + "grad_norm": 19.33631706237793, + "learning_rate": 8.92493516856174e-06, + "loss": 4.6533, + "step": 219275 + }, + { + "epoch": 19.685816876122082, + "grad_norm": 20.920480728149414, + "learning_rate": 8.924685816876123e-06, + "loss": 4.6247, + "step": 219300 + }, + { + "epoch": 19.688061041292638, + "grad_norm": 18.317630767822266, + "learning_rate": 8.924436465190505e-06, + "loss": 4.5319, + "step": 219325 + }, + { + "epoch": 19.690305206463197, + "grad_norm": 20.143457412719727, + "learning_rate": 8.924187113504888e-06, + "loss": 4.5804, + "step": 219350 + }, + { + "epoch": 19.692549371633753, + "grad_norm": 16.153898239135742, + "learning_rate": 8.92393776181927e-06, + "loss": 4.729, + "step": 219375 + }, + { + "epoch": 19.69479353680431, + "grad_norm": 16.1629581451416, + "learning_rate": 8.923688410133652e-06, + "loss": 4.8146, + "step": 219400 + }, + { + "epoch": 19.697037701974864, + "grad_norm": 17.31422233581543, + "learning_rate": 8.923439058448036e-06, + "loss": 4.5732, + "step": 219425 + }, + { + "epoch": 19.699281867145423, + "grad_norm": 21.712289810180664, + "learning_rate": 8.923189706762419e-06, + "loss": 4.7674, + "step": 219450 + }, + { + "epoch": 19.70152603231598, + "grad_norm": 18.538524627685547, + "learning_rate": 8.922940355076801e-06, + "loss": 4.794, + "step": 219475 + }, + { + "epoch": 19.703770197486534, + "grad_norm": 18.343324661254883, + "learning_rate": 8.922691003391183e-06, + "loss": 4.8487, + "step": 219500 + }, + { + "epoch": 19.706014362657093, + "grad_norm": 17.178247451782227, + "learning_rate": 8.922441651705567e-06, + "loss": 4.727, + "step": 219525 + }, + { + "epoch": 19.70825852782765, + "grad_norm": 19.66762924194336, + "learning_rate": 8.922192300019948e-06, + "loss": 4.8369, + "step": 219550 + }, + { + "epoch": 19.710502692998205, + "grad_norm": 18.99695587158203, + "learning_rate": 8.921942948334332e-06, + "loss": 4.5668, + "step": 219575 + }, + { + "epoch": 19.71274685816876, + "grad_norm": 20.51740264892578, + "learning_rate": 8.921693596648714e-06, + "loss": 4.8809, + "step": 219600 + }, + { + "epoch": 19.71499102333932, + "grad_norm": 21.894775390625, + "learning_rate": 8.921444244963097e-06, + "loss": 4.8931, + "step": 219625 + }, + { + "epoch": 19.717235188509875, + "grad_norm": 21.083070755004883, + "learning_rate": 8.921194893277479e-06, + "loss": 4.9171, + "step": 219650 + }, + { + "epoch": 19.71947935368043, + "grad_norm": 18.68585777282715, + "learning_rate": 8.920945541591863e-06, + "loss": 4.8211, + "step": 219675 + }, + { + "epoch": 19.721723518850986, + "grad_norm": 16.70577621459961, + "learning_rate": 8.920696189906245e-06, + "loss": 4.5252, + "step": 219700 + }, + { + "epoch": 19.723967684021545, + "grad_norm": 18.443580627441406, + "learning_rate": 8.920446838220626e-06, + "loss": 4.8038, + "step": 219725 + }, + { + "epoch": 19.7262118491921, + "grad_norm": 18.203744888305664, + "learning_rate": 8.92019748653501e-06, + "loss": 4.6544, + "step": 219750 + }, + { + "epoch": 19.728456014362656, + "grad_norm": 16.33669662475586, + "learning_rate": 8.919948134849392e-06, + "loss": 4.5265, + "step": 219775 + }, + { + "epoch": 19.730700179533212, + "grad_norm": 17.603788375854492, + "learning_rate": 8.919698783163775e-06, + "loss": 4.8026, + "step": 219800 + }, + { + "epoch": 19.73294434470377, + "grad_norm": 18.24757957458496, + "learning_rate": 8.919449431478159e-06, + "loss": 4.8628, + "step": 219825 + }, + { + "epoch": 19.735188509874327, + "grad_norm": 22.964271545410156, + "learning_rate": 8.91920007979254e-06, + "loss": 4.7144, + "step": 219850 + }, + { + "epoch": 19.737432675044882, + "grad_norm": 18.72841453552246, + "learning_rate": 8.918950728106923e-06, + "loss": 4.7631, + "step": 219875 + }, + { + "epoch": 19.73967684021544, + "grad_norm": 17.291976928710938, + "learning_rate": 8.918701376421305e-06, + "loss": 4.7739, + "step": 219900 + }, + { + "epoch": 19.741921005385997, + "grad_norm": 14.620047569274902, + "learning_rate": 8.918452024735688e-06, + "loss": 4.842, + "step": 219925 + }, + { + "epoch": 19.744165170556553, + "grad_norm": 19.630996704101562, + "learning_rate": 8.91820267305007e-06, + "loss": 4.778, + "step": 219950 + }, + { + "epoch": 19.746409335727108, + "grad_norm": 18.459856033325195, + "learning_rate": 8.917953321364452e-06, + "loss": 4.6658, + "step": 219975 + }, + { + "epoch": 19.748653500897667, + "grad_norm": 17.69123077392578, + "learning_rate": 8.917703969678836e-06, + "loss": 4.6669, + "step": 220000 + }, + { + "epoch": 19.750897666068223, + "grad_norm": 18.79239273071289, + "learning_rate": 8.917454617993219e-06, + "loss": 4.87, + "step": 220025 + }, + { + "epoch": 19.75314183123878, + "grad_norm": 18.798967361450195, + "learning_rate": 8.917205266307601e-06, + "loss": 4.3824, + "step": 220050 + }, + { + "epoch": 19.755385996409334, + "grad_norm": 16.634990692138672, + "learning_rate": 8.916955914621983e-06, + "loss": 4.735, + "step": 220075 + }, + { + "epoch": 19.757630161579893, + "grad_norm": 17.053510665893555, + "learning_rate": 8.916706562936366e-06, + "loss": 4.5938, + "step": 220100 + }, + { + "epoch": 19.75987432675045, + "grad_norm": 18.59354591369629, + "learning_rate": 8.916457211250748e-06, + "loss": 4.8505, + "step": 220125 + }, + { + "epoch": 19.762118491921004, + "grad_norm": 17.27494239807129, + "learning_rate": 8.916207859565132e-06, + "loss": 5.0153, + "step": 220150 + }, + { + "epoch": 19.764362657091564, + "grad_norm": 16.149660110473633, + "learning_rate": 8.915958507879514e-06, + "loss": 4.5692, + "step": 220175 + }, + { + "epoch": 19.76660682226212, + "grad_norm": 18.9099063873291, + "learning_rate": 8.915709156193897e-06, + "loss": 4.9203, + "step": 220200 + }, + { + "epoch": 19.768850987432675, + "grad_norm": 18.27613639831543, + "learning_rate": 8.915459804508279e-06, + "loss": 4.5348, + "step": 220225 + }, + { + "epoch": 19.77109515260323, + "grad_norm": 19.308958053588867, + "learning_rate": 8.915210452822661e-06, + "loss": 4.9068, + "step": 220250 + }, + { + "epoch": 19.77333931777379, + "grad_norm": 20.049060821533203, + "learning_rate": 8.914961101137044e-06, + "loss": 4.6533, + "step": 220275 + }, + { + "epoch": 19.775583482944345, + "grad_norm": 15.756067276000977, + "learning_rate": 8.914711749451428e-06, + "loss": 4.5303, + "step": 220300 + }, + { + "epoch": 19.7778276481149, + "grad_norm": 18.413150787353516, + "learning_rate": 8.91446239776581e-06, + "loss": 4.7395, + "step": 220325 + }, + { + "epoch": 19.780071813285456, + "grad_norm": 17.462190628051758, + "learning_rate": 8.914213046080192e-06, + "loss": 4.6681, + "step": 220350 + }, + { + "epoch": 19.782315978456015, + "grad_norm": 18.951189041137695, + "learning_rate": 8.913963694394575e-06, + "loss": 4.7632, + "step": 220375 + }, + { + "epoch": 19.78456014362657, + "grad_norm": 22.38157081604004, + "learning_rate": 8.913714342708959e-06, + "loss": 4.6253, + "step": 220400 + }, + { + "epoch": 19.786804308797127, + "grad_norm": 17.37641143798828, + "learning_rate": 8.91346499102334e-06, + "loss": 4.8708, + "step": 220425 + }, + { + "epoch": 19.789048473967686, + "grad_norm": 20.711523056030273, + "learning_rate": 8.913215639337721e-06, + "loss": 4.8042, + "step": 220450 + }, + { + "epoch": 19.79129263913824, + "grad_norm": 17.608213424682617, + "learning_rate": 8.912966287652106e-06, + "loss": 5.0272, + "step": 220475 + }, + { + "epoch": 19.793536804308797, + "grad_norm": 17.35223960876465, + "learning_rate": 8.912716935966488e-06, + "loss": 4.6328, + "step": 220500 + }, + { + "epoch": 19.795780969479353, + "grad_norm": 17.06209945678711, + "learning_rate": 8.91246758428087e-06, + "loss": 4.8855, + "step": 220525 + }, + { + "epoch": 19.79802513464991, + "grad_norm": 21.57456398010254, + "learning_rate": 8.912218232595254e-06, + "loss": 4.8412, + "step": 220550 + }, + { + "epoch": 19.800269299820467, + "grad_norm": 18.16551971435547, + "learning_rate": 8.911968880909636e-06, + "loss": 4.8683, + "step": 220575 + }, + { + "epoch": 19.802513464991023, + "grad_norm": 15.874252319335938, + "learning_rate": 8.911719529224017e-06, + "loss": 4.5291, + "step": 220600 + }, + { + "epoch": 19.80475763016158, + "grad_norm": 19.84747886657715, + "learning_rate": 8.911470177538401e-06, + "loss": 4.668, + "step": 220625 + }, + { + "epoch": 19.807001795332138, + "grad_norm": 17.161678314208984, + "learning_rate": 8.911220825852783e-06, + "loss": 4.7369, + "step": 220650 + }, + { + "epoch": 19.809245960502693, + "grad_norm": 18.742929458618164, + "learning_rate": 8.910971474167166e-06, + "loss": 4.7368, + "step": 220675 + }, + { + "epoch": 19.81149012567325, + "grad_norm": 20.07526397705078, + "learning_rate": 8.910722122481548e-06, + "loss": 4.794, + "step": 220700 + }, + { + "epoch": 19.813734290843804, + "grad_norm": 19.84645652770996, + "learning_rate": 8.910472770795932e-06, + "loss": 4.6588, + "step": 220725 + }, + { + "epoch": 19.815978456014363, + "grad_norm": 19.90781593322754, + "learning_rate": 8.910223419110314e-06, + "loss": 4.6714, + "step": 220750 + }, + { + "epoch": 19.81822262118492, + "grad_norm": 19.170869827270508, + "learning_rate": 8.909974067424697e-06, + "loss": 4.8072, + "step": 220775 + }, + { + "epoch": 19.820466786355475, + "grad_norm": 16.912765502929688, + "learning_rate": 8.909724715739079e-06, + "loss": 4.8449, + "step": 220800 + }, + { + "epoch": 19.822710951526034, + "grad_norm": 16.084680557250977, + "learning_rate": 8.909475364053461e-06, + "loss": 4.8269, + "step": 220825 + }, + { + "epoch": 19.82495511669659, + "grad_norm": 18.610679626464844, + "learning_rate": 8.909226012367844e-06, + "loss": 4.7887, + "step": 220850 + }, + { + "epoch": 19.827199281867145, + "grad_norm": 23.99001693725586, + "learning_rate": 8.908976660682228e-06, + "loss": 4.6071, + "step": 220875 + }, + { + "epoch": 19.8294434470377, + "grad_norm": 19.1395320892334, + "learning_rate": 8.90872730899661e-06, + "loss": 4.787, + "step": 220900 + }, + { + "epoch": 19.83168761220826, + "grad_norm": 22.283977508544922, + "learning_rate": 8.908477957310992e-06, + "loss": 4.9387, + "step": 220925 + }, + { + "epoch": 19.833931777378815, + "grad_norm": 22.084354400634766, + "learning_rate": 8.908228605625375e-06, + "loss": 5.0833, + "step": 220950 + }, + { + "epoch": 19.83617594254937, + "grad_norm": 16.40814781188965, + "learning_rate": 8.907979253939757e-06, + "loss": 4.8566, + "step": 220975 + }, + { + "epoch": 19.838420107719926, + "grad_norm": 17.110551834106445, + "learning_rate": 8.90772990225414e-06, + "loss": 4.8422, + "step": 221000 + }, + { + "epoch": 19.840664272890486, + "grad_norm": 17.37224769592285, + "learning_rate": 8.907480550568523e-06, + "loss": 4.9328, + "step": 221025 + }, + { + "epoch": 19.84290843806104, + "grad_norm": 16.970722198486328, + "learning_rate": 8.907231198882906e-06, + "loss": 4.6897, + "step": 221050 + }, + { + "epoch": 19.845152603231597, + "grad_norm": 18.948013305664062, + "learning_rate": 8.906981847197288e-06, + "loss": 4.5965, + "step": 221075 + }, + { + "epoch": 19.847396768402156, + "grad_norm": 17.796049118041992, + "learning_rate": 8.90673249551167e-06, + "loss": 4.8296, + "step": 221100 + }, + { + "epoch": 19.84964093357271, + "grad_norm": 16.41073989868164, + "learning_rate": 8.906483143826054e-06, + "loss": 4.7499, + "step": 221125 + }, + { + "epoch": 19.851885098743267, + "grad_norm": 22.707326889038086, + "learning_rate": 8.906233792140435e-06, + "loss": 4.5972, + "step": 221150 + }, + { + "epoch": 19.854129263913823, + "grad_norm": 17.541528701782227, + "learning_rate": 8.905984440454817e-06, + "loss": 4.9276, + "step": 221175 + }, + { + "epoch": 19.856373429084382, + "grad_norm": 17.122610092163086, + "learning_rate": 8.905735088769201e-06, + "loss": 4.6401, + "step": 221200 + }, + { + "epoch": 19.858617594254937, + "grad_norm": 19.345745086669922, + "learning_rate": 8.905485737083583e-06, + "loss": 4.7954, + "step": 221225 + }, + { + "epoch": 19.860861759425493, + "grad_norm": 20.47861099243164, + "learning_rate": 8.905236385397966e-06, + "loss": 4.7374, + "step": 221250 + }, + { + "epoch": 19.86310592459605, + "grad_norm": 14.49191665649414, + "learning_rate": 8.90498703371235e-06, + "loss": 4.9384, + "step": 221275 + }, + { + "epoch": 19.865350089766608, + "grad_norm": 18.938461303710938, + "learning_rate": 8.904737682026732e-06, + "loss": 4.5893, + "step": 221300 + }, + { + "epoch": 19.867594254937163, + "grad_norm": 21.3725643157959, + "learning_rate": 8.904488330341113e-06, + "loss": 4.5458, + "step": 221325 + }, + { + "epoch": 19.86983842010772, + "grad_norm": 17.196096420288086, + "learning_rate": 8.904238978655497e-06, + "loss": 4.675, + "step": 221350 + }, + { + "epoch": 19.872082585278278, + "grad_norm": 18.098731994628906, + "learning_rate": 8.903989626969879e-06, + "loss": 5.0589, + "step": 221375 + }, + { + "epoch": 19.874326750448834, + "grad_norm": 16.52849006652832, + "learning_rate": 8.903740275284261e-06, + "loss": 4.9963, + "step": 221400 + }, + { + "epoch": 19.87657091561939, + "grad_norm": 21.03007698059082, + "learning_rate": 8.903490923598644e-06, + "loss": 4.6895, + "step": 221425 + }, + { + "epoch": 19.878815080789945, + "grad_norm": 19.855966567993164, + "learning_rate": 8.903241571913028e-06, + "loss": 4.9111, + "step": 221450 + }, + { + "epoch": 19.881059245960504, + "grad_norm": 17.57882308959961, + "learning_rate": 8.90299222022741e-06, + "loss": 4.6419, + "step": 221475 + }, + { + "epoch": 19.88330341113106, + "grad_norm": 18.77579116821289, + "learning_rate": 8.902742868541792e-06, + "loss": 4.7682, + "step": 221500 + }, + { + "epoch": 19.885547576301615, + "grad_norm": 18.351638793945312, + "learning_rate": 8.902493516856175e-06, + "loss": 5.0267, + "step": 221525 + }, + { + "epoch": 19.88779174147217, + "grad_norm": 18.849933624267578, + "learning_rate": 8.902244165170557e-06, + "loss": 4.8777, + "step": 221550 + }, + { + "epoch": 19.89003590664273, + "grad_norm": 18.203811645507812, + "learning_rate": 8.90199481348494e-06, + "loss": 5.1574, + "step": 221575 + }, + { + "epoch": 19.892280071813286, + "grad_norm": 19.762344360351562, + "learning_rate": 8.901745461799323e-06, + "loss": 4.8112, + "step": 221600 + }, + { + "epoch": 19.89452423698384, + "grad_norm": 19.792070388793945, + "learning_rate": 8.90150608418113e-06, + "loss": 4.6191, + "step": 221625 + }, + { + "epoch": 19.8967684021544, + "grad_norm": 17.669445037841797, + "learning_rate": 8.901256732495513e-06, + "loss": 4.5553, + "step": 221650 + }, + { + "epoch": 19.899012567324956, + "grad_norm": 16.85118865966797, + "learning_rate": 8.901007380809895e-06, + "loss": 4.7369, + "step": 221675 + }, + { + "epoch": 19.90125673249551, + "grad_norm": 19.79808807373047, + "learning_rate": 8.900758029124277e-06, + "loss": 4.5556, + "step": 221700 + }, + { + "epoch": 19.903500897666067, + "grad_norm": 16.6353816986084, + "learning_rate": 8.900508677438661e-06, + "loss": 4.8439, + "step": 221725 + }, + { + "epoch": 19.905745062836626, + "grad_norm": 16.73040771484375, + "learning_rate": 8.900259325753042e-06, + "loss": 4.6309, + "step": 221750 + }, + { + "epoch": 19.90798922800718, + "grad_norm": 18.293148040771484, + "learning_rate": 8.900009974067426e-06, + "loss": 4.6445, + "step": 221775 + }, + { + "epoch": 19.910233393177737, + "grad_norm": 15.553557395935059, + "learning_rate": 8.899760622381808e-06, + "loss": 4.7388, + "step": 221800 + }, + { + "epoch": 19.912477558348293, + "grad_norm": 21.111473083496094, + "learning_rate": 8.89951127069619e-06, + "loss": 4.8762, + "step": 221825 + }, + { + "epoch": 19.914721723518852, + "grad_norm": 18.132797241210938, + "learning_rate": 8.899261919010573e-06, + "loss": 4.6658, + "step": 221850 + }, + { + "epoch": 19.916965888689408, + "grad_norm": 17.797956466674805, + "learning_rate": 8.899012567324957e-06, + "loss": 4.8188, + "step": 221875 + }, + { + "epoch": 19.919210053859963, + "grad_norm": 17.90738868713379, + "learning_rate": 8.898763215639339e-06, + "loss": 5.1195, + "step": 221900 + }, + { + "epoch": 19.921454219030522, + "grad_norm": 17.808626174926758, + "learning_rate": 8.89851386395372e-06, + "loss": 4.7434, + "step": 221925 + }, + { + "epoch": 19.923698384201078, + "grad_norm": 21.444557189941406, + "learning_rate": 8.898264512268104e-06, + "loss": 4.7621, + "step": 221950 + }, + { + "epoch": 19.925942549371634, + "grad_norm": 17.46640396118164, + "learning_rate": 8.898015160582486e-06, + "loss": 4.4255, + "step": 221975 + }, + { + "epoch": 19.92818671454219, + "grad_norm": 17.577531814575195, + "learning_rate": 8.897765808896868e-06, + "loss": 4.8546, + "step": 222000 + }, + { + "epoch": 19.93043087971275, + "grad_norm": 16.6295108795166, + "learning_rate": 8.897516457211252e-06, + "loss": 4.7923, + "step": 222025 + }, + { + "epoch": 19.932675044883304, + "grad_norm": 20.748638153076172, + "learning_rate": 8.897267105525635e-06, + "loss": 4.6018, + "step": 222050 + }, + { + "epoch": 19.93491921005386, + "grad_norm": 24.688554763793945, + "learning_rate": 8.897017753840017e-06, + "loss": 4.9279, + "step": 222075 + }, + { + "epoch": 19.937163375224415, + "grad_norm": 19.249919891357422, + "learning_rate": 8.8967684021544e-06, + "loss": 4.6753, + "step": 222100 + }, + { + "epoch": 19.939407540394974, + "grad_norm": 19.451641082763672, + "learning_rate": 8.896519050468782e-06, + "loss": 4.8876, + "step": 222125 + }, + { + "epoch": 19.94165170556553, + "grad_norm": 15.517289161682129, + "learning_rate": 8.896269698783164e-06, + "loss": 4.9066, + "step": 222150 + }, + { + "epoch": 19.943895870736085, + "grad_norm": 21.104045867919922, + "learning_rate": 8.896020347097546e-06, + "loss": 4.8238, + "step": 222175 + }, + { + "epoch": 19.946140035906645, + "grad_norm": 15.65658950805664, + "learning_rate": 8.89577099541193e-06, + "loss": 4.7219, + "step": 222200 + }, + { + "epoch": 19.9483842010772, + "grad_norm": 18.738174438476562, + "learning_rate": 8.895521643726313e-06, + "loss": 4.8532, + "step": 222225 + }, + { + "epoch": 19.950628366247756, + "grad_norm": 17.349151611328125, + "learning_rate": 8.895272292040695e-06, + "loss": 4.8573, + "step": 222250 + }, + { + "epoch": 19.95287253141831, + "grad_norm": 20.9176025390625, + "learning_rate": 8.895022940355077e-06, + "loss": 4.6283, + "step": 222275 + }, + { + "epoch": 19.95511669658887, + "grad_norm": 18.297977447509766, + "learning_rate": 8.89477358866946e-06, + "loss": 4.8084, + "step": 222300 + }, + { + "epoch": 19.957360861759426, + "grad_norm": 17.565690994262695, + "learning_rate": 8.894524236983842e-06, + "loss": 4.9309, + "step": 222325 + }, + { + "epoch": 19.95960502692998, + "grad_norm": 17.951194763183594, + "learning_rate": 8.894274885298226e-06, + "loss": 4.6601, + "step": 222350 + }, + { + "epoch": 19.961849192100537, + "grad_norm": 17.11066436767578, + "learning_rate": 8.894025533612608e-06, + "loss": 4.9363, + "step": 222375 + }, + { + "epoch": 19.964093357271096, + "grad_norm": 19.568674087524414, + "learning_rate": 8.89377618192699e-06, + "loss": 4.6351, + "step": 222400 + }, + { + "epoch": 19.966337522441652, + "grad_norm": 17.55156707763672, + "learning_rate": 8.893526830241373e-06, + "loss": 4.7658, + "step": 222425 + }, + { + "epoch": 19.968581687612208, + "grad_norm": 18.980308532714844, + "learning_rate": 8.893277478555757e-06, + "loss": 4.8652, + "step": 222450 + }, + { + "epoch": 19.970825852782763, + "grad_norm": 17.03390121459961, + "learning_rate": 8.893028126870137e-06, + "loss": 4.8822, + "step": 222475 + }, + { + "epoch": 19.973070017953322, + "grad_norm": 18.51448631286621, + "learning_rate": 8.892778775184521e-06, + "loss": 4.5605, + "step": 222500 + }, + { + "epoch": 19.975314183123878, + "grad_norm": 18.306289672851562, + "learning_rate": 8.892529423498904e-06, + "loss": 4.5639, + "step": 222525 + }, + { + "epoch": 19.977558348294433, + "grad_norm": 16.97557830810547, + "learning_rate": 8.892280071813286e-06, + "loss": 4.9584, + "step": 222550 + }, + { + "epoch": 19.979802513464993, + "grad_norm": 16.82259750366211, + "learning_rate": 8.892030720127668e-06, + "loss": 4.7587, + "step": 222575 + }, + { + "epoch": 19.982046678635548, + "grad_norm": 16.6539306640625, + "learning_rate": 8.891781368442052e-06, + "loss": 4.7685, + "step": 222600 + }, + { + "epoch": 19.984290843806104, + "grad_norm": 17.94995880126953, + "learning_rate": 8.891532016756435e-06, + "loss": 4.8515, + "step": 222625 + }, + { + "epoch": 19.98653500897666, + "grad_norm": 22.79832649230957, + "learning_rate": 8.891282665070815e-06, + "loss": 4.8666, + "step": 222650 + }, + { + "epoch": 19.98877917414722, + "grad_norm": 18.787574768066406, + "learning_rate": 8.8910333133852e-06, + "loss": 4.7883, + "step": 222675 + }, + { + "epoch": 19.991023339317774, + "grad_norm": 17.22242546081543, + "learning_rate": 8.890783961699582e-06, + "loss": 4.8921, + "step": 222700 + }, + { + "epoch": 19.99326750448833, + "grad_norm": 17.14498519897461, + "learning_rate": 8.890534610013964e-06, + "loss": 4.7181, + "step": 222725 + }, + { + "epoch": 19.995511669658885, + "grad_norm": 17.172866821289062, + "learning_rate": 8.890285258328348e-06, + "loss": 4.5028, + "step": 222750 + }, + { + "epoch": 19.997755834829444, + "grad_norm": 18.016925811767578, + "learning_rate": 8.89003590664273e-06, + "loss": 4.7143, + "step": 222775 + }, + { + "epoch": 20.0, + "grad_norm": 22.986465454101562, + "learning_rate": 8.889786554957113e-06, + "loss": 4.712, + "step": 222800 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.06703835946002712, + "eval_f1_macro": 0.008964057838724499, + "eval_f1_micro": 0.06703835946002712, + "eval_f1_weighted": 0.0411660182150295, + "eval_loss": 6.828232765197754, + "eval_precision_macro": 0.007902388604657976, + "eval_precision_micro": 0.06703835946002712, + "eval_precision_weighted": 0.03346290538969182, + "eval_recall_macro": 0.014261684497546695, + "eval_recall_micro": 0.06703835946002712, + "eval_recall_weighted": 0.06703835946002712, + "eval_runtime": 129.2153, + "eval_samples_per_second": 405.316, + "eval_steps_per_second": 12.669, + "step": 222800 + }, + { + "epoch": 20.002244165170556, + "grad_norm": 16.097557067871094, + "learning_rate": 8.889537203271495e-06, + "loss": 4.3309, + "step": 222825 + }, + { + "epoch": 20.004488330341115, + "grad_norm": 20.40545654296875, + "learning_rate": 8.889287851585877e-06, + "loss": 4.5137, + "step": 222850 + }, + { + "epoch": 20.00673249551167, + "grad_norm": 18.80745506286621, + "learning_rate": 8.88903849990026e-06, + "loss": 4.3738, + "step": 222875 + }, + { + "epoch": 20.008976660682226, + "grad_norm": 21.115392684936523, + "learning_rate": 8.888789148214642e-06, + "loss": 4.2961, + "step": 222900 + }, + { + "epoch": 20.01122082585278, + "grad_norm": 15.9141845703125, + "learning_rate": 8.888539796529026e-06, + "loss": 4.3895, + "step": 222925 + }, + { + "epoch": 20.01346499102334, + "grad_norm": 19.353347778320312, + "learning_rate": 8.888290444843408e-06, + "loss": 4.1316, + "step": 222950 + }, + { + "epoch": 20.015709156193896, + "grad_norm": 15.410467147827148, + "learning_rate": 8.88804109315779e-06, + "loss": 4.4656, + "step": 222975 + }, + { + "epoch": 20.017953321364452, + "grad_norm": 19.559574127197266, + "learning_rate": 8.887791741472173e-06, + "loss": 4.3872, + "step": 223000 + }, + { + "epoch": 20.020197486535007, + "grad_norm": 16.891706466674805, + "learning_rate": 8.887542389786555e-06, + "loss": 4.4524, + "step": 223025 + }, + { + "epoch": 20.022441651705567, + "grad_norm": 16.633573532104492, + "learning_rate": 8.887293038100937e-06, + "loss": 4.4431, + "step": 223050 + }, + { + "epoch": 20.024685816876122, + "grad_norm": 18.666940689086914, + "learning_rate": 8.887043686415322e-06, + "loss": 4.4582, + "step": 223075 + }, + { + "epoch": 20.026929982046678, + "grad_norm": 19.094669342041016, + "learning_rate": 8.886794334729704e-06, + "loss": 4.3238, + "step": 223100 + }, + { + "epoch": 20.029174147217237, + "grad_norm": 17.125925064086914, + "learning_rate": 8.886544983044086e-06, + "loss": 4.3503, + "step": 223125 + }, + { + "epoch": 20.031418312387792, + "grad_norm": 17.562562942504883, + "learning_rate": 8.886295631358468e-06, + "loss": 4.423, + "step": 223150 + }, + { + "epoch": 20.033662477558348, + "grad_norm": 18.73354148864746, + "learning_rate": 8.88604627967285e-06, + "loss": 4.5077, + "step": 223175 + }, + { + "epoch": 20.035906642728904, + "grad_norm": 19.349376678466797, + "learning_rate": 8.885796927987233e-06, + "loss": 4.5224, + "step": 223200 + }, + { + "epoch": 20.038150807899463, + "grad_norm": 16.747772216796875, + "learning_rate": 8.885547576301617e-06, + "loss": 4.4789, + "step": 223225 + }, + { + "epoch": 20.04039497307002, + "grad_norm": 19.136262893676758, + "learning_rate": 8.885298224616e-06, + "loss": 4.3911, + "step": 223250 + }, + { + "epoch": 20.042639138240574, + "grad_norm": 15.183823585510254, + "learning_rate": 8.885048872930382e-06, + "loss": 4.2319, + "step": 223275 + }, + { + "epoch": 20.04488330341113, + "grad_norm": 17.277324676513672, + "learning_rate": 8.884799521244764e-06, + "loss": 4.3766, + "step": 223300 + }, + { + "epoch": 20.04712746858169, + "grad_norm": 19.41237449645996, + "learning_rate": 8.884550169559148e-06, + "loss": 4.4969, + "step": 223325 + }, + { + "epoch": 20.049371633752244, + "grad_norm": 19.366601943969727, + "learning_rate": 8.884300817873529e-06, + "loss": 4.5857, + "step": 223350 + }, + { + "epoch": 20.0516157989228, + "grad_norm": 17.097023010253906, + "learning_rate": 8.884051466187911e-06, + "loss": 4.3718, + "step": 223375 + }, + { + "epoch": 20.05385996409336, + "grad_norm": 18.918489456176758, + "learning_rate": 8.883802114502295e-06, + "loss": 4.4481, + "step": 223400 + }, + { + "epoch": 20.056104129263915, + "grad_norm": 17.193553924560547, + "learning_rate": 8.883552762816677e-06, + "loss": 4.277, + "step": 223425 + }, + { + "epoch": 20.05834829443447, + "grad_norm": 16.65245819091797, + "learning_rate": 8.88330341113106e-06, + "loss": 4.5491, + "step": 223450 + }, + { + "epoch": 20.060592459605026, + "grad_norm": 17.36224937438965, + "learning_rate": 8.883054059445444e-06, + "loss": 4.4388, + "step": 223475 + }, + { + "epoch": 20.062836624775585, + "grad_norm": 19.71054458618164, + "learning_rate": 8.882804707759826e-06, + "loss": 4.4304, + "step": 223500 + }, + { + "epoch": 20.06508078994614, + "grad_norm": 21.74962043762207, + "learning_rate": 8.882555356074207e-06, + "loss": 4.4706, + "step": 223525 + }, + { + "epoch": 20.067324955116696, + "grad_norm": 18.534954071044922, + "learning_rate": 8.88230600438859e-06, + "loss": 4.3121, + "step": 223550 + }, + { + "epoch": 20.06956912028725, + "grad_norm": 16.694303512573242, + "learning_rate": 8.882056652702973e-06, + "loss": 4.6054, + "step": 223575 + }, + { + "epoch": 20.07181328545781, + "grad_norm": 17.27090072631836, + "learning_rate": 8.881807301017355e-06, + "loss": 4.437, + "step": 223600 + }, + { + "epoch": 20.074057450628366, + "grad_norm": 18.12067985534668, + "learning_rate": 8.881557949331738e-06, + "loss": 4.6007, + "step": 223625 + }, + { + "epoch": 20.076301615798922, + "grad_norm": 19.158052444458008, + "learning_rate": 8.881308597646122e-06, + "loss": 4.3957, + "step": 223650 + }, + { + "epoch": 20.078545780969478, + "grad_norm": 16.761825561523438, + "learning_rate": 8.881059245960504e-06, + "loss": 4.3607, + "step": 223675 + }, + { + "epoch": 20.080789946140037, + "grad_norm": 20.01456642150879, + "learning_rate": 8.880809894274886e-06, + "loss": 4.6135, + "step": 223700 + }, + { + "epoch": 20.083034111310592, + "grad_norm": 20.490055084228516, + "learning_rate": 8.880560542589268e-06, + "loss": 4.7445, + "step": 223725 + }, + { + "epoch": 20.085278276481148, + "grad_norm": 18.573429107666016, + "learning_rate": 8.88031119090365e-06, + "loss": 4.4802, + "step": 223750 + }, + { + "epoch": 20.087522441651707, + "grad_norm": 20.196311950683594, + "learning_rate": 8.880061839218033e-06, + "loss": 4.5788, + "step": 223775 + }, + { + "epoch": 20.089766606822263, + "grad_norm": 21.606264114379883, + "learning_rate": 8.879812487532417e-06, + "loss": 4.4752, + "step": 223800 + }, + { + "epoch": 20.09201077199282, + "grad_norm": 20.208663940429688, + "learning_rate": 8.8795631358468e-06, + "loss": 4.293, + "step": 223825 + }, + { + "epoch": 20.094254937163374, + "grad_norm": 20.36569595336914, + "learning_rate": 8.879313784161182e-06, + "loss": 4.4031, + "step": 223850 + }, + { + "epoch": 20.096499102333933, + "grad_norm": 18.036848068237305, + "learning_rate": 8.879064432475564e-06, + "loss": 4.3664, + "step": 223875 + }, + { + "epoch": 20.09874326750449, + "grad_norm": 19.15073585510254, + "learning_rate": 8.878825054857371e-06, + "loss": 4.4201, + "step": 223900 + }, + { + "epoch": 20.100987432675044, + "grad_norm": 19.95070457458496, + "learning_rate": 8.878575703171755e-06, + "loss": 4.5869, + "step": 223925 + }, + { + "epoch": 20.1032315978456, + "grad_norm": 16.547292709350586, + "learning_rate": 8.878326351486137e-06, + "loss": 4.7485, + "step": 223950 + }, + { + "epoch": 20.10547576301616, + "grad_norm": 17.893693923950195, + "learning_rate": 8.87807699980052e-06, + "loss": 4.3678, + "step": 223975 + }, + { + "epoch": 20.107719928186714, + "grad_norm": 21.426795959472656, + "learning_rate": 8.877827648114902e-06, + "loss": 4.3696, + "step": 224000 + }, + { + "epoch": 20.10996409335727, + "grad_norm": 15.459090232849121, + "learning_rate": 8.877578296429284e-06, + "loss": 4.2679, + "step": 224025 + }, + { + "epoch": 20.11220825852783, + "grad_norm": 17.03648567199707, + "learning_rate": 8.877328944743667e-06, + "loss": 4.6301, + "step": 224050 + }, + { + "epoch": 20.114452423698385, + "grad_norm": 17.199880599975586, + "learning_rate": 8.87707959305805e-06, + "loss": 4.3153, + "step": 224075 + }, + { + "epoch": 20.11669658886894, + "grad_norm": 21.95600700378418, + "learning_rate": 8.876830241372433e-06, + "loss": 4.5587, + "step": 224100 + }, + { + "epoch": 20.118940754039496, + "grad_norm": 19.074033737182617, + "learning_rate": 8.876580889686815e-06, + "loss": 4.5202, + "step": 224125 + }, + { + "epoch": 20.121184919210055, + "grad_norm": 21.41193962097168, + "learning_rate": 8.876331538001198e-06, + "loss": 4.7287, + "step": 224150 + }, + { + "epoch": 20.12342908438061, + "grad_norm": 19.04717254638672, + "learning_rate": 8.87608218631558e-06, + "loss": 4.3994, + "step": 224175 + }, + { + "epoch": 20.125673249551166, + "grad_norm": 22.842178344726562, + "learning_rate": 8.875832834629962e-06, + "loss": 4.5058, + "step": 224200 + }, + { + "epoch": 20.127917414721722, + "grad_norm": 15.706796646118164, + "learning_rate": 8.875583482944346e-06, + "loss": 4.4288, + "step": 224225 + }, + { + "epoch": 20.13016157989228, + "grad_norm": 17.970136642456055, + "learning_rate": 8.875334131258729e-06, + "loss": 4.4101, + "step": 224250 + }, + { + "epoch": 20.132405745062837, + "grad_norm": 16.476099014282227, + "learning_rate": 8.875084779573111e-06, + "loss": 4.4191, + "step": 224275 + }, + { + "epoch": 20.134649910233392, + "grad_norm": 18.968448638916016, + "learning_rate": 8.874835427887493e-06, + "loss": 4.5951, + "step": 224300 + }, + { + "epoch": 20.13689407540395, + "grad_norm": 19.536670684814453, + "learning_rate": 8.874586076201875e-06, + "loss": 4.5444, + "step": 224325 + }, + { + "epoch": 20.139138240574507, + "grad_norm": 16.226715087890625, + "learning_rate": 8.874336724516258e-06, + "loss": 4.3525, + "step": 224350 + }, + { + "epoch": 20.141382405745063, + "grad_norm": 15.805954933166504, + "learning_rate": 8.87408737283064e-06, + "loss": 4.3164, + "step": 224375 + }, + { + "epoch": 20.143626570915618, + "grad_norm": 19.99773406982422, + "learning_rate": 8.873838021145024e-06, + "loss": 4.6216, + "step": 224400 + }, + { + "epoch": 20.145870736086177, + "grad_norm": 17.339237213134766, + "learning_rate": 8.873588669459406e-06, + "loss": 4.5302, + "step": 224425 + }, + { + "epoch": 20.148114901256733, + "grad_norm": 16.569894790649414, + "learning_rate": 8.873339317773789e-06, + "loss": 4.6976, + "step": 224450 + }, + { + "epoch": 20.15035906642729, + "grad_norm": 16.954238891601562, + "learning_rate": 8.873089966088173e-06, + "loss": 4.5352, + "step": 224475 + }, + { + "epoch": 20.152603231597844, + "grad_norm": 18.32302474975586, + "learning_rate": 8.872840614402553e-06, + "loss": 4.497, + "step": 224500 + }, + { + "epoch": 20.154847396768403, + "grad_norm": 19.275583267211914, + "learning_rate": 8.872591262716936e-06, + "loss": 4.4798, + "step": 224525 + }, + { + "epoch": 20.15709156193896, + "grad_norm": 16.445701599121094, + "learning_rate": 8.87234191103132e-06, + "loss": 4.3357, + "step": 224550 + }, + { + "epoch": 20.159335727109514, + "grad_norm": 17.4966983795166, + "learning_rate": 8.872092559345702e-06, + "loss": 4.3279, + "step": 224575 + }, + { + "epoch": 20.161579892280074, + "grad_norm": 19.796085357666016, + "learning_rate": 8.871843207660084e-06, + "loss": 4.3927, + "step": 224600 + }, + { + "epoch": 20.16382405745063, + "grad_norm": 18.511621475219727, + "learning_rate": 8.871593855974467e-06, + "loss": 4.3284, + "step": 224625 + }, + { + "epoch": 20.166068222621185, + "grad_norm": 18.124982833862305, + "learning_rate": 8.87134450428885e-06, + "loss": 4.9396, + "step": 224650 + }, + { + "epoch": 20.16831238779174, + "grad_norm": 20.432308197021484, + "learning_rate": 8.871095152603231e-06, + "loss": 4.0364, + "step": 224675 + }, + { + "epoch": 20.1705565529623, + "grad_norm": 21.78136444091797, + "learning_rate": 8.870845800917615e-06, + "loss": 4.4396, + "step": 224700 + }, + { + "epoch": 20.172800718132855, + "grad_norm": 20.08563804626465, + "learning_rate": 8.870596449231998e-06, + "loss": 4.3934, + "step": 224725 + }, + { + "epoch": 20.17504488330341, + "grad_norm": 20.717426300048828, + "learning_rate": 8.87034709754638e-06, + "loss": 4.6746, + "step": 224750 + }, + { + "epoch": 20.177289048473966, + "grad_norm": 18.351314544677734, + "learning_rate": 8.870097745860762e-06, + "loss": 4.3497, + "step": 224775 + }, + { + "epoch": 20.179533213644525, + "grad_norm": 17.0410099029541, + "learning_rate": 8.869848394175146e-06, + "loss": 4.347, + "step": 224800 + }, + { + "epoch": 20.18177737881508, + "grad_norm": 17.601728439331055, + "learning_rate": 8.869599042489529e-06, + "loss": 4.5383, + "step": 224825 + }, + { + "epoch": 20.184021543985637, + "grad_norm": 19.541854858398438, + "learning_rate": 8.869349690803911e-06, + "loss": 4.6826, + "step": 224850 + }, + { + "epoch": 20.186265709156196, + "grad_norm": 22.4276065826416, + "learning_rate": 8.869100339118293e-06, + "loss": 4.504, + "step": 224875 + }, + { + "epoch": 20.18850987432675, + "grad_norm": 20.441293716430664, + "learning_rate": 8.868850987432676e-06, + "loss": 4.6625, + "step": 224900 + }, + { + "epoch": 20.190754039497307, + "grad_norm": 16.60894203186035, + "learning_rate": 8.868601635747058e-06, + "loss": 4.4642, + "step": 224925 + }, + { + "epoch": 20.192998204667862, + "grad_norm": 18.952587127685547, + "learning_rate": 8.868352284061442e-06, + "loss": 4.5757, + "step": 224950 + }, + { + "epoch": 20.19524236983842, + "grad_norm": 18.11183738708496, + "learning_rate": 8.868102932375824e-06, + "loss": 4.5673, + "step": 224975 + }, + { + "epoch": 20.197486535008977, + "grad_norm": 17.727825164794922, + "learning_rate": 8.867853580690206e-06, + "loss": 4.5441, + "step": 225000 + }, + { + "epoch": 20.199730700179533, + "grad_norm": 22.48601722717285, + "learning_rate": 8.867604229004589e-06, + "loss": 4.3318, + "step": 225025 + }, + { + "epoch": 20.20197486535009, + "grad_norm": 14.814921379089355, + "learning_rate": 8.867354877318971e-06, + "loss": 4.4137, + "step": 225050 + }, + { + "epoch": 20.204219030520647, + "grad_norm": 19.89827537536621, + "learning_rate": 8.867105525633353e-06, + "loss": 4.4915, + "step": 225075 + }, + { + "epoch": 20.206463195691203, + "grad_norm": 20.18576431274414, + "learning_rate": 8.866856173947736e-06, + "loss": 4.3969, + "step": 225100 + }, + { + "epoch": 20.20870736086176, + "grad_norm": 20.962493896484375, + "learning_rate": 8.86660682226212e-06, + "loss": 4.3716, + "step": 225125 + }, + { + "epoch": 20.210951526032314, + "grad_norm": 18.661836624145508, + "learning_rate": 8.866357470576502e-06, + "loss": 4.4605, + "step": 225150 + }, + { + "epoch": 20.213195691202873, + "grad_norm": 18.031932830810547, + "learning_rate": 8.866108118890884e-06, + "loss": 4.6481, + "step": 225175 + }, + { + "epoch": 20.21543985637343, + "grad_norm": 19.416399002075195, + "learning_rate": 8.865858767205267e-06, + "loss": 4.2948, + "step": 225200 + }, + { + "epoch": 20.217684021543985, + "grad_norm": 19.71856117248535, + "learning_rate": 8.865609415519649e-06, + "loss": 4.3828, + "step": 225225 + }, + { + "epoch": 20.219928186714544, + "grad_norm": 21.150760650634766, + "learning_rate": 8.865360063834031e-06, + "loss": 4.4234, + "step": 225250 + }, + { + "epoch": 20.2221723518851, + "grad_norm": 19.79009437561035, + "learning_rate": 8.865110712148415e-06, + "loss": 4.4737, + "step": 225275 + }, + { + "epoch": 20.224416517055655, + "grad_norm": 16.910236358642578, + "learning_rate": 8.864861360462798e-06, + "loss": 4.5086, + "step": 225300 + }, + { + "epoch": 20.22666068222621, + "grad_norm": 19.1484432220459, + "learning_rate": 8.86461200877718e-06, + "loss": 4.4458, + "step": 225325 + }, + { + "epoch": 20.22890484739677, + "grad_norm": 15.443249702453613, + "learning_rate": 8.864362657091562e-06, + "loss": 4.639, + "step": 225350 + }, + { + "epoch": 20.231149012567325, + "grad_norm": 17.750675201416016, + "learning_rate": 8.864113305405945e-06, + "loss": 4.64, + "step": 225375 + }, + { + "epoch": 20.23339317773788, + "grad_norm": 18.08816909790039, + "learning_rate": 8.863863953720327e-06, + "loss": 4.5844, + "step": 225400 + }, + { + "epoch": 20.235637342908436, + "grad_norm": 21.891063690185547, + "learning_rate": 8.863614602034711e-06, + "loss": 4.5231, + "step": 225425 + }, + { + "epoch": 20.237881508078996, + "grad_norm": 18.79275131225586, + "learning_rate": 8.863365250349093e-06, + "loss": 4.5495, + "step": 225450 + }, + { + "epoch": 20.24012567324955, + "grad_norm": 19.420669555664062, + "learning_rate": 8.863115898663476e-06, + "loss": 4.5621, + "step": 225475 + }, + { + "epoch": 20.242369838420107, + "grad_norm": 24.05970001220703, + "learning_rate": 8.862866546977858e-06, + "loss": 4.4106, + "step": 225500 + }, + { + "epoch": 20.244614003590666, + "grad_norm": 17.13429832458496, + "learning_rate": 8.862617195292242e-06, + "loss": 4.5704, + "step": 225525 + }, + { + "epoch": 20.24685816876122, + "grad_norm": 17.35452651977539, + "learning_rate": 8.862367843606624e-06, + "loss": 4.396, + "step": 225550 + }, + { + "epoch": 20.249102333931777, + "grad_norm": 18.582231521606445, + "learning_rate": 8.862118491921007e-06, + "loss": 4.3612, + "step": 225575 + }, + { + "epoch": 20.251346499102333, + "grad_norm": 17.639089584350586, + "learning_rate": 8.861869140235389e-06, + "loss": 4.3382, + "step": 225600 + }, + { + "epoch": 20.253590664272892, + "grad_norm": 16.664979934692383, + "learning_rate": 8.861619788549771e-06, + "loss": 4.6298, + "step": 225625 + }, + { + "epoch": 20.255834829443447, + "grad_norm": 16.469242095947266, + "learning_rate": 8.861370436864153e-06, + "loss": 4.7404, + "step": 225650 + }, + { + "epoch": 20.258078994614003, + "grad_norm": 20.159029006958008, + "learning_rate": 8.861121085178537e-06, + "loss": 4.6506, + "step": 225675 + }, + { + "epoch": 20.26032315978456, + "grad_norm": 17.529451370239258, + "learning_rate": 8.86087173349292e-06, + "loss": 4.4373, + "step": 225700 + }, + { + "epoch": 20.262567324955118, + "grad_norm": 19.043882369995117, + "learning_rate": 8.860622381807302e-06, + "loss": 4.6717, + "step": 225725 + }, + { + "epoch": 20.264811490125673, + "grad_norm": 16.587636947631836, + "learning_rate": 8.860373030121684e-06, + "loss": 4.4549, + "step": 225750 + }, + { + "epoch": 20.26705565529623, + "grad_norm": 18.169273376464844, + "learning_rate": 8.860123678436067e-06, + "loss": 4.3845, + "step": 225775 + }, + { + "epoch": 20.269299820466788, + "grad_norm": 19.084375381469727, + "learning_rate": 8.859874326750449e-06, + "loss": 4.527, + "step": 225800 + }, + { + "epoch": 20.271543985637344, + "grad_norm": 16.753623962402344, + "learning_rate": 8.859624975064831e-06, + "loss": 4.6069, + "step": 225825 + }, + { + "epoch": 20.2737881508079, + "grad_norm": 20.138233184814453, + "learning_rate": 8.859375623379215e-06, + "loss": 4.4809, + "step": 225850 + }, + { + "epoch": 20.276032315978455, + "grad_norm": 18.198423385620117, + "learning_rate": 8.859126271693598e-06, + "loss": 4.5291, + "step": 225875 + }, + { + "epoch": 20.278276481149014, + "grad_norm": 19.347320556640625, + "learning_rate": 8.85887692000798e-06, + "loss": 4.4374, + "step": 225900 + }, + { + "epoch": 20.28052064631957, + "grad_norm": 15.136480331420898, + "learning_rate": 8.858627568322362e-06, + "loss": 4.5118, + "step": 225925 + }, + { + "epoch": 20.282764811490125, + "grad_norm": 17.955432891845703, + "learning_rate": 8.858378216636745e-06, + "loss": 4.4502, + "step": 225950 + }, + { + "epoch": 20.28500897666068, + "grad_norm": 19.82448387145996, + "learning_rate": 8.858128864951127e-06, + "loss": 4.59, + "step": 225975 + }, + { + "epoch": 20.28725314183124, + "grad_norm": 18.59618377685547, + "learning_rate": 8.857879513265511e-06, + "loss": 4.4864, + "step": 226000 + }, + { + "epoch": 20.289497307001795, + "grad_norm": 19.707983016967773, + "learning_rate": 8.857630161579893e-06, + "loss": 4.4693, + "step": 226025 + }, + { + "epoch": 20.29174147217235, + "grad_norm": 16.884071350097656, + "learning_rate": 8.857380809894276e-06, + "loss": 4.3813, + "step": 226050 + }, + { + "epoch": 20.29398563734291, + "grad_norm": 17.97606086730957, + "learning_rate": 8.857131458208658e-06, + "loss": 4.6897, + "step": 226075 + }, + { + "epoch": 20.296229802513466, + "grad_norm": 20.47897720336914, + "learning_rate": 8.85688210652304e-06, + "loss": 4.5228, + "step": 226100 + }, + { + "epoch": 20.29847396768402, + "grad_norm": 19.458528518676758, + "learning_rate": 8.856632754837423e-06, + "loss": 4.5597, + "step": 226125 + }, + { + "epoch": 20.300718132854577, + "grad_norm": 18.815523147583008, + "learning_rate": 8.856383403151807e-06, + "loss": 4.4445, + "step": 226150 + }, + { + "epoch": 20.302962298025136, + "grad_norm": 17.410118103027344, + "learning_rate": 8.856134051466189e-06, + "loss": 4.6586, + "step": 226175 + }, + { + "epoch": 20.30520646319569, + "grad_norm": 18.51643180847168, + "learning_rate": 8.855884699780571e-06, + "loss": 4.7057, + "step": 226200 + }, + { + "epoch": 20.307450628366247, + "grad_norm": 22.540010452270508, + "learning_rate": 8.855635348094954e-06, + "loss": 4.5505, + "step": 226225 + }, + { + "epoch": 20.309694793536803, + "grad_norm": 15.74378490447998, + "learning_rate": 8.855385996409338e-06, + "loss": 4.2339, + "step": 226250 + }, + { + "epoch": 20.311938958707362, + "grad_norm": 17.848268508911133, + "learning_rate": 8.855136644723718e-06, + "loss": 4.5737, + "step": 226275 + }, + { + "epoch": 20.314183123877918, + "grad_norm": 20.0689640045166, + "learning_rate": 8.854887293038102e-06, + "loss": 4.6026, + "step": 226300 + }, + { + "epoch": 20.316427289048473, + "grad_norm": 20.594675064086914, + "learning_rate": 8.854637941352484e-06, + "loss": 4.3764, + "step": 226325 + }, + { + "epoch": 20.31867145421903, + "grad_norm": 19.329118728637695, + "learning_rate": 8.854388589666867e-06, + "loss": 4.9158, + "step": 226350 + }, + { + "epoch": 20.320915619389588, + "grad_norm": 19.039737701416016, + "learning_rate": 8.854139237981249e-06, + "loss": 4.7363, + "step": 226375 + }, + { + "epoch": 20.323159784560143, + "grad_norm": 17.90771484375, + "learning_rate": 8.853889886295633e-06, + "loss": 4.7223, + "step": 226400 + }, + { + "epoch": 20.3254039497307, + "grad_norm": 16.761301040649414, + "learning_rate": 8.853640534610015e-06, + "loss": 4.9311, + "step": 226425 + }, + { + "epoch": 20.32764811490126, + "grad_norm": 16.910415649414062, + "learning_rate": 8.853391182924396e-06, + "loss": 4.6325, + "step": 226450 + }, + { + "epoch": 20.329892280071814, + "grad_norm": 19.347816467285156, + "learning_rate": 8.85314183123878e-06, + "loss": 4.5485, + "step": 226475 + }, + { + "epoch": 20.33213644524237, + "grad_norm": 21.66509437561035, + "learning_rate": 8.852892479553162e-06, + "loss": 4.5732, + "step": 226500 + }, + { + "epoch": 20.334380610412925, + "grad_norm": 17.501161575317383, + "learning_rate": 8.852643127867545e-06, + "loss": 4.3721, + "step": 226525 + }, + { + "epoch": 20.336624775583484, + "grad_norm": 15.887932777404785, + "learning_rate": 8.852393776181927e-06, + "loss": 4.654, + "step": 226550 + }, + { + "epoch": 20.33886894075404, + "grad_norm": 19.537797927856445, + "learning_rate": 8.852144424496311e-06, + "loss": 4.5858, + "step": 226575 + }, + { + "epoch": 20.341113105924595, + "grad_norm": 18.898611068725586, + "learning_rate": 8.851895072810693e-06, + "loss": 4.7079, + "step": 226600 + }, + { + "epoch": 20.34335727109515, + "grad_norm": 19.292816162109375, + "learning_rate": 8.851645721125076e-06, + "loss": 4.731, + "step": 226625 + }, + { + "epoch": 20.34560143626571, + "grad_norm": 17.6402587890625, + "learning_rate": 8.851396369439458e-06, + "loss": 4.6172, + "step": 226650 + }, + { + "epoch": 20.347845601436266, + "grad_norm": 17.0201473236084, + "learning_rate": 8.85114701775384e-06, + "loss": 4.7686, + "step": 226675 + }, + { + "epoch": 20.35008976660682, + "grad_norm": 17.20569610595703, + "learning_rate": 8.850897666068223e-06, + "loss": 4.8388, + "step": 226700 + }, + { + "epoch": 20.35233393177738, + "grad_norm": 18.086889266967773, + "learning_rate": 8.850648314382607e-06, + "loss": 4.4589, + "step": 226725 + }, + { + "epoch": 20.354578096947936, + "grad_norm": 17.267459869384766, + "learning_rate": 8.850398962696989e-06, + "loss": 4.4568, + "step": 226750 + }, + { + "epoch": 20.35682226211849, + "grad_norm": 18.32021713256836, + "learning_rate": 8.850149611011371e-06, + "loss": 4.4641, + "step": 226775 + }, + { + "epoch": 20.359066427289047, + "grad_norm": 22.58913803100586, + "learning_rate": 8.849900259325754e-06, + "loss": 4.5665, + "step": 226800 + }, + { + "epoch": 20.361310592459606, + "grad_norm": 20.042604446411133, + "learning_rate": 8.849650907640136e-06, + "loss": 4.5663, + "step": 226825 + }, + { + "epoch": 20.363554757630162, + "grad_norm": 17.779699325561523, + "learning_rate": 8.849401555954518e-06, + "loss": 4.1857, + "step": 226850 + }, + { + "epoch": 20.365798922800717, + "grad_norm": 19.88434600830078, + "learning_rate": 8.849152204268902e-06, + "loss": 4.5326, + "step": 226875 + }, + { + "epoch": 20.368043087971273, + "grad_norm": 18.94249725341797, + "learning_rate": 8.848902852583285e-06, + "loss": 4.3947, + "step": 226900 + }, + { + "epoch": 20.370287253141832, + "grad_norm": 22.156646728515625, + "learning_rate": 8.848653500897667e-06, + "loss": 4.4807, + "step": 226925 + }, + { + "epoch": 20.372531418312388, + "grad_norm": 17.89682388305664, + "learning_rate": 8.84840414921205e-06, + "loss": 4.6299, + "step": 226950 + }, + { + "epoch": 20.374775583482943, + "grad_norm": 20.08428955078125, + "learning_rate": 8.848154797526431e-06, + "loss": 4.4933, + "step": 226975 + }, + { + "epoch": 20.377019748653503, + "grad_norm": 17.74971580505371, + "learning_rate": 8.84791541990824e-06, + "loss": 4.4637, + "step": 227000 + }, + { + "epoch": 20.379263913824058, + "grad_norm": 19.651399612426758, + "learning_rate": 8.847666068222622e-06, + "loss": 4.4994, + "step": 227025 + }, + { + "epoch": 20.381508078994614, + "grad_norm": 20.60396957397461, + "learning_rate": 8.847416716537005e-06, + "loss": 4.6621, + "step": 227050 + }, + { + "epoch": 20.38375224416517, + "grad_norm": 20.93037223815918, + "learning_rate": 8.847167364851387e-06, + "loss": 4.8043, + "step": 227075 + }, + { + "epoch": 20.38599640933573, + "grad_norm": 19.037479400634766, + "learning_rate": 8.84691801316577e-06, + "loss": 4.4669, + "step": 227100 + }, + { + "epoch": 20.388240574506284, + "grad_norm": 18.68936538696289, + "learning_rate": 8.846668661480152e-06, + "loss": 4.5123, + "step": 227125 + }, + { + "epoch": 20.39048473967684, + "grad_norm": 21.45209503173828, + "learning_rate": 8.846419309794536e-06, + "loss": 4.4724, + "step": 227150 + }, + { + "epoch": 20.392728904847395, + "grad_norm": 19.50929069519043, + "learning_rate": 8.846169958108918e-06, + "loss": 4.4283, + "step": 227175 + }, + { + "epoch": 20.394973070017954, + "grad_norm": 17.401275634765625, + "learning_rate": 8.8459206064233e-06, + "loss": 4.4101, + "step": 227200 + }, + { + "epoch": 20.39721723518851, + "grad_norm": 21.458555221557617, + "learning_rate": 8.845671254737683e-06, + "loss": 4.6605, + "step": 227225 + }, + { + "epoch": 20.399461400359066, + "grad_norm": 22.85089683532715, + "learning_rate": 8.845421903052065e-06, + "loss": 4.5869, + "step": 227250 + }, + { + "epoch": 20.401705565529625, + "grad_norm": 17.661575317382812, + "learning_rate": 8.845172551366447e-06, + "loss": 4.4791, + "step": 227275 + }, + { + "epoch": 20.40394973070018, + "grad_norm": 17.10017967224121, + "learning_rate": 8.844923199680831e-06, + "loss": 4.7072, + "step": 227300 + }, + { + "epoch": 20.406193895870736, + "grad_norm": 18.45982551574707, + "learning_rate": 8.844673847995214e-06, + "loss": 4.5266, + "step": 227325 + }, + { + "epoch": 20.40843806104129, + "grad_norm": 19.533159255981445, + "learning_rate": 8.844424496309596e-06, + "loss": 4.3404, + "step": 227350 + }, + { + "epoch": 20.41068222621185, + "grad_norm": 20.919294357299805, + "learning_rate": 8.844175144623978e-06, + "loss": 4.4592, + "step": 227375 + }, + { + "epoch": 20.412926391382406, + "grad_norm": 19.614713668823242, + "learning_rate": 8.843925792938362e-06, + "loss": 4.6138, + "step": 227400 + }, + { + "epoch": 20.41517055655296, + "grad_norm": 20.417335510253906, + "learning_rate": 8.843676441252743e-06, + "loss": 4.721, + "step": 227425 + }, + { + "epoch": 20.417414721723517, + "grad_norm": 21.649633407592773, + "learning_rate": 8.843427089567125e-06, + "loss": 4.662, + "step": 227450 + }, + { + "epoch": 20.419658886894076, + "grad_norm": 16.880220413208008, + "learning_rate": 8.84317773788151e-06, + "loss": 4.5279, + "step": 227475 + }, + { + "epoch": 20.421903052064632, + "grad_norm": 19.384313583374023, + "learning_rate": 8.842928386195892e-06, + "loss": 4.4513, + "step": 227500 + }, + { + "epoch": 20.424147217235188, + "grad_norm": 17.49518394470215, + "learning_rate": 8.842679034510274e-06, + "loss": 4.7047, + "step": 227525 + }, + { + "epoch": 20.426391382405747, + "grad_norm": 19.003337860107422, + "learning_rate": 8.842429682824656e-06, + "loss": 4.5466, + "step": 227550 + }, + { + "epoch": 20.428635547576302, + "grad_norm": 18.535049438476562, + "learning_rate": 8.84218033113904e-06, + "loss": 4.7478, + "step": 227575 + }, + { + "epoch": 20.430879712746858, + "grad_norm": 18.078554153442383, + "learning_rate": 8.84193097945342e-06, + "loss": 4.5876, + "step": 227600 + }, + { + "epoch": 20.433123877917414, + "grad_norm": 16.19063377380371, + "learning_rate": 8.841681627767805e-06, + "loss": 4.5666, + "step": 227625 + }, + { + "epoch": 20.435368043087973, + "grad_norm": 16.59459686279297, + "learning_rate": 8.841432276082187e-06, + "loss": 4.5261, + "step": 227650 + }, + { + "epoch": 20.43761220825853, + "grad_norm": 19.661964416503906, + "learning_rate": 8.84118292439657e-06, + "loss": 4.688, + "step": 227675 + }, + { + "epoch": 20.439856373429084, + "grad_norm": 16.803468704223633, + "learning_rate": 8.840933572710952e-06, + "loss": 4.5186, + "step": 227700 + }, + { + "epoch": 20.44210053859964, + "grad_norm": 20.593088150024414, + "learning_rate": 8.840684221025336e-06, + "loss": 4.7351, + "step": 227725 + }, + { + "epoch": 20.4443447037702, + "grad_norm": 19.682435989379883, + "learning_rate": 8.840434869339718e-06, + "loss": 4.3168, + "step": 227750 + }, + { + "epoch": 20.446588868940754, + "grad_norm": 22.47095489501953, + "learning_rate": 8.8401855176541e-06, + "loss": 4.6211, + "step": 227775 + }, + { + "epoch": 20.44883303411131, + "grad_norm": 21.54058265686035, + "learning_rate": 8.839936165968483e-06, + "loss": 4.3891, + "step": 227800 + }, + { + "epoch": 20.451077199281865, + "grad_norm": 19.96416664123535, + "learning_rate": 8.839686814282865e-06, + "loss": 4.4917, + "step": 227825 + }, + { + "epoch": 20.453321364452425, + "grad_norm": 19.44461441040039, + "learning_rate": 8.839437462597247e-06, + "loss": 4.4827, + "step": 227850 + }, + { + "epoch": 20.45556552962298, + "grad_norm": 16.84109878540039, + "learning_rate": 8.839188110911631e-06, + "loss": 4.4125, + "step": 227875 + }, + { + "epoch": 20.457809694793536, + "grad_norm": 19.33616065979004, + "learning_rate": 8.838938759226014e-06, + "loss": 4.6677, + "step": 227900 + }, + { + "epoch": 20.460053859964095, + "grad_norm": 21.49380111694336, + "learning_rate": 8.838689407540396e-06, + "loss": 4.6688, + "step": 227925 + }, + { + "epoch": 20.46229802513465, + "grad_norm": 19.721345901489258, + "learning_rate": 8.838440055854778e-06, + "loss": 4.5291, + "step": 227950 + }, + { + "epoch": 20.464542190305206, + "grad_norm": 19.110496520996094, + "learning_rate": 8.83819070416916e-06, + "loss": 4.5626, + "step": 227975 + }, + { + "epoch": 20.46678635547576, + "grad_norm": 17.309412002563477, + "learning_rate": 8.837941352483543e-06, + "loss": 4.4593, + "step": 228000 + }, + { + "epoch": 20.46903052064632, + "grad_norm": 17.715423583984375, + "learning_rate": 8.837692000797927e-06, + "loss": 4.5088, + "step": 228025 + }, + { + "epoch": 20.471274685816876, + "grad_norm": 20.072189331054688, + "learning_rate": 8.83744264911231e-06, + "loss": 4.486, + "step": 228050 + }, + { + "epoch": 20.473518850987432, + "grad_norm": 19.15359115600586, + "learning_rate": 8.837193297426692e-06, + "loss": 4.6864, + "step": 228075 + }, + { + "epoch": 20.475763016157988, + "grad_norm": 17.732999801635742, + "learning_rate": 8.836943945741074e-06, + "loss": 4.3387, + "step": 228100 + }, + { + "epoch": 20.478007181328547, + "grad_norm": 17.15180015563965, + "learning_rate": 8.836694594055456e-06, + "loss": 4.7068, + "step": 228125 + }, + { + "epoch": 20.480251346499102, + "grad_norm": 20.083263397216797, + "learning_rate": 8.836445242369839e-06, + "loss": 4.6104, + "step": 228150 + }, + { + "epoch": 20.482495511669658, + "grad_norm": 16.604412078857422, + "learning_rate": 8.83619589068422e-06, + "loss": 4.3027, + "step": 228175 + }, + { + "epoch": 20.484739676840217, + "grad_norm": 20.596065521240234, + "learning_rate": 8.835946538998605e-06, + "loss": 4.7166, + "step": 228200 + }, + { + "epoch": 20.486983842010773, + "grad_norm": 19.685405731201172, + "learning_rate": 8.835697187312987e-06, + "loss": 4.5663, + "step": 228225 + }, + { + "epoch": 20.489228007181328, + "grad_norm": 21.50383186340332, + "learning_rate": 8.83544783562737e-06, + "loss": 4.5002, + "step": 228250 + }, + { + "epoch": 20.491472172351884, + "grad_norm": 20.50653076171875, + "learning_rate": 8.835198483941752e-06, + "loss": 4.5094, + "step": 228275 + }, + { + "epoch": 20.493716337522443, + "grad_norm": 19.673715591430664, + "learning_rate": 8.834949132256134e-06, + "loss": 4.5624, + "step": 228300 + }, + { + "epoch": 20.495960502693, + "grad_norm": 21.73993682861328, + "learning_rate": 8.834699780570516e-06, + "loss": 4.5446, + "step": 228325 + }, + { + "epoch": 20.498204667863554, + "grad_norm": 18.118398666381836, + "learning_rate": 8.8344504288849e-06, + "loss": 4.3893, + "step": 228350 + }, + { + "epoch": 20.50044883303411, + "grad_norm": 20.005233764648438, + "learning_rate": 8.834201077199283e-06, + "loss": 4.6543, + "step": 228375 + }, + { + "epoch": 20.50269299820467, + "grad_norm": 19.899810791015625, + "learning_rate": 8.833951725513665e-06, + "loss": 4.7117, + "step": 228400 + }, + { + "epoch": 20.504937163375224, + "grad_norm": 23.52919578552246, + "learning_rate": 8.833702373828047e-06, + "loss": 4.5019, + "step": 228425 + }, + { + "epoch": 20.50718132854578, + "grad_norm": 19.72957992553711, + "learning_rate": 8.833453022142431e-06, + "loss": 4.6114, + "step": 228450 + }, + { + "epoch": 20.50942549371634, + "grad_norm": 21.274690628051758, + "learning_rate": 8.833203670456812e-06, + "loss": 4.6809, + "step": 228475 + }, + { + "epoch": 20.511669658886895, + "grad_norm": 17.880277633666992, + "learning_rate": 8.832954318771196e-06, + "loss": 4.4539, + "step": 228500 + }, + { + "epoch": 20.51391382405745, + "grad_norm": 20.487857818603516, + "learning_rate": 8.832704967085578e-06, + "loss": 4.54, + "step": 228525 + }, + { + "epoch": 20.516157989228006, + "grad_norm": 21.497053146362305, + "learning_rate": 8.83245561539996e-06, + "loss": 4.4624, + "step": 228550 + }, + { + "epoch": 20.518402154398565, + "grad_norm": 19.41450309753418, + "learning_rate": 8.832206263714343e-06, + "loss": 4.7431, + "step": 228575 + }, + { + "epoch": 20.52064631956912, + "grad_norm": 20.97969627380371, + "learning_rate": 8.831956912028727e-06, + "loss": 4.6225, + "step": 228600 + }, + { + "epoch": 20.522890484739676, + "grad_norm": 17.48153305053711, + "learning_rate": 8.83170756034311e-06, + "loss": 4.7544, + "step": 228625 + }, + { + "epoch": 20.525134649910232, + "grad_norm": 17.208999633789062, + "learning_rate": 8.83145820865749e-06, + "loss": 4.5026, + "step": 228650 + }, + { + "epoch": 20.52737881508079, + "grad_norm": 20.808269500732422, + "learning_rate": 8.831208856971874e-06, + "loss": 4.5043, + "step": 228675 + }, + { + "epoch": 20.529622980251347, + "grad_norm": 19.173683166503906, + "learning_rate": 8.830959505286256e-06, + "loss": 4.5534, + "step": 228700 + }, + { + "epoch": 20.531867145421902, + "grad_norm": 21.72273826599121, + "learning_rate": 8.830710153600639e-06, + "loss": 4.6915, + "step": 228725 + }, + { + "epoch": 20.53411131059246, + "grad_norm": 19.780208587646484, + "learning_rate": 8.830460801915023e-06, + "loss": 4.5084, + "step": 228750 + }, + { + "epoch": 20.536355475763017, + "grad_norm": 14.360075950622559, + "learning_rate": 8.830211450229405e-06, + "loss": 4.5599, + "step": 228775 + }, + { + "epoch": 20.538599640933572, + "grad_norm": 19.005474090576172, + "learning_rate": 8.829962098543787e-06, + "loss": 4.5862, + "step": 228800 + }, + { + "epoch": 20.540843806104128, + "grad_norm": 18.84044647216797, + "learning_rate": 8.82971274685817e-06, + "loss": 4.6868, + "step": 228825 + }, + { + "epoch": 20.543087971274687, + "grad_norm": 15.819036483764648, + "learning_rate": 8.829463395172552e-06, + "loss": 4.5751, + "step": 228850 + }, + { + "epoch": 20.545332136445243, + "grad_norm": 16.797121047973633, + "learning_rate": 8.829214043486934e-06, + "loss": 4.4593, + "step": 228875 + }, + { + "epoch": 20.5475763016158, + "grad_norm": 18.54521942138672, + "learning_rate": 8.828964691801316e-06, + "loss": 4.4416, + "step": 228900 + }, + { + "epoch": 20.549820466786354, + "grad_norm": 17.639562606811523, + "learning_rate": 8.8287153401157e-06, + "loss": 4.2321, + "step": 228925 + }, + { + "epoch": 20.552064631956913, + "grad_norm": 19.226892471313477, + "learning_rate": 8.828465988430083e-06, + "loss": 4.8667, + "step": 228950 + }, + { + "epoch": 20.55430879712747, + "grad_norm": 17.692283630371094, + "learning_rate": 8.828216636744465e-06, + "loss": 4.4922, + "step": 228975 + }, + { + "epoch": 20.556552962298024, + "grad_norm": 19.56973648071289, + "learning_rate": 8.827967285058847e-06, + "loss": 4.4249, + "step": 229000 + }, + { + "epoch": 20.55879712746858, + "grad_norm": 15.46180534362793, + "learning_rate": 8.82771793337323e-06, + "loss": 4.6137, + "step": 229025 + }, + { + "epoch": 20.56104129263914, + "grad_norm": 18.472545623779297, + "learning_rate": 8.827468581687612e-06, + "loss": 4.6, + "step": 229050 + }, + { + "epoch": 20.563285457809695, + "grad_norm": 14.984911918640137, + "learning_rate": 8.827219230001996e-06, + "loss": 4.3417, + "step": 229075 + }, + { + "epoch": 20.56552962298025, + "grad_norm": 18.800046920776367, + "learning_rate": 8.826969878316378e-06, + "loss": 4.7307, + "step": 229100 + }, + { + "epoch": 20.56777378815081, + "grad_norm": 19.587278366088867, + "learning_rate": 8.82672052663076e-06, + "loss": 4.6191, + "step": 229125 + }, + { + "epoch": 20.570017953321365, + "grad_norm": 17.443281173706055, + "learning_rate": 8.826471174945143e-06, + "loss": 4.3878, + "step": 229150 + }, + { + "epoch": 20.57226211849192, + "grad_norm": 19.084243774414062, + "learning_rate": 8.826221823259527e-06, + "loss": 4.4805, + "step": 229175 + }, + { + "epoch": 20.574506283662476, + "grad_norm": 19.228382110595703, + "learning_rate": 8.825972471573908e-06, + "loss": 4.7779, + "step": 229200 + }, + { + "epoch": 20.576750448833035, + "grad_norm": 19.373960494995117, + "learning_rate": 8.825723119888292e-06, + "loss": 4.4646, + "step": 229225 + }, + { + "epoch": 20.57899461400359, + "grad_norm": 19.399324417114258, + "learning_rate": 8.825473768202674e-06, + "loss": 4.5753, + "step": 229250 + }, + { + "epoch": 20.581238779174146, + "grad_norm": 15.86722469329834, + "learning_rate": 8.825224416517056e-06, + "loss": 4.479, + "step": 229275 + }, + { + "epoch": 20.583482944344702, + "grad_norm": 17.64829444885254, + "learning_rate": 8.824975064831439e-06, + "loss": 4.5972, + "step": 229300 + }, + { + "epoch": 20.58572710951526, + "grad_norm": 17.636253356933594, + "learning_rate": 8.824725713145823e-06, + "loss": 4.51, + "step": 229325 + }, + { + "epoch": 20.587971274685817, + "grad_norm": 20.56565284729004, + "learning_rate": 8.824476361460205e-06, + "loss": 4.5668, + "step": 229350 + }, + { + "epoch": 20.590215439856372, + "grad_norm": 22.282651901245117, + "learning_rate": 8.824227009774586e-06, + "loss": 4.9913, + "step": 229375 + }, + { + "epoch": 20.59245960502693, + "grad_norm": 18.409507751464844, + "learning_rate": 8.82397765808897e-06, + "loss": 4.6049, + "step": 229400 + }, + { + "epoch": 20.594703770197487, + "grad_norm": 19.427034378051758, + "learning_rate": 8.823728306403352e-06, + "loss": 4.8496, + "step": 229425 + }, + { + "epoch": 20.596947935368043, + "grad_norm": 17.36878204345703, + "learning_rate": 8.823478954717734e-06, + "loss": 4.6905, + "step": 229450 + }, + { + "epoch": 20.5991921005386, + "grad_norm": 19.360694885253906, + "learning_rate": 8.823229603032118e-06, + "loss": 4.6529, + "step": 229475 + }, + { + "epoch": 20.601436265709157, + "grad_norm": 22.409074783325195, + "learning_rate": 8.8229802513465e-06, + "loss": 4.6879, + "step": 229500 + }, + { + "epoch": 20.603680430879713, + "grad_norm": 16.994979858398438, + "learning_rate": 8.822730899660883e-06, + "loss": 4.6647, + "step": 229525 + }, + { + "epoch": 20.60592459605027, + "grad_norm": 20.156278610229492, + "learning_rate": 8.822481547975265e-06, + "loss": 4.5556, + "step": 229550 + }, + { + "epoch": 20.608168761220824, + "grad_norm": 19.85907745361328, + "learning_rate": 8.822232196289647e-06, + "loss": 4.6131, + "step": 229575 + }, + { + "epoch": 20.610412926391383, + "grad_norm": 18.659137725830078, + "learning_rate": 8.82198284460403e-06, + "loss": 4.6713, + "step": 229600 + }, + { + "epoch": 20.61265709156194, + "grad_norm": 16.977514266967773, + "learning_rate": 8.821733492918412e-06, + "loss": 4.4294, + "step": 229625 + }, + { + "epoch": 20.614901256732495, + "grad_norm": 19.94139289855957, + "learning_rate": 8.821484141232796e-06, + "loss": 4.4483, + "step": 229650 + }, + { + "epoch": 20.617145421903054, + "grad_norm": 17.34031867980957, + "learning_rate": 8.821234789547178e-06, + "loss": 4.3663, + "step": 229675 + }, + { + "epoch": 20.61938958707361, + "grad_norm": 20.4258975982666, + "learning_rate": 8.82098543786156e-06, + "loss": 4.6196, + "step": 229700 + }, + { + "epoch": 20.621633752244165, + "grad_norm": 25.004344940185547, + "learning_rate": 8.820736086175943e-06, + "loss": 4.6929, + "step": 229725 + }, + { + "epoch": 20.62387791741472, + "grad_norm": 22.986785888671875, + "learning_rate": 8.820486734490325e-06, + "loss": 4.6264, + "step": 229750 + }, + { + "epoch": 20.62612208258528, + "grad_norm": 19.43357276916504, + "learning_rate": 8.820237382804708e-06, + "loss": 4.7261, + "step": 229775 + }, + { + "epoch": 20.628366247755835, + "grad_norm": 22.157203674316406, + "learning_rate": 8.819988031119092e-06, + "loss": 4.6006, + "step": 229800 + }, + { + "epoch": 20.63061041292639, + "grad_norm": 18.688838958740234, + "learning_rate": 8.819738679433474e-06, + "loss": 4.5436, + "step": 229825 + }, + { + "epoch": 20.632854578096946, + "grad_norm": 17.565013885498047, + "learning_rate": 8.819489327747856e-06, + "loss": 4.5331, + "step": 229850 + }, + { + "epoch": 20.635098743267505, + "grad_norm": 15.008822441101074, + "learning_rate": 8.819239976062239e-06, + "loss": 4.6482, + "step": 229875 + }, + { + "epoch": 20.63734290843806, + "grad_norm": 19.994647979736328, + "learning_rate": 8.818990624376621e-06, + "loss": 4.7359, + "step": 229900 + }, + { + "epoch": 20.639587073608617, + "grad_norm": 16.98655891418457, + "learning_rate": 8.818741272691003e-06, + "loss": 4.4606, + "step": 229925 + }, + { + "epoch": 20.641831238779176, + "grad_norm": 17.41816520690918, + "learning_rate": 8.818491921005387e-06, + "loss": 4.7311, + "step": 229950 + }, + { + "epoch": 20.64407540394973, + "grad_norm": 16.35725975036621, + "learning_rate": 8.81824256931977e-06, + "loss": 4.6804, + "step": 229975 + }, + { + "epoch": 20.646319569120287, + "grad_norm": 15.512984275817871, + "learning_rate": 8.817993217634152e-06, + "loss": 4.6655, + "step": 230000 + }, + { + "epoch": 20.648563734290843, + "grad_norm": 16.722314834594727, + "learning_rate": 8.817743865948534e-06, + "loss": 4.7786, + "step": 230025 + }, + { + "epoch": 20.6508078994614, + "grad_norm": 21.073652267456055, + "learning_rate": 8.817494514262918e-06, + "loss": 4.7254, + "step": 230050 + }, + { + "epoch": 20.653052064631957, + "grad_norm": 17.864042282104492, + "learning_rate": 8.817245162577299e-06, + "loss": 4.6704, + "step": 230075 + }, + { + "epoch": 20.655296229802513, + "grad_norm": 17.952213287353516, + "learning_rate": 8.816995810891681e-06, + "loss": 4.6007, + "step": 230100 + }, + { + "epoch": 20.65754039497307, + "grad_norm": 15.968402862548828, + "learning_rate": 8.816746459206065e-06, + "loss": 4.4279, + "step": 230125 + }, + { + "epoch": 20.659784560143628, + "grad_norm": 20.299283981323242, + "learning_rate": 8.816497107520448e-06, + "loss": 4.3253, + "step": 230150 + }, + { + "epoch": 20.662028725314183, + "grad_norm": 20.322633743286133, + "learning_rate": 8.81624775583483e-06, + "loss": 4.5436, + "step": 230175 + }, + { + "epoch": 20.66427289048474, + "grad_norm": 19.871580123901367, + "learning_rate": 8.815998404149214e-06, + "loss": 4.5676, + "step": 230200 + }, + { + "epoch": 20.666517055655298, + "grad_norm": 19.55272674560547, + "learning_rate": 8.815749052463596e-06, + "loss": 4.5357, + "step": 230225 + }, + { + "epoch": 20.668761220825854, + "grad_norm": 19.30452537536621, + "learning_rate": 8.815499700777977e-06, + "loss": 4.6479, + "step": 230250 + }, + { + "epoch": 20.67100538599641, + "grad_norm": 16.24127960205078, + "learning_rate": 8.81525034909236e-06, + "loss": 4.6063, + "step": 230275 + }, + { + "epoch": 20.673249551166965, + "grad_norm": 18.643800735473633, + "learning_rate": 8.815000997406743e-06, + "loss": 4.5746, + "step": 230300 + }, + { + "epoch": 20.675493716337524, + "grad_norm": 21.855915069580078, + "learning_rate": 8.814751645721125e-06, + "loss": 4.5856, + "step": 230325 + }, + { + "epoch": 20.67773788150808, + "grad_norm": 17.423738479614258, + "learning_rate": 8.814502294035508e-06, + "loss": 4.8659, + "step": 230350 + }, + { + "epoch": 20.679982046678635, + "grad_norm": 17.285877227783203, + "learning_rate": 8.814252942349892e-06, + "loss": 4.6709, + "step": 230375 + }, + { + "epoch": 20.68222621184919, + "grad_norm": 18.871803283691406, + "learning_rate": 8.814003590664274e-06, + "loss": 4.5016, + "step": 230400 + }, + { + "epoch": 20.68447037701975, + "grad_norm": 19.157485961914062, + "learning_rate": 8.813754238978656e-06, + "loss": 4.5116, + "step": 230425 + }, + { + "epoch": 20.686714542190305, + "grad_norm": 18.280786514282227, + "learning_rate": 8.813504887293039e-06, + "loss": 4.5214, + "step": 230450 + }, + { + "epoch": 20.68895870736086, + "grad_norm": 14.802830696105957, + "learning_rate": 8.813255535607421e-06, + "loss": 4.6606, + "step": 230475 + }, + { + "epoch": 20.69120287253142, + "grad_norm": 20.63130760192871, + "learning_rate": 8.813006183921803e-06, + "loss": 4.5141, + "step": 230500 + }, + { + "epoch": 20.693447037701976, + "grad_norm": 26.01654815673828, + "learning_rate": 8.812756832236187e-06, + "loss": 4.5956, + "step": 230525 + }, + { + "epoch": 20.69569120287253, + "grad_norm": 18.80040168762207, + "learning_rate": 8.81250748055057e-06, + "loss": 4.6104, + "step": 230550 + }, + { + "epoch": 20.697935368043087, + "grad_norm": 19.151771545410156, + "learning_rate": 8.812258128864952e-06, + "loss": 4.3492, + "step": 230575 + }, + { + "epoch": 20.700179533213646, + "grad_norm": 17.275787353515625, + "learning_rate": 8.812008777179334e-06, + "loss": 4.6124, + "step": 230600 + }, + { + "epoch": 20.7024236983842, + "grad_norm": 20.307571411132812, + "learning_rate": 8.811759425493717e-06, + "loss": 4.5298, + "step": 230625 + }, + { + "epoch": 20.704667863554757, + "grad_norm": 18.818180084228516, + "learning_rate": 8.811510073808099e-06, + "loss": 4.6024, + "step": 230650 + }, + { + "epoch": 20.706912028725313, + "grad_norm": 19.524662017822266, + "learning_rate": 8.811260722122483e-06, + "loss": 4.6894, + "step": 230675 + }, + { + "epoch": 20.709156193895872, + "grad_norm": 17.25592613220215, + "learning_rate": 8.811011370436865e-06, + "loss": 4.791, + "step": 230700 + }, + { + "epoch": 20.711400359066428, + "grad_norm": 22.10040283203125, + "learning_rate": 8.810762018751248e-06, + "loss": 4.3788, + "step": 230725 + }, + { + "epoch": 20.713644524236983, + "grad_norm": 19.336822509765625, + "learning_rate": 8.81051266706563e-06, + "loss": 4.9221, + "step": 230750 + }, + { + "epoch": 20.71588868940754, + "grad_norm": 20.830612182617188, + "learning_rate": 8.810263315380014e-06, + "loss": 4.6067, + "step": 230775 + }, + { + "epoch": 20.718132854578098, + "grad_norm": 19.873769760131836, + "learning_rate": 8.810013963694395e-06, + "loss": 4.6775, + "step": 230800 + }, + { + "epoch": 20.720377019748653, + "grad_norm": 16.86178970336914, + "learning_rate": 8.809764612008777e-06, + "loss": 4.7591, + "step": 230825 + }, + { + "epoch": 20.72262118491921, + "grad_norm": 19.999553680419922, + "learning_rate": 8.80951526032316e-06, + "loss": 4.7115, + "step": 230850 + }, + { + "epoch": 20.724865350089768, + "grad_norm": 17.321674346923828, + "learning_rate": 8.809265908637543e-06, + "loss": 4.3901, + "step": 230875 + }, + { + "epoch": 20.727109515260324, + "grad_norm": 18.532726287841797, + "learning_rate": 8.809016556951925e-06, + "loss": 4.5801, + "step": 230900 + }, + { + "epoch": 20.72935368043088, + "grad_norm": 21.5196533203125, + "learning_rate": 8.80876720526631e-06, + "loss": 4.6552, + "step": 230925 + }, + { + "epoch": 20.731597845601435, + "grad_norm": 21.891063690185547, + "learning_rate": 8.808517853580692e-06, + "loss": 4.5669, + "step": 230950 + }, + { + "epoch": 20.733842010771994, + "grad_norm": 17.732120513916016, + "learning_rate": 8.808268501895072e-06, + "loss": 4.8195, + "step": 230975 + }, + { + "epoch": 20.73608617594255, + "grad_norm": 19.779634475708008, + "learning_rate": 8.808019150209456e-06, + "loss": 4.4723, + "step": 231000 + }, + { + "epoch": 20.738330341113105, + "grad_norm": 20.772031784057617, + "learning_rate": 8.807779772591263e-06, + "loss": 4.7618, + "step": 231025 + }, + { + "epoch": 20.74057450628366, + "grad_norm": 18.232215881347656, + "learning_rate": 8.807530420905646e-06, + "loss": 4.7804, + "step": 231050 + }, + { + "epoch": 20.74281867145422, + "grad_norm": 16.998205184936523, + "learning_rate": 8.807281069220028e-06, + "loss": 4.6904, + "step": 231075 + }, + { + "epoch": 20.745062836624776, + "grad_norm": 23.132896423339844, + "learning_rate": 8.80703171753441e-06, + "loss": 4.6933, + "step": 231100 + }, + { + "epoch": 20.74730700179533, + "grad_norm": 16.325414657592773, + "learning_rate": 8.806782365848794e-06, + "loss": 4.4958, + "step": 231125 + }, + { + "epoch": 20.74955116696589, + "grad_norm": 18.77460479736328, + "learning_rate": 8.806533014163177e-06, + "loss": 4.7026, + "step": 231150 + }, + { + "epoch": 20.751795332136446, + "grad_norm": 21.678714752197266, + "learning_rate": 8.806283662477559e-06, + "loss": 4.6275, + "step": 231175 + }, + { + "epoch": 20.754039497307, + "grad_norm": 19.034631729125977, + "learning_rate": 8.806034310791943e-06, + "loss": 4.5722, + "step": 231200 + }, + { + "epoch": 20.756283662477557, + "grad_norm": 19.308441162109375, + "learning_rate": 8.805784959106324e-06, + "loss": 4.8484, + "step": 231225 + }, + { + "epoch": 20.758527827648116, + "grad_norm": 18.1337947845459, + "learning_rate": 8.805535607420706e-06, + "loss": 4.6023, + "step": 231250 + }, + { + "epoch": 20.760771992818672, + "grad_norm": 20.35565948486328, + "learning_rate": 8.80528625573509e-06, + "loss": 4.6648, + "step": 231275 + }, + { + "epoch": 20.763016157989227, + "grad_norm": 21.11745262145996, + "learning_rate": 8.805036904049472e-06, + "loss": 4.6213, + "step": 231300 + }, + { + "epoch": 20.765260323159783, + "grad_norm": 19.788240432739258, + "learning_rate": 8.804787552363855e-06, + "loss": 4.7212, + "step": 231325 + }, + { + "epoch": 20.767504488330342, + "grad_norm": 20.036928176879883, + "learning_rate": 8.804538200678237e-06, + "loss": 4.6623, + "step": 231350 + }, + { + "epoch": 20.769748653500898, + "grad_norm": 17.582136154174805, + "learning_rate": 8.804288848992621e-06, + "loss": 4.6007, + "step": 231375 + }, + { + "epoch": 20.771992818671453, + "grad_norm": 21.285640716552734, + "learning_rate": 8.804039497307002e-06, + "loss": 4.6804, + "step": 231400 + }, + { + "epoch": 20.774236983842012, + "grad_norm": 17.83155632019043, + "learning_rate": 8.80380011968881e-06, + "loss": 4.7388, + "step": 231425 + }, + { + "epoch": 20.776481149012568, + "grad_norm": 18.999801635742188, + "learning_rate": 8.803550768003192e-06, + "loss": 4.4715, + "step": 231450 + }, + { + "epoch": 20.778725314183124, + "grad_norm": 18.824783325195312, + "learning_rate": 8.803301416317575e-06, + "loss": 4.7966, + "step": 231475 + }, + { + "epoch": 20.78096947935368, + "grad_norm": 17.262266159057617, + "learning_rate": 8.803052064631957e-06, + "loss": 4.8524, + "step": 231500 + }, + { + "epoch": 20.78321364452424, + "grad_norm": 17.627653121948242, + "learning_rate": 8.80280271294634e-06, + "loss": 4.5867, + "step": 231525 + }, + { + "epoch": 20.785457809694794, + "grad_norm": 21.62139892578125, + "learning_rate": 8.802553361260723e-06, + "loss": 4.6834, + "step": 231550 + }, + { + "epoch": 20.78770197486535, + "grad_norm": 19.277769088745117, + "learning_rate": 8.802304009575106e-06, + "loss": 4.6946, + "step": 231575 + }, + { + "epoch": 20.789946140035905, + "grad_norm": 18.595720291137695, + "learning_rate": 8.802054657889488e-06, + "loss": 4.7125, + "step": 231600 + }, + { + "epoch": 20.792190305206464, + "grad_norm": 17.297321319580078, + "learning_rate": 8.80180530620387e-06, + "loss": 4.5788, + "step": 231625 + }, + { + "epoch": 20.79443447037702, + "grad_norm": 20.485576629638672, + "learning_rate": 8.801555954518253e-06, + "loss": 4.6473, + "step": 231650 + }, + { + "epoch": 20.796678635547575, + "grad_norm": 18.549072265625, + "learning_rate": 8.801306602832635e-06, + "loss": 4.4372, + "step": 231675 + }, + { + "epoch": 20.79892280071813, + "grad_norm": 17.65496063232422, + "learning_rate": 8.801057251147019e-06, + "loss": 4.6729, + "step": 231700 + }, + { + "epoch": 20.80116696588869, + "grad_norm": 19.09524154663086, + "learning_rate": 8.800807899461401e-06, + "loss": 4.7877, + "step": 231725 + }, + { + "epoch": 20.803411131059246, + "grad_norm": 22.104965209960938, + "learning_rate": 8.800558547775784e-06, + "loss": 4.7663, + "step": 231750 + }, + { + "epoch": 20.8056552962298, + "grad_norm": 16.261735916137695, + "learning_rate": 8.800309196090166e-06, + "loss": 4.5639, + "step": 231775 + }, + { + "epoch": 20.80789946140036, + "grad_norm": 19.685501098632812, + "learning_rate": 8.80005984440455e-06, + "loss": 4.6023, + "step": 231800 + }, + { + "epoch": 20.810143626570916, + "grad_norm": 17.098072052001953, + "learning_rate": 8.79981049271893e-06, + "loss": 4.5891, + "step": 231825 + }, + { + "epoch": 20.81238779174147, + "grad_norm": 22.06791114807129, + "learning_rate": 8.799561141033313e-06, + "loss": 4.512, + "step": 231850 + }, + { + "epoch": 20.814631956912027, + "grad_norm": 19.91753387451172, + "learning_rate": 8.799311789347697e-06, + "loss": 4.6517, + "step": 231875 + }, + { + "epoch": 20.816876122082586, + "grad_norm": 24.467824935913086, + "learning_rate": 8.79906243766208e-06, + "loss": 4.8138, + "step": 231900 + }, + { + "epoch": 20.819120287253142, + "grad_norm": 20.281328201293945, + "learning_rate": 8.798813085976462e-06, + "loss": 4.6358, + "step": 231925 + }, + { + "epoch": 20.821364452423698, + "grad_norm": 19.13373565673828, + "learning_rate": 8.798563734290846e-06, + "loss": 4.5355, + "step": 231950 + }, + { + "epoch": 20.823608617594253, + "grad_norm": 20.336088180541992, + "learning_rate": 8.798314382605228e-06, + "loss": 4.7566, + "step": 231975 + }, + { + "epoch": 20.825852782764812, + "grad_norm": 18.207395553588867, + "learning_rate": 8.79806503091961e-06, + "loss": 4.807, + "step": 232000 + }, + { + "epoch": 20.828096947935368, + "grad_norm": 20.51740264892578, + "learning_rate": 8.797815679233993e-06, + "loss": 4.5956, + "step": 232025 + }, + { + "epoch": 20.830341113105924, + "grad_norm": 20.446935653686523, + "learning_rate": 8.797566327548375e-06, + "loss": 4.6437, + "step": 232050 + }, + { + "epoch": 20.832585278276483, + "grad_norm": 18.707902908325195, + "learning_rate": 8.797316975862757e-06, + "loss": 4.5428, + "step": 232075 + }, + { + "epoch": 20.83482944344704, + "grad_norm": 18.539045333862305, + "learning_rate": 8.79706762417714e-06, + "loss": 4.4277, + "step": 232100 + }, + { + "epoch": 20.837073608617594, + "grad_norm": 18.893644332885742, + "learning_rate": 8.796818272491523e-06, + "loss": 4.724, + "step": 232125 + }, + { + "epoch": 20.83931777378815, + "grad_norm": 16.507064819335938, + "learning_rate": 8.796568920805906e-06, + "loss": 4.4332, + "step": 232150 + }, + { + "epoch": 20.84156193895871, + "grad_norm": 16.962663650512695, + "learning_rate": 8.796319569120288e-06, + "loss": 4.6923, + "step": 232175 + }, + { + "epoch": 20.843806104129264, + "grad_norm": 20.316238403320312, + "learning_rate": 8.79607021743467e-06, + "loss": 4.616, + "step": 232200 + }, + { + "epoch": 20.84605026929982, + "grad_norm": 18.81438446044922, + "learning_rate": 8.795820865749053e-06, + "loss": 4.7129, + "step": 232225 + }, + { + "epoch": 20.848294434470375, + "grad_norm": 18.513362884521484, + "learning_rate": 8.795571514063435e-06, + "loss": 4.7791, + "step": 232250 + }, + { + "epoch": 20.850538599640934, + "grad_norm": 18.032926559448242, + "learning_rate": 8.795322162377819e-06, + "loss": 4.5644, + "step": 232275 + }, + { + "epoch": 20.85278276481149, + "grad_norm": 20.288761138916016, + "learning_rate": 8.795072810692201e-06, + "loss": 4.5374, + "step": 232300 + }, + { + "epoch": 20.855026929982046, + "grad_norm": 20.81064224243164, + "learning_rate": 8.794823459006584e-06, + "loss": 4.6664, + "step": 232325 + }, + { + "epoch": 20.857271095152605, + "grad_norm": 19.874847412109375, + "learning_rate": 8.794574107320966e-06, + "loss": 4.661, + "step": 232350 + }, + { + "epoch": 20.85951526032316, + "grad_norm": 17.628917694091797, + "learning_rate": 8.794324755635348e-06, + "loss": 4.6889, + "step": 232375 + }, + { + "epoch": 20.861759425493716, + "grad_norm": 20.38092041015625, + "learning_rate": 8.79407540394973e-06, + "loss": 4.7479, + "step": 232400 + }, + { + "epoch": 20.86400359066427, + "grad_norm": 18.573619842529297, + "learning_rate": 8.793826052264115e-06, + "loss": 4.6067, + "step": 232425 + }, + { + "epoch": 20.86624775583483, + "grad_norm": 19.2459659576416, + "learning_rate": 8.793576700578497e-06, + "loss": 4.6074, + "step": 232450 + }, + { + "epoch": 20.868491921005386, + "grad_norm": 19.318246841430664, + "learning_rate": 8.79332734889288e-06, + "loss": 4.6579, + "step": 232475 + }, + { + "epoch": 20.870736086175942, + "grad_norm": 17.37032699584961, + "learning_rate": 8.793077997207262e-06, + "loss": 4.5995, + "step": 232500 + }, + { + "epoch": 20.872980251346497, + "grad_norm": 21.55072784423828, + "learning_rate": 8.792828645521646e-06, + "loss": 4.7463, + "step": 232525 + }, + { + "epoch": 20.875224416517057, + "grad_norm": 19.61673927307129, + "learning_rate": 8.792579293836026e-06, + "loss": 4.5998, + "step": 232550 + }, + { + "epoch": 20.877468581687612, + "grad_norm": 20.84162139892578, + "learning_rate": 8.792329942150409e-06, + "loss": 4.6079, + "step": 232575 + }, + { + "epoch": 20.879712746858168, + "grad_norm": 18.13399887084961, + "learning_rate": 8.792080590464793e-06, + "loss": 4.5808, + "step": 232600 + }, + { + "epoch": 20.881956912028727, + "grad_norm": 14.672592163085938, + "learning_rate": 8.791831238779175e-06, + "loss": 4.7511, + "step": 232625 + }, + { + "epoch": 20.884201077199283, + "grad_norm": 17.801315307617188, + "learning_rate": 8.791581887093557e-06, + "loss": 4.5296, + "step": 232650 + }, + { + "epoch": 20.886445242369838, + "grad_norm": 19.48407745361328, + "learning_rate": 8.791332535407941e-06, + "loss": 4.808, + "step": 232675 + }, + { + "epoch": 20.888689407540394, + "grad_norm": 16.485937118530273, + "learning_rate": 8.791083183722324e-06, + "loss": 4.5297, + "step": 232700 + }, + { + "epoch": 20.890933572710953, + "grad_norm": 21.2692928314209, + "learning_rate": 8.790833832036704e-06, + "loss": 5.0886, + "step": 232725 + }, + { + "epoch": 20.89317773788151, + "grad_norm": 18.588773727416992, + "learning_rate": 8.790584480351088e-06, + "loss": 4.6954, + "step": 232750 + }, + { + "epoch": 20.895421903052064, + "grad_norm": 19.08291244506836, + "learning_rate": 8.79033512866547e-06, + "loss": 4.5388, + "step": 232775 + }, + { + "epoch": 20.89766606822262, + "grad_norm": 18.51504898071289, + "learning_rate": 8.790085776979853e-06, + "loss": 4.565, + "step": 232800 + }, + { + "epoch": 20.89991023339318, + "grad_norm": 17.860933303833008, + "learning_rate": 8.789836425294235e-06, + "loss": 4.5724, + "step": 232825 + }, + { + "epoch": 20.902154398563734, + "grad_norm": 17.252702713012695, + "learning_rate": 8.789587073608619e-06, + "loss": 4.6647, + "step": 232850 + }, + { + "epoch": 20.90439856373429, + "grad_norm": 18.806833267211914, + "learning_rate": 8.789337721923001e-06, + "loss": 4.6109, + "step": 232875 + }, + { + "epoch": 20.90664272890485, + "grad_norm": 18.398820877075195, + "learning_rate": 8.789088370237384e-06, + "loss": 4.592, + "step": 232900 + }, + { + "epoch": 20.908886894075405, + "grad_norm": 15.938538551330566, + "learning_rate": 8.788839018551766e-06, + "loss": 4.5446, + "step": 232925 + }, + { + "epoch": 20.91113105924596, + "grad_norm": 18.22700309753418, + "learning_rate": 8.788589666866148e-06, + "loss": 4.9913, + "step": 232950 + }, + { + "epoch": 20.913375224416516, + "grad_norm": 16.263540267944336, + "learning_rate": 8.78834031518053e-06, + "loss": 4.6816, + "step": 232975 + }, + { + "epoch": 20.915619389587075, + "grad_norm": 20.932342529296875, + "learning_rate": 8.788090963494915e-06, + "loss": 4.3449, + "step": 233000 + }, + { + "epoch": 20.91786355475763, + "grad_norm": 18.895008087158203, + "learning_rate": 8.787841611809297e-06, + "loss": 4.6902, + "step": 233025 + }, + { + "epoch": 20.920107719928186, + "grad_norm": 18.50002098083496, + "learning_rate": 8.78759226012368e-06, + "loss": 4.4144, + "step": 233050 + }, + { + "epoch": 20.92235188509874, + "grad_norm": 19.333303451538086, + "learning_rate": 8.787342908438062e-06, + "loss": 4.7756, + "step": 233075 + }, + { + "epoch": 20.9245960502693, + "grad_norm": 22.388399124145508, + "learning_rate": 8.787093556752444e-06, + "loss": 4.5757, + "step": 233100 + }, + { + "epoch": 20.926840215439857, + "grad_norm": 16.773609161376953, + "learning_rate": 8.786844205066826e-06, + "loss": 4.6101, + "step": 233125 + }, + { + "epoch": 20.929084380610412, + "grad_norm": 21.43154525756836, + "learning_rate": 8.78659485338121e-06, + "loss": 4.4998, + "step": 233150 + }, + { + "epoch": 20.93132854578097, + "grad_norm": 17.469619750976562, + "learning_rate": 8.786345501695593e-06, + "loss": 4.8407, + "step": 233175 + }, + { + "epoch": 20.933572710951527, + "grad_norm": 19.096160888671875, + "learning_rate": 8.786096150009975e-06, + "loss": 4.411, + "step": 233200 + }, + { + "epoch": 20.935816876122082, + "grad_norm": 17.297706604003906, + "learning_rate": 8.785846798324357e-06, + "loss": 4.5789, + "step": 233225 + }, + { + "epoch": 20.938061041292638, + "grad_norm": 18.90047836303711, + "learning_rate": 8.78559744663874e-06, + "loss": 4.6582, + "step": 233250 + }, + { + "epoch": 20.940305206463197, + "grad_norm": 18.36685562133789, + "learning_rate": 8.785348094953122e-06, + "loss": 4.6339, + "step": 233275 + }, + { + "epoch": 20.942549371633753, + "grad_norm": 19.230154037475586, + "learning_rate": 8.785098743267504e-06, + "loss": 4.7271, + "step": 233300 + }, + { + "epoch": 20.94479353680431, + "grad_norm": 17.68512725830078, + "learning_rate": 8.784849391581888e-06, + "loss": 4.6385, + "step": 233325 + }, + { + "epoch": 20.947037701974864, + "grad_norm": 19.840167999267578, + "learning_rate": 8.78460003989627e-06, + "loss": 4.6877, + "step": 233350 + }, + { + "epoch": 20.949281867145423, + "grad_norm": 25.881132125854492, + "learning_rate": 8.784350688210653e-06, + "loss": 4.6598, + "step": 233375 + }, + { + "epoch": 20.95152603231598, + "grad_norm": 17.012405395507812, + "learning_rate": 8.784101336525037e-06, + "loss": 4.6165, + "step": 233400 + }, + { + "epoch": 20.953770197486534, + "grad_norm": 17.588518142700195, + "learning_rate": 8.783851984839417e-06, + "loss": 4.386, + "step": 233425 + }, + { + "epoch": 20.956014362657093, + "grad_norm": 19.696401596069336, + "learning_rate": 8.7836026331538e-06, + "loss": 4.7154, + "step": 233450 + }, + { + "epoch": 20.95825852782765, + "grad_norm": 16.959842681884766, + "learning_rate": 8.783353281468184e-06, + "loss": 4.4799, + "step": 233475 + }, + { + "epoch": 20.960502692998205, + "grad_norm": 19.774002075195312, + "learning_rate": 8.783103929782566e-06, + "loss": 4.946, + "step": 233500 + }, + { + "epoch": 20.96274685816876, + "grad_norm": 16.93734359741211, + "learning_rate": 8.782854578096948e-06, + "loss": 4.5574, + "step": 233525 + }, + { + "epoch": 20.96499102333932, + "grad_norm": 20.510486602783203, + "learning_rate": 8.78260522641133e-06, + "loss": 4.5781, + "step": 233550 + }, + { + "epoch": 20.967235188509875, + "grad_norm": 17.662044525146484, + "learning_rate": 8.782355874725715e-06, + "loss": 4.6291, + "step": 233575 + }, + { + "epoch": 20.96947935368043, + "grad_norm": 14.499275207519531, + "learning_rate": 8.782106523040095e-06, + "loss": 4.8344, + "step": 233600 + }, + { + "epoch": 20.971723518850986, + "grad_norm": 17.00281524658203, + "learning_rate": 8.78185717135448e-06, + "loss": 4.6658, + "step": 233625 + }, + { + "epoch": 20.973967684021545, + "grad_norm": 17.887847900390625, + "learning_rate": 8.781607819668862e-06, + "loss": 4.6315, + "step": 233650 + }, + { + "epoch": 20.9762118491921, + "grad_norm": 19.470333099365234, + "learning_rate": 8.781358467983244e-06, + "loss": 4.8315, + "step": 233675 + }, + { + "epoch": 20.978456014362656, + "grad_norm": 17.038108825683594, + "learning_rate": 8.781109116297626e-06, + "loss": 4.8428, + "step": 233700 + }, + { + "epoch": 20.980700179533212, + "grad_norm": 19.464794158935547, + "learning_rate": 8.78085976461201e-06, + "loss": 4.6827, + "step": 233725 + }, + { + "epoch": 20.98294434470377, + "grad_norm": 17.71124267578125, + "learning_rate": 8.780610412926393e-06, + "loss": 4.6676, + "step": 233750 + }, + { + "epoch": 20.985188509874327, + "grad_norm": 19.307973861694336, + "learning_rate": 8.780361061240775e-06, + "loss": 4.7285, + "step": 233775 + }, + { + "epoch": 20.987432675044882, + "grad_norm": 15.731684684753418, + "learning_rate": 8.780111709555157e-06, + "loss": 4.6405, + "step": 233800 + }, + { + "epoch": 20.98967684021544, + "grad_norm": 20.461673736572266, + "learning_rate": 8.77986235786954e-06, + "loss": 4.7345, + "step": 233825 + }, + { + "epoch": 20.991921005385997, + "grad_norm": 21.583240509033203, + "learning_rate": 8.779613006183922e-06, + "loss": 4.7345, + "step": 233850 + }, + { + "epoch": 20.994165170556553, + "grad_norm": 16.004873275756836, + "learning_rate": 8.779363654498306e-06, + "loss": 4.6579, + "step": 233875 + }, + { + "epoch": 20.996409335727108, + "grad_norm": 19.849475860595703, + "learning_rate": 8.779114302812688e-06, + "loss": 4.654, + "step": 233900 + }, + { + "epoch": 20.998653500897667, + "grad_norm": 17.51142120361328, + "learning_rate": 8.77886495112707e-06, + "loss": 4.7358, + "step": 233925 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.0655299486376568, + "eval_f1_macro": 0.009154084372371071, + "eval_f1_micro": 0.0655299486376568, + "eval_f1_weighted": 0.04114495570822452, + "eval_loss": 6.821298122406006, + "eval_precision_macro": 0.008234313526010402, + "eval_precision_micro": 0.0655299486376568, + "eval_precision_weighted": 0.03448553563582171, + "eval_recall_macro": 0.01448097555648022, + "eval_recall_micro": 0.0655299486376568, + "eval_recall_weighted": 0.0655299486376568, + "eval_runtime": 128.7651, + "eval_samples_per_second": 406.733, + "eval_steps_per_second": 12.713, + "step": 233940 + }, + { + "epoch": 21.000897666068223, + "grad_norm": 20.333375930786133, + "learning_rate": 8.778615599441453e-06, + "loss": 4.3004, + "step": 233950 + }, + { + "epoch": 21.00314183123878, + "grad_norm": 18.68164825439453, + "learning_rate": 8.778366247755835e-06, + "loss": 4.4586, + "step": 233975 + }, + { + "epoch": 21.005385996409334, + "grad_norm": 18.440034866333008, + "learning_rate": 8.778116896070218e-06, + "loss": 4.324, + "step": 234000 + }, + { + "epoch": 21.007630161579893, + "grad_norm": 14.78779125213623, + "learning_rate": 8.7778675443846e-06, + "loss": 4.268, + "step": 234025 + }, + { + "epoch": 21.00987432675045, + "grad_norm": 14.094012260437012, + "learning_rate": 8.777628166766407e-06, + "loss": 4.3589, + "step": 234050 + }, + { + "epoch": 21.012118491921004, + "grad_norm": 17.734405517578125, + "learning_rate": 8.77737881508079e-06, + "loss": 4.454, + "step": 234075 + }, + { + "epoch": 21.014362657091564, + "grad_norm": 17.69460105895996, + "learning_rate": 8.777129463395173e-06, + "loss": 4.2401, + "step": 234100 + }, + { + "epoch": 21.01660682226212, + "grad_norm": 19.57148551940918, + "learning_rate": 8.776880111709555e-06, + "loss": 4.286, + "step": 234125 + }, + { + "epoch": 21.018850987432675, + "grad_norm": 21.417482376098633, + "learning_rate": 8.77663076002394e-06, + "loss": 4.6292, + "step": 234150 + }, + { + "epoch": 21.02109515260323, + "grad_norm": 17.032974243164062, + "learning_rate": 8.776381408338322e-06, + "loss": 4.4745, + "step": 234175 + }, + { + "epoch": 21.02333931777379, + "grad_norm": 19.92683219909668, + "learning_rate": 8.776132056652704e-06, + "loss": 4.223, + "step": 234200 + }, + { + "epoch": 21.025583482944345, + "grad_norm": 17.201528549194336, + "learning_rate": 8.775882704967086e-06, + "loss": 4.1402, + "step": 234225 + }, + { + "epoch": 21.0278276481149, + "grad_norm": 16.361425399780273, + "learning_rate": 8.775633353281469e-06, + "loss": 4.2421, + "step": 234250 + }, + { + "epoch": 21.030071813285456, + "grad_norm": 18.289348602294922, + "learning_rate": 8.775384001595851e-06, + "loss": 4.5326, + "step": 234275 + }, + { + "epoch": 21.032315978456015, + "grad_norm": 19.838998794555664, + "learning_rate": 8.775134649910233e-06, + "loss": 4.5372, + "step": 234300 + }, + { + "epoch": 21.03456014362657, + "grad_norm": 19.08919334411621, + "learning_rate": 8.774885298224617e-06, + "loss": 4.2772, + "step": 234325 + }, + { + "epoch": 21.036804308797127, + "grad_norm": 20.695274353027344, + "learning_rate": 8.774635946539e-06, + "loss": 4.4358, + "step": 234350 + }, + { + "epoch": 21.039048473967686, + "grad_norm": 22.743167877197266, + "learning_rate": 8.774386594853382e-06, + "loss": 4.2978, + "step": 234375 + }, + { + "epoch": 21.04129263913824, + "grad_norm": 19.09190559387207, + "learning_rate": 8.774137243167764e-06, + "loss": 4.4936, + "step": 234400 + }, + { + "epoch": 21.043536804308797, + "grad_norm": 18.965545654296875, + "learning_rate": 8.773887891482147e-06, + "loss": 4.0932, + "step": 234425 + }, + { + "epoch": 21.045780969479353, + "grad_norm": 18.046524047851562, + "learning_rate": 8.773638539796529e-06, + "loss": 4.4035, + "step": 234450 + }, + { + "epoch": 21.04802513464991, + "grad_norm": 18.095375061035156, + "learning_rate": 8.773389188110913e-06, + "loss": 4.3513, + "step": 234475 + }, + { + "epoch": 21.050269299820467, + "grad_norm": 17.29137420654297, + "learning_rate": 8.773139836425295e-06, + "loss": 4.1317, + "step": 234500 + }, + { + "epoch": 21.052513464991023, + "grad_norm": 19.199081420898438, + "learning_rate": 8.772890484739678e-06, + "loss": 4.3434, + "step": 234525 + }, + { + "epoch": 21.05475763016158, + "grad_norm": 20.52240562438965, + "learning_rate": 8.77264113305406e-06, + "loss": 4.1636, + "step": 234550 + }, + { + "epoch": 21.057001795332138, + "grad_norm": 18.833402633666992, + "learning_rate": 8.772391781368442e-06, + "loss": 4.2097, + "step": 234575 + }, + { + "epoch": 21.059245960502693, + "grad_norm": 15.778203010559082, + "learning_rate": 8.772142429682825e-06, + "loss": 4.3194, + "step": 234600 + }, + { + "epoch": 21.06149012567325, + "grad_norm": 17.949581146240234, + "learning_rate": 8.771893077997209e-06, + "loss": 4.3954, + "step": 234625 + }, + { + "epoch": 21.063734290843804, + "grad_norm": 17.946895599365234, + "learning_rate": 8.77164372631159e-06, + "loss": 4.0788, + "step": 234650 + }, + { + "epoch": 21.065978456014363, + "grad_norm": 18.467472076416016, + "learning_rate": 8.771394374625973e-06, + "loss": 4.4359, + "step": 234675 + }, + { + "epoch": 21.06822262118492, + "grad_norm": 23.313899993896484, + "learning_rate": 8.771145022940355e-06, + "loss": 4.3774, + "step": 234700 + }, + { + "epoch": 21.070466786355475, + "grad_norm": 24.70026397705078, + "learning_rate": 8.77089567125474e-06, + "loss": 4.2816, + "step": 234725 + }, + { + "epoch": 21.072710951526034, + "grad_norm": 18.248783111572266, + "learning_rate": 8.77064631956912e-06, + "loss": 4.5884, + "step": 234750 + }, + { + "epoch": 21.07495511669659, + "grad_norm": 15.739548683166504, + "learning_rate": 8.770396967883502e-06, + "loss": 4.5788, + "step": 234775 + }, + { + "epoch": 21.077199281867145, + "grad_norm": 19.3920955657959, + "learning_rate": 8.770147616197886e-06, + "loss": 4.3652, + "step": 234800 + }, + { + "epoch": 21.0794434470377, + "grad_norm": 17.624921798706055, + "learning_rate": 8.769898264512269e-06, + "loss": 4.3802, + "step": 234825 + }, + { + "epoch": 21.08168761220826, + "grad_norm": 21.203189849853516, + "learning_rate": 8.769648912826651e-06, + "loss": 4.4567, + "step": 234850 + }, + { + "epoch": 21.083931777378815, + "grad_norm": 19.578012466430664, + "learning_rate": 8.769399561141035e-06, + "loss": 4.34, + "step": 234875 + }, + { + "epoch": 21.08617594254937, + "grad_norm": 21.040212631225586, + "learning_rate": 8.769150209455417e-06, + "loss": 4.3569, + "step": 234900 + }, + { + "epoch": 21.088420107719926, + "grad_norm": 19.906742095947266, + "learning_rate": 8.768900857769798e-06, + "loss": 4.2562, + "step": 234925 + }, + { + "epoch": 21.090664272890486, + "grad_norm": 21.548215866088867, + "learning_rate": 8.768651506084182e-06, + "loss": 4.5245, + "step": 234950 + }, + { + "epoch": 21.09290843806104, + "grad_norm": 19.97476577758789, + "learning_rate": 8.768402154398564e-06, + "loss": 4.2774, + "step": 234975 + }, + { + "epoch": 21.095152603231597, + "grad_norm": 22.545936584472656, + "learning_rate": 8.768152802712947e-06, + "loss": 4.3322, + "step": 235000 + }, + { + "epoch": 21.097396768402156, + "grad_norm": 17.92310905456543, + "learning_rate": 8.767903451027329e-06, + "loss": 4.3764, + "step": 235025 + }, + { + "epoch": 21.09964093357271, + "grad_norm": 22.198461532592773, + "learning_rate": 8.767654099341713e-06, + "loss": 4.2792, + "step": 235050 + }, + { + "epoch": 21.101885098743267, + "grad_norm": 19.09475326538086, + "learning_rate": 8.767404747656095e-06, + "loss": 4.4525, + "step": 235075 + }, + { + "epoch": 21.104129263913823, + "grad_norm": 17.67413902282715, + "learning_rate": 8.767155395970478e-06, + "loss": 4.3218, + "step": 235100 + }, + { + "epoch": 21.106373429084382, + "grad_norm": 19.194887161254883, + "learning_rate": 8.76690604428486e-06, + "loss": 4.393, + "step": 235125 + }, + { + "epoch": 21.108617594254937, + "grad_norm": 18.558626174926758, + "learning_rate": 8.766656692599242e-06, + "loss": 4.287, + "step": 235150 + }, + { + "epoch": 21.110861759425493, + "grad_norm": 20.844242095947266, + "learning_rate": 8.766407340913625e-06, + "loss": 4.1014, + "step": 235175 + }, + { + "epoch": 21.11310592459605, + "grad_norm": 16.74549674987793, + "learning_rate": 8.766157989228009e-06, + "loss": 4.3463, + "step": 235200 + }, + { + "epoch": 21.115350089766608, + "grad_norm": 14.034250259399414, + "learning_rate": 8.765908637542391e-06, + "loss": 4.5435, + "step": 235225 + }, + { + "epoch": 21.117594254937163, + "grad_norm": 18.676061630249023, + "learning_rate": 8.765659285856773e-06, + "loss": 4.2299, + "step": 235250 + }, + { + "epoch": 21.11983842010772, + "grad_norm": 18.326061248779297, + "learning_rate": 8.765409934171156e-06, + "loss": 4.1683, + "step": 235275 + }, + { + "epoch": 21.122082585278278, + "grad_norm": 18.58392333984375, + "learning_rate": 8.765160582485538e-06, + "loss": 4.0342, + "step": 235300 + }, + { + "epoch": 21.124326750448834, + "grad_norm": 19.153366088867188, + "learning_rate": 8.76491123079992e-06, + "loss": 4.0842, + "step": 235325 + }, + { + "epoch": 21.12657091561939, + "grad_norm": 18.79571533203125, + "learning_rate": 8.764661879114304e-06, + "loss": 4.2875, + "step": 235350 + }, + { + "epoch": 21.128815080789945, + "grad_norm": 18.099872589111328, + "learning_rate": 8.764412527428686e-06, + "loss": 4.2876, + "step": 235375 + }, + { + "epoch": 21.131059245960504, + "grad_norm": 21.98067283630371, + "learning_rate": 8.764163175743069e-06, + "loss": 4.3375, + "step": 235400 + }, + { + "epoch": 21.13330341113106, + "grad_norm": 16.14326286315918, + "learning_rate": 8.763913824057451e-06, + "loss": 4.3526, + "step": 235425 + }, + { + "epoch": 21.135547576301615, + "grad_norm": 19.477670669555664, + "learning_rate": 8.763664472371835e-06, + "loss": 4.4101, + "step": 235450 + }, + { + "epoch": 21.13779174147217, + "grad_norm": 17.500791549682617, + "learning_rate": 8.763415120686216e-06, + "loss": 4.6018, + "step": 235475 + }, + { + "epoch": 21.14003590664273, + "grad_norm": 17.02172088623047, + "learning_rate": 8.7631657690006e-06, + "loss": 4.3369, + "step": 235500 + }, + { + "epoch": 21.142280071813286, + "grad_norm": 18.654733657836914, + "learning_rate": 8.762916417314982e-06, + "loss": 4.4945, + "step": 235525 + }, + { + "epoch": 21.14452423698384, + "grad_norm": 17.95917510986328, + "learning_rate": 8.762667065629364e-06, + "loss": 4.469, + "step": 235550 + }, + { + "epoch": 21.1467684021544, + "grad_norm": 18.97679328918457, + "learning_rate": 8.762417713943747e-06, + "loss": 4.309, + "step": 235575 + }, + { + "epoch": 21.149012567324956, + "grad_norm": 16.538972854614258, + "learning_rate": 8.76216836225813e-06, + "loss": 4.4164, + "step": 235600 + }, + { + "epoch": 21.15125673249551, + "grad_norm": 18.92995834350586, + "learning_rate": 8.761919010572513e-06, + "loss": 4.2407, + "step": 235625 + }, + { + "epoch": 21.153500897666067, + "grad_norm": 17.983928680419922, + "learning_rate": 8.761669658886894e-06, + "loss": 4.4289, + "step": 235650 + }, + { + "epoch": 21.155745062836626, + "grad_norm": 19.361352920532227, + "learning_rate": 8.761420307201278e-06, + "loss": 4.1704, + "step": 235675 + }, + { + "epoch": 21.15798922800718, + "grad_norm": 16.015567779541016, + "learning_rate": 8.76117095551566e-06, + "loss": 4.3574, + "step": 235700 + }, + { + "epoch": 21.160233393177737, + "grad_norm": 18.717138290405273, + "learning_rate": 8.760921603830042e-06, + "loss": 4.4637, + "step": 235725 + }, + { + "epoch": 21.162477558348293, + "grad_norm": 19.18358612060547, + "learning_rate": 8.760672252144425e-06, + "loss": 4.4431, + "step": 235750 + }, + { + "epoch": 21.164721723518852, + "grad_norm": 18.86152458190918, + "learning_rate": 8.760422900458809e-06, + "loss": 4.3523, + "step": 235775 + }, + { + "epoch": 21.166965888689408, + "grad_norm": 19.78553581237793, + "learning_rate": 8.760173548773191e-06, + "loss": 4.2316, + "step": 235800 + }, + { + "epoch": 21.169210053859963, + "grad_norm": 20.6082820892334, + "learning_rate": 8.759924197087573e-06, + "loss": 4.3947, + "step": 235825 + }, + { + "epoch": 21.171454219030522, + "grad_norm": 20.452373504638672, + "learning_rate": 8.759674845401956e-06, + "loss": 4.5109, + "step": 235850 + }, + { + "epoch": 21.173698384201078, + "grad_norm": 17.432809829711914, + "learning_rate": 8.759425493716338e-06, + "loss": 4.409, + "step": 235875 + }, + { + "epoch": 21.175942549371634, + "grad_norm": 17.29524040222168, + "learning_rate": 8.75917614203072e-06, + "loss": 4.1638, + "step": 235900 + }, + { + "epoch": 21.17818671454219, + "grad_norm": 15.897664070129395, + "learning_rate": 8.758926790345104e-06, + "loss": 4.4141, + "step": 235925 + }, + { + "epoch": 21.18043087971275, + "grad_norm": 18.397567749023438, + "learning_rate": 8.758677438659487e-06, + "loss": 4.4455, + "step": 235950 + }, + { + "epoch": 21.182675044883304, + "grad_norm": 19.956199645996094, + "learning_rate": 8.758428086973869e-06, + "loss": 4.3373, + "step": 235975 + }, + { + "epoch": 21.18491921005386, + "grad_norm": 17.770503997802734, + "learning_rate": 8.758178735288251e-06, + "loss": 4.4725, + "step": 236000 + }, + { + "epoch": 21.187163375224415, + "grad_norm": 16.542329788208008, + "learning_rate": 8.757929383602633e-06, + "loss": 4.4537, + "step": 236025 + }, + { + "epoch": 21.189407540394974, + "grad_norm": 18.8156795501709, + "learning_rate": 8.757680031917016e-06, + "loss": 4.2403, + "step": 236050 + }, + { + "epoch": 21.19165170556553, + "grad_norm": 16.797386169433594, + "learning_rate": 8.7574306802314e-06, + "loss": 4.5081, + "step": 236075 + }, + { + "epoch": 21.193895870736085, + "grad_norm": 19.727840423583984, + "learning_rate": 8.757181328545782e-06, + "loss": 4.5762, + "step": 236100 + }, + { + "epoch": 21.19614003590664, + "grad_norm": 21.040266036987305, + "learning_rate": 8.756931976860164e-06, + "loss": 4.3517, + "step": 236125 + }, + { + "epoch": 21.1983842010772, + "grad_norm": 20.662456512451172, + "learning_rate": 8.756682625174547e-06, + "loss": 4.3788, + "step": 236150 + }, + { + "epoch": 21.200628366247756, + "grad_norm": 19.008703231811523, + "learning_rate": 8.756433273488929e-06, + "loss": 4.4178, + "step": 236175 + }, + { + "epoch": 21.20287253141831, + "grad_norm": 18.73773765563965, + "learning_rate": 8.756183921803311e-06, + "loss": 4.1506, + "step": 236200 + }, + { + "epoch": 21.20511669658887, + "grad_norm": 20.627853393554688, + "learning_rate": 8.755934570117695e-06, + "loss": 4.2923, + "step": 236225 + }, + { + "epoch": 21.207360861759426, + "grad_norm": 21.04833221435547, + "learning_rate": 8.755685218432078e-06, + "loss": 4.7347, + "step": 236250 + }, + { + "epoch": 21.20960502692998, + "grad_norm": 19.50849151611328, + "learning_rate": 8.75543586674646e-06, + "loss": 4.4393, + "step": 236275 + }, + { + "epoch": 21.211849192100537, + "grad_norm": 20.028470993041992, + "learning_rate": 8.755186515060842e-06, + "loss": 4.3279, + "step": 236300 + }, + { + "epoch": 21.214093357271096, + "grad_norm": 19.492652893066406, + "learning_rate": 8.754937163375226e-06, + "loss": 4.4097, + "step": 236325 + }, + { + "epoch": 21.216337522441652, + "grad_norm": 20.940635681152344, + "learning_rate": 8.754687811689607e-06, + "loss": 4.3368, + "step": 236350 + }, + { + "epoch": 21.218581687612208, + "grad_norm": 17.904335021972656, + "learning_rate": 8.75443846000399e-06, + "loss": 4.524, + "step": 236375 + }, + { + "epoch": 21.220825852782763, + "grad_norm": 18.691200256347656, + "learning_rate": 8.754199082385798e-06, + "loss": 4.5763, + "step": 236400 + }, + { + "epoch": 21.223070017953322, + "grad_norm": 18.172697067260742, + "learning_rate": 8.75394973070018e-06, + "loss": 4.3652, + "step": 236425 + }, + { + "epoch": 21.225314183123878, + "grad_norm": 20.50137710571289, + "learning_rate": 8.753700379014563e-06, + "loss": 4.4574, + "step": 236450 + }, + { + "epoch": 21.227558348294433, + "grad_norm": 19.308395385742188, + "learning_rate": 8.753451027328945e-06, + "loss": 4.405, + "step": 236475 + }, + { + "epoch": 21.229802513464993, + "grad_norm": 17.99346160888672, + "learning_rate": 8.753201675643327e-06, + "loss": 4.3802, + "step": 236500 + }, + { + "epoch": 21.232046678635548, + "grad_norm": 23.67854118347168, + "learning_rate": 8.752952323957711e-06, + "loss": 4.5026, + "step": 236525 + }, + { + "epoch": 21.234290843806104, + "grad_norm": 17.162992477416992, + "learning_rate": 8.752702972272094e-06, + "loss": 4.2031, + "step": 236550 + }, + { + "epoch": 21.23653500897666, + "grad_norm": 17.53830909729004, + "learning_rate": 8.752453620586476e-06, + "loss": 4.5343, + "step": 236575 + }, + { + "epoch": 21.23877917414722, + "grad_norm": 20.230358123779297, + "learning_rate": 8.752204268900858e-06, + "loss": 4.351, + "step": 236600 + }, + { + "epoch": 21.241023339317774, + "grad_norm": 15.890538215637207, + "learning_rate": 8.75195491721524e-06, + "loss": 4.4262, + "step": 236625 + }, + { + "epoch": 21.24326750448833, + "grad_norm": 21.822547912597656, + "learning_rate": 8.751705565529623e-06, + "loss": 4.4162, + "step": 236650 + }, + { + "epoch": 21.245511669658885, + "grad_norm": 15.433343887329102, + "learning_rate": 8.751456213844007e-06, + "loss": 4.4975, + "step": 236675 + }, + { + "epoch": 21.247755834829444, + "grad_norm": 19.349651336669922, + "learning_rate": 8.751206862158389e-06, + "loss": 4.3796, + "step": 236700 + }, + { + "epoch": 21.25, + "grad_norm": 19.241111755371094, + "learning_rate": 8.750957510472771e-06, + "loss": 4.4382, + "step": 236725 + }, + { + "epoch": 21.252244165170556, + "grad_norm": 16.946317672729492, + "learning_rate": 8.750708158787154e-06, + "loss": 4.5268, + "step": 236750 + }, + { + "epoch": 21.254488330341115, + "grad_norm": 17.428977966308594, + "learning_rate": 8.750458807101536e-06, + "loss": 4.3586, + "step": 236775 + }, + { + "epoch": 21.25673249551167, + "grad_norm": 16.873641967773438, + "learning_rate": 8.750209455415918e-06, + "loss": 4.4921, + "step": 236800 + }, + { + "epoch": 21.258976660682226, + "grad_norm": 22.30611228942871, + "learning_rate": 8.749960103730302e-06, + "loss": 4.2924, + "step": 236825 + }, + { + "epoch": 21.26122082585278, + "grad_norm": 18.135589599609375, + "learning_rate": 8.749710752044685e-06, + "loss": 4.4927, + "step": 236850 + }, + { + "epoch": 21.26346499102334, + "grad_norm": 21.322065353393555, + "learning_rate": 8.749461400359067e-06, + "loss": 4.2813, + "step": 236875 + }, + { + "epoch": 21.265709156193896, + "grad_norm": 21.50674057006836, + "learning_rate": 8.74921204867345e-06, + "loss": 4.257, + "step": 236900 + }, + { + "epoch": 21.267953321364452, + "grad_norm": 22.905332565307617, + "learning_rate": 8.748962696987833e-06, + "loss": 4.4077, + "step": 236925 + }, + { + "epoch": 21.270197486535007, + "grad_norm": 16.075576782226562, + "learning_rate": 8.748713345302216e-06, + "loss": 4.4094, + "step": 236950 + }, + { + "epoch": 21.272441651705567, + "grad_norm": 16.551223754882812, + "learning_rate": 8.748463993616598e-06, + "loss": 4.3796, + "step": 236975 + }, + { + "epoch": 21.274685816876122, + "grad_norm": 18.02849578857422, + "learning_rate": 8.74821464193098e-06, + "loss": 4.4379, + "step": 237000 + }, + { + "epoch": 21.276929982046678, + "grad_norm": 21.8756046295166, + "learning_rate": 8.747965290245363e-06, + "loss": 4.6686, + "step": 237025 + }, + { + "epoch": 21.279174147217237, + "grad_norm": 19.305288314819336, + "learning_rate": 8.747715938559745e-06, + "loss": 4.1081, + "step": 237050 + }, + { + "epoch": 21.281418312387792, + "grad_norm": 18.007190704345703, + "learning_rate": 8.747466586874129e-06, + "loss": 4.5605, + "step": 237075 + }, + { + "epoch": 21.283662477558348, + "grad_norm": 21.078521728515625, + "learning_rate": 8.747217235188511e-06, + "loss": 4.2132, + "step": 237100 + }, + { + "epoch": 21.285906642728904, + "grad_norm": 18.750154495239258, + "learning_rate": 8.746967883502894e-06, + "loss": 4.3721, + "step": 237125 + }, + { + "epoch": 21.288150807899463, + "grad_norm": 21.317293167114258, + "learning_rate": 8.746718531817276e-06, + "loss": 4.4553, + "step": 237150 + }, + { + "epoch": 21.29039497307002, + "grad_norm": 24.1552791595459, + "learning_rate": 8.746469180131658e-06, + "loss": 4.5487, + "step": 237175 + }, + { + "epoch": 21.292639138240574, + "grad_norm": 20.096946716308594, + "learning_rate": 8.74621982844604e-06, + "loss": 4.2725, + "step": 237200 + }, + { + "epoch": 21.29488330341113, + "grad_norm": 17.643997192382812, + "learning_rate": 8.745970476760423e-06, + "loss": 4.7165, + "step": 237225 + }, + { + "epoch": 21.29712746858169, + "grad_norm": 20.25861930847168, + "learning_rate": 8.745721125074807e-06, + "loss": 4.4809, + "step": 237250 + }, + { + "epoch": 21.299371633752244, + "grad_norm": 20.846149444580078, + "learning_rate": 8.745471773389189e-06, + "loss": 4.3543, + "step": 237275 + }, + { + "epoch": 21.3016157989228, + "grad_norm": 18.511394500732422, + "learning_rate": 8.745222421703571e-06, + "loss": 4.3331, + "step": 237300 + }, + { + "epoch": 21.303859964093355, + "grad_norm": 19.75503158569336, + "learning_rate": 8.744973070017954e-06, + "loss": 4.4262, + "step": 237325 + }, + { + "epoch": 21.306104129263915, + "grad_norm": 22.359071731567383, + "learning_rate": 8.744723718332336e-06, + "loss": 4.4577, + "step": 237350 + }, + { + "epoch": 21.30834829443447, + "grad_norm": 16.443309783935547, + "learning_rate": 8.744474366646718e-06, + "loss": 4.3552, + "step": 237375 + }, + { + "epoch": 21.310592459605026, + "grad_norm": 20.399494171142578, + "learning_rate": 8.744225014961102e-06, + "loss": 4.2384, + "step": 237400 + }, + { + "epoch": 21.312836624775585, + "grad_norm": 18.587512969970703, + "learning_rate": 8.743975663275485e-06, + "loss": 4.4637, + "step": 237425 + }, + { + "epoch": 21.31508078994614, + "grad_norm": 17.20473289489746, + "learning_rate": 8.743726311589867e-06, + "loss": 4.3289, + "step": 237450 + }, + { + "epoch": 21.317324955116696, + "grad_norm": 18.484601974487305, + "learning_rate": 8.74347695990425e-06, + "loss": 4.5995, + "step": 237475 + }, + { + "epoch": 21.31956912028725, + "grad_norm": 20.05165672302246, + "learning_rate": 8.743227608218632e-06, + "loss": 4.3645, + "step": 237500 + }, + { + "epoch": 21.32181328545781, + "grad_norm": 17.607961654663086, + "learning_rate": 8.742978256533014e-06, + "loss": 4.3501, + "step": 237525 + }, + { + "epoch": 21.324057450628366, + "grad_norm": 20.61247444152832, + "learning_rate": 8.742728904847398e-06, + "loss": 4.5426, + "step": 237550 + }, + { + "epoch": 21.326301615798922, + "grad_norm": 22.783817291259766, + "learning_rate": 8.74247955316178e-06, + "loss": 4.1867, + "step": 237575 + }, + { + "epoch": 21.328545780969478, + "grad_norm": 18.809368133544922, + "learning_rate": 8.742230201476163e-06, + "loss": 4.6797, + "step": 237600 + }, + { + "epoch": 21.330789946140037, + "grad_norm": 19.10387420654297, + "learning_rate": 8.741980849790545e-06, + "loss": 4.255, + "step": 237625 + }, + { + "epoch": 21.333034111310592, + "grad_norm": 18.121702194213867, + "learning_rate": 8.741731498104929e-06, + "loss": 4.6105, + "step": 237650 + }, + { + "epoch": 21.335278276481148, + "grad_norm": 17.192602157592773, + "learning_rate": 8.74148214641931e-06, + "loss": 4.4044, + "step": 237675 + }, + { + "epoch": 21.337522441651707, + "grad_norm": 18.420711517333984, + "learning_rate": 8.741232794733694e-06, + "loss": 4.5375, + "step": 237700 + }, + { + "epoch": 21.339766606822263, + "grad_norm": 14.582747459411621, + "learning_rate": 8.740983443048076e-06, + "loss": 4.6181, + "step": 237725 + }, + { + "epoch": 21.34201077199282, + "grad_norm": 18.78200912475586, + "learning_rate": 8.740734091362458e-06, + "loss": 4.3834, + "step": 237750 + }, + { + "epoch": 21.344254937163374, + "grad_norm": 21.580747604370117, + "learning_rate": 8.74048473967684e-06, + "loss": 4.3797, + "step": 237775 + }, + { + "epoch": 21.346499102333933, + "grad_norm": 21.147178649902344, + "learning_rate": 8.740235387991225e-06, + "loss": 4.4642, + "step": 237800 + }, + { + "epoch": 21.34874326750449, + "grad_norm": 16.22683334350586, + "learning_rate": 8.739986036305607e-06, + "loss": 4.3562, + "step": 237825 + }, + { + "epoch": 21.350987432675044, + "grad_norm": 19.686111450195312, + "learning_rate": 8.739746658687414e-06, + "loss": 4.3572, + "step": 237850 + }, + { + "epoch": 21.3532315978456, + "grad_norm": 20.028884887695312, + "learning_rate": 8.739497307001796e-06, + "loss": 4.5353, + "step": 237875 + }, + { + "epoch": 21.35547576301616, + "grad_norm": 17.04547119140625, + "learning_rate": 8.739247955316178e-06, + "loss": 4.4707, + "step": 237900 + }, + { + "epoch": 21.357719928186714, + "grad_norm": 22.59800910949707, + "learning_rate": 8.73899860363056e-06, + "loss": 4.6009, + "step": 237925 + }, + { + "epoch": 21.35996409335727, + "grad_norm": 18.48118782043457, + "learning_rate": 8.738749251944943e-06, + "loss": 4.3062, + "step": 237950 + }, + { + "epoch": 21.36220825852783, + "grad_norm": 20.505008697509766, + "learning_rate": 8.738499900259327e-06, + "loss": 4.4944, + "step": 237975 + }, + { + "epoch": 21.364452423698385, + "grad_norm": 18.45110321044922, + "learning_rate": 8.73825054857371e-06, + "loss": 4.2947, + "step": 238000 + }, + { + "epoch": 21.36669658886894, + "grad_norm": 21.69849967956543, + "learning_rate": 8.738001196888092e-06, + "loss": 4.6001, + "step": 238025 + }, + { + "epoch": 21.368940754039496, + "grad_norm": 20.32794952392578, + "learning_rate": 8.737751845202474e-06, + "loss": 4.2697, + "step": 238050 + }, + { + "epoch": 21.371184919210055, + "grad_norm": 18.83197593688965, + "learning_rate": 8.737502493516858e-06, + "loss": 4.6032, + "step": 238075 + }, + { + "epoch": 21.37342908438061, + "grad_norm": 18.606239318847656, + "learning_rate": 8.737253141831239e-06, + "loss": 4.4486, + "step": 238100 + }, + { + "epoch": 21.375673249551166, + "grad_norm": 19.912513732910156, + "learning_rate": 8.737003790145621e-06, + "loss": 4.6743, + "step": 238125 + }, + { + "epoch": 21.377917414721722, + "grad_norm": 19.261268615722656, + "learning_rate": 8.736754438460005e-06, + "loss": 4.3604, + "step": 238150 + }, + { + "epoch": 21.38016157989228, + "grad_norm": 16.80694580078125, + "learning_rate": 8.736505086774387e-06, + "loss": 4.4396, + "step": 238175 + }, + { + "epoch": 21.382405745062837, + "grad_norm": 19.316862106323242, + "learning_rate": 8.73625573508877e-06, + "loss": 4.2869, + "step": 238200 + }, + { + "epoch": 21.384649910233392, + "grad_norm": 17.649452209472656, + "learning_rate": 8.736006383403152e-06, + "loss": 4.2896, + "step": 238225 + }, + { + "epoch": 21.38689407540395, + "grad_norm": 20.566606521606445, + "learning_rate": 8.735757031717536e-06, + "loss": 4.4277, + "step": 238250 + }, + { + "epoch": 21.389138240574507, + "grad_norm": 19.422330856323242, + "learning_rate": 8.735507680031917e-06, + "loss": 4.5872, + "step": 238275 + }, + { + "epoch": 21.391382405745063, + "grad_norm": 19.128747940063477, + "learning_rate": 8.7352583283463e-06, + "loss": 4.1027, + "step": 238300 + }, + { + "epoch": 21.393626570915618, + "grad_norm": 21.751384735107422, + "learning_rate": 8.735008976660683e-06, + "loss": 4.562, + "step": 238325 + }, + { + "epoch": 21.395870736086177, + "grad_norm": 21.905651092529297, + "learning_rate": 8.734759624975065e-06, + "loss": 4.5255, + "step": 238350 + }, + { + "epoch": 21.398114901256733, + "grad_norm": 19.301189422607422, + "learning_rate": 8.734510273289448e-06, + "loss": 4.6194, + "step": 238375 + }, + { + "epoch": 21.40035906642729, + "grad_norm": 18.99312400817871, + "learning_rate": 8.734260921603832e-06, + "loss": 4.5211, + "step": 238400 + }, + { + "epoch": 21.402603231597844, + "grad_norm": 19.307729721069336, + "learning_rate": 8.734011569918214e-06, + "loss": 4.4392, + "step": 238425 + }, + { + "epoch": 21.404847396768403, + "grad_norm": 20.420434951782227, + "learning_rate": 8.733762218232596e-06, + "loss": 4.4093, + "step": 238450 + }, + { + "epoch": 21.40709156193896, + "grad_norm": 20.626976013183594, + "learning_rate": 8.733512866546978e-06, + "loss": 4.3063, + "step": 238475 + }, + { + "epoch": 21.409335727109514, + "grad_norm": 22.1422119140625, + "learning_rate": 8.73326351486136e-06, + "loss": 4.4484, + "step": 238500 + }, + { + "epoch": 21.411579892280074, + "grad_norm": 20.88422203063965, + "learning_rate": 8.733014163175743e-06, + "loss": 4.4862, + "step": 238525 + }, + { + "epoch": 21.41382405745063, + "grad_norm": 19.453113555908203, + "learning_rate": 8.732764811490127e-06, + "loss": 4.1315, + "step": 238550 + }, + { + "epoch": 21.416068222621185, + "grad_norm": 21.084806442260742, + "learning_rate": 8.73251545980451e-06, + "loss": 4.3733, + "step": 238575 + }, + { + "epoch": 21.41831238779174, + "grad_norm": 19.515520095825195, + "learning_rate": 8.732266108118892e-06, + "loss": 4.3561, + "step": 238600 + }, + { + "epoch": 21.4205565529623, + "grad_norm": 15.185651779174805, + "learning_rate": 8.732016756433274e-06, + "loss": 4.2657, + "step": 238625 + }, + { + "epoch": 21.422800718132855, + "grad_norm": 19.501840591430664, + "learning_rate": 8.731767404747656e-06, + "loss": 4.3943, + "step": 238650 + }, + { + "epoch": 21.42504488330341, + "grad_norm": 14.528517723083496, + "learning_rate": 8.731518053062039e-06, + "loss": 4.4664, + "step": 238675 + }, + { + "epoch": 21.427289048473966, + "grad_norm": 18.866134643554688, + "learning_rate": 8.731268701376423e-06, + "loss": 4.5112, + "step": 238700 + }, + { + "epoch": 21.429533213644525, + "grad_norm": 17.516565322875977, + "learning_rate": 8.731019349690805e-06, + "loss": 4.3068, + "step": 238725 + }, + { + "epoch": 21.43177737881508, + "grad_norm": 18.014307022094727, + "learning_rate": 8.730769998005187e-06, + "loss": 4.3236, + "step": 238750 + }, + { + "epoch": 21.434021543985637, + "grad_norm": 18.548885345458984, + "learning_rate": 8.73052064631957e-06, + "loss": 4.4582, + "step": 238775 + }, + { + "epoch": 21.436265709156196, + "grad_norm": 17.03150177001953, + "learning_rate": 8.730271294633954e-06, + "loss": 4.284, + "step": 238800 + }, + { + "epoch": 21.43850987432675, + "grad_norm": 18.699127197265625, + "learning_rate": 8.730021942948334e-06, + "loss": 4.4435, + "step": 238825 + }, + { + "epoch": 21.440754039497307, + "grad_norm": 18.243938446044922, + "learning_rate": 8.729772591262717e-06, + "loss": 4.6763, + "step": 238850 + }, + { + "epoch": 21.442998204667862, + "grad_norm": 18.059314727783203, + "learning_rate": 8.7295232395771e-06, + "loss": 4.5802, + "step": 238875 + }, + { + "epoch": 21.44524236983842, + "grad_norm": 20.682701110839844, + "learning_rate": 8.729273887891483e-06, + "loss": 4.4199, + "step": 238900 + }, + { + "epoch": 21.447486535008977, + "grad_norm": 20.060949325561523, + "learning_rate": 8.729024536205865e-06, + "loss": 4.1893, + "step": 238925 + }, + { + "epoch": 21.449730700179533, + "grad_norm": 15.33145809173584, + "learning_rate": 8.728775184520248e-06, + "loss": 4.6457, + "step": 238950 + }, + { + "epoch": 21.45197486535009, + "grad_norm": 17.72092628479004, + "learning_rate": 8.728525832834632e-06, + "loss": 4.6294, + "step": 238975 + }, + { + "epoch": 21.454219030520647, + "grad_norm": 19.169164657592773, + "learning_rate": 8.728276481149012e-06, + "loss": 4.4696, + "step": 239000 + }, + { + "epoch": 21.456463195691203, + "grad_norm": 18.632688522338867, + "learning_rate": 8.728027129463396e-06, + "loss": 4.6528, + "step": 239025 + }, + { + "epoch": 21.45870736086176, + "grad_norm": 18.035804748535156, + "learning_rate": 8.727777777777779e-06, + "loss": 4.2938, + "step": 239050 + }, + { + "epoch": 21.460951526032314, + "grad_norm": 17.56698989868164, + "learning_rate": 8.727528426092161e-06, + "loss": 4.5968, + "step": 239075 + }, + { + "epoch": 21.463195691202873, + "grad_norm": 17.89370346069336, + "learning_rate": 8.727279074406543e-06, + "loss": 4.639, + "step": 239100 + }, + { + "epoch": 21.46543985637343, + "grad_norm": 21.13718605041504, + "learning_rate": 8.727029722720927e-06, + "loss": 4.383, + "step": 239125 + }, + { + "epoch": 21.467684021543985, + "grad_norm": 23.55045509338379, + "learning_rate": 8.72678037103531e-06, + "loss": 4.5183, + "step": 239150 + }, + { + "epoch": 21.469928186714544, + "grad_norm": 20.697471618652344, + "learning_rate": 8.726531019349692e-06, + "loss": 4.5213, + "step": 239175 + }, + { + "epoch": 21.4721723518851, + "grad_norm": 20.37091827392578, + "learning_rate": 8.726281667664074e-06, + "loss": 4.8952, + "step": 239200 + }, + { + "epoch": 21.474416517055655, + "grad_norm": 20.059667587280273, + "learning_rate": 8.726032315978456e-06, + "loss": 4.4649, + "step": 239225 + }, + { + "epoch": 21.47666068222621, + "grad_norm": 16.118242263793945, + "learning_rate": 8.725782964292839e-06, + "loss": 4.3784, + "step": 239250 + }, + { + "epoch": 21.47890484739677, + "grad_norm": 17.394132614135742, + "learning_rate": 8.725533612607223e-06, + "loss": 4.6691, + "step": 239275 + }, + { + "epoch": 21.481149012567325, + "grad_norm": 21.4489688873291, + "learning_rate": 8.725284260921605e-06, + "loss": 4.4575, + "step": 239300 + }, + { + "epoch": 21.48339317773788, + "grad_norm": 17.24399757385254, + "learning_rate": 8.725034909235987e-06, + "loss": 4.4971, + "step": 239325 + }, + { + "epoch": 21.485637342908436, + "grad_norm": 19.24429702758789, + "learning_rate": 8.72478555755037e-06, + "loss": 4.401, + "step": 239350 + }, + { + "epoch": 21.487881508078996, + "grad_norm": 19.910985946655273, + "learning_rate": 8.724536205864752e-06, + "loss": 4.5819, + "step": 239375 + }, + { + "epoch": 21.49012567324955, + "grad_norm": 18.23456382751465, + "learning_rate": 8.724286854179134e-06, + "loss": 4.3641, + "step": 239400 + }, + { + "epoch": 21.492369838420107, + "grad_norm": 18.840383529663086, + "learning_rate": 8.724037502493518e-06, + "loss": 4.4174, + "step": 239425 + }, + { + "epoch": 21.494614003590666, + "grad_norm": 17.946277618408203, + "learning_rate": 8.7237881508079e-06, + "loss": 4.7615, + "step": 239450 + }, + { + "epoch": 21.49685816876122, + "grad_norm": 21.08812141418457, + "learning_rate": 8.723538799122283e-06, + "loss": 4.484, + "step": 239475 + }, + { + "epoch": 21.499102333931777, + "grad_norm": 20.03944206237793, + "learning_rate": 8.723289447436665e-06, + "loss": 4.2546, + "step": 239500 + }, + { + "epoch": 21.501346499102333, + "grad_norm": 17.46189308166504, + "learning_rate": 8.723040095751048e-06, + "loss": 4.8744, + "step": 239525 + }, + { + "epoch": 21.503590664272892, + "grad_norm": 17.10010528564453, + "learning_rate": 8.72279074406543e-06, + "loss": 4.4888, + "step": 239550 + }, + { + "epoch": 21.505834829443447, + "grad_norm": 15.925742149353027, + "learning_rate": 8.722541392379812e-06, + "loss": 4.3831, + "step": 239575 + }, + { + "epoch": 21.508078994614003, + "grad_norm": 21.496944427490234, + "learning_rate": 8.722292040694196e-06, + "loss": 4.3685, + "step": 239600 + }, + { + "epoch": 21.51032315978456, + "grad_norm": 22.9089412689209, + "learning_rate": 8.722042689008579e-06, + "loss": 4.5143, + "step": 239625 + }, + { + "epoch": 21.512567324955118, + "grad_norm": 17.4807071685791, + "learning_rate": 8.721793337322961e-06, + "loss": 4.5316, + "step": 239650 + }, + { + "epoch": 21.514811490125673, + "grad_norm": 18.692428588867188, + "learning_rate": 8.721543985637343e-06, + "loss": 4.5967, + "step": 239675 + }, + { + "epoch": 21.51705565529623, + "grad_norm": 18.85250473022461, + "learning_rate": 8.721294633951726e-06, + "loss": 4.7321, + "step": 239700 + }, + { + "epoch": 21.519299820466788, + "grad_norm": 20.64775276184082, + "learning_rate": 8.721045282266108e-06, + "loss": 4.7471, + "step": 239725 + }, + { + "epoch": 21.521543985637344, + "grad_norm": 18.382761001586914, + "learning_rate": 8.720795930580492e-06, + "loss": 4.5325, + "step": 239750 + }, + { + "epoch": 21.5237881508079, + "grad_norm": 22.012283325195312, + "learning_rate": 8.720546578894874e-06, + "loss": 4.3854, + "step": 239775 + }, + { + "epoch": 21.526032315978455, + "grad_norm": 15.598360061645508, + "learning_rate": 8.720297227209256e-06, + "loss": 4.6019, + "step": 239800 + }, + { + "epoch": 21.528276481149014, + "grad_norm": 21.596006393432617, + "learning_rate": 8.720047875523639e-06, + "loss": 4.3754, + "step": 239825 + }, + { + "epoch": 21.53052064631957, + "grad_norm": 22.844375610351562, + "learning_rate": 8.719798523838023e-06, + "loss": 4.2172, + "step": 239850 + }, + { + "epoch": 21.532764811490125, + "grad_norm": 16.747507095336914, + "learning_rate": 8.719549172152403e-06, + "loss": 4.5781, + "step": 239875 + }, + { + "epoch": 21.53500897666068, + "grad_norm": 19.5283145904541, + "learning_rate": 8.719299820466787e-06, + "loss": 4.5089, + "step": 239900 + }, + { + "epoch": 21.53725314183124, + "grad_norm": 17.474275588989258, + "learning_rate": 8.71905046878117e-06, + "loss": 4.5654, + "step": 239925 + }, + { + "epoch": 21.539497307001795, + "grad_norm": 17.93681526184082, + "learning_rate": 8.718801117095552e-06, + "loss": 4.6429, + "step": 239950 + }, + { + "epoch": 21.54174147217235, + "grad_norm": 19.61073112487793, + "learning_rate": 8.718551765409934e-06, + "loss": 4.5744, + "step": 239975 + }, + { + "epoch": 21.543985637342907, + "grad_norm": 20.798044204711914, + "learning_rate": 8.718302413724318e-06, + "loss": 4.5017, + "step": 240000 + }, + { + "epoch": 21.546229802513466, + "grad_norm": 25.273969650268555, + "learning_rate": 8.7180530620387e-06, + "loss": 4.5384, + "step": 240025 + }, + { + "epoch": 21.54847396768402, + "grad_norm": 19.411514282226562, + "learning_rate": 8.717803710353081e-06, + "loss": 4.583, + "step": 240050 + }, + { + "epoch": 21.550718132854577, + "grad_norm": 15.992470741271973, + "learning_rate": 8.717554358667465e-06, + "loss": 4.6113, + "step": 240075 + }, + { + "epoch": 21.552962298025136, + "grad_norm": 18.005231857299805, + "learning_rate": 8.717305006981848e-06, + "loss": 4.3983, + "step": 240100 + }, + { + "epoch": 21.55520646319569, + "grad_norm": 19.316265106201172, + "learning_rate": 8.71705565529623e-06, + "loss": 4.4494, + "step": 240125 + }, + { + "epoch": 21.557450628366247, + "grad_norm": 20.479324340820312, + "learning_rate": 8.716806303610614e-06, + "loss": 4.2952, + "step": 240150 + }, + { + "epoch": 21.559694793536803, + "grad_norm": 20.472848892211914, + "learning_rate": 8.716556951924996e-06, + "loss": 4.2981, + "step": 240175 + }, + { + "epoch": 21.561938958707362, + "grad_norm": 18.433019638061523, + "learning_rate": 8.716307600239379e-06, + "loss": 4.5421, + "step": 240200 + }, + { + "epoch": 21.564183123877918, + "grad_norm": 21.375551223754883, + "learning_rate": 8.716058248553761e-06, + "loss": 4.4433, + "step": 240225 + }, + { + "epoch": 21.566427289048473, + "grad_norm": 20.312999725341797, + "learning_rate": 8.715808896868143e-06, + "loss": 4.2648, + "step": 240250 + }, + { + "epoch": 21.56867145421903, + "grad_norm": 19.96982765197754, + "learning_rate": 8.715559545182526e-06, + "loss": 4.684, + "step": 240275 + }, + { + "epoch": 21.570915619389588, + "grad_norm": 15.988472938537598, + "learning_rate": 8.715310193496908e-06, + "loss": 4.3174, + "step": 240300 + }, + { + "epoch": 21.573159784560143, + "grad_norm": 22.0460205078125, + "learning_rate": 8.715060841811292e-06, + "loss": 4.519, + "step": 240325 + }, + { + "epoch": 21.5754039497307, + "grad_norm": 22.421398162841797, + "learning_rate": 8.714811490125674e-06, + "loss": 4.4145, + "step": 240350 + }, + { + "epoch": 21.57764811490126, + "grad_norm": 17.104068756103516, + "learning_rate": 8.714562138440057e-06, + "loss": 4.589, + "step": 240375 + }, + { + "epoch": 21.579892280071814, + "grad_norm": 27.256576538085938, + "learning_rate": 8.71431278675444e-06, + "loss": 4.3264, + "step": 240400 + }, + { + "epoch": 21.58213644524237, + "grad_norm": 21.984886169433594, + "learning_rate": 8.714063435068821e-06, + "loss": 4.23, + "step": 240425 + }, + { + "epoch": 21.584380610412925, + "grad_norm": 19.81584930419922, + "learning_rate": 8.713814083383203e-06, + "loss": 4.7801, + "step": 240450 + }, + { + "epoch": 21.586624775583484, + "grad_norm": 18.481731414794922, + "learning_rate": 8.713564731697587e-06, + "loss": 4.3908, + "step": 240475 + }, + { + "epoch": 21.58886894075404, + "grad_norm": 18.43737030029297, + "learning_rate": 8.71331538001197e-06, + "loss": 4.4089, + "step": 240500 + }, + { + "epoch": 21.591113105924595, + "grad_norm": 18.614906311035156, + "learning_rate": 8.713066028326352e-06, + "loss": 4.4911, + "step": 240525 + }, + { + "epoch": 21.59335727109515, + "grad_norm": 20.295284271240234, + "learning_rate": 8.712816676640734e-06, + "loss": 4.4702, + "step": 240550 + }, + { + "epoch": 21.59560143626571, + "grad_norm": 18.20414161682129, + "learning_rate": 8.712567324955118e-06, + "loss": 4.4292, + "step": 240575 + }, + { + "epoch": 21.597845601436266, + "grad_norm": 18.452938079833984, + "learning_rate": 8.712317973269499e-06, + "loss": 4.4991, + "step": 240600 + }, + { + "epoch": 21.60008976660682, + "grad_norm": 18.341306686401367, + "learning_rate": 8.712068621583883e-06, + "loss": 4.5145, + "step": 240625 + }, + { + "epoch": 21.60233393177738, + "grad_norm": 20.061059951782227, + "learning_rate": 8.711819269898265e-06, + "loss": 4.4436, + "step": 240650 + }, + { + "epoch": 21.604578096947936, + "grad_norm": 18.201108932495117, + "learning_rate": 8.711569918212648e-06, + "loss": 4.6868, + "step": 240675 + }, + { + "epoch": 21.60682226211849, + "grad_norm": 17.556751251220703, + "learning_rate": 8.71132056652703e-06, + "loss": 4.6009, + "step": 240700 + }, + { + "epoch": 21.609066427289047, + "grad_norm": 18.40970802307129, + "learning_rate": 8.711071214841414e-06, + "loss": 4.6551, + "step": 240725 + }, + { + "epoch": 21.611310592459606, + "grad_norm": 19.434322357177734, + "learning_rate": 8.710821863155796e-06, + "loss": 4.6332, + "step": 240750 + }, + { + "epoch": 21.613554757630162, + "grad_norm": 17.803165435791016, + "learning_rate": 8.710572511470177e-06, + "loss": 4.5457, + "step": 240775 + }, + { + "epoch": 21.615798922800717, + "grad_norm": 23.337820053100586, + "learning_rate": 8.710323159784561e-06, + "loss": 4.4975, + "step": 240800 + }, + { + "epoch": 21.618043087971273, + "grad_norm": 16.923337936401367, + "learning_rate": 8.710073808098943e-06, + "loss": 4.4986, + "step": 240825 + }, + { + "epoch": 21.620287253141832, + "grad_norm": 19.99466896057129, + "learning_rate": 8.709824456413326e-06, + "loss": 4.2827, + "step": 240850 + }, + { + "epoch": 21.622531418312388, + "grad_norm": 17.620712280273438, + "learning_rate": 8.70957510472771e-06, + "loss": 4.5199, + "step": 240875 + }, + { + "epoch": 21.624775583482943, + "grad_norm": 20.14613914489746, + "learning_rate": 8.709325753042092e-06, + "loss": 4.4406, + "step": 240900 + }, + { + "epoch": 21.627019748653503, + "grad_norm": 19.474281311035156, + "learning_rate": 8.709076401356474e-06, + "loss": 4.5428, + "step": 240925 + }, + { + "epoch": 21.629263913824058, + "grad_norm": 17.63496971130371, + "learning_rate": 8.708827049670857e-06, + "loss": 4.4176, + "step": 240950 + }, + { + "epoch": 21.631508078994614, + "grad_norm": 18.550458908081055, + "learning_rate": 8.708577697985239e-06, + "loss": 4.3337, + "step": 240975 + }, + { + "epoch": 21.63375224416517, + "grad_norm": 18.426265716552734, + "learning_rate": 8.708328346299621e-06, + "loss": 4.3958, + "step": 241000 + }, + { + "epoch": 21.63599640933573, + "grad_norm": 22.086814880371094, + "learning_rate": 8.708078994614004e-06, + "loss": 4.3948, + "step": 241025 + }, + { + "epoch": 21.638240574506284, + "grad_norm": 19.887916564941406, + "learning_rate": 8.707829642928388e-06, + "loss": 4.6324, + "step": 241050 + }, + { + "epoch": 21.64048473967684, + "grad_norm": 20.390663146972656, + "learning_rate": 8.70758029124277e-06, + "loss": 4.566, + "step": 241075 + }, + { + "epoch": 21.642728904847395, + "grad_norm": 21.139545440673828, + "learning_rate": 8.707330939557152e-06, + "loss": 4.4987, + "step": 241100 + }, + { + "epoch": 21.644973070017954, + "grad_norm": 18.630348205566406, + "learning_rate": 8.707081587871534e-06, + "loss": 4.3501, + "step": 241125 + }, + { + "epoch": 21.64721723518851, + "grad_norm": 21.851755142211914, + "learning_rate": 8.706832236185917e-06, + "loss": 4.5069, + "step": 241150 + }, + { + "epoch": 21.649461400359066, + "grad_norm": 22.21521759033203, + "learning_rate": 8.706582884500299e-06, + "loss": 4.4474, + "step": 241175 + }, + { + "epoch": 21.651705565529625, + "grad_norm": 21.578466415405273, + "learning_rate": 8.706333532814683e-06, + "loss": 4.6957, + "step": 241200 + }, + { + "epoch": 21.65394973070018, + "grad_norm": 15.798752784729004, + "learning_rate": 8.706084181129065e-06, + "loss": 4.8185, + "step": 241225 + }, + { + "epoch": 21.656193895870736, + "grad_norm": 20.80021095275879, + "learning_rate": 8.705834829443448e-06, + "loss": 4.5452, + "step": 241250 + }, + { + "epoch": 21.65843806104129, + "grad_norm": 19.639877319335938, + "learning_rate": 8.70558547775783e-06, + "loss": 4.4684, + "step": 241275 + }, + { + "epoch": 21.66068222621185, + "grad_norm": 18.74321174621582, + "learning_rate": 8.705336126072212e-06, + "loss": 4.4268, + "step": 241300 + }, + { + "epoch": 21.662926391382406, + "grad_norm": 20.05789566040039, + "learning_rate": 8.705086774386595e-06, + "loss": 4.3978, + "step": 241325 + }, + { + "epoch": 21.66517055655296, + "grad_norm": 20.559614181518555, + "learning_rate": 8.704837422700979e-06, + "loss": 4.648, + "step": 241350 + }, + { + "epoch": 21.667414721723517, + "grad_norm": 17.822582244873047, + "learning_rate": 8.704588071015361e-06, + "loss": 4.702, + "step": 241375 + }, + { + "epoch": 21.669658886894076, + "grad_norm": 17.110578536987305, + "learning_rate": 8.704338719329743e-06, + "loss": 4.6026, + "step": 241400 + }, + { + "epoch": 21.671903052064632, + "grad_norm": 16.461626052856445, + "learning_rate": 8.704089367644126e-06, + "loss": 4.1302, + "step": 241425 + }, + { + "epoch": 21.674147217235188, + "grad_norm": 19.1634521484375, + "learning_rate": 8.70384001595851e-06, + "loss": 4.7378, + "step": 241450 + }, + { + "epoch": 21.676391382405747, + "grad_norm": 22.346172332763672, + "learning_rate": 8.70359066427289e-06, + "loss": 4.4146, + "step": 241475 + }, + { + "epoch": 21.678635547576302, + "grad_norm": 23.80390167236328, + "learning_rate": 8.703341312587273e-06, + "loss": 4.4469, + "step": 241500 + }, + { + "epoch": 21.680879712746858, + "grad_norm": 19.42636489868164, + "learning_rate": 8.703091960901657e-06, + "loss": 4.6059, + "step": 241525 + }, + { + "epoch": 21.683123877917414, + "grad_norm": 19.81976318359375, + "learning_rate": 8.702842609216039e-06, + "loss": 4.6539, + "step": 241550 + }, + { + "epoch": 21.685368043087973, + "grad_norm": 19.234230041503906, + "learning_rate": 8.702593257530421e-06, + "loss": 4.2755, + "step": 241575 + }, + { + "epoch": 21.68761220825853, + "grad_norm": 21.0791015625, + "learning_rate": 8.702343905844805e-06, + "loss": 4.4447, + "step": 241600 + }, + { + "epoch": 21.689856373429084, + "grad_norm": 18.535234451293945, + "learning_rate": 8.702094554159188e-06, + "loss": 4.5955, + "step": 241625 + }, + { + "epoch": 21.69210053859964, + "grad_norm": 20.523094177246094, + "learning_rate": 8.701845202473568e-06, + "loss": 4.432, + "step": 241650 + }, + { + "epoch": 21.6943447037702, + "grad_norm": 21.135223388671875, + "learning_rate": 8.701595850787952e-06, + "loss": 4.562, + "step": 241675 + }, + { + "epoch": 21.696588868940754, + "grad_norm": 23.307676315307617, + "learning_rate": 8.701346499102335e-06, + "loss": 4.373, + "step": 241700 + }, + { + "epoch": 21.69883303411131, + "grad_norm": 19.997915267944336, + "learning_rate": 8.701097147416717e-06, + "loss": 4.4476, + "step": 241725 + }, + { + "epoch": 21.70107719928187, + "grad_norm": 16.80611228942871, + "learning_rate": 8.7008477957311e-06, + "loss": 4.3985, + "step": 241750 + }, + { + "epoch": 21.703321364452425, + "grad_norm": 19.622329711914062, + "learning_rate": 8.700598444045483e-06, + "loss": 4.5182, + "step": 241775 + }, + { + "epoch": 21.70556552962298, + "grad_norm": 19.39972496032715, + "learning_rate": 8.700349092359865e-06, + "loss": 4.6484, + "step": 241800 + }, + { + "epoch": 21.707809694793536, + "grad_norm": 17.98261070251465, + "learning_rate": 8.700099740674248e-06, + "loss": 4.5656, + "step": 241825 + }, + { + "epoch": 21.710053859964095, + "grad_norm": 19.8470516204834, + "learning_rate": 8.69985038898863e-06, + "loss": 4.5289, + "step": 241850 + }, + { + "epoch": 21.71229802513465, + "grad_norm": 19.286972045898438, + "learning_rate": 8.699611011370439e-06, + "loss": 4.617, + "step": 241875 + }, + { + "epoch": 21.714542190305206, + "grad_norm": 17.85266876220703, + "learning_rate": 8.699361659684821e-06, + "loss": 4.4506, + "step": 241900 + }, + { + "epoch": 21.71678635547576, + "grad_norm": 23.532482147216797, + "learning_rate": 8.699112307999202e-06, + "loss": 4.6934, + "step": 241925 + }, + { + "epoch": 21.71903052064632, + "grad_norm": 18.777549743652344, + "learning_rate": 8.698862956313586e-06, + "loss": 4.4909, + "step": 241950 + }, + { + "epoch": 21.721274685816876, + "grad_norm": 19.373491287231445, + "learning_rate": 8.698613604627968e-06, + "loss": 4.7576, + "step": 241975 + }, + { + "epoch": 21.723518850987432, + "grad_norm": 18.694849014282227, + "learning_rate": 8.69836425294235e-06, + "loss": 4.6139, + "step": 242000 + }, + { + "epoch": 21.725763016157988, + "grad_norm": 17.52950668334961, + "learning_rate": 8.698114901256733e-06, + "loss": 4.3508, + "step": 242025 + }, + { + "epoch": 21.728007181328547, + "grad_norm": 19.606861114501953, + "learning_rate": 8.697865549571117e-06, + "loss": 4.5141, + "step": 242050 + }, + { + "epoch": 21.730251346499102, + "grad_norm": 19.17592430114746, + "learning_rate": 8.697616197885499e-06, + "loss": 4.2916, + "step": 242075 + }, + { + "epoch": 21.732495511669658, + "grad_norm": 18.868267059326172, + "learning_rate": 8.697366846199881e-06, + "loss": 4.3396, + "step": 242100 + }, + { + "epoch": 21.734739676840217, + "grad_norm": 23.28089714050293, + "learning_rate": 8.697117494514264e-06, + "loss": 4.5846, + "step": 242125 + }, + { + "epoch": 21.736983842010773, + "grad_norm": 19.028282165527344, + "learning_rate": 8.696868142828646e-06, + "loss": 4.6452, + "step": 242150 + }, + { + "epoch": 21.739228007181328, + "grad_norm": 20.359375, + "learning_rate": 8.696618791143028e-06, + "loss": 4.6198, + "step": 242175 + }, + { + "epoch": 21.741472172351884, + "grad_norm": 19.86206817626953, + "learning_rate": 8.696369439457412e-06, + "loss": 4.4221, + "step": 242200 + }, + { + "epoch": 21.743716337522443, + "grad_norm": 19.309640884399414, + "learning_rate": 8.696120087771795e-06, + "loss": 4.5544, + "step": 242225 + }, + { + "epoch": 21.745960502693, + "grad_norm": 16.521833419799805, + "learning_rate": 8.695870736086177e-06, + "loss": 4.2963, + "step": 242250 + }, + { + "epoch": 21.748204667863554, + "grad_norm": 19.85814666748047, + "learning_rate": 8.69562138440056e-06, + "loss": 4.5036, + "step": 242275 + }, + { + "epoch": 21.75044883303411, + "grad_norm": 19.096410751342773, + "learning_rate": 8.695372032714942e-06, + "loss": 4.4007, + "step": 242300 + }, + { + "epoch": 21.75269299820467, + "grad_norm": 18.0024356842041, + "learning_rate": 8.695122681029324e-06, + "loss": 4.8607, + "step": 242325 + }, + { + "epoch": 21.754937163375224, + "grad_norm": 20.556699752807617, + "learning_rate": 8.694873329343708e-06, + "loss": 4.3665, + "step": 242350 + }, + { + "epoch": 21.75718132854578, + "grad_norm": 16.623279571533203, + "learning_rate": 8.69462397765809e-06, + "loss": 4.3289, + "step": 242375 + }, + { + "epoch": 21.75942549371634, + "grad_norm": 18.63771629333496, + "learning_rate": 8.694374625972472e-06, + "loss": 4.4224, + "step": 242400 + }, + { + "epoch": 21.761669658886895, + "grad_norm": 17.14535140991211, + "learning_rate": 8.694125274286855e-06, + "loss": 4.7297, + "step": 242425 + }, + { + "epoch": 21.76391382405745, + "grad_norm": 19.625244140625, + "learning_rate": 8.693875922601237e-06, + "loss": 4.4907, + "step": 242450 + }, + { + "epoch": 21.766157989228006, + "grad_norm": 16.292076110839844, + "learning_rate": 8.69362657091562e-06, + "loss": 4.3828, + "step": 242475 + }, + { + "epoch": 21.768402154398565, + "grad_norm": 21.909290313720703, + "learning_rate": 8.693377219230002e-06, + "loss": 4.4659, + "step": 242500 + }, + { + "epoch": 21.77064631956912, + "grad_norm": 22.815521240234375, + "learning_rate": 8.693127867544386e-06, + "loss": 4.328, + "step": 242525 + }, + { + "epoch": 21.772890484739676, + "grad_norm": 18.047748565673828, + "learning_rate": 8.692878515858768e-06, + "loss": 4.1298, + "step": 242550 + }, + { + "epoch": 21.775134649910232, + "grad_norm": 16.21000862121582, + "learning_rate": 8.69262916417315e-06, + "loss": 4.3187, + "step": 242575 + }, + { + "epoch": 21.77737881508079, + "grad_norm": 18.493412017822266, + "learning_rate": 8.692379812487534e-06, + "loss": 4.4866, + "step": 242600 + }, + { + "epoch": 21.779622980251347, + "grad_norm": 21.196285247802734, + "learning_rate": 8.692130460801915e-06, + "loss": 4.4539, + "step": 242625 + }, + { + "epoch": 21.781867145421902, + "grad_norm": 20.36894989013672, + "learning_rate": 8.691881109116297e-06, + "loss": 4.5848, + "step": 242650 + }, + { + "epoch": 21.78411131059246, + "grad_norm": 19.095611572265625, + "learning_rate": 8.691631757430681e-06, + "loss": 4.9336, + "step": 242675 + }, + { + "epoch": 21.786355475763017, + "grad_norm": 18.025409698486328, + "learning_rate": 8.691382405745064e-06, + "loss": 4.6141, + "step": 242700 + }, + { + "epoch": 21.788599640933572, + "grad_norm": 20.7681827545166, + "learning_rate": 8.691133054059446e-06, + "loss": 4.5991, + "step": 242725 + }, + { + "epoch": 21.790843806104128, + "grad_norm": 18.533475875854492, + "learning_rate": 8.690883702373828e-06, + "loss": 4.5907, + "step": 242750 + }, + { + "epoch": 21.793087971274687, + "grad_norm": 22.364816665649414, + "learning_rate": 8.690634350688212e-06, + "loss": 4.6418, + "step": 242775 + }, + { + "epoch": 21.795332136445243, + "grad_norm": 18.514652252197266, + "learning_rate": 8.690384999002593e-06, + "loss": 4.3887, + "step": 242800 + }, + { + "epoch": 21.7975763016158, + "grad_norm": 18.08380699157715, + "learning_rate": 8.690135647316977e-06, + "loss": 4.5199, + "step": 242825 + }, + { + "epoch": 21.799820466786354, + "grad_norm": 20.676050186157227, + "learning_rate": 8.68988629563136e-06, + "loss": 4.2826, + "step": 242850 + }, + { + "epoch": 21.802064631956913, + "grad_norm": 19.09082794189453, + "learning_rate": 8.689636943945742e-06, + "loss": 4.5896, + "step": 242875 + }, + { + "epoch": 21.80430879712747, + "grad_norm": 18.528562545776367, + "learning_rate": 8.689387592260124e-06, + "loss": 4.4314, + "step": 242900 + }, + { + "epoch": 21.806552962298024, + "grad_norm": 19.677255630493164, + "learning_rate": 8.689138240574508e-06, + "loss": 4.7274, + "step": 242925 + }, + { + "epoch": 21.80879712746858, + "grad_norm": 18.004329681396484, + "learning_rate": 8.68888888888889e-06, + "loss": 4.804, + "step": 242950 + }, + { + "epoch": 21.81104129263914, + "grad_norm": 18.48574447631836, + "learning_rate": 8.688639537203273e-06, + "loss": 4.4604, + "step": 242975 + }, + { + "epoch": 21.813285457809695, + "grad_norm": 22.3065185546875, + "learning_rate": 8.688390185517655e-06, + "loss": 4.4858, + "step": 243000 + }, + { + "epoch": 21.81552962298025, + "grad_norm": 20.837387084960938, + "learning_rate": 8.688140833832037e-06, + "loss": 4.3548, + "step": 243025 + }, + { + "epoch": 21.81777378815081, + "grad_norm": 16.49390983581543, + "learning_rate": 8.68789148214642e-06, + "loss": 4.5285, + "step": 243050 + }, + { + "epoch": 21.820017953321365, + "grad_norm": 17.694990158081055, + "learning_rate": 8.687642130460803e-06, + "loss": 4.4262, + "step": 243075 + }, + { + "epoch": 21.82226211849192, + "grad_norm": 18.868743896484375, + "learning_rate": 8.687392778775186e-06, + "loss": 4.4713, + "step": 243100 + }, + { + "epoch": 21.824506283662476, + "grad_norm": 19.85360336303711, + "learning_rate": 8.687143427089568e-06, + "loss": 4.2799, + "step": 243125 + }, + { + "epoch": 21.826750448833035, + "grad_norm": 16.28440284729004, + "learning_rate": 8.68689407540395e-06, + "loss": 4.691, + "step": 243150 + }, + { + "epoch": 21.82899461400359, + "grad_norm": 16.51926040649414, + "learning_rate": 8.686644723718333e-06, + "loss": 4.5917, + "step": 243175 + }, + { + "epoch": 21.831238779174146, + "grad_norm": 15.436100959777832, + "learning_rate": 8.686395372032715e-06, + "loss": 4.5648, + "step": 243200 + }, + { + "epoch": 21.833482944344702, + "grad_norm": 18.465139389038086, + "learning_rate": 8.686146020347097e-06, + "loss": 4.4268, + "step": 243225 + }, + { + "epoch": 21.83572710951526, + "grad_norm": 18.313396453857422, + "learning_rate": 8.685896668661481e-06, + "loss": 4.487, + "step": 243250 + }, + { + "epoch": 21.837971274685817, + "grad_norm": 21.1453914642334, + "learning_rate": 8.685647316975864e-06, + "loss": 4.3729, + "step": 243275 + }, + { + "epoch": 21.840215439856372, + "grad_norm": 19.104162216186523, + "learning_rate": 8.685397965290246e-06, + "loss": 4.5468, + "step": 243300 + }, + { + "epoch": 21.84245960502693, + "grad_norm": 16.58429718017578, + "learning_rate": 8.685148613604628e-06, + "loss": 4.4603, + "step": 243325 + }, + { + "epoch": 21.844703770197487, + "grad_norm": 17.237844467163086, + "learning_rate": 8.68489926191901e-06, + "loss": 4.6112, + "step": 243350 + }, + { + "epoch": 21.846947935368043, + "grad_norm": 17.997838973999023, + "learning_rate": 8.684649910233393e-06, + "loss": 4.5457, + "step": 243375 + }, + { + "epoch": 21.8491921005386, + "grad_norm": 19.90052604675293, + "learning_rate": 8.684400558547777e-06, + "loss": 4.5719, + "step": 243400 + }, + { + "epoch": 21.851436265709157, + "grad_norm": 17.159637451171875, + "learning_rate": 8.68415120686216e-06, + "loss": 4.4712, + "step": 243425 + }, + { + "epoch": 21.853680430879713, + "grad_norm": 19.114534378051758, + "learning_rate": 8.683901855176542e-06, + "loss": 4.4005, + "step": 243450 + }, + { + "epoch": 21.85592459605027, + "grad_norm": 19.894723892211914, + "learning_rate": 8.683652503490924e-06, + "loss": 4.559, + "step": 243475 + }, + { + "epoch": 21.858168761220824, + "grad_norm": 21.413352966308594, + "learning_rate": 8.683403151805308e-06, + "loss": 4.4814, + "step": 243500 + }, + { + "epoch": 21.860412926391383, + "grad_norm": 18.443994522094727, + "learning_rate": 8.683153800119689e-06, + "loss": 4.4847, + "step": 243525 + }, + { + "epoch": 21.86265709156194, + "grad_norm": 20.654064178466797, + "learning_rate": 8.682904448434073e-06, + "loss": 4.5547, + "step": 243550 + }, + { + "epoch": 21.864901256732495, + "grad_norm": 19.682401657104492, + "learning_rate": 8.682655096748455e-06, + "loss": 4.484, + "step": 243575 + }, + { + "epoch": 21.867145421903054, + "grad_norm": 19.308835983276367, + "learning_rate": 8.682405745062837e-06, + "loss": 4.7744, + "step": 243600 + }, + { + "epoch": 21.86938958707361, + "grad_norm": 18.39275550842285, + "learning_rate": 8.68215639337722e-06, + "loss": 4.5306, + "step": 243625 + }, + { + "epoch": 21.871633752244165, + "grad_norm": 16.422626495361328, + "learning_rate": 8.681907041691604e-06, + "loss": 4.473, + "step": 243650 + }, + { + "epoch": 21.87387791741472, + "grad_norm": 19.68478012084961, + "learning_rate": 8.681657690005986e-06, + "loss": 4.2195, + "step": 243675 + }, + { + "epoch": 21.87612208258528, + "grad_norm": 18.239131927490234, + "learning_rate": 8.681408338320368e-06, + "loss": 4.6253, + "step": 243700 + }, + { + "epoch": 21.878366247755835, + "grad_norm": 19.18603515625, + "learning_rate": 8.68115898663475e-06, + "loss": 4.5858, + "step": 243725 + }, + { + "epoch": 21.88061041292639, + "grad_norm": 21.413053512573242, + "learning_rate": 8.680909634949133e-06, + "loss": 4.5587, + "step": 243750 + }, + { + "epoch": 21.882854578096946, + "grad_norm": 17.32509994506836, + "learning_rate": 8.680660283263515e-06, + "loss": 4.4855, + "step": 243775 + }, + { + "epoch": 21.885098743267505, + "grad_norm": 21.576265335083008, + "learning_rate": 8.680410931577899e-06, + "loss": 4.4876, + "step": 243800 + }, + { + "epoch": 21.88734290843806, + "grad_norm": 21.067625045776367, + "learning_rate": 8.680161579892281e-06, + "loss": 4.2508, + "step": 243825 + }, + { + "epoch": 21.889587073608617, + "grad_norm": 18.317415237426758, + "learning_rate": 8.679912228206664e-06, + "loss": 4.7473, + "step": 243850 + }, + { + "epoch": 21.891831238779176, + "grad_norm": Infinity, + "learning_rate": 8.67967285058847e-06, + "loss": 4.4986, + "step": 243875 + }, + { + "epoch": 21.89407540394973, + "grad_norm": 23.279495239257812, + "learning_rate": 8.679423498902853e-06, + "loss": 4.4625, + "step": 243900 + }, + { + "epoch": 21.896319569120287, + "grad_norm": 18.928176879882812, + "learning_rate": 8.679174147217237e-06, + "loss": 4.4684, + "step": 243925 + }, + { + "epoch": 21.898563734290843, + "grad_norm": 20.65825843811035, + "learning_rate": 8.678924795531618e-06, + "loss": 4.3761, + "step": 243950 + }, + { + "epoch": 21.9008078994614, + "grad_norm": 22.215957641601562, + "learning_rate": 8.678675443846e-06, + "loss": 4.8057, + "step": 243975 + }, + { + "epoch": 21.903052064631957, + "grad_norm": 18.910858154296875, + "learning_rate": 8.678426092160384e-06, + "loss": 4.4925, + "step": 244000 + }, + { + "epoch": 21.905296229802513, + "grad_norm": 18.93661117553711, + "learning_rate": 8.678176740474766e-06, + "loss": 4.7577, + "step": 244025 + }, + { + "epoch": 21.90754039497307, + "grad_norm": 19.968082427978516, + "learning_rate": 8.677927388789149e-06, + "loss": 4.3187, + "step": 244050 + }, + { + "epoch": 21.909784560143628, + "grad_norm": 19.68869400024414, + "learning_rate": 8.677678037103533e-06, + "loss": 4.7488, + "step": 244075 + }, + { + "epoch": 21.912028725314183, + "grad_norm": 16.891544342041016, + "learning_rate": 8.677428685417915e-06, + "loss": 4.608, + "step": 244100 + }, + { + "epoch": 21.91427289048474, + "grad_norm": 18.52802276611328, + "learning_rate": 8.677179333732296e-06, + "loss": 4.4861, + "step": 244125 + }, + { + "epoch": 21.916517055655298, + "grad_norm": 19.677003860473633, + "learning_rate": 8.67692998204668e-06, + "loss": 4.5845, + "step": 244150 + }, + { + "epoch": 21.918761220825854, + "grad_norm": 17.516599655151367, + "learning_rate": 8.676680630361062e-06, + "loss": 4.2284, + "step": 244175 + }, + { + "epoch": 21.92100538599641, + "grad_norm": 19.954652786254883, + "learning_rate": 8.676431278675444e-06, + "loss": 4.5921, + "step": 244200 + }, + { + "epoch": 21.923249551166965, + "grad_norm": 16.39237403869629, + "learning_rate": 8.676181926989827e-06, + "loss": 4.5809, + "step": 244225 + }, + { + "epoch": 21.925493716337524, + "grad_norm": 16.3067684173584, + "learning_rate": 8.67593257530421e-06, + "loss": 4.8684, + "step": 244250 + }, + { + "epoch": 21.92773788150808, + "grad_norm": 22.1051082611084, + "learning_rate": 8.675683223618593e-06, + "loss": 4.6929, + "step": 244275 + }, + { + "epoch": 21.929982046678635, + "grad_norm": 18.08682632446289, + "learning_rate": 8.675433871932975e-06, + "loss": 4.6658, + "step": 244300 + }, + { + "epoch": 21.93222621184919, + "grad_norm": 17.739521026611328, + "learning_rate": 8.675184520247357e-06, + "loss": 4.3673, + "step": 244325 + }, + { + "epoch": 21.93447037701975, + "grad_norm": 21.0284481048584, + "learning_rate": 8.67493516856174e-06, + "loss": 4.5431, + "step": 244350 + }, + { + "epoch": 21.936714542190305, + "grad_norm": 17.84334945678711, + "learning_rate": 8.674685816876122e-06, + "loss": 4.8367, + "step": 244375 + }, + { + "epoch": 21.93895870736086, + "grad_norm": 19.2390193939209, + "learning_rate": 8.674436465190506e-06, + "loss": 4.4831, + "step": 244400 + }, + { + "epoch": 21.94120287253142, + "grad_norm": 16.166343688964844, + "learning_rate": 8.674187113504888e-06, + "loss": 4.3671, + "step": 244425 + }, + { + "epoch": 21.943447037701976, + "grad_norm": 17.09248924255371, + "learning_rate": 8.67393776181927e-06, + "loss": 4.5026, + "step": 244450 + }, + { + "epoch": 21.94569120287253, + "grad_norm": 17.133914947509766, + "learning_rate": 8.673688410133653e-06, + "loss": 4.6265, + "step": 244475 + }, + { + "epoch": 21.947935368043087, + "grad_norm": 25.716135025024414, + "learning_rate": 8.673439058448035e-06, + "loss": 4.7046, + "step": 244500 + }, + { + "epoch": 21.950179533213646, + "grad_norm": 29.101810455322266, + "learning_rate": 8.673189706762418e-06, + "loss": 4.619, + "step": 244525 + }, + { + "epoch": 21.9524236983842, + "grad_norm": 20.193981170654297, + "learning_rate": 8.672940355076802e-06, + "loss": 4.6725, + "step": 244550 + }, + { + "epoch": 21.954667863554757, + "grad_norm": 18.999130249023438, + "learning_rate": 8.672691003391184e-06, + "loss": 4.5667, + "step": 244575 + }, + { + "epoch": 21.956912028725313, + "grad_norm": 19.32090187072754, + "learning_rate": 8.672441651705566e-06, + "loss": 4.7482, + "step": 244600 + }, + { + "epoch": 21.959156193895872, + "grad_norm": 18.6471004486084, + "learning_rate": 8.672192300019949e-06, + "loss": 4.5518, + "step": 244625 + }, + { + "epoch": 21.961400359066428, + "grad_norm": 19.41020393371582, + "learning_rate": 8.671942948334331e-06, + "loss": 4.6384, + "step": 244650 + }, + { + "epoch": 21.963644524236983, + "grad_norm": 17.786293029785156, + "learning_rate": 8.671693596648713e-06, + "loss": 4.5489, + "step": 244675 + }, + { + "epoch": 21.96588868940754, + "grad_norm": 18.560775756835938, + "learning_rate": 8.671444244963096e-06, + "loss": 4.5622, + "step": 244700 + }, + { + "epoch": 21.968132854578098, + "grad_norm": 21.79495620727539, + "learning_rate": 8.67119489327748e-06, + "loss": 4.445, + "step": 244725 + }, + { + "epoch": 21.970377019748653, + "grad_norm": 22.46162986755371, + "learning_rate": 8.670945541591862e-06, + "loss": 4.7246, + "step": 244750 + }, + { + "epoch": 21.97262118491921, + "grad_norm": 21.057037353515625, + "learning_rate": 8.670696189906244e-06, + "loss": 4.6035, + "step": 244775 + }, + { + "epoch": 21.974865350089768, + "grad_norm": 20.343332290649414, + "learning_rate": 8.670446838220628e-06, + "loss": 4.6943, + "step": 244800 + }, + { + "epoch": 21.977109515260324, + "grad_norm": 15.908320426940918, + "learning_rate": 8.670197486535009e-06, + "loss": 4.5566, + "step": 244825 + }, + { + "epoch": 21.97935368043088, + "grad_norm": 17.51407814025879, + "learning_rate": 8.669948134849391e-06, + "loss": 4.4305, + "step": 244850 + }, + { + "epoch": 21.981597845601435, + "grad_norm": 17.883455276489258, + "learning_rate": 8.669698783163775e-06, + "loss": 4.8602, + "step": 244875 + }, + { + "epoch": 21.983842010771994, + "grad_norm": 20.50774574279785, + "learning_rate": 8.669449431478158e-06, + "loss": 4.4229, + "step": 244900 + }, + { + "epoch": 21.98608617594255, + "grad_norm": 19.13277816772461, + "learning_rate": 8.66920007979254e-06, + "loss": 4.7263, + "step": 244925 + }, + { + "epoch": 21.988330341113105, + "grad_norm": 19.258831024169922, + "learning_rate": 8.668950728106922e-06, + "loss": 4.5759, + "step": 244950 + }, + { + "epoch": 21.99057450628366, + "grad_norm": 15.804481506347656, + "learning_rate": 8.668701376421306e-06, + "loss": 4.7401, + "step": 244975 + }, + { + "epoch": 21.99281867145422, + "grad_norm": 19.62061309814453, + "learning_rate": 8.668452024735688e-06, + "loss": 4.5366, + "step": 245000 + }, + { + "epoch": 21.995062836624776, + "grad_norm": 21.840190887451172, + "learning_rate": 8.66820267305007e-06, + "loss": 4.5878, + "step": 245025 + }, + { + "epoch": 21.99730700179533, + "grad_norm": 17.47736930847168, + "learning_rate": 8.667953321364453e-06, + "loss": 4.6725, + "step": 245050 + }, + { + "epoch": 21.99955116696589, + "grad_norm": 20.71500587463379, + "learning_rate": 8.667703969678835e-06, + "loss": 4.3129, + "step": 245075 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.06276134649533156, + "eval_f1_macro": 0.009477047596877624, + "eval_f1_micro": 0.06276134649533156, + "eval_f1_weighted": 0.03996332128759549, + "eval_loss": 6.8204345703125, + "eval_precision_macro": 0.008540631772865619, + "eval_precision_micro": 0.06276134649533156, + "eval_precision_weighted": 0.03304113276853252, + "eval_recall_macro": 0.014575594682055309, + "eval_recall_micro": 0.06276134649533156, + "eval_recall_weighted": 0.06276134649533156, + "eval_runtime": 129.3436, + "eval_samples_per_second": 404.914, + "eval_steps_per_second": 12.656, + "step": 245080 + } + ], + "logging_steps": 25, + "max_steps": 1114000, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 5 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2324352877312082e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}