{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.802729044834308, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.031189083820662766, "grad_norm": 126.5, "learning_rate": 8.000000000000001e-07, "loss": 0.5428, "step": 10 }, { "epoch": 0.06237816764132553, "grad_norm": 326.0, "learning_rate": 1.6000000000000001e-06, "loss": 0.4758, "step": 20 }, { "epoch": 0.0935672514619883, "grad_norm": 524.0, "learning_rate": 2.4000000000000003e-06, "loss": 0.5854, "step": 30 }, { "epoch": 0.12475633528265107, "grad_norm": 446.0, "learning_rate": 3.2000000000000003e-06, "loss": 0.8299, "step": 40 }, { "epoch": 0.15594541910331383, "grad_norm": 268.0, "learning_rate": 4.000000000000001e-06, "loss": 1.0183, "step": 50 }, { "epoch": 0.1871345029239766, "grad_norm": 1592.0, "learning_rate": 4.800000000000001e-06, "loss": 1.316, "step": 60 }, { "epoch": 0.21832358674463936, "grad_norm": 139.0, "learning_rate": 5.600000000000001e-06, "loss": 1.5705, "step": 70 }, { "epoch": 0.24951267056530213, "grad_norm": 374.0, "learning_rate": 6.4000000000000006e-06, "loss": 5.897, "step": 80 }, { "epoch": 0.2807017543859649, "grad_norm": 115.0, "learning_rate": 7.2000000000000005e-06, "loss": 1.1907, "step": 90 }, { "epoch": 0.31189083820662766, "grad_norm": 824.0, "learning_rate": 8.000000000000001e-06, "loss": 1.602, "step": 100 }, { "epoch": 0.34307992202729043, "grad_norm": 74.5, "learning_rate": 8.8e-06, "loss": 1.3299, "step": 110 }, { "epoch": 0.3742690058479532, "grad_norm": 129.0, "learning_rate": 9.600000000000001e-06, "loss": 1.106, "step": 120 }, { "epoch": 0.40545808966861596, "grad_norm": 168.0, "learning_rate": 1.04e-05, "loss": 1.5673, "step": 130 }, { "epoch": 0.43664717348927873, "grad_norm": 52.75, "learning_rate": 1.1200000000000001e-05, "loss": 0.9792, "step": 140 }, { "epoch": 0.4678362573099415, "grad_norm": 95.5, "learning_rate": 1.2e-05, "loss": 0.9398, "step": 150 }, { "epoch": 0.49902534113060426, "grad_norm": 66.0, "learning_rate": 1.2800000000000001e-05, "loss": 0.8719, "step": 160 }, { "epoch": 0.530214424951267, "grad_norm": 46.75, "learning_rate": 1.3600000000000002e-05, "loss": 1.0627, "step": 170 }, { "epoch": 0.5614035087719298, "grad_norm": 41.75, "learning_rate": 1.4400000000000001e-05, "loss": 0.8263, "step": 180 }, { "epoch": 0.5925925925925926, "grad_norm": 45.5, "learning_rate": 1.5200000000000002e-05, "loss": 0.8172, "step": 190 }, { "epoch": 0.6237816764132553, "grad_norm": 41.5, "learning_rate": 1.6000000000000003e-05, "loss": 0.8474, "step": 200 }, { "epoch": 0.6549707602339181, "grad_norm": 41.75, "learning_rate": 1.6800000000000002e-05, "loss": 0.8815, "step": 210 }, { "epoch": 0.6861598440545809, "grad_norm": 45.75, "learning_rate": 1.76e-05, "loss": 0.8097, "step": 220 }, { "epoch": 0.7173489278752436, "grad_norm": 412.0, "learning_rate": 1.8400000000000003e-05, "loss": 1.0962, "step": 230 }, { "epoch": 0.7485380116959064, "grad_norm": 52.25, "learning_rate": 1.9200000000000003e-05, "loss": 10.0622, "step": 240 }, { "epoch": 0.7797270955165692, "grad_norm": 38.0, "learning_rate": 2e-05, "loss": 1.358, "step": 250 }, { "epoch": 0.8109161793372319, "grad_norm": 47.25, "learning_rate": 1.9999025240093045e-05, "loss": 1.0034, "step": 260 }, { "epoch": 0.8421052631578947, "grad_norm": 22.5, "learning_rate": 1.9996101150403543e-05, "loss": 0.7018, "step": 270 }, { "epoch": 0.8732943469785575, "grad_norm": 42.0, "learning_rate": 1.9991228300988586e-05, "loss": 0.6776, "step": 280 }, { "epoch": 0.9044834307992202, "grad_norm": 29.0, "learning_rate": 1.9984407641819812e-05, "loss": 0.7348, "step": 290 }, { "epoch": 0.935672514619883, "grad_norm": 42.5, "learning_rate": 1.9975640502598243e-05, "loss": 0.6959, "step": 300 }, { "epoch": 0.9668615984405458, "grad_norm": 45.75, "learning_rate": 1.9964928592495046e-05, "loss": 0.7085, "step": 310 }, { "epoch": 0.9980506822612085, "grad_norm": 38.0, "learning_rate": 1.9952273999818312e-05, "loss": 0.6367, "step": 320 }, { "epoch": 1.0300194931773878, "grad_norm": 44.0, "learning_rate": 1.9937679191605964e-05, "loss": 0.6069, "step": 330 }, { "epoch": 1.0612085769980506, "grad_norm": 17.625, "learning_rate": 1.9921147013144782e-05, "loss": 0.6435, "step": 340 }, { "epoch": 1.0923976608187134, "grad_norm": 26.875, "learning_rate": 1.9902680687415704e-05, "loss": 0.5493, "step": 350 }, { "epoch": 1.1235867446393761, "grad_norm": 33.5, "learning_rate": 1.988228381446553e-05, "loss": 0.5624, "step": 360 }, { "epoch": 1.154775828460039, "grad_norm": 40.75, "learning_rate": 1.985996037070505e-05, "loss": 0.5981, "step": 370 }, { "epoch": 1.1859649122807017, "grad_norm": 17.125, "learning_rate": 1.983571470813386e-05, "loss": 0.656, "step": 380 }, { "epoch": 1.2171539961013644, "grad_norm": 21.5, "learning_rate": 1.9809551553491918e-05, "loss": 0.4947, "step": 390 }, { "epoch": 1.2483430799220272, "grad_norm": 29.0, "learning_rate": 1.9781476007338058e-05, "loss": 0.562, "step": 400 }, { "epoch": 1.2795321637426902, "grad_norm": 8.1875, "learning_rate": 1.9751493543055634e-05, "loss": 0.4717, "step": 410 }, { "epoch": 1.310721247563353, "grad_norm": 41.5, "learning_rate": 1.9719610005785466e-05, "loss": 0.5865, "step": 420 }, { "epoch": 1.3419103313840157, "grad_norm": 23.375, "learning_rate": 1.9685831611286312e-05, "loss": 0.4994, "step": 430 }, { "epoch": 1.3730994152046785, "grad_norm": 18.375, "learning_rate": 1.9650164944723116e-05, "loss": 0.4635, "step": 440 }, { "epoch": 1.4042884990253413, "grad_norm": 25.0, "learning_rate": 1.961261695938319e-05, "loss": 0.5149, "step": 450 }, { "epoch": 1.435477582846004, "grad_norm": 37.5, "learning_rate": 1.9573194975320672e-05, "loss": 0.5077, "step": 460 }, { "epoch": 1.4666666666666668, "grad_norm": 21.0, "learning_rate": 1.9531906677929472e-05, "loss": 0.5361, "step": 470 }, { "epoch": 1.4978557504873296, "grad_norm": 11.3125, "learning_rate": 1.9488760116444966e-05, "loss": 0.5036, "step": 480 }, { "epoch": 1.5290448343079923, "grad_norm": 39.75, "learning_rate": 1.944376370237481e-05, "loss": 0.4732, "step": 490 }, { "epoch": 1.560233918128655, "grad_norm": 17.5, "learning_rate": 1.9396926207859085e-05, "loss": 0.4622, "step": 500 }, { "epoch": 1.5914230019493179, "grad_norm": 13.9375, "learning_rate": 1.9348256763960146e-05, "loss": 0.4519, "step": 510 }, { "epoch": 1.6226120857699806, "grad_norm": 21.375, "learning_rate": 1.9297764858882516e-05, "loss": 0.5471, "step": 520 }, { "epoch": 1.6538011695906434, "grad_norm": 34.25, "learning_rate": 1.9245460336123136e-05, "loss": 0.4767, "step": 530 }, { "epoch": 1.6849902534113061, "grad_norm": 23.625, "learning_rate": 1.9191353392552346e-05, "loss": 0.4921, "step": 540 }, { "epoch": 1.716179337231969, "grad_norm": 21.125, "learning_rate": 1.913545457642601e-05, "loss": 0.4537, "step": 550 }, { "epoch": 1.7473684210526317, "grad_norm": 28.5, "learning_rate": 1.907777478532909e-05, "loss": 0.466, "step": 560 }, { "epoch": 1.7785575048732944, "grad_norm": 15.375, "learning_rate": 1.901832526405114e-05, "loss": 0.4803, "step": 570 }, { "epoch": 1.8097465886939572, "grad_norm": 12.625, "learning_rate": 1.895711760239413e-05, "loss": 0.4508, "step": 580 }, { "epoch": 1.84093567251462, "grad_norm": 11.25, "learning_rate": 1.889416373291298e-05, "loss": 0.4905, "step": 590 }, { "epoch": 1.8721247563352827, "grad_norm": 11.375, "learning_rate": 1.8829475928589272e-05, "loss": 0.4482, "step": 600 }, { "epoch": 1.9033138401559455, "grad_norm": 43.25, "learning_rate": 1.8763066800438638e-05, "loss": 0.4531, "step": 610 }, { "epoch": 1.9345029239766083, "grad_norm": 24.75, "learning_rate": 1.869494929505219e-05, "loss": 0.4268, "step": 620 }, { "epoch": 1.965692007797271, "grad_norm": 49.5, "learning_rate": 1.8625136692072577e-05, "loss": 0.4737, "step": 630 }, { "epoch": 1.9968810916179338, "grad_norm": 22.5, "learning_rate": 1.855364260160507e-05, "loss": 0.487, "step": 640 }, { "epoch": 2.028849902534113, "grad_norm": 15.1875, "learning_rate": 1.848048096156426e-05, "loss": 0.4299, "step": 650 }, { "epoch": 2.0600389863547757, "grad_norm": 8.5, "learning_rate": 1.8405666034956842e-05, "loss": 0.4246, "step": 660 }, { "epoch": 2.0912280701754384, "grad_norm": 7.78125, "learning_rate": 1.8329212407100996e-05, "loss": 0.4017, "step": 670 }, { "epoch": 2.122417153996101, "grad_norm": 23.75, "learning_rate": 1.8251134982782952e-05, "loss": 0.4378, "step": 680 }, { "epoch": 2.153606237816764, "grad_norm": 17.125, "learning_rate": 1.8171448983351284e-05, "loss": 0.4024, "step": 690 }, { "epoch": 2.1847953216374267, "grad_norm": 19.125, "learning_rate": 1.8090169943749477e-05, "loss": 0.395, "step": 700 }, { "epoch": 2.2159844054580895, "grad_norm": 21.125, "learning_rate": 1.8007313709487334e-05, "loss": 0.3597, "step": 710 }, { "epoch": 2.2471734892787523, "grad_norm": 4.6875, "learning_rate": 1.792289643355191e-05, "loss": 0.3673, "step": 720 }, { "epoch": 2.278362573099415, "grad_norm": 8.625, "learning_rate": 1.78369345732584e-05, "loss": 0.3961, "step": 730 }, { "epoch": 2.309551656920078, "grad_norm": 12.875, "learning_rate": 1.7749444887041797e-05, "loss": 0.3944, "step": 740 }, { "epoch": 2.3407407407407406, "grad_norm": 17.375, "learning_rate": 1.766044443118978e-05, "loss": 0.3777, "step": 750 }, { "epoch": 2.3719298245614033, "grad_norm": 12.375, "learning_rate": 1.7569950556517566e-05, "loss": 0.3799, "step": 760 }, { "epoch": 2.403118908382066, "grad_norm": 23.625, "learning_rate": 1.747798090498532e-05, "loss": 0.374, "step": 770 }, { "epoch": 2.434307992202729, "grad_norm": 23.125, "learning_rate": 1.7384553406258842e-05, "loss": 0.3891, "step": 780 }, { "epoch": 2.4654970760233916, "grad_norm": 15.1875, "learning_rate": 1.7289686274214116e-05, "loss": 0.3792, "step": 790 }, { "epoch": 2.4966861598440544, "grad_norm": 34.5, "learning_rate": 1.7193398003386514e-05, "loss": 0.374, "step": 800 }, { "epoch": 2.5278752436647176, "grad_norm": 15.875, "learning_rate": 1.709570736536521e-05, "loss": 0.3616, "step": 810 }, { "epoch": 2.5590643274853804, "grad_norm": 5.09375, "learning_rate": 1.6996633405133656e-05, "loss": 0.3745, "step": 820 }, { "epoch": 2.590253411306043, "grad_norm": 20.125, "learning_rate": 1.68961954373567e-05, "loss": 0.3756, "step": 830 }, { "epoch": 2.621442495126706, "grad_norm": 17.875, "learning_rate": 1.6794413042615168e-05, "loss": 0.3736, "step": 840 }, { "epoch": 2.6526315789473687, "grad_norm": 7.84375, "learning_rate": 1.6691306063588583e-05, "loss": 0.3827, "step": 850 }, { "epoch": 2.6838206627680314, "grad_norm": 16.625, "learning_rate": 1.6586894601186804e-05, "loss": 0.376, "step": 860 }, { "epoch": 2.715009746588694, "grad_norm": 20.125, "learning_rate": 1.6481199010631312e-05, "loss": 0.3706, "step": 870 }, { "epoch": 2.746198830409357, "grad_norm": 13.6875, "learning_rate": 1.63742398974869e-05, "loss": 0.3692, "step": 880 }, { "epoch": 2.7773879142300197, "grad_norm": 10.5625, "learning_rate": 1.6266038113644605e-05, "loss": 0.3232, "step": 890 }, { "epoch": 2.8085769980506825, "grad_norm": 11.75, "learning_rate": 1.6156614753256583e-05, "loss": 0.3514, "step": 900 }, { "epoch": 2.8397660818713453, "grad_norm": 16.625, "learning_rate": 1.6045991148623752e-05, "loss": 0.3483, "step": 910 }, { "epoch": 2.870955165692008, "grad_norm": 6.9375, "learning_rate": 1.5934188866037017e-05, "loss": 0.3546, "step": 920 }, { "epoch": 2.902144249512671, "grad_norm": 28.0, "learning_rate": 1.5821229701572897e-05, "loss": 0.3382, "step": 930 }, { "epoch": 2.9333333333333336, "grad_norm": 4.9375, "learning_rate": 1.570713567684432e-05, "loss": 0.327, "step": 940 }, { "epoch": 2.9645224171539963, "grad_norm": 6.71875, "learning_rate": 1.5591929034707468e-05, "loss": 0.3372, "step": 950 }, { "epoch": 2.995711500974659, "grad_norm": 14.375, "learning_rate": 1.5475632234925505e-05, "loss": 0.3392, "step": 960 }, { "epoch": 3.027680311890838, "grad_norm": 21.0, "learning_rate": 1.5358267949789968e-05, "loss": 0.3177, "step": 970 }, { "epoch": 3.058869395711501, "grad_norm": 17.0, "learning_rate": 1.5239859059700794e-05, "loss": 0.3119, "step": 980 }, { "epoch": 3.0900584795321637, "grad_norm": 8.1875, "learning_rate": 1.5120428648705716e-05, "loss": 0.304, "step": 990 }, { "epoch": 3.1212475633528265, "grad_norm": 10.75, "learning_rate": 1.5000000000000002e-05, "loss": 0.2962, "step": 1000 }, { "epoch": 3.1524366471734893, "grad_norm": 17.25, "learning_rate": 1.4878596591387329e-05, "loss": 0.2975, "step": 1010 }, { "epoch": 3.183625730994152, "grad_norm": 2.25, "learning_rate": 1.4756242090702756e-05, "loss": 0.2932, "step": 1020 }, { "epoch": 3.214814814814815, "grad_norm": 16.75, "learning_rate": 1.463296035119862e-05, "loss": 0.2956, "step": 1030 }, { "epoch": 3.2460038986354776, "grad_norm": 17.75, "learning_rate": 1.4508775406894308e-05, "loss": 0.2861, "step": 1040 }, { "epoch": 3.2771929824561403, "grad_norm": 15.0, "learning_rate": 1.4383711467890776e-05, "loss": 0.2926, "step": 1050 }, { "epoch": 3.308382066276803, "grad_norm": 16.0, "learning_rate": 1.4257792915650728e-05, "loss": 0.3076, "step": 1060 }, { "epoch": 3.339571150097466, "grad_norm": 15.8125, "learning_rate": 1.413104429824542e-05, "loss": 0.2804, "step": 1070 }, { "epoch": 3.3707602339181286, "grad_norm": 3.75, "learning_rate": 1.4003490325568953e-05, "loss": 0.2651, "step": 1080 }, { "epoch": 3.4019493177387914, "grad_norm": 12.75, "learning_rate": 1.3875155864521031e-05, "loss": 0.2951, "step": 1090 }, { "epoch": 3.433138401559454, "grad_norm": 9.5625, "learning_rate": 1.3746065934159123e-05, "loss": 0.2963, "step": 1100 }, { "epoch": 3.464327485380117, "grad_norm": 14.0, "learning_rate": 1.3616245700820922e-05, "loss": 0.2999, "step": 1110 }, { "epoch": 3.4955165692007797, "grad_norm": 9.6875, "learning_rate": 1.3485720473218153e-05, "loss": 0.2867, "step": 1120 }, { "epoch": 3.5267056530214425, "grad_norm": 11.4375, "learning_rate": 1.3354515697502552e-05, "loss": 0.2846, "step": 1130 }, { "epoch": 3.557894736842105, "grad_norm": 10.875, "learning_rate": 1.3222656952305113e-05, "loss": 0.2934, "step": 1140 }, { "epoch": 3.589083820662768, "grad_norm": 12.1875, "learning_rate": 1.3090169943749475e-05, "loss": 0.2819, "step": 1150 }, { "epoch": 3.6202729044834308, "grad_norm": 4.0625, "learning_rate": 1.2957080500440469e-05, "loss": 0.2896, "step": 1160 }, { "epoch": 3.6514619883040935, "grad_norm": 6.90625, "learning_rate": 1.2823414568428767e-05, "loss": 0.2913, "step": 1170 }, { "epoch": 3.6826510721247563, "grad_norm": 7.96875, "learning_rate": 1.2689198206152657e-05, "loss": 0.2945, "step": 1180 }, { "epoch": 3.713840155945419, "grad_norm": 6.25, "learning_rate": 1.2554457579357906e-05, "loss": 0.2989, "step": 1190 }, { "epoch": 3.745029239766082, "grad_norm": 4.9375, "learning_rate": 1.2419218955996677e-05, "loss": 0.3023, "step": 1200 }, { "epoch": 3.7762183235867446, "grad_norm": 14.0, "learning_rate": 1.2283508701106559e-05, "loss": 0.2751, "step": 1210 }, { "epoch": 3.8074074074074074, "grad_norm": 11.0625, "learning_rate": 1.2147353271670634e-05, "loss": 0.2741, "step": 1220 }, { "epoch": 3.83859649122807, "grad_norm": 6.875, "learning_rate": 1.2010779211459649e-05, "loss": 0.2744, "step": 1230 }, { "epoch": 3.869785575048733, "grad_norm": 4.71875, "learning_rate": 1.187381314585725e-05, "loss": 0.283, "step": 1240 }, { "epoch": 3.9009746588693957, "grad_norm": 6.78125, "learning_rate": 1.1736481776669307e-05, "loss": 0.2737, "step": 1250 }, { "epoch": 3.9321637426900584, "grad_norm": 4.625, "learning_rate": 1.159881187691835e-05, "loss": 0.2486, "step": 1260 }, { "epoch": 3.963352826510721, "grad_norm": 6.625, "learning_rate": 1.1460830285624119e-05, "loss": 0.2598, "step": 1270 }, { "epoch": 3.994541910331384, "grad_norm": 9.6875, "learning_rate": 1.1322563902571227e-05, "loss": 0.2669, "step": 1280 }, { "epoch": 4.026510721247563, "grad_norm": 7.46875, "learning_rate": 1.1184039683065014e-05, "loss": 0.2389, "step": 1290 }, { "epoch": 4.057699805068226, "grad_norm": 6.46875, "learning_rate": 1.1045284632676535e-05, "loss": 0.2191, "step": 1300 }, { "epoch": 4.088888888888889, "grad_norm": 6.25, "learning_rate": 1.0906325801977804e-05, "loss": 0.2313, "step": 1310 }, { "epoch": 4.120077972709551, "grad_norm": 3.5, "learning_rate": 1.0767190281268187e-05, "loss": 0.2376, "step": 1320 }, { "epoch": 4.151267056530214, "grad_norm": 6.125, "learning_rate": 1.0627905195293135e-05, "loss": 0.2448, "step": 1330 }, { "epoch": 4.182456140350877, "grad_norm": 7.5, "learning_rate": 1.0488497697956134e-05, "loss": 0.2349, "step": 1340 }, { "epoch": 4.21364522417154, "grad_norm": 7.78125, "learning_rate": 1.0348994967025012e-05, "loss": 0.2252, "step": 1350 }, { "epoch": 4.244834307992202, "grad_norm": 2.578125, "learning_rate": 1.0209424198833571e-05, "loss": 0.2309, "step": 1360 }, { "epoch": 4.276023391812865, "grad_norm": 8.0, "learning_rate": 1.0069812602979617e-05, "loss": 0.2137, "step": 1370 }, { "epoch": 4.307212475633528, "grad_norm": 4.78125, "learning_rate": 9.930187397020385e-06, "loss": 0.2394, "step": 1380 }, { "epoch": 4.338401559454191, "grad_norm": 8.4375, "learning_rate": 9.790575801166432e-06, "loss": 0.2256, "step": 1390 }, { "epoch": 4.3695906432748535, "grad_norm": 5.4375, "learning_rate": 9.651005032974994e-06, "loss": 0.2273, "step": 1400 }, { "epoch": 4.400779727095516, "grad_norm": 5.25, "learning_rate": 9.511502302043867e-06, "loss": 0.2177, "step": 1410 }, { "epoch": 4.431968810916179, "grad_norm": 4.8125, "learning_rate": 9.372094804706867e-06, "loss": 0.2323, "step": 1420 }, { "epoch": 4.463157894736842, "grad_norm": 2.34375, "learning_rate": 9.232809718731815e-06, "loss": 0.2425, "step": 1430 }, { "epoch": 4.4943469785575045, "grad_norm": 6.53125, "learning_rate": 9.093674198022201e-06, "loss": 0.2248, "step": 1440 }, { "epoch": 4.525536062378167, "grad_norm": 2.671875, "learning_rate": 8.954715367323468e-06, "loss": 0.2083, "step": 1450 }, { "epoch": 4.55672514619883, "grad_norm": 5.15625, "learning_rate": 8.815960316934991e-06, "loss": 0.2291, "step": 1460 }, { "epoch": 4.587914230019493, "grad_norm": 7.28125, "learning_rate": 8.677436097428775e-06, "loss": 0.2425, "step": 1470 }, { "epoch": 4.619103313840156, "grad_norm": 3.546875, "learning_rate": 8.539169714375885e-06, "loss": 0.2285, "step": 1480 }, { "epoch": 4.650292397660818, "grad_norm": 6.25, "learning_rate": 8.401188123081653e-06, "loss": 0.2291, "step": 1490 }, { "epoch": 4.681481481481481, "grad_norm": 5.46875, "learning_rate": 8.263518223330698e-06, "loss": 0.2253, "step": 1500 }, { "epoch": 4.712670565302144, "grad_norm": 3.03125, "learning_rate": 8.126186854142752e-06, "loss": 0.2278, "step": 1510 }, { "epoch": 4.743859649122807, "grad_norm": 2.109375, "learning_rate": 7.989220788540356e-06, "loss": 0.2349, "step": 1520 }, { "epoch": 4.775048732943469, "grad_norm": 2.234375, "learning_rate": 7.852646728329368e-06, "loss": 0.2424, "step": 1530 }, { "epoch": 4.806237816764132, "grad_norm": 4.84375, "learning_rate": 7.716491298893443e-06, "loss": 0.2168, "step": 1540 }, { "epoch": 4.837426900584795, "grad_norm": 4.75, "learning_rate": 7.580781044003324e-06, "loss": 0.2063, "step": 1550 }, { "epoch": 4.868615984405458, "grad_norm": 2.578125, "learning_rate": 7.445542420642097e-06, "loss": 0.2291, "step": 1560 }, { "epoch": 4.8998050682261205, "grad_norm": 2.453125, "learning_rate": 7.310801793847344e-06, "loss": 0.2117, "step": 1570 }, { "epoch": 4.930994152046783, "grad_norm": 2.640625, "learning_rate": 7.176585431571235e-06, "loss": 0.2152, "step": 1580 }, { "epoch": 4.962183235867446, "grad_norm": 6.90625, "learning_rate": 7.042919499559538e-06, "loss": 0.2092, "step": 1590 }, { "epoch": 4.993372319688109, "grad_norm": 9.0625, "learning_rate": 6.909830056250527e-06, "loss": 0.2142, "step": 1600 }, { "epoch": 5.025341130604288, "grad_norm": 4.53125, "learning_rate": 6.777343047694891e-06, "loss": 0.1957, "step": 1610 }, { "epoch": 5.056530214424951, "grad_norm": 2.984375, "learning_rate": 6.645484302497452e-06, "loss": 0.1895, "step": 1620 }, { "epoch": 5.087719298245614, "grad_norm": 2.671875, "learning_rate": 6.5142795267818505e-06, "loss": 0.1966, "step": 1630 }, { "epoch": 5.118908382066277, "grad_norm": 4.25, "learning_rate": 6.383754299179079e-06, "loss": 0.1837, "step": 1640 }, { "epoch": 5.150097465886939, "grad_norm": 3.359375, "learning_rate": 6.25393406584088e-06, "loss": 0.1886, "step": 1650 }, { "epoch": 5.181286549707602, "grad_norm": 4.46875, "learning_rate": 6.124844135478971e-06, "loss": 0.2011, "step": 1660 }, { "epoch": 5.212475633528265, "grad_norm": 6.03125, "learning_rate": 5.996509674431053e-06, "loss": 0.194, "step": 1670 }, { "epoch": 5.243664717348928, "grad_norm": 3.875, "learning_rate": 5.868955701754584e-06, "loss": 0.1921, "step": 1680 }, { "epoch": 5.2748538011695905, "grad_norm": 1.390625, "learning_rate": 5.742207084349274e-06, "loss": 0.1889, "step": 1690 }, { "epoch": 5.306042884990253, "grad_norm": 4.5, "learning_rate": 5.616288532109225e-06, "loss": 0.2059, "step": 1700 }, { "epoch": 5.337231968810916, "grad_norm": 2.671875, "learning_rate": 5.491224593105695e-06, "loss": 0.1957, "step": 1710 }, { "epoch": 5.368421052631579, "grad_norm": 3.875, "learning_rate": 5.367039648801386e-06, "loss": 0.201, "step": 1720 }, { "epoch": 5.3996101364522415, "grad_norm": 2.4375, "learning_rate": 5.243757909297247e-06, "loss": 0.2037, "step": 1730 }, { "epoch": 5.430799220272904, "grad_norm": 2.859375, "learning_rate": 5.121403408612672e-06, "loss": 0.1872, "step": 1740 }, { "epoch": 5.461988304093567, "grad_norm": 1.640625, "learning_rate": 5.000000000000003e-06, "loss": 0.182, "step": 1750 }, { "epoch": 5.49317738791423, "grad_norm": 1.984375, "learning_rate": 4.879571351294287e-06, "loss": 0.1959, "step": 1760 }, { "epoch": 5.524366471734893, "grad_norm": 1.328125, "learning_rate": 4.76014094029921e-06, "loss": 0.1848, "step": 1770 }, { "epoch": 5.555555555555555, "grad_norm": 1.65625, "learning_rate": 4.641732050210032e-06, "loss": 0.1791, "step": 1780 }, { "epoch": 5.586744639376218, "grad_norm": 2.734375, "learning_rate": 4.524367765074499e-06, "loss": 0.1885, "step": 1790 }, { "epoch": 5.617933723196881, "grad_norm": 4.5, "learning_rate": 4.408070965292534e-06, "loss": 0.2202, "step": 1800 }, { "epoch": 5.649122807017544, "grad_norm": 1.65625, "learning_rate": 4.292864323155684e-06, "loss": 0.1871, "step": 1810 }, { "epoch": 5.680311890838206, "grad_norm": 1.28125, "learning_rate": 4.178770298427107e-06, "loss": 0.1998, "step": 1820 }, { "epoch": 5.711500974658869, "grad_norm": 3.359375, "learning_rate": 4.065811133962987e-06, "loss": 0.1949, "step": 1830 }, { "epoch": 5.742690058479532, "grad_norm": 1.8203125, "learning_rate": 3.954008851376252e-06, "loss": 0.1873, "step": 1840 }, { "epoch": 5.773879142300195, "grad_norm": 1.359375, "learning_rate": 3.8433852467434175e-06, "loss": 0.1836, "step": 1850 }, { "epoch": 5.8050682261208575, "grad_norm": 2.375, "learning_rate": 3.7339618863553983e-06, "loss": 0.1755, "step": 1860 }, { "epoch": 5.83625730994152, "grad_norm": 1.5703125, "learning_rate": 3.625760102513103e-06, "loss": 0.1909, "step": 1870 }, { "epoch": 5.867446393762183, "grad_norm": 1.84375, "learning_rate": 3.5188009893686916e-06, "loss": 0.1815, "step": 1880 }, { "epoch": 5.898635477582846, "grad_norm": 1.703125, "learning_rate": 3.4131053988131947e-06, "loss": 0.1897, "step": 1890 }, { "epoch": 5.9298245614035086, "grad_norm": 1.2265625, "learning_rate": 3.308693936411421e-06, "loss": 0.183, "step": 1900 }, { "epoch": 5.961013645224171, "grad_norm": 1.2109375, "learning_rate": 3.2055869573848374e-06, "loss": 0.1865, "step": 1910 }, { "epoch": 5.992202729044834, "grad_norm": 1.4921875, "learning_rate": 3.103804562643302e-06, "loss": 0.197, "step": 1920 }, { "epoch": 6.024171539961014, "grad_norm": 0.9921875, "learning_rate": 3.003366594866345e-06, "loss": 0.1904, "step": 1930 }, { "epoch": 6.055360623781676, "grad_norm": 1.46875, "learning_rate": 2.9042926346347932e-06, "loss": 0.178, "step": 1940 }, { "epoch": 6.086549707602339, "grad_norm": 1.734375, "learning_rate": 2.8066019966134907e-06, "loss": 0.1877, "step": 1950 }, { "epoch": 6.117738791423002, "grad_norm": 1.0546875, "learning_rate": 2.7103137257858867e-06, "loss": 0.1877, "step": 1960 }, { "epoch": 6.148927875243665, "grad_norm": 1.5390625, "learning_rate": 2.615446593741161e-06, "loss": 0.185, "step": 1970 }, { "epoch": 6.1801169590643275, "grad_norm": 1.5, "learning_rate": 2.522019095014683e-06, "loss": 0.1695, "step": 1980 }, { "epoch": 6.21130604288499, "grad_norm": 0.90234375, "learning_rate": 2.4300494434824373e-06, "loss": 0.1652, "step": 1990 }, { "epoch": 6.242495126705653, "grad_norm": 1.7734375, "learning_rate": 2.339555568810221e-06, "loss": 0.1775, "step": 2000 }, { "epoch": 6.273684210526316, "grad_norm": 3.609375, "learning_rate": 2.2505551129582047e-06, "loss": 0.1856, "step": 2010 }, { "epoch": 6.3048732943469785, "grad_norm": 2.640625, "learning_rate": 2.163065426741603e-06, "loss": 0.1714, "step": 2020 }, { "epoch": 6.336062378167641, "grad_norm": 1.125, "learning_rate": 2.0771035664480944e-06, "loss": 0.1763, "step": 2030 }, { "epoch": 6.367251461988304, "grad_norm": 1.4375, "learning_rate": 1.9926862905126663e-06, "loss": 0.178, "step": 2040 }, { "epoch": 6.398440545808967, "grad_norm": 1.09375, "learning_rate": 1.9098300562505266e-06, "loss": 0.1843, "step": 2050 }, { "epoch": 6.42962962962963, "grad_norm": 2.78125, "learning_rate": 1.8285510166487154e-06, "loss": 0.183, "step": 2060 }, { "epoch": 6.460818713450292, "grad_norm": 0.98046875, "learning_rate": 1.7488650172170496e-06, "loss": 0.1708, "step": 2070 }, { "epoch": 6.492007797270955, "grad_norm": 1.453125, "learning_rate": 1.6707875928990059e-06, "loss": 0.179, "step": 2080 }, { "epoch": 6.523196881091618, "grad_norm": 1.0703125, "learning_rate": 1.5943339650431578e-06, "loss": 0.1779, "step": 2090 }, { "epoch": 6.554385964912281, "grad_norm": 0.921875, "learning_rate": 1.5195190384357405e-06, "loss": 0.1668, "step": 2100 }, { "epoch": 6.585575048732943, "grad_norm": 1.2890625, "learning_rate": 1.446357398394934e-06, "loss": 0.1707, "step": 2110 }, { "epoch": 6.616764132553606, "grad_norm": 1.015625, "learning_rate": 1.3748633079274254e-06, "loss": 0.1726, "step": 2120 }, { "epoch": 6.647953216374269, "grad_norm": 1.5, "learning_rate": 1.30505070494781e-06, "loss": 0.1827, "step": 2130 }, { "epoch": 6.679142300194932, "grad_norm": 2.25, "learning_rate": 1.2369331995613664e-06, "loss": 0.1832, "step": 2140 }, { "epoch": 6.7103313840155945, "grad_norm": 1.5078125, "learning_rate": 1.1705240714107301e-06, "loss": 0.1884, "step": 2150 }, { "epoch": 6.741520467836257, "grad_norm": 1.9921875, "learning_rate": 1.1058362670870248e-06, "loss": 0.1884, "step": 2160 }, { "epoch": 6.77270955165692, "grad_norm": 1.0703125, "learning_rate": 1.042882397605871e-06, "loss": 0.1829, "step": 2170 }, { "epoch": 6.803898635477583, "grad_norm": 2.875, "learning_rate": 9.816747359488632e-07, "loss": 0.1791, "step": 2180 }, { "epoch": 6.8350877192982455, "grad_norm": 1.2578125, "learning_rate": 9.222252146709143e-07, "loss": 0.1748, "step": 2190 }, { "epoch": 6.866276803118908, "grad_norm": 1.171875, "learning_rate": 8.645454235739903e-07, "loss": 0.1666, "step": 2200 }, { "epoch": 6.897465886939571, "grad_norm": 2.546875, "learning_rate": 8.086466074476562e-07, "loss": 0.161, "step": 2210 }, { "epoch": 6.928654970760234, "grad_norm": 1.09375, "learning_rate": 7.545396638768698e-07, "loss": 0.1811, "step": 2220 }, { "epoch": 6.959844054580897, "grad_norm": 2.28125, "learning_rate": 7.022351411174866e-07, "loss": 0.1817, "step": 2230 }, { "epoch": 6.991033138401559, "grad_norm": 1.921875, "learning_rate": 6.517432360398556e-07, "loss": 0.1775, "step": 2240 }, { "epoch": 7.023001949317739, "grad_norm": 1.21875, "learning_rate": 6.030737921409169e-07, "loss": 0.1643, "step": 2250 }, { "epoch": 7.054191033138402, "grad_norm": 1.296875, "learning_rate": 5.562362976251901e-07, "loss": 0.1754, "step": 2260 }, { "epoch": 7.0853801169590644, "grad_norm": 1.25, "learning_rate": 5.112398835550348e-07, "loss": 0.1782, "step": 2270 }, { "epoch": 7.116569200779727, "grad_norm": 1.515625, "learning_rate": 4.6809332207053083e-07, "loss": 0.1734, "step": 2280 }, { "epoch": 7.14775828460039, "grad_norm": 1.0625, "learning_rate": 4.268050246793276e-07, "loss": 0.1805, "step": 2290 }, { "epoch": 7.178947368421053, "grad_norm": 1.484375, "learning_rate": 3.8738304061681107e-07, "loss": 0.176, "step": 2300 }, { "epoch": 7.2101364522417155, "grad_norm": 1.40625, "learning_rate": 3.498350552768859e-07, "loss": 0.1778, "step": 2310 }, { "epoch": 7.241325536062378, "grad_norm": 1.953125, "learning_rate": 3.1416838871368925e-07, "loss": 0.1722, "step": 2320 }, { "epoch": 7.272514619883041, "grad_norm": 1.4296875, "learning_rate": 2.8038999421453827e-07, "loss": 0.1726, "step": 2330 }, { "epoch": 7.303703703703704, "grad_norm": 1.7265625, "learning_rate": 2.4850645694436736e-07, "loss": 0.1867, "step": 2340 }, { "epoch": 7.334892787524367, "grad_norm": 1.0078125, "learning_rate": 2.1852399266194312e-07, "loss": 0.1828, "step": 2350 }, { "epoch": 7.366081871345029, "grad_norm": 1.4609375, "learning_rate": 1.9044844650808468e-07, "loss": 0.1706, "step": 2360 }, { "epoch": 7.397270955165692, "grad_norm": 0.9765625, "learning_rate": 1.6428529186614195e-07, "loss": 0.18, "step": 2370 }, { "epoch": 7.428460038986355, "grad_norm": 0.99609375, "learning_rate": 1.400396292949513e-07, "loss": 0.1686, "step": 2380 }, { "epoch": 7.459649122807018, "grad_norm": 1.3828125, "learning_rate": 1.1771618553447217e-07, "loss": 0.1755, "step": 2390 }, { "epoch": 7.49083820662768, "grad_norm": 1.2890625, "learning_rate": 9.731931258429638e-08, "loss": 0.1805, "step": 2400 }, { "epoch": 7.522027290448343, "grad_norm": 1.390625, "learning_rate": 7.885298685522235e-08, "loss": 0.1749, "step": 2410 }, { "epoch": 7.553216374269006, "grad_norm": 1.0390625, "learning_rate": 6.232080839403631e-08, "loss": 0.1756, "step": 2420 }, { "epoch": 7.584405458089669, "grad_norm": 0.91015625, "learning_rate": 4.772600018168816e-08, "loss": 0.1692, "step": 2430 }, { "epoch": 7.6155945419103315, "grad_norm": 1.5625, "learning_rate": 3.50714075049563e-08, "loss": 0.1676, "step": 2440 }, { "epoch": 7.646783625730994, "grad_norm": 2.953125, "learning_rate": 2.4359497401758026e-08, "loss": 0.1811, "step": 2450 }, { "epoch": 7.677972709551657, "grad_norm": 1.3046875, "learning_rate": 1.5592358180189782e-08, "loss": 0.18, "step": 2460 }, { "epoch": 7.70916179337232, "grad_norm": 1.4453125, "learning_rate": 8.771699011416169e-09, "loss": 0.1801, "step": 2470 }, { "epoch": 7.7403508771929825, "grad_norm": 1.0625, "learning_rate": 3.898849596456477e-09, "loss": 0.1835, "step": 2480 }, { "epoch": 7.771539961013645, "grad_norm": 1.4140625, "learning_rate": 9.74759906957612e-10, "loss": 0.1731, "step": 2490 }, { "epoch": 7.802729044834308, "grad_norm": 1.59375, "learning_rate": 0.0, "loss": 0.1847, "step": 2500 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.5664155394048e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }