|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.8996138996139, |
|
"eval_steps": 500, |
|
"global_step": 3600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019305019305019305, |
|
"grad_norm": 11.880151748657227, |
|
"learning_rate": 4.999988505504495e-05, |
|
"loss": 1.2597, |
|
"num_input_tokens_seen": 133888, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03861003861003861, |
|
"grad_norm": 11.379286766052246, |
|
"learning_rate": 4.999954022123679e-05, |
|
"loss": 1.1508, |
|
"num_input_tokens_seen": 234752, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05791505791505792, |
|
"grad_norm": 9.279751777648926, |
|
"learning_rate": 4.999896550174647e-05, |
|
"loss": 1.4957, |
|
"num_input_tokens_seen": 388352, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07722007722007722, |
|
"grad_norm": 4.549979209899902, |
|
"learning_rate": 4.999816090185888e-05, |
|
"loss": 1.1843, |
|
"num_input_tokens_seen": 542720, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09652509652509653, |
|
"grad_norm": 5.948971748352051, |
|
"learning_rate": 4.999712642897278e-05, |
|
"loss": 1.1623, |
|
"num_input_tokens_seen": 642048, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11583011583011583, |
|
"grad_norm": 6.370548725128174, |
|
"learning_rate": 4.9995862092600796e-05, |
|
"loss": 1.2511, |
|
"num_input_tokens_seen": 761344, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 4.528784275054932, |
|
"learning_rate": 4.999436790436924e-05, |
|
"loss": 1.0512, |
|
"num_input_tokens_seen": 905984, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15444015444015444, |
|
"grad_norm": 4.6348114013671875, |
|
"learning_rate": 4.9992643878018054e-05, |
|
"loss": 1.2519, |
|
"num_input_tokens_seen": 1040896, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17374517374517376, |
|
"grad_norm": 5.525028705596924, |
|
"learning_rate": 4.99906900294007e-05, |
|
"loss": 1.2093, |
|
"num_input_tokens_seen": 1163776, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19305019305019305, |
|
"grad_norm": 5.8078718185424805, |
|
"learning_rate": 4.998850637648398e-05, |
|
"loss": 1.3795, |
|
"num_input_tokens_seen": 1277440, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21235521235521235, |
|
"grad_norm": 6.931970596313477, |
|
"learning_rate": 4.998609293934789e-05, |
|
"loss": 1.1255, |
|
"num_input_tokens_seen": 1412352, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.23166023166023167, |
|
"grad_norm": 5.5319976806640625, |
|
"learning_rate": 4.99834497401854e-05, |
|
"loss": 1.1894, |
|
"num_input_tokens_seen": 1551360, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25096525096525096, |
|
"grad_norm": 5.610212326049805, |
|
"learning_rate": 4.998057680330233e-05, |
|
"loss": 1.2108, |
|
"num_input_tokens_seen": 1715968, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 5.512667179107666, |
|
"learning_rate": 4.9977474155117045e-05, |
|
"loss": 1.1484, |
|
"num_input_tokens_seen": 1836800, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28957528957528955, |
|
"grad_norm": 5.64188289642334, |
|
"learning_rate": 4.9974141824160224e-05, |
|
"loss": 1.0656, |
|
"num_input_tokens_seen": 1982720, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3088803088803089, |
|
"grad_norm": 5.340020179748535, |
|
"learning_rate": 4.997057984107466e-05, |
|
"loss": 1.0068, |
|
"num_input_tokens_seen": 2115840, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3281853281853282, |
|
"grad_norm": 5.499743938446045, |
|
"learning_rate": 4.9966788238614905e-05, |
|
"loss": 1.142, |
|
"num_input_tokens_seen": 2244608, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3474903474903475, |
|
"grad_norm": 5.120875835418701, |
|
"learning_rate": 4.9962767051647006e-05, |
|
"loss": 1.2448, |
|
"num_input_tokens_seen": 2355200, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3667953667953668, |
|
"grad_norm": 5.501336574554443, |
|
"learning_rate": 4.995851631714816e-05, |
|
"loss": 1.0841, |
|
"num_input_tokens_seen": 2491392, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3861003861003861, |
|
"grad_norm": 5.672695636749268, |
|
"learning_rate": 4.995403607420644e-05, |
|
"loss": 1.079, |
|
"num_input_tokens_seen": 2605568, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 4.757706165313721, |
|
"learning_rate": 4.9949326364020314e-05, |
|
"loss": 1.2072, |
|
"num_input_tokens_seen": 2754560, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4247104247104247, |
|
"grad_norm": 5.0469489097595215, |
|
"learning_rate": 4.99443872298984e-05, |
|
"loss": 1.388, |
|
"num_input_tokens_seen": 2893312, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.444015444015444, |
|
"grad_norm": 4.751271724700928, |
|
"learning_rate": 4.9939218717258976e-05, |
|
"loss": 1.1421, |
|
"num_input_tokens_seen": 3045632, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.46332046332046334, |
|
"grad_norm": 4.551059722900391, |
|
"learning_rate": 4.993382087362959e-05, |
|
"loss": 1.1926, |
|
"num_input_tokens_seen": 3194624, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4826254826254826, |
|
"grad_norm": 4.361262321472168, |
|
"learning_rate": 4.992819374864665e-05, |
|
"loss": 1.3206, |
|
"num_input_tokens_seen": 3323904, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5019305019305019, |
|
"grad_norm": 4.295720100402832, |
|
"learning_rate": 4.992233739405492e-05, |
|
"loss": 1.3139, |
|
"num_input_tokens_seen": 3435008, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5212355212355212, |
|
"grad_norm": 9.746088027954102, |
|
"learning_rate": 4.9916251863707056e-05, |
|
"loss": 1.2343, |
|
"num_input_tokens_seen": 3552768, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 5.580798625946045, |
|
"learning_rate": 4.9909937213563165e-05, |
|
"loss": 1.2867, |
|
"num_input_tokens_seen": 3655936, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5598455598455598, |
|
"grad_norm": 3.9572153091430664, |
|
"learning_rate": 4.990339350169021e-05, |
|
"loss": 1.2402, |
|
"num_input_tokens_seen": 3779328, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5791505791505791, |
|
"grad_norm": 4.077531337738037, |
|
"learning_rate": 4.989662078826152e-05, |
|
"loss": 1.1501, |
|
"num_input_tokens_seen": 3911168, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5984555984555985, |
|
"grad_norm": 3.8401482105255127, |
|
"learning_rate": 4.988961913555623e-05, |
|
"loss": 1.0678, |
|
"num_input_tokens_seen": 4035328, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6177606177606177, |
|
"grad_norm": 4.454199314117432, |
|
"learning_rate": 4.988238860795873e-05, |
|
"loss": 1.1792, |
|
"num_input_tokens_seen": 4161792, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.637065637065637, |
|
"grad_norm": 4.723377704620361, |
|
"learning_rate": 4.9874929271958026e-05, |
|
"loss": 1.2038, |
|
"num_input_tokens_seen": 4281088, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6563706563706564, |
|
"grad_norm": 3.8252053260803223, |
|
"learning_rate": 4.986724119614715e-05, |
|
"loss": 1.21, |
|
"num_input_tokens_seen": 4445952, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 5.56161642074585, |
|
"learning_rate": 4.985932445122257e-05, |
|
"loss": 1.1328, |
|
"num_input_tokens_seen": 4618496, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.694980694980695, |
|
"grad_norm": 4.20256233215332, |
|
"learning_rate": 4.985117910998345e-05, |
|
"loss": 1.0232, |
|
"num_input_tokens_seen": 4784384, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 4.422672271728516, |
|
"learning_rate": 4.984280524733107e-05, |
|
"loss": 1.0596, |
|
"num_input_tokens_seen": 4925440, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7335907335907336, |
|
"grad_norm": 4.174430847167969, |
|
"learning_rate": 4.983420294026809e-05, |
|
"loss": 1.246, |
|
"num_input_tokens_seen": 5068544, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.752895752895753, |
|
"grad_norm": 5.4516191482543945, |
|
"learning_rate": 4.982537226789786e-05, |
|
"loss": 1.2178, |
|
"num_input_tokens_seen": 5196032, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7722007722007722, |
|
"grad_norm": 5.097752571105957, |
|
"learning_rate": 4.981631331142368e-05, |
|
"loss": 1.1424, |
|
"num_input_tokens_seen": 5289216, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7915057915057915, |
|
"grad_norm": 2.8934011459350586, |
|
"learning_rate": 4.9807026154148054e-05, |
|
"loss": 1.1518, |
|
"num_input_tokens_seen": 5451520, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 5.223976135253906, |
|
"learning_rate": 4.979751088147192e-05, |
|
"loss": 1.1354, |
|
"num_input_tokens_seen": 5612544, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8301158301158301, |
|
"grad_norm": 4.850491523742676, |
|
"learning_rate": 4.978776758089391e-05, |
|
"loss": 1.1915, |
|
"num_input_tokens_seen": 5757696, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8494208494208494, |
|
"grad_norm": 6.363794803619385, |
|
"learning_rate": 4.977779634200947e-05, |
|
"loss": 1.2171, |
|
"num_input_tokens_seen": 5895168, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8687258687258688, |
|
"grad_norm": 4.063634395599365, |
|
"learning_rate": 4.976759725651008e-05, |
|
"loss": 1.1818, |
|
"num_input_tokens_seen": 6004480, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.888030888030888, |
|
"grad_norm": 4.925548553466797, |
|
"learning_rate": 4.975717041818242e-05, |
|
"loss": 1.3625, |
|
"num_input_tokens_seen": 6158080, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9073359073359073, |
|
"grad_norm": 4.7595977783203125, |
|
"learning_rate": 4.97465159229075e-05, |
|
"loss": 1.1483, |
|
"num_input_tokens_seen": 6270208, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9266409266409267, |
|
"grad_norm": 4.2403974533081055, |
|
"learning_rate": 4.973563386865974e-05, |
|
"loss": 0.9918, |
|
"num_input_tokens_seen": 6400512, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 3.330521583557129, |
|
"learning_rate": 4.972452435550614e-05, |
|
"loss": 1.0481, |
|
"num_input_tokens_seen": 6534144, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9652509652509652, |
|
"grad_norm": 4.355147361755371, |
|
"learning_rate": 4.971318748560527e-05, |
|
"loss": 1.1627, |
|
"num_input_tokens_seen": 6689280, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9845559845559846, |
|
"grad_norm": 3.845254421234131, |
|
"learning_rate": 4.970162336320644e-05, |
|
"loss": 1.1168, |
|
"num_input_tokens_seen": 6830848, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0038610038610039, |
|
"grad_norm": 8.39128303527832, |
|
"learning_rate": 4.968983209464863e-05, |
|
"loss": 1.1015, |
|
"num_input_tokens_seen": 6950984, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0231660231660231, |
|
"grad_norm": 5.653963565826416, |
|
"learning_rate": 4.9677813788359595e-05, |
|
"loss": 0.6397, |
|
"num_input_tokens_seen": 7077192, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0424710424710424, |
|
"grad_norm": 6.64690637588501, |
|
"learning_rate": 4.9665568554854834e-05, |
|
"loss": 0.6662, |
|
"num_input_tokens_seen": 7194696, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0617760617760619, |
|
"grad_norm": 3.9684646129608154, |
|
"learning_rate": 4.965309650673656e-05, |
|
"loss": 0.7584, |
|
"num_input_tokens_seen": 7325512, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 4.258115768432617, |
|
"learning_rate": 4.9640397758692715e-05, |
|
"loss": 0.6332, |
|
"num_input_tokens_seen": 7464776, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1003861003861004, |
|
"grad_norm": 3.913597822189331, |
|
"learning_rate": 4.962747242749584e-05, |
|
"loss": 0.7527, |
|
"num_input_tokens_seen": 7630664, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1196911196911197, |
|
"grad_norm": 4.672424793243408, |
|
"learning_rate": 4.961432063200207e-05, |
|
"loss": 0.7667, |
|
"num_input_tokens_seen": 7771976, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.138996138996139, |
|
"grad_norm": 3.960838556289673, |
|
"learning_rate": 4.960094249315002e-05, |
|
"loss": 0.6909, |
|
"num_input_tokens_seen": 7877192, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.1583011583011582, |
|
"grad_norm": 3.6881680488586426, |
|
"learning_rate": 4.9587338133959625e-05, |
|
"loss": 0.611, |
|
"num_input_tokens_seen": 8005448, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1776061776061777, |
|
"grad_norm": 3.072396993637085, |
|
"learning_rate": 4.957350767953112e-05, |
|
"loss": 0.6579, |
|
"num_input_tokens_seen": 8132168, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.196911196911197, |
|
"grad_norm": 3.8796889781951904, |
|
"learning_rate": 4.9559451257043754e-05, |
|
"loss": 0.6889, |
|
"num_input_tokens_seen": 8265544, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 3.125763177871704, |
|
"learning_rate": 4.954516899575473e-05, |
|
"loss": 0.5874, |
|
"num_input_tokens_seen": 8390728, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2355212355212355, |
|
"grad_norm": 5.750874996185303, |
|
"learning_rate": 4.953066102699795e-05, |
|
"loss": 0.632, |
|
"num_input_tokens_seen": 8513864, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2548262548262548, |
|
"grad_norm": 4.728124141693115, |
|
"learning_rate": 4.951592748418284e-05, |
|
"loss": 0.6939, |
|
"num_input_tokens_seen": 8650824, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.2741312741312742, |
|
"grad_norm": 3.5621907711029053, |
|
"learning_rate": 4.9500968502793125e-05, |
|
"loss": 0.7251, |
|
"num_input_tokens_seen": 8773704, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2934362934362935, |
|
"grad_norm": 5.006427764892578, |
|
"learning_rate": 4.948578422038555e-05, |
|
"loss": 0.6997, |
|
"num_input_tokens_seen": 8894280, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.3127413127413128, |
|
"grad_norm": 4.801665782928467, |
|
"learning_rate": 4.947037477658865e-05, |
|
"loss": 0.5608, |
|
"num_input_tokens_seen": 9058376, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.332046332046332, |
|
"grad_norm": 5.262493133544922, |
|
"learning_rate": 4.945474031310144e-05, |
|
"loss": 0.6575, |
|
"num_input_tokens_seen": 9163592, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 3.481433868408203, |
|
"learning_rate": 4.943888097369216e-05, |
|
"loss": 0.7387, |
|
"num_input_tokens_seen": 9288008, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3706563706563706, |
|
"grad_norm": 4.59250545501709, |
|
"learning_rate": 4.942279690419687e-05, |
|
"loss": 0.7204, |
|
"num_input_tokens_seen": 9382216, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.3899613899613898, |
|
"grad_norm": 4.549391746520996, |
|
"learning_rate": 4.9406488252518203e-05, |
|
"loss": 0.7472, |
|
"num_input_tokens_seen": 9507912, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4092664092664093, |
|
"grad_norm": 6.062809944152832, |
|
"learning_rate": 4.9389955168623914e-05, |
|
"loss": 0.7806, |
|
"num_input_tokens_seen": 9632584, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 6.251070976257324, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 0.7466, |
|
"num_input_tokens_seen": 9756232, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4478764478764479, |
|
"grad_norm": 8.911005020141602, |
|
"learning_rate": 4.9356216314377184e-05, |
|
"loss": 0.7491, |
|
"num_input_tokens_seen": 9953096, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.4671814671814671, |
|
"grad_norm": 6.960048675537109, |
|
"learning_rate": 4.933901085427362e-05, |
|
"loss": 0.6792, |
|
"num_input_tokens_seen": 10094408, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 3.882781505584717, |
|
"learning_rate": 4.9321581582449365e-05, |
|
"loss": 0.652, |
|
"num_input_tokens_seen": 10183496, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.505791505791506, |
|
"grad_norm": 4.664374828338623, |
|
"learning_rate": 4.930392865917698e-05, |
|
"loss": 0.6968, |
|
"num_input_tokens_seen": 10299720, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.525096525096525, |
|
"grad_norm": 3.1740968227386475, |
|
"learning_rate": 4.928605224678561e-05, |
|
"loss": 0.6024, |
|
"num_input_tokens_seen": 10453576, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.5444015444015444, |
|
"grad_norm": 5.918027400970459, |
|
"learning_rate": 4.926795250965952e-05, |
|
"loss": 0.6757, |
|
"num_input_tokens_seen": 10564168, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5637065637065637, |
|
"grad_norm": 5.5165696144104, |
|
"learning_rate": 4.924962961423662e-05, |
|
"loss": 0.7303, |
|
"num_input_tokens_seen": 10680392, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.583011583011583, |
|
"grad_norm": 4.745772838592529, |
|
"learning_rate": 4.9231083729006825e-05, |
|
"loss": 0.7291, |
|
"num_input_tokens_seen": 10797128, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6023166023166024, |
|
"grad_norm": 3.4109997749328613, |
|
"learning_rate": 4.921231502451064e-05, |
|
"loss": 0.6801, |
|
"num_input_tokens_seen": 10911048, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 4.954616069793701, |
|
"learning_rate": 4.9193323673337476e-05, |
|
"loss": 0.766, |
|
"num_input_tokens_seen": 11030088, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.640926640926641, |
|
"grad_norm": 3.547844648361206, |
|
"learning_rate": 4.917410985012414e-05, |
|
"loss": 0.7338, |
|
"num_input_tokens_seen": 11202888, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.6602316602316602, |
|
"grad_norm": 4.377864360809326, |
|
"learning_rate": 4.91546737315532e-05, |
|
"loss": 0.669, |
|
"num_input_tokens_seen": 11353416, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.6795366795366795, |
|
"grad_norm": 4.632313251495361, |
|
"learning_rate": 4.913501549635136e-05, |
|
"loss": 0.7093, |
|
"num_input_tokens_seen": 11472968, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.698841698841699, |
|
"grad_norm": 5.683695316314697, |
|
"learning_rate": 4.91151353252878e-05, |
|
"loss": 0.7509, |
|
"num_input_tokens_seen": 11581512, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.718146718146718, |
|
"grad_norm": 3.844526767730713, |
|
"learning_rate": 4.9095033401172565e-05, |
|
"loss": 0.6265, |
|
"num_input_tokens_seen": 11741512, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.7374517374517375, |
|
"grad_norm": 4.572934150695801, |
|
"learning_rate": 4.907470990885483e-05, |
|
"loss": 0.7188, |
|
"num_input_tokens_seen": 11902536, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 5.532532215118408, |
|
"learning_rate": 4.9054165035221236e-05, |
|
"loss": 0.7237, |
|
"num_input_tokens_seen": 12052296, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.776061776061776, |
|
"grad_norm": 3.139909505844116, |
|
"learning_rate": 4.9033398969194145e-05, |
|
"loss": 0.6446, |
|
"num_input_tokens_seen": 12253512, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7953667953667953, |
|
"grad_norm": 3.506171703338623, |
|
"learning_rate": 4.901241190172991e-05, |
|
"loss": 0.6381, |
|
"num_input_tokens_seen": 12343624, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.8146718146718146, |
|
"grad_norm": 4.847969055175781, |
|
"learning_rate": 4.899120402581715e-05, |
|
"loss": 0.7995, |
|
"num_input_tokens_seen": 12481864, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.833976833976834, |
|
"grad_norm": 4.391269207000732, |
|
"learning_rate": 4.8969775536474915e-05, |
|
"loss": 0.7226, |
|
"num_input_tokens_seen": 12638792, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.8532818532818531, |
|
"grad_norm": 3.4892685413360596, |
|
"learning_rate": 4.894812663075095e-05, |
|
"loss": 0.5672, |
|
"num_input_tokens_seen": 12810824, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.8725868725868726, |
|
"grad_norm": 4.9667277336120605, |
|
"learning_rate": 4.8926257507719865e-05, |
|
"loss": 0.607, |
|
"num_input_tokens_seen": 12970312, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 5.739168167114258, |
|
"learning_rate": 4.890416836848127e-05, |
|
"loss": 0.7329, |
|
"num_input_tokens_seen": 13082696, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.9111969111969112, |
|
"grad_norm": 4.744792938232422, |
|
"learning_rate": 4.8881859416158e-05, |
|
"loss": 0.8327, |
|
"num_input_tokens_seen": 13192008, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.9305019305019306, |
|
"grad_norm": 5.711423873901367, |
|
"learning_rate": 4.8859330855894156e-05, |
|
"loss": 0.6468, |
|
"num_input_tokens_seen": 13347912, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9498069498069497, |
|
"grad_norm": 3.4863574504852295, |
|
"learning_rate": 4.883658289485329e-05, |
|
"loss": 0.6951, |
|
"num_input_tokens_seen": 13500488, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.9691119691119692, |
|
"grad_norm": 5.2073564529418945, |
|
"learning_rate": 4.881361574221648e-05, |
|
"loss": 0.7812, |
|
"num_input_tokens_seen": 13641800, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.9884169884169884, |
|
"grad_norm": 7.231164455413818, |
|
"learning_rate": 4.8790429609180375e-05, |
|
"loss": 0.8085, |
|
"num_input_tokens_seen": 13757000, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.0077220077220077, |
|
"grad_norm": 3.289991617202759, |
|
"learning_rate": 4.876702470895531e-05, |
|
"loss": 0.6533, |
|
"num_input_tokens_seen": 13843120, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 5.630355358123779, |
|
"learning_rate": 4.87434012567633e-05, |
|
"loss": 0.3901, |
|
"num_input_tokens_seen": 14011568, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.0463320463320462, |
|
"grad_norm": 11.086012840270996, |
|
"learning_rate": 4.871955946983607e-05, |
|
"loss": 0.344, |
|
"num_input_tokens_seen": 14202800, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.0656370656370657, |
|
"grad_norm": 6.8246259689331055, |
|
"learning_rate": 4.8695499567413086e-05, |
|
"loss": 0.3463, |
|
"num_input_tokens_seen": 14395824, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.0849420849420848, |
|
"grad_norm": 3.6909987926483154, |
|
"learning_rate": 4.867122177073948e-05, |
|
"loss": 0.3611, |
|
"num_input_tokens_seen": 14516400, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.1042471042471043, |
|
"grad_norm": 4.255818843841553, |
|
"learning_rate": 4.864672630306408e-05, |
|
"loss": 0.3644, |
|
"num_input_tokens_seen": 14627760, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.1235521235521237, |
|
"grad_norm": 6.667994022369385, |
|
"learning_rate": 4.862201338963733e-05, |
|
"loss": 0.3669, |
|
"num_input_tokens_seen": 14724016, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 3.5708060264587402, |
|
"learning_rate": 4.8597083257709194e-05, |
|
"loss": 0.3485, |
|
"num_input_tokens_seen": 14861488, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 3.337921380996704, |
|
"learning_rate": 4.857193613652711e-05, |
|
"loss": 0.3734, |
|
"num_input_tokens_seen": 14983088, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.1814671814671813, |
|
"grad_norm": 6.495638847351074, |
|
"learning_rate": 4.854657225733385e-05, |
|
"loss": 0.3867, |
|
"num_input_tokens_seen": 15077296, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.200772200772201, |
|
"grad_norm": 4.791080951690674, |
|
"learning_rate": 4.8520991853365414e-05, |
|
"loss": 0.4532, |
|
"num_input_tokens_seen": 15202736, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.2200772200772203, |
|
"grad_norm": 2.633500814437866, |
|
"learning_rate": 4.849519515984888e-05, |
|
"loss": 0.3848, |
|
"num_input_tokens_seen": 15337392, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.2393822393822393, |
|
"grad_norm": 5.552781105041504, |
|
"learning_rate": 4.846918241400022e-05, |
|
"loss": 0.3595, |
|
"num_input_tokens_seen": 15466672, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.258687258687259, |
|
"grad_norm": 4.466195106506348, |
|
"learning_rate": 4.844295385502215e-05, |
|
"loss": 0.434, |
|
"num_input_tokens_seen": 15617456, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.277992277992278, |
|
"grad_norm": 5.135293483734131, |
|
"learning_rate": 4.841650972410191e-05, |
|
"loss": 0.4374, |
|
"num_input_tokens_seen": 15723952, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 4.918574810028076, |
|
"learning_rate": 4.838985026440905e-05, |
|
"loss": 0.3616, |
|
"num_input_tokens_seen": 15867824, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.3166023166023164, |
|
"grad_norm": 5.5822224617004395, |
|
"learning_rate": 4.836297572109323e-05, |
|
"loss": 0.3864, |
|
"num_input_tokens_seen": 15970992, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.335907335907336, |
|
"grad_norm": 3.984456777572632, |
|
"learning_rate": 4.833588634128187e-05, |
|
"loss": 0.3897, |
|
"num_input_tokens_seen": 16082864, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.3552123552123554, |
|
"grad_norm": 4.147584915161133, |
|
"learning_rate": 4.8308582374077984e-05, |
|
"loss": 0.3853, |
|
"num_input_tokens_seen": 16218032, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.3745173745173744, |
|
"grad_norm": 3.50801682472229, |
|
"learning_rate": 4.8281064070557833e-05, |
|
"loss": 0.4045, |
|
"num_input_tokens_seen": 16359856, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.393822393822394, |
|
"grad_norm": 3.857002019882202, |
|
"learning_rate": 4.825333168376864e-05, |
|
"loss": 0.4303, |
|
"num_input_tokens_seen": 16484272, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.413127413127413, |
|
"grad_norm": 3.6967315673828125, |
|
"learning_rate": 4.8225385468726234e-05, |
|
"loss": 0.3753, |
|
"num_input_tokens_seen": 16618928, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 5.856339931488037, |
|
"learning_rate": 4.819722568241274e-05, |
|
"loss": 0.4107, |
|
"num_input_tokens_seen": 16734896, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.4517374517374515, |
|
"grad_norm": 3.909010410308838, |
|
"learning_rate": 4.8168852583774166e-05, |
|
"loss": 0.4277, |
|
"num_input_tokens_seen": 16850096, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.471042471042471, |
|
"grad_norm": 3.580345630645752, |
|
"learning_rate": 4.8140266433718095e-05, |
|
"loss": 0.3309, |
|
"num_input_tokens_seen": 16977840, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.4903474903474905, |
|
"grad_norm": 3.9844889640808105, |
|
"learning_rate": 4.811146749511121e-05, |
|
"loss": 0.3572, |
|
"num_input_tokens_seen": 17111984, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.5096525096525095, |
|
"grad_norm": 5.305726051330566, |
|
"learning_rate": 4.808245603277695e-05, |
|
"loss": 0.4373, |
|
"num_input_tokens_seen": 17226160, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.528957528957529, |
|
"grad_norm": 3.5857083797454834, |
|
"learning_rate": 4.805323231349299e-05, |
|
"loss": 0.3996, |
|
"num_input_tokens_seen": 17384880, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.5482625482625485, |
|
"grad_norm": 4.2644219398498535, |
|
"learning_rate": 4.8023796605988866e-05, |
|
"loss": 0.3748, |
|
"num_input_tokens_seen": 17510064, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 2.571721076965332, |
|
"learning_rate": 4.799414918094347e-05, |
|
"loss": 0.4303, |
|
"num_input_tokens_seen": 17668784, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.586872586872587, |
|
"grad_norm": 3.5632381439208984, |
|
"learning_rate": 4.796429031098255e-05, |
|
"loss": 0.4342, |
|
"num_input_tokens_seen": 17817776, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.606177606177606, |
|
"grad_norm": 3.475538492202759, |
|
"learning_rate": 4.7934220270676237e-05, |
|
"loss": 0.3935, |
|
"num_input_tokens_seen": 17979056, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.6254826254826256, |
|
"grad_norm": 3.3301711082458496, |
|
"learning_rate": 4.790393933653646e-05, |
|
"loss": 0.3469, |
|
"num_input_tokens_seen": 18119344, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.6447876447876446, |
|
"grad_norm": 3.781501531600952, |
|
"learning_rate": 4.787344778701449e-05, |
|
"loss": 0.4097, |
|
"num_input_tokens_seen": 18222256, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.664092664092664, |
|
"grad_norm": 3.3143699169158936, |
|
"learning_rate": 4.78427459024983e-05, |
|
"loss": 0.4291, |
|
"num_input_tokens_seen": 18360752, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.6833976833976836, |
|
"grad_norm": 4.394753456115723, |
|
"learning_rate": 4.781183396531004e-05, |
|
"loss": 0.4713, |
|
"num_input_tokens_seen": 18491824, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 3.8386504650115967, |
|
"learning_rate": 4.77807122597034e-05, |
|
"loss": 0.3772, |
|
"num_input_tokens_seen": 18634672, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.722007722007722, |
|
"grad_norm": 4.388941764831543, |
|
"learning_rate": 4.774938107186102e-05, |
|
"loss": 0.4281, |
|
"num_input_tokens_seen": 18778544, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.741312741312741, |
|
"grad_norm": 6.152458667755127, |
|
"learning_rate": 4.771784068989186e-05, |
|
"loss": 0.358, |
|
"num_input_tokens_seen": 18870960, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.7606177606177607, |
|
"grad_norm": 3.7352821826934814, |
|
"learning_rate": 4.768609140382855e-05, |
|
"loss": 0.3852, |
|
"num_input_tokens_seen": 19034032, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.7799227799227797, |
|
"grad_norm": 6.073156833648682, |
|
"learning_rate": 4.7654133505624695e-05, |
|
"loss": 0.4125, |
|
"num_input_tokens_seen": 19186096, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.799227799227799, |
|
"grad_norm": 3.3675029277801514, |
|
"learning_rate": 4.7621967289152256e-05, |
|
"loss": 0.4029, |
|
"num_input_tokens_seen": 19319728, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.8185328185328187, |
|
"grad_norm": 3.7507877349853516, |
|
"learning_rate": 4.7589593050198754e-05, |
|
"loss": 0.3858, |
|
"num_input_tokens_seen": 19463856, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 4.324283599853516, |
|
"learning_rate": 4.755701108646463e-05, |
|
"loss": 0.3679, |
|
"num_input_tokens_seen": 19587504, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 5.601120471954346, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 0.3863, |
|
"num_input_tokens_seen": 19732912, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.8764478764478767, |
|
"grad_norm": 3.6340267658233643, |
|
"learning_rate": 4.7491225185004286e-05, |
|
"loss": 0.382, |
|
"num_input_tokens_seen": 19833520, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.8957528957528957, |
|
"grad_norm": 4.500921249389648, |
|
"learning_rate": 4.745802185221866e-05, |
|
"loss": 0.4239, |
|
"num_input_tokens_seen": 19932592, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.915057915057915, |
|
"grad_norm": 3.6028401851654053, |
|
"learning_rate": 4.742461200452804e-05, |
|
"loss": 0.4577, |
|
"num_input_tokens_seen": 20090544, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.9343629343629343, |
|
"grad_norm": 3.114858627319336, |
|
"learning_rate": 4.7390995949155915e-05, |
|
"loss": 0.4031, |
|
"num_input_tokens_seen": 20210096, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.9536679536679538, |
|
"grad_norm": 3.504840135574341, |
|
"learning_rate": 4.735717399522196e-05, |
|
"loss": 0.3664, |
|
"num_input_tokens_seen": 20329392, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 3.960780143737793, |
|
"learning_rate": 4.732314645373921e-05, |
|
"loss": 0.424, |
|
"num_input_tokens_seen": 20448944, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.9922779922779923, |
|
"grad_norm": 3.825749635696411, |
|
"learning_rate": 4.728891363761121e-05, |
|
"loss": 0.3959, |
|
"num_input_tokens_seen": 20582320, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.011583011583012, |
|
"grad_norm": 4.023091793060303, |
|
"learning_rate": 4.725447586162911e-05, |
|
"loss": 0.2985, |
|
"num_input_tokens_seen": 20708384, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.030888030888031, |
|
"grad_norm": 3.257418394088745, |
|
"learning_rate": 4.721983344246881e-05, |
|
"loss": 0.2133, |
|
"num_input_tokens_seen": 20832032, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.0501930501930503, |
|
"grad_norm": 5.765536785125732, |
|
"learning_rate": 4.7184986698688e-05, |
|
"loss": 0.214, |
|
"num_input_tokens_seen": 20974880, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.0694980694980694, |
|
"grad_norm": 3.13777232170105, |
|
"learning_rate": 4.714993595072329e-05, |
|
"loss": 0.1982, |
|
"num_input_tokens_seen": 21092896, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.088803088803089, |
|
"grad_norm": 5.349308490753174, |
|
"learning_rate": 4.711468152088719e-05, |
|
"loss": 0.2139, |
|
"num_input_tokens_seen": 21228576, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 4.700284957885742, |
|
"learning_rate": 4.7079223733365237e-05, |
|
"loss": 0.2146, |
|
"num_input_tokens_seen": 21333536, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.1274131274131274, |
|
"grad_norm": 3.779158115386963, |
|
"learning_rate": 4.704356291421291e-05, |
|
"loss": 0.1931, |
|
"num_input_tokens_seen": 21462560, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.146718146718147, |
|
"grad_norm": 3.8744430541992188, |
|
"learning_rate": 4.700769939135272e-05, |
|
"loss": 0.2137, |
|
"num_input_tokens_seen": 21596960, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.166023166023166, |
|
"grad_norm": 4.513326644897461, |
|
"learning_rate": 4.697163349457115e-05, |
|
"loss": 0.2343, |
|
"num_input_tokens_seen": 21721888, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.1853281853281854, |
|
"grad_norm": 3.671565294265747, |
|
"learning_rate": 4.693536555551562e-05, |
|
"loss": 0.2048, |
|
"num_input_tokens_seen": 21824800, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.2046332046332044, |
|
"grad_norm": 6.427221775054932, |
|
"learning_rate": 4.689889590769147e-05, |
|
"loss": 0.2383, |
|
"num_input_tokens_seen": 21950240, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.223938223938224, |
|
"grad_norm": 2.976755142211914, |
|
"learning_rate": 4.686222488645886e-05, |
|
"loss": 0.2171, |
|
"num_input_tokens_seen": 22089760, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 3.2848925590515137, |
|
"learning_rate": 4.6825352829029705e-05, |
|
"loss": 0.2066, |
|
"num_input_tokens_seen": 22233120, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.2625482625482625, |
|
"grad_norm": 3.1143906116485596, |
|
"learning_rate": 4.6788280074464556e-05, |
|
"loss": 0.207, |
|
"num_input_tokens_seen": 22371872, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.281853281853282, |
|
"grad_norm": 5.183198928833008, |
|
"learning_rate": 4.675100696366951e-05, |
|
"loss": 0.2136, |
|
"num_input_tokens_seen": 22465568, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.301158301158301, |
|
"grad_norm": 3.4606399536132812, |
|
"learning_rate": 4.6713533839393045e-05, |
|
"loss": 0.2255, |
|
"num_input_tokens_seen": 22617632, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.3204633204633205, |
|
"grad_norm": 3.6124610900878906, |
|
"learning_rate": 4.667586104622288e-05, |
|
"loss": 0.2291, |
|
"num_input_tokens_seen": 22805792, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.33976833976834, |
|
"grad_norm": 4.528263092041016, |
|
"learning_rate": 4.663798893058283e-05, |
|
"loss": 0.2313, |
|
"num_input_tokens_seen": 22970144, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.359073359073359, |
|
"grad_norm": 4.979870796203613, |
|
"learning_rate": 4.659991784072959e-05, |
|
"loss": 0.2286, |
|
"num_input_tokens_seen": 23086880, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 4.131895542144775, |
|
"learning_rate": 4.656164812674951e-05, |
|
"loss": 0.2387, |
|
"num_input_tokens_seen": 23219488, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.3976833976833976, |
|
"grad_norm": 3.2840301990509033, |
|
"learning_rate": 4.652318014055547e-05, |
|
"loss": 0.2334, |
|
"num_input_tokens_seen": 23334176, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.416988416988417, |
|
"grad_norm": 5.544443130493164, |
|
"learning_rate": 4.6484514235883514e-05, |
|
"loss": 0.2209, |
|
"num_input_tokens_seen": 23431456, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.436293436293436, |
|
"grad_norm": 3.8010241985321045, |
|
"learning_rate": 4.644565076828972e-05, |
|
"loss": 0.2373, |
|
"num_input_tokens_seen": 23554848, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.4555984555984556, |
|
"grad_norm": 4.628957748413086, |
|
"learning_rate": 4.640659009514683e-05, |
|
"loss": 0.237, |
|
"num_input_tokens_seen": 23684128, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.474903474903475, |
|
"grad_norm": 3.2354142665863037, |
|
"learning_rate": 4.636733257564104e-05, |
|
"loss": 0.2482, |
|
"num_input_tokens_seen": 23807008, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.494208494208494, |
|
"grad_norm": 3.5726277828216553, |
|
"learning_rate": 4.632787857076866e-05, |
|
"loss": 0.2736, |
|
"num_input_tokens_seen": 23894560, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 2.902233600616455, |
|
"learning_rate": 4.628822844333278e-05, |
|
"loss": 0.2166, |
|
"num_input_tokens_seen": 24036128, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.532818532818533, |
|
"grad_norm": 3.0207438468933105, |
|
"learning_rate": 4.624838255793999e-05, |
|
"loss": 0.2357, |
|
"num_input_tokens_seen": 24181792, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.552123552123552, |
|
"grad_norm": 3.2155232429504395, |
|
"learning_rate": 4.620834128099696e-05, |
|
"loss": 0.2558, |
|
"num_input_tokens_seen": 24347424, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 6.303427219390869, |
|
"learning_rate": 4.6168104980707107e-05, |
|
"loss": 0.2441, |
|
"num_input_tokens_seen": 24476704, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.5907335907335907, |
|
"grad_norm": 4.327144622802734, |
|
"learning_rate": 4.612767402706721e-05, |
|
"loss": 0.2573, |
|
"num_input_tokens_seen": 24624928, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.61003861003861, |
|
"grad_norm": 3.255530595779419, |
|
"learning_rate": 4.608704879186402e-05, |
|
"loss": 0.245, |
|
"num_input_tokens_seen": 24820512, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.629343629343629, |
|
"grad_norm": 4.7047648429870605, |
|
"learning_rate": 4.604622964867078e-05, |
|
"loss": 0.2203, |
|
"num_input_tokens_seen": 24944416, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 4.091588973999023, |
|
"learning_rate": 4.6005216972843864e-05, |
|
"loss": 0.2233, |
|
"num_input_tokens_seen": 25067808, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.667953667953668, |
|
"grad_norm": 3.490816116333008, |
|
"learning_rate": 4.596401114151929e-05, |
|
"loss": 0.224, |
|
"num_input_tokens_seen": 25216032, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.687258687258687, |
|
"grad_norm": 3.7286484241485596, |
|
"learning_rate": 4.592261253360924e-05, |
|
"loss": 0.2285, |
|
"num_input_tokens_seen": 25306656, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.7065637065637067, |
|
"grad_norm": 3.884098529815674, |
|
"learning_rate": 4.588102152979863e-05, |
|
"loss": 0.2492, |
|
"num_input_tokens_seen": 25468704, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.7258687258687258, |
|
"grad_norm": 2.801973342895508, |
|
"learning_rate": 4.583923851254152e-05, |
|
"loss": 0.253, |
|
"num_input_tokens_seen": 25622816, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.7451737451737452, |
|
"grad_norm": 3.9415650367736816, |
|
"learning_rate": 4.579726386605768e-05, |
|
"loss": 0.2327, |
|
"num_input_tokens_seen": 25739040, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.7644787644787643, |
|
"grad_norm": 3.1281697750091553, |
|
"learning_rate": 4.575509797632903e-05, |
|
"loss": 0.2495, |
|
"num_input_tokens_seen": 25860896, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 2.4093031883239746, |
|
"learning_rate": 4.571274123109606e-05, |
|
"loss": 0.2771, |
|
"num_input_tokens_seen": 25998368, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.8030888030888033, |
|
"grad_norm": 4.196908950805664, |
|
"learning_rate": 4.567019401985431e-05, |
|
"loss": 0.243, |
|
"num_input_tokens_seen": 26126368, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.8223938223938223, |
|
"grad_norm": 3.5428357124328613, |
|
"learning_rate": 4.562745673385075e-05, |
|
"loss": 0.2759, |
|
"num_input_tokens_seen": 26260512, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.841698841698842, |
|
"grad_norm": 2.962770462036133, |
|
"learning_rate": 4.5584529766080236e-05, |
|
"loss": 0.2757, |
|
"num_input_tokens_seen": 26393376, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.861003861003861, |
|
"grad_norm": 3.8009259700775146, |
|
"learning_rate": 4.5541413511281826e-05, |
|
"loss": 0.215, |
|
"num_input_tokens_seen": 26544416, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.8803088803088803, |
|
"grad_norm": 4.168282985687256, |
|
"learning_rate": 4.54981083659352e-05, |
|
"loss": 0.2404, |
|
"num_input_tokens_seen": 26653984, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.8996138996138994, |
|
"grad_norm": 4.866374492645264, |
|
"learning_rate": 4.5454614728256995e-05, |
|
"loss": 0.2511, |
|
"num_input_tokens_seen": 26750496, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 4.477441310882568, |
|
"learning_rate": 4.541093299819714e-05, |
|
"loss": 0.2207, |
|
"num_input_tokens_seen": 26895392, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.9382239382239383, |
|
"grad_norm": 5.003627300262451, |
|
"learning_rate": 4.536706357743522e-05, |
|
"loss": 0.2757, |
|
"num_input_tokens_seen": 27054880, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.9575289575289574, |
|
"grad_norm": 5.468934059143066, |
|
"learning_rate": 4.5323006869376697e-05, |
|
"loss": 0.2604, |
|
"num_input_tokens_seen": 27183648, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.976833976833977, |
|
"grad_norm": 4.3826093673706055, |
|
"learning_rate": 4.527876327914929e-05, |
|
"loss": 0.2568, |
|
"num_input_tokens_seen": 27320864, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.9961389961389964, |
|
"grad_norm": 3.077738046646118, |
|
"learning_rate": 4.52343332135992e-05, |
|
"loss": 0.2112, |
|
"num_input_tokens_seen": 27465760, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.015444015444015, |
|
"grad_norm": 2.550122022628784, |
|
"learning_rate": 4.518971708128737e-05, |
|
"loss": 0.142, |
|
"num_input_tokens_seen": 27625568, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.0347490347490345, |
|
"grad_norm": 2.2519214153289795, |
|
"learning_rate": 4.514491529248576e-05, |
|
"loss": 0.1238, |
|
"num_input_tokens_seen": 27748704, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.054054054054054, |
|
"grad_norm": 4.56045389175415, |
|
"learning_rate": 4.5099928259173516e-05, |
|
"loss": 0.1417, |
|
"num_input_tokens_seen": 27849056, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.073359073359073, |
|
"grad_norm": 2.8091559410095215, |
|
"learning_rate": 4.505475639503326e-05, |
|
"loss": 0.1447, |
|
"num_input_tokens_seen": 27950176, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.0926640926640925, |
|
"grad_norm": 5.7186150550842285, |
|
"learning_rate": 4.5009400115447206e-05, |
|
"loss": 0.1619, |
|
"num_input_tokens_seen": 28102752, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.1119691119691115, |
|
"grad_norm": 5.198718547821045, |
|
"learning_rate": 4.4963859837493404e-05, |
|
"loss": 0.1592, |
|
"num_input_tokens_seen": 28251232, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.1312741312741315, |
|
"grad_norm": 3.646064519882202, |
|
"learning_rate": 4.491813597994188e-05, |
|
"loss": 0.1573, |
|
"num_input_tokens_seen": 28363616, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.1505791505791505, |
|
"grad_norm": 3.22397780418396, |
|
"learning_rate": 4.4872228963250764e-05, |
|
"loss": 0.1302, |
|
"num_input_tokens_seen": 28512352, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.1698841698841695, |
|
"grad_norm": 4.661962985992432, |
|
"learning_rate": 4.4826139209562444e-05, |
|
"loss": 0.1337, |
|
"num_input_tokens_seen": 28634208, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.1891891891891895, |
|
"grad_norm": 2.3518221378326416, |
|
"learning_rate": 4.477986714269972e-05, |
|
"loss": 0.1445, |
|
"num_input_tokens_seen": 28745568, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.2084942084942085, |
|
"grad_norm": 4.337996006011963, |
|
"learning_rate": 4.473341318816181e-05, |
|
"loss": 0.1382, |
|
"num_input_tokens_seen": 28834144, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.227799227799228, |
|
"grad_norm": 2.566878318786621, |
|
"learning_rate": 4.468677777312056e-05, |
|
"loss": 0.1618, |
|
"num_input_tokens_seen": 28962912, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.2471042471042475, |
|
"grad_norm": 3.2163915634155273, |
|
"learning_rate": 4.463996132641641e-05, |
|
"loss": 0.145, |
|
"num_input_tokens_seen": 29112160, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.2664092664092665, |
|
"grad_norm": 5.36053991317749, |
|
"learning_rate": 4.45929642785545e-05, |
|
"loss": 0.1368, |
|
"num_input_tokens_seen": 29233760, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 6.08097505569458, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 0.1568, |
|
"num_input_tokens_seen": 29349728, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.305019305019305, |
|
"grad_norm": 3.5987799167633057, |
|
"learning_rate": 4.4498430109677766e-05, |
|
"loss": 0.1717, |
|
"num_input_tokens_seen": 29490528, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 2.7298388481140137, |
|
"learning_rate": 4.445089385796099e-05, |
|
"loss": 0.1484, |
|
"num_input_tokens_seen": 29643616, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.343629343629344, |
|
"grad_norm": 3.550762414932251, |
|
"learning_rate": 4.4403178743674594e-05, |
|
"loss": 0.1582, |
|
"num_input_tokens_seen": 29750112, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.362934362934363, |
|
"grad_norm": 2.97074818611145, |
|
"learning_rate": 4.435528520558752e-05, |
|
"loss": 0.145, |
|
"num_input_tokens_seen": 29865824, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.382239382239383, |
|
"grad_norm": 5.895785331726074, |
|
"learning_rate": 4.430721368410941e-05, |
|
"loss": 0.1651, |
|
"num_input_tokens_seen": 30009440, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 4.401544401544402, |
|
"grad_norm": 3.1574273109436035, |
|
"learning_rate": 4.4258964621286584e-05, |
|
"loss": 0.1463, |
|
"num_input_tokens_seen": 30108000, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.420849420849421, |
|
"grad_norm": 2.566192388534546, |
|
"learning_rate": 4.421053846079793e-05, |
|
"loss": 0.1601, |
|
"num_input_tokens_seen": 30271584, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 4.440154440154441, |
|
"grad_norm": 3.3316359519958496, |
|
"learning_rate": 4.4161935647950894e-05, |
|
"loss": 0.1546, |
|
"num_input_tokens_seen": 30400352, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.45945945945946, |
|
"grad_norm": 4.07961893081665, |
|
"learning_rate": 4.4113156629677316e-05, |
|
"loss": 0.1908, |
|
"num_input_tokens_seen": 30517856, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 4.478764478764479, |
|
"grad_norm": 3.392979860305786, |
|
"learning_rate": 4.4064201854529365e-05, |
|
"loss": 0.1697, |
|
"num_input_tokens_seen": 30701664, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.498069498069498, |
|
"grad_norm": 7.8615922927856445, |
|
"learning_rate": 4.40150717726754e-05, |
|
"loss": 0.1531, |
|
"num_input_tokens_seen": 30829664, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 4.517374517374518, |
|
"grad_norm": 4.743879318237305, |
|
"learning_rate": 4.396576683589582e-05, |
|
"loss": 0.1479, |
|
"num_input_tokens_seen": 30961760, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.536679536679537, |
|
"grad_norm": 3.9626708030700684, |
|
"learning_rate": 4.391628749757892e-05, |
|
"loss": 0.1732, |
|
"num_input_tokens_seen": 31134304, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.555984555984556, |
|
"grad_norm": 6.22069787979126, |
|
"learning_rate": 4.386663421271674e-05, |
|
"loss": 0.1845, |
|
"num_input_tokens_seen": 31266656, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.575289575289576, |
|
"grad_norm": 3.347773790359497, |
|
"learning_rate": 4.381680743790083e-05, |
|
"loss": 0.1527, |
|
"num_input_tokens_seen": 31394656, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 4.594594594594595, |
|
"grad_norm": 3.2894535064697266, |
|
"learning_rate": 4.3766807631318106e-05, |
|
"loss": 0.1559, |
|
"num_input_tokens_seen": 31586144, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.613899613899614, |
|
"grad_norm": 3.1221001148223877, |
|
"learning_rate": 4.3716635252746625e-05, |
|
"loss": 0.174, |
|
"num_input_tokens_seen": 31715168, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 4.633204633204633, |
|
"grad_norm": 2.959454298019409, |
|
"learning_rate": 4.366629076355131e-05, |
|
"loss": 0.1638, |
|
"num_input_tokens_seen": 31828832, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.652509652509653, |
|
"grad_norm": 3.2776925563812256, |
|
"learning_rate": 4.361577462667978e-05, |
|
"loss": 0.1582, |
|
"num_input_tokens_seen": 31939424, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 4.671814671814672, |
|
"grad_norm": 2.5983834266662598, |
|
"learning_rate": 4.356508730665804e-05, |
|
"loss": 0.1507, |
|
"num_input_tokens_seen": 32057952, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.691119691119691, |
|
"grad_norm": 2.471327066421509, |
|
"learning_rate": 4.351422926958621e-05, |
|
"loss": 0.1523, |
|
"num_input_tokens_seen": 32194144, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 4.710424710424711, |
|
"grad_norm": 3.6747171878814697, |
|
"learning_rate": 4.346320098313429e-05, |
|
"loss": 0.156, |
|
"num_input_tokens_seen": 32324192, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.72972972972973, |
|
"grad_norm": 2.855123281478882, |
|
"learning_rate": 4.341200291653781e-05, |
|
"loss": 0.1439, |
|
"num_input_tokens_seen": 32441696, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 4.749034749034749, |
|
"grad_norm": 3.59421706199646, |
|
"learning_rate": 4.336063554059351e-05, |
|
"loss": 0.1728, |
|
"num_input_tokens_seen": 32564320, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.768339768339768, |
|
"grad_norm": 3.7414238452911377, |
|
"learning_rate": 4.3309099327655064e-05, |
|
"loss": 0.1633, |
|
"num_input_tokens_seen": 32660320, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 4.787644787644788, |
|
"grad_norm": 2.5379812717437744, |
|
"learning_rate": 4.3257394751628686e-05, |
|
"loss": 0.1648, |
|
"num_input_tokens_seen": 32821344, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.806949806949807, |
|
"grad_norm": 2.9361348152160645, |
|
"learning_rate": 4.320552228796878e-05, |
|
"loss": 0.1596, |
|
"num_input_tokens_seen": 32965728, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 4.826254826254826, |
|
"grad_norm": 3.6422150135040283, |
|
"learning_rate": 4.315348241367358e-05, |
|
"loss": 0.145, |
|
"num_input_tokens_seen": 33093984, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.845559845559846, |
|
"grad_norm": 3.705981731414795, |
|
"learning_rate": 4.310127560728079e-05, |
|
"loss": 0.1693, |
|
"num_input_tokens_seen": 33215840, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 4.864864864864865, |
|
"grad_norm": 3.614696979522705, |
|
"learning_rate": 4.3048902348863116e-05, |
|
"loss": 0.1555, |
|
"num_input_tokens_seen": 33311840, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.884169884169884, |
|
"grad_norm": 3.013524055480957, |
|
"learning_rate": 4.2996363120023904e-05, |
|
"loss": 0.1429, |
|
"num_input_tokens_seen": 33480032, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 4.903474903474903, |
|
"grad_norm": 3.562551736831665, |
|
"learning_rate": 4.29436584038927e-05, |
|
"loss": 0.1509, |
|
"num_input_tokens_seen": 33604704, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.922779922779923, |
|
"grad_norm": 2.640592336654663, |
|
"learning_rate": 4.2890788685120804e-05, |
|
"loss": 0.1553, |
|
"num_input_tokens_seen": 33735520, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 4.942084942084942, |
|
"grad_norm": 3.140902280807495, |
|
"learning_rate": 4.283775444987681e-05, |
|
"loss": 0.16, |
|
"num_input_tokens_seen": 33850976, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.961389961389961, |
|
"grad_norm": 3.582165479660034, |
|
"learning_rate": 4.278455618584214e-05, |
|
"loss": 0.1737, |
|
"num_input_tokens_seen": 33996384, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 4.980694980694981, |
|
"grad_norm": 3.9491639137268066, |
|
"learning_rate": 4.273119438220656e-05, |
|
"loss": 0.1818, |
|
"num_input_tokens_seen": 34131808, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 3.7889578342437744, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.15, |
|
"num_input_tokens_seen": 34283512, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 5.019305019305019, |
|
"grad_norm": 2.997573137283325, |
|
"learning_rate": 4.262398212040646e-05, |
|
"loss": 0.0875, |
|
"num_input_tokens_seen": 34411256, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.038610038610039, |
|
"grad_norm": 3.7864131927490234, |
|
"learning_rate": 4.257013264812261e-05, |
|
"loss": 0.0836, |
|
"num_input_tokens_seen": 34543864, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 5.057915057915058, |
|
"grad_norm": 2.8357129096984863, |
|
"learning_rate": 4.2516121607990175e-05, |
|
"loss": 0.092, |
|
"num_input_tokens_seen": 34658040, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.077220077220077, |
|
"grad_norm": 2.736976385116577, |
|
"learning_rate": 4.246194949667286e-05, |
|
"loss": 0.0912, |
|
"num_input_tokens_seen": 34776568, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 5.096525096525096, |
|
"grad_norm": 3.070627212524414, |
|
"learning_rate": 4.2407616812315554e-05, |
|
"loss": 0.0946, |
|
"num_input_tokens_seen": 34904824, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.115830115830116, |
|
"grad_norm": 2.968640089035034, |
|
"learning_rate": 4.2353124054539684e-05, |
|
"loss": 0.1137, |
|
"num_input_tokens_seen": 35038456, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 5.135135135135135, |
|
"grad_norm": 2.9958269596099854, |
|
"learning_rate": 4.229847172443866e-05, |
|
"loss": 0.1395, |
|
"num_input_tokens_seen": 35153912, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.154440154440154, |
|
"grad_norm": 2.4986655712127686, |
|
"learning_rate": 4.2243660324573255e-05, |
|
"loss": 0.1074, |
|
"num_input_tokens_seen": 35257336, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 5.173745173745174, |
|
"grad_norm": 3.3315913677215576, |
|
"learning_rate": 4.218869035896697e-05, |
|
"loss": 0.105, |
|
"num_input_tokens_seen": 35385336, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.193050193050193, |
|
"grad_norm": 3.6041369438171387, |
|
"learning_rate": 4.213356233310144e-05, |
|
"loss": 0.1244, |
|
"num_input_tokens_seen": 35536632, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 5.212355212355212, |
|
"grad_norm": 3.3627777099609375, |
|
"learning_rate": 4.2078276753911725e-05, |
|
"loss": 0.1129, |
|
"num_input_tokens_seen": 35660536, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.231660231660232, |
|
"grad_norm": 7.004786014556885, |
|
"learning_rate": 4.202283412978171e-05, |
|
"loss": 0.1096, |
|
"num_input_tokens_seen": 35796728, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 5.250965250965251, |
|
"grad_norm": 2.9665541648864746, |
|
"learning_rate": 4.1967234970539384e-05, |
|
"loss": 0.1104, |
|
"num_input_tokens_seen": 36003320, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.27027027027027, |
|
"grad_norm": 2.722766160964966, |
|
"learning_rate": 4.191147978745218e-05, |
|
"loss": 0.1068, |
|
"num_input_tokens_seen": 36131832, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 5.289575289575289, |
|
"grad_norm": 2.683607816696167, |
|
"learning_rate": 4.1855569093222245e-05, |
|
"loss": 0.1167, |
|
"num_input_tokens_seen": 36269816, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.308880308880309, |
|
"grad_norm": 2.957512378692627, |
|
"learning_rate": 4.179950340198178e-05, |
|
"loss": 0.1397, |
|
"num_input_tokens_seen": 36395512, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 5.328185328185328, |
|
"grad_norm": 2.8095273971557617, |
|
"learning_rate": 4.1743283229288245e-05, |
|
"loss": 0.1096, |
|
"num_input_tokens_seen": 36510200, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.347490347490347, |
|
"grad_norm": 2.9579036235809326, |
|
"learning_rate": 4.168690909211965e-05, |
|
"loss": 0.1133, |
|
"num_input_tokens_seen": 36632312, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 5.366795366795367, |
|
"grad_norm": 3.1217041015625, |
|
"learning_rate": 4.163038150886982e-05, |
|
"loss": 0.1092, |
|
"num_input_tokens_seen": 36754424, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.386100386100386, |
|
"grad_norm": 2.8517022132873535, |
|
"learning_rate": 4.157370099934358e-05, |
|
"loss": 0.1059, |
|
"num_input_tokens_seen": 36881656, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 5.405405405405405, |
|
"grad_norm": 2.803497314453125, |
|
"learning_rate": 4.151686808475204e-05, |
|
"loss": 0.113, |
|
"num_input_tokens_seen": 37017592, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.424710424710424, |
|
"grad_norm": 2.8086516857147217, |
|
"learning_rate": 4.145988328770773e-05, |
|
"loss": 0.1008, |
|
"num_input_tokens_seen": 37153528, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 5.444015444015444, |
|
"grad_norm": 2.5405125617980957, |
|
"learning_rate": 4.140274713221985e-05, |
|
"loss": 0.1082, |
|
"num_input_tokens_seen": 37245688, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.463320463320463, |
|
"grad_norm": 5.254634380340576, |
|
"learning_rate": 4.1345460143689425e-05, |
|
"loss": 0.1089, |
|
"num_input_tokens_seen": 37376248, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 5.482625482625482, |
|
"grad_norm": 2.1900696754455566, |
|
"learning_rate": 4.1288022848904485e-05, |
|
"loss": 0.1082, |
|
"num_input_tokens_seen": 37524472, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.501930501930502, |
|
"grad_norm": 2.8734242916107178, |
|
"learning_rate": 4.12304357760352e-05, |
|
"loss": 0.1256, |
|
"num_input_tokens_seen": 37648376, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 5.521235521235521, |
|
"grad_norm": 2.449732780456543, |
|
"learning_rate": 4.117269945462906e-05, |
|
"loss": 0.1112, |
|
"num_input_tokens_seen": 37798392, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.54054054054054, |
|
"grad_norm": 2.664689064025879, |
|
"learning_rate": 4.111481441560598e-05, |
|
"loss": 0.1139, |
|
"num_input_tokens_seen": 37922808, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 5.559845559845559, |
|
"grad_norm": 2.6877481937408447, |
|
"learning_rate": 4.10567811912534e-05, |
|
"loss": 0.1214, |
|
"num_input_tokens_seen": 38035448, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.579150579150579, |
|
"grad_norm": 2.6928939819335938, |
|
"learning_rate": 4.0998600315221445e-05, |
|
"loss": 0.104, |
|
"num_input_tokens_seen": 38186232, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 5.598455598455598, |
|
"grad_norm": 2.930649995803833, |
|
"learning_rate": 4.094027232251796e-05, |
|
"loss": 0.1154, |
|
"num_input_tokens_seen": 38296568, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.617760617760617, |
|
"grad_norm": 2.5673305988311768, |
|
"learning_rate": 4.0881797749503626e-05, |
|
"loss": 0.129, |
|
"num_input_tokens_seen": 38388216, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 5.637065637065637, |
|
"grad_norm": 2.376112937927246, |
|
"learning_rate": 4.082317713388702e-05, |
|
"loss": 0.1464, |
|
"num_input_tokens_seen": 38552312, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.656370656370656, |
|
"grad_norm": 3.658287525177002, |
|
"learning_rate": 4.076441101471966e-05, |
|
"loss": 0.1149, |
|
"num_input_tokens_seen": 38649592, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 5.675675675675675, |
|
"grad_norm": 2.58902907371521, |
|
"learning_rate": 4.070549993239106e-05, |
|
"loss": 0.1261, |
|
"num_input_tokens_seen": 38768888, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.694980694980695, |
|
"grad_norm": 2.975992202758789, |
|
"learning_rate": 4.064644442862376e-05, |
|
"loss": 0.1285, |
|
"num_input_tokens_seen": 38868472, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 3.739966869354248, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 0.1033, |
|
"num_input_tokens_seen": 38983416, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.7335907335907335, |
|
"grad_norm": 4.107306003570557, |
|
"learning_rate": 4.0527902330298425e-05, |
|
"loss": 0.117, |
|
"num_input_tokens_seen": 39105272, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 5.752895752895753, |
|
"grad_norm": 2.5064680576324463, |
|
"learning_rate": 4.0468416825805676e-05, |
|
"loss": 0.1298, |
|
"num_input_tokens_seen": 39264504, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.772200772200772, |
|
"grad_norm": 2.3775291442871094, |
|
"learning_rate": 4.040878907999479e-05, |
|
"loss": 0.1225, |
|
"num_input_tokens_seen": 39388408, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 5.7915057915057915, |
|
"grad_norm": 3.883429765701294, |
|
"learning_rate": 4.0349019641178443e-05, |
|
"loss": 0.1052, |
|
"num_input_tokens_seen": 39496696, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.8108108108108105, |
|
"grad_norm": 3.047959566116333, |
|
"learning_rate": 4.028910905897229e-05, |
|
"loss": 0.1047, |
|
"num_input_tokens_seen": 39624952, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 5.8301158301158305, |
|
"grad_norm": 3.1065351963043213, |
|
"learning_rate": 4.022905788428985e-05, |
|
"loss": 0.119, |
|
"num_input_tokens_seen": 39768568, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.8494208494208495, |
|
"grad_norm": 2.1394753456115723, |
|
"learning_rate": 4.016886666933749e-05, |
|
"loss": 0.1255, |
|
"num_input_tokens_seen": 39937272, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 5.8687258687258685, |
|
"grad_norm": 4.255035877227783, |
|
"learning_rate": 4.010853596760933e-05, |
|
"loss": 0.1314, |
|
"num_input_tokens_seen": 40050424, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.8880308880308885, |
|
"grad_norm": 3.418353319168091, |
|
"learning_rate": 4.004806633388216e-05, |
|
"loss": 0.116, |
|
"num_input_tokens_seen": 40194808, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 5.9073359073359075, |
|
"grad_norm": 2.7976632118225098, |
|
"learning_rate": 3.998745832421033e-05, |
|
"loss": 0.1095, |
|
"num_input_tokens_seen": 40311544, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.926640926640927, |
|
"grad_norm": 4.011565208435059, |
|
"learning_rate": 3.9926712495920624e-05, |
|
"loss": 0.1161, |
|
"num_input_tokens_seen": 40435448, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 5.945945945945946, |
|
"grad_norm": 1.9062412977218628, |
|
"learning_rate": 3.986582940760717e-05, |
|
"loss": 0.1162, |
|
"num_input_tokens_seen": 40597496, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.9652509652509655, |
|
"grad_norm": 1.8253803253173828, |
|
"learning_rate": 3.980480961912627e-05, |
|
"loss": 0.1022, |
|
"num_input_tokens_seen": 40749048, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 5.984555984555985, |
|
"grad_norm": 3.512676239013672, |
|
"learning_rate": 3.9743653691591277e-05, |
|
"loss": 0.0992, |
|
"num_input_tokens_seen": 40854008, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.003861003861004, |
|
"grad_norm": 2.4813055992126465, |
|
"learning_rate": 3.968236218736741e-05, |
|
"loss": 0.0932, |
|
"num_input_tokens_seen": 40982240, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 6.023166023166024, |
|
"grad_norm": 3.3331120014190674, |
|
"learning_rate": 3.962093567006662e-05, |
|
"loss": 0.0791, |
|
"num_input_tokens_seen": 41121248, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.042471042471043, |
|
"grad_norm": 2.6241912841796875, |
|
"learning_rate": 3.955937470454235e-05, |
|
"loss": 0.0681, |
|
"num_input_tokens_seen": 41241824, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 6.061776061776062, |
|
"grad_norm": 2.7598159313201904, |
|
"learning_rate": 3.9497679856884403e-05, |
|
"loss": 0.0772, |
|
"num_input_tokens_seen": 41367776, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.081081081081081, |
|
"grad_norm": 3.0244016647338867, |
|
"learning_rate": 3.94358516944137e-05, |
|
"loss": 0.0669, |
|
"num_input_tokens_seen": 41479904, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 6.100386100386101, |
|
"grad_norm": 2.4088172912597656, |
|
"learning_rate": 3.937389078567708e-05, |
|
"loss": 0.082, |
|
"num_input_tokens_seen": 41627104, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.11969111969112, |
|
"grad_norm": 2.2643067836761475, |
|
"learning_rate": 3.931179770044202e-05, |
|
"loss": 0.0843, |
|
"num_input_tokens_seen": 41745632, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 6.138996138996139, |
|
"grad_norm": 3.209613800048828, |
|
"learning_rate": 3.924957300969151e-05, |
|
"loss": 0.0958, |
|
"num_input_tokens_seen": 41869024, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.158301158301159, |
|
"grad_norm": 1.9632810354232788, |
|
"learning_rate": 3.918721728561866e-05, |
|
"loss": 0.0729, |
|
"num_input_tokens_seen": 42028512, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 6.177606177606178, |
|
"grad_norm": 3.151571750640869, |
|
"learning_rate": 3.912473110162156e-05, |
|
"loss": 0.0796, |
|
"num_input_tokens_seen": 42146272, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.196911196911197, |
|
"grad_norm": 2.506683111190796, |
|
"learning_rate": 3.9062115032297936e-05, |
|
"loss": 0.0828, |
|
"num_input_tokens_seen": 42270176, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 6.216216216216216, |
|
"grad_norm": 2.6251142024993896, |
|
"learning_rate": 3.899936965343989e-05, |
|
"loss": 0.0753, |
|
"num_input_tokens_seen": 42387936, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.235521235521236, |
|
"grad_norm": 3.599527597427368, |
|
"learning_rate": 3.8936495542028596e-05, |
|
"loss": 0.0761, |
|
"num_input_tokens_seen": 42500320, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 6.254826254826255, |
|
"grad_norm": 2.3818442821502686, |
|
"learning_rate": 3.887349327622901e-05, |
|
"loss": 0.092, |
|
"num_input_tokens_seen": 42613984, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.274131274131274, |
|
"grad_norm": 2.111990451812744, |
|
"learning_rate": 3.881036343538455e-05, |
|
"loss": 0.0733, |
|
"num_input_tokens_seen": 42769888, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 6.293436293436294, |
|
"grad_norm": 2.8767385482788086, |
|
"learning_rate": 3.874710660001174e-05, |
|
"loss": 0.0752, |
|
"num_input_tokens_seen": 42881248, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.312741312741313, |
|
"grad_norm": 2.8325531482696533, |
|
"learning_rate": 3.868372335179492e-05, |
|
"loss": 0.0935, |
|
"num_input_tokens_seen": 43014880, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 6.332046332046332, |
|
"grad_norm": 1.9607194662094116, |
|
"learning_rate": 3.8620214273580846e-05, |
|
"loss": 0.0853, |
|
"num_input_tokens_seen": 43172832, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.351351351351352, |
|
"grad_norm": 2.744037628173828, |
|
"learning_rate": 3.8556579949373384e-05, |
|
"loss": 0.0729, |
|
"num_input_tokens_seen": 43304672, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 6.370656370656371, |
|
"grad_norm": 2.235743284225464, |
|
"learning_rate": 3.849282096432808e-05, |
|
"loss": 0.0798, |
|
"num_input_tokens_seen": 43408864, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.38996138996139, |
|
"grad_norm": 2.214385986328125, |
|
"learning_rate": 3.842893790474684e-05, |
|
"loss": 0.0847, |
|
"num_input_tokens_seen": 43524832, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 6.409266409266409, |
|
"grad_norm": 2.995649576187134, |
|
"learning_rate": 3.83649313580725e-05, |
|
"loss": 0.09, |
|
"num_input_tokens_seen": 43662816, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"grad_norm": 2.5828518867492676, |
|
"learning_rate": 3.830080191288342e-05, |
|
"loss": 0.0819, |
|
"num_input_tokens_seen": 43799008, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 6.447876447876448, |
|
"grad_norm": 3.808608293533325, |
|
"learning_rate": 3.8236550158888096e-05, |
|
"loss": 0.0797, |
|
"num_input_tokens_seen": 43905760, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.467181467181467, |
|
"grad_norm": 2.2933712005615234, |
|
"learning_rate": 3.8172176686919733e-05, |
|
"loss": 0.0816, |
|
"num_input_tokens_seen": 44093664, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 6.486486486486487, |
|
"grad_norm": 3.484246253967285, |
|
"learning_rate": 3.8107682088930794e-05, |
|
"loss": 0.0796, |
|
"num_input_tokens_seen": 44194016, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.505791505791506, |
|
"grad_norm": 2.982470750808716, |
|
"learning_rate": 3.804306695798757e-05, |
|
"loss": 0.0795, |
|
"num_input_tokens_seen": 44379104, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 6.525096525096525, |
|
"grad_norm": 2.1628623008728027, |
|
"learning_rate": 3.797833188826474e-05, |
|
"loss": 0.0858, |
|
"num_input_tokens_seen": 44527328, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.544401544401545, |
|
"grad_norm": 2.5524768829345703, |
|
"learning_rate": 3.791347747503987e-05, |
|
"loss": 0.0869, |
|
"num_input_tokens_seen": 44678624, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 6.563706563706564, |
|
"grad_norm": 1.862451434135437, |
|
"learning_rate": 3.784850431468795e-05, |
|
"loss": 0.077, |
|
"num_input_tokens_seen": 44812512, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.583011583011583, |
|
"grad_norm": 2.2355399131774902, |
|
"learning_rate": 3.778341300467597e-05, |
|
"loss": 0.0789, |
|
"num_input_tokens_seen": 44977120, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 6.602316602316602, |
|
"grad_norm": 2.9770116806030273, |
|
"learning_rate": 3.771820414355733e-05, |
|
"loss": 0.0837, |
|
"num_input_tokens_seen": 45079520, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.621621621621622, |
|
"grad_norm": 2.7328426837921143, |
|
"learning_rate": 3.7652878330966415e-05, |
|
"loss": 0.084, |
|
"num_input_tokens_seen": 45213408, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 6.640926640926641, |
|
"grad_norm": 2.8325905799865723, |
|
"learning_rate": 3.7587436167613e-05, |
|
"loss": 0.0696, |
|
"num_input_tokens_seen": 45358304, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.66023166023166, |
|
"grad_norm": 2.922762393951416, |
|
"learning_rate": 3.752187825527684e-05, |
|
"loss": 0.0864, |
|
"num_input_tokens_seen": 45498592, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 6.67953667953668, |
|
"grad_norm": 2.4982705116271973, |
|
"learning_rate": 3.7456205196802016e-05, |
|
"loss": 0.0911, |
|
"num_input_tokens_seen": 45614816, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.698841698841699, |
|
"grad_norm": 2.8256053924560547, |
|
"learning_rate": 3.739041759609148e-05, |
|
"loss": 0.076, |
|
"num_input_tokens_seen": 45747680, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 6.718146718146718, |
|
"grad_norm": 2.376085042953491, |
|
"learning_rate": 3.7324516058101454e-05, |
|
"loss": 0.0879, |
|
"num_input_tokens_seen": 45886176, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.737451737451737, |
|
"grad_norm": 2.991415023803711, |
|
"learning_rate": 3.7258501188835884e-05, |
|
"loss": 0.0834, |
|
"num_input_tokens_seen": 45995232, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 6.756756756756757, |
|
"grad_norm": 4.112320899963379, |
|
"learning_rate": 3.719237359534087e-05, |
|
"loss": 0.0899, |
|
"num_input_tokens_seen": 46125792, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.776061776061776, |
|
"grad_norm": 2.9766454696655273, |
|
"learning_rate": 3.712613388569905e-05, |
|
"loss": 0.0946, |
|
"num_input_tokens_seen": 46248160, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 6.795366795366795, |
|
"grad_norm": 2.4415359497070312, |
|
"learning_rate": 3.705978266902409e-05, |
|
"loss": 0.093, |
|
"num_input_tokens_seen": 46360544, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.814671814671815, |
|
"grad_norm": 2.493098258972168, |
|
"learning_rate": 3.6993320555454986e-05, |
|
"loss": 0.0862, |
|
"num_input_tokens_seen": 46486496, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 6.833976833976834, |
|
"grad_norm": 3.369009256362915, |
|
"learning_rate": 3.692674815615051e-05, |
|
"loss": 0.0852, |
|
"num_input_tokens_seen": 46621664, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.853281853281853, |
|
"grad_norm": 2.1404685974121094, |
|
"learning_rate": 3.6860066083283576e-05, |
|
"loss": 0.0869, |
|
"num_input_tokens_seen": 46741728, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 6.872586872586872, |
|
"grad_norm": 2.0776665210723877, |
|
"learning_rate": 3.679327495003561e-05, |
|
"loss": 0.092, |
|
"num_input_tokens_seen": 46917344, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.891891891891892, |
|
"grad_norm": 2.0455210208892822, |
|
"learning_rate": 3.672637537059093e-05, |
|
"loss": 0.0718, |
|
"num_input_tokens_seen": 47088352, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 6.911196911196911, |
|
"grad_norm": 2.2774274349212646, |
|
"learning_rate": 3.6659367960131056e-05, |
|
"loss": 0.0775, |
|
"num_input_tokens_seen": 47212768, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.93050193050193, |
|
"grad_norm": 2.9299464225769043, |
|
"learning_rate": 3.65922533348291e-05, |
|
"loss": 0.087, |
|
"num_input_tokens_seen": 47357152, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 6.94980694980695, |
|
"grad_norm": 2.3977270126342773, |
|
"learning_rate": 3.6525032111844054e-05, |
|
"loss": 0.078, |
|
"num_input_tokens_seen": 47488224, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.969111969111969, |
|
"grad_norm": 2.7028167247772217, |
|
"learning_rate": 3.645770490931517e-05, |
|
"loss": 0.0872, |
|
"num_input_tokens_seen": 47653088, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 6.988416988416988, |
|
"grad_norm": 2.393808603286743, |
|
"learning_rate": 3.6390272346356224e-05, |
|
"loss": 0.0887, |
|
"num_input_tokens_seen": 47770592, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.007722007722008, |
|
"grad_norm": 2.408655881881714, |
|
"learning_rate": 3.6322735043049846e-05, |
|
"loss": 0.0737, |
|
"num_input_tokens_seen": 47899880, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 7.027027027027027, |
|
"grad_norm": 1.941495418548584, |
|
"learning_rate": 3.6255093620441834e-05, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 48011240, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.046332046332046, |
|
"grad_norm": 3.0053250789642334, |
|
"learning_rate": 3.618734870053539e-05, |
|
"loss": 0.0715, |
|
"num_input_tokens_seen": 48187368, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 7.065637065637065, |
|
"grad_norm": 1.5025749206542969, |
|
"learning_rate": 3.611950090628547e-05, |
|
"loss": 0.0485, |
|
"num_input_tokens_seen": 48299496, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.084942084942085, |
|
"grad_norm": 2.5862605571746826, |
|
"learning_rate": 3.6051550861593e-05, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 48432104, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 7.104247104247104, |
|
"grad_norm": 1.4833030700683594, |
|
"learning_rate": 3.598349919129917e-05, |
|
"loss": 0.0515, |
|
"num_input_tokens_seen": 48571880, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.123552123552123, |
|
"grad_norm": 2.7165639400482178, |
|
"learning_rate": 3.5915346521179675e-05, |
|
"loss": 0.0564, |
|
"num_input_tokens_seen": 48697064, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 2.289926052093506, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 0.0638, |
|
"num_input_tokens_seen": 48851432, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.162162162162162, |
|
"grad_norm": 2.0912668704986572, |
|
"learning_rate": 3.577874068920446e-05, |
|
"loss": 0.0572, |
|
"num_input_tokens_seen": 48962024, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 7.181467181467181, |
|
"grad_norm": 3.291398763656616, |
|
"learning_rate": 3.571028878352084e-05, |
|
"loss": 0.0577, |
|
"num_input_tokens_seen": 49042408, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.2007722007722, |
|
"grad_norm": 2.4217653274536133, |
|
"learning_rate": 3.56417383903442e-05, |
|
"loss": 0.0596, |
|
"num_input_tokens_seen": 49139688, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 7.22007722007722, |
|
"grad_norm": 1.8093854188919067, |
|
"learning_rate": 3.557309014003629e-05, |
|
"loss": 0.0713, |
|
"num_input_tokens_seen": 49296104, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.239382239382239, |
|
"grad_norm": 2.9671144485473633, |
|
"learning_rate": 3.55043446638587e-05, |
|
"loss": 0.062, |
|
"num_input_tokens_seen": 49423336, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 7.258687258687258, |
|
"grad_norm": 1.95906400680542, |
|
"learning_rate": 3.5435502593967104e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 49578472, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.277992277992278, |
|
"grad_norm": 2.1866114139556885, |
|
"learning_rate": 3.536656456340537e-05, |
|
"loss": 0.0867, |
|
"num_input_tokens_seen": 49733096, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 7.297297297297297, |
|
"grad_norm": 3.067368268966675, |
|
"learning_rate": 3.529753120609982e-05, |
|
"loss": 0.0687, |
|
"num_input_tokens_seen": 49866216, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.316602316602316, |
|
"grad_norm": 3.627135753631592, |
|
"learning_rate": 3.522840315685334e-05, |
|
"loss": 0.0819, |
|
"num_input_tokens_seen": 49972712, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 7.335907335907336, |
|
"grad_norm": 2.668987274169922, |
|
"learning_rate": 3.515918105133958e-05, |
|
"loss": 0.0774, |
|
"num_input_tokens_seen": 50125544, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.355212355212355, |
|
"grad_norm": 3.8022704124450684, |
|
"learning_rate": 3.5089865526097065e-05, |
|
"loss": 0.07, |
|
"num_input_tokens_seen": 50279656, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 7.374517374517374, |
|
"grad_norm": 1.9725759029388428, |
|
"learning_rate": 3.5020457218523405e-05, |
|
"loss": 0.0623, |
|
"num_input_tokens_seen": 50435048, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.3938223938223935, |
|
"grad_norm": 2.231874465942383, |
|
"learning_rate": 3.495095676686938e-05, |
|
"loss": 0.0652, |
|
"num_input_tokens_seen": 50559464, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 7.413127413127413, |
|
"grad_norm": 2.671466112136841, |
|
"learning_rate": 3.4881364810233095e-05, |
|
"loss": 0.0708, |
|
"num_input_tokens_seen": 50701544, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.4324324324324325, |
|
"grad_norm": 2.6696228981018066, |
|
"learning_rate": 3.481168198855409e-05, |
|
"loss": 0.0604, |
|
"num_input_tokens_seen": 50810600, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 7.4517374517374515, |
|
"grad_norm": 2.344695806503296, |
|
"learning_rate": 3.4741908942607485e-05, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 50924520, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.471042471042471, |
|
"grad_norm": 1.9226641654968262, |
|
"learning_rate": 3.4672046313998036e-05, |
|
"loss": 0.0622, |
|
"num_input_tokens_seen": 51046376, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 7.4903474903474905, |
|
"grad_norm": 1.9699925184249878, |
|
"learning_rate": 3.460209474515428e-05, |
|
"loss": 0.0652, |
|
"num_input_tokens_seen": 51162600, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.5096525096525095, |
|
"grad_norm": 1.700432538986206, |
|
"learning_rate": 3.4532054879322604e-05, |
|
"loss": 0.0576, |
|
"num_input_tokens_seen": 51303912, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 7.528957528957529, |
|
"grad_norm": 2.434849739074707, |
|
"learning_rate": 3.446192736056138e-05, |
|
"loss": 0.0616, |
|
"num_input_tokens_seen": 51429864, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.5482625482625485, |
|
"grad_norm": 2.251537799835205, |
|
"learning_rate": 3.439171283373492e-05, |
|
"loss": 0.0713, |
|
"num_input_tokens_seen": 51565800, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 7.5675675675675675, |
|
"grad_norm": 2.6035313606262207, |
|
"learning_rate": 3.432141194450772e-05, |
|
"loss": 0.0572, |
|
"num_input_tokens_seen": 51688680, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.586872586872587, |
|
"grad_norm": 2.6067779064178467, |
|
"learning_rate": 3.425102533933835e-05, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 51839976, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 7.6061776061776065, |
|
"grad_norm": 2.2776315212249756, |
|
"learning_rate": 3.418055366547364e-05, |
|
"loss": 0.0582, |
|
"num_input_tokens_seen": 51977704, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.625482625482626, |
|
"grad_norm": 1.6626644134521484, |
|
"learning_rate": 3.410999757094266e-05, |
|
"loss": 0.053, |
|
"num_input_tokens_seen": 52129256, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 7.644787644787645, |
|
"grad_norm": 2.7480499744415283, |
|
"learning_rate": 3.403935770455077e-05, |
|
"loss": 0.0602, |
|
"num_input_tokens_seen": 52280296, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.664092664092664, |
|
"grad_norm": 1.7971251010894775, |
|
"learning_rate": 3.396863471587368e-05, |
|
"loss": 0.0719, |
|
"num_input_tokens_seen": 52408296, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 7.683397683397684, |
|
"grad_norm": 2.8962650299072266, |
|
"learning_rate": 3.389782925525146e-05, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 52542952, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.702702702702703, |
|
"grad_norm": 2.102598190307617, |
|
"learning_rate": 3.382694197378252e-05, |
|
"loss": 0.0636, |
|
"num_input_tokens_seen": 52664808, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 7.722007722007722, |
|
"grad_norm": 2.998412847518921, |
|
"learning_rate": 3.375597352331772e-05, |
|
"loss": 0.0638, |
|
"num_input_tokens_seen": 52813032, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.741312741312742, |
|
"grad_norm": 2.1801412105560303, |
|
"learning_rate": 3.368492455645427e-05, |
|
"loss": 0.0606, |
|
"num_input_tokens_seen": 52943336, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 7.760617760617761, |
|
"grad_norm": 1.6384451389312744, |
|
"learning_rate": 3.36137957265298e-05, |
|
"loss": 0.0628, |
|
"num_input_tokens_seen": 53109224, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 7.77992277992278, |
|
"grad_norm": 1.6432430744171143, |
|
"learning_rate": 3.3542587687616316e-05, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 53237480, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 7.799227799227799, |
|
"grad_norm": 3.8239212036132812, |
|
"learning_rate": 3.347130109451422e-05, |
|
"loss": 0.0703, |
|
"num_input_tokens_seen": 53381608, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 7.818532818532819, |
|
"grad_norm": 2.5226223468780518, |
|
"learning_rate": 3.3399936602746243e-05, |
|
"loss": 0.0661, |
|
"num_input_tokens_seen": 53550824, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 7.837837837837838, |
|
"grad_norm": 2.168243169784546, |
|
"learning_rate": 3.332849486855144e-05, |
|
"loss": 0.066, |
|
"num_input_tokens_seen": 53660136, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 2.0156755447387695, |
|
"learning_rate": 3.3256976548879184e-05, |
|
"loss": 0.0575, |
|
"num_input_tokens_seen": 53799400, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 7.876447876447877, |
|
"grad_norm": 2.126399040222168, |
|
"learning_rate": 3.3185382301383064e-05, |
|
"loss": 0.0631, |
|
"num_input_tokens_seen": 53890280, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 7.895752895752896, |
|
"grad_norm": 2.2973220348358154, |
|
"learning_rate": 3.311371278441488e-05, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 54021864, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 7.915057915057915, |
|
"grad_norm": 2.3345823287963867, |
|
"learning_rate": 3.30419686570186e-05, |
|
"loss": 0.0596, |
|
"num_input_tokens_seen": 54155240, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.934362934362935, |
|
"grad_norm": 2.446683406829834, |
|
"learning_rate": 3.297015057892425e-05, |
|
"loss": 0.0634, |
|
"num_input_tokens_seen": 54317544, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 7.953667953667954, |
|
"grad_norm": 2.26706600189209, |
|
"learning_rate": 3.28982592105419e-05, |
|
"loss": 0.0696, |
|
"num_input_tokens_seen": 54415080, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.972972972972973, |
|
"grad_norm": 2.6478688716888428, |
|
"learning_rate": 3.282629521295556e-05, |
|
"loss": 0.0536, |
|
"num_input_tokens_seen": 54536680, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 7.992277992277993, |
|
"grad_norm": 2.3253068923950195, |
|
"learning_rate": 3.2754259247917105e-05, |
|
"loss": 0.0574, |
|
"num_input_tokens_seen": 54664680, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 8.011583011583012, |
|
"grad_norm": 3.8490004539489746, |
|
"learning_rate": 3.268215197784019e-05, |
|
"loss": 0.0542, |
|
"num_input_tokens_seen": 54840696, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 8.03088803088803, |
|
"grad_norm": 3.325805902481079, |
|
"learning_rate": 3.260997406579417e-05, |
|
"loss": 0.0599, |
|
"num_input_tokens_seen": 54964088, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 8.05019305019305, |
|
"grad_norm": 1.5885120630264282, |
|
"learning_rate": 3.2537726175498e-05, |
|
"loss": 0.0375, |
|
"num_input_tokens_seen": 55137144, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 8.069498069498069, |
|
"grad_norm": 1.7649269104003906, |
|
"learning_rate": 3.246540897131412e-05, |
|
"loss": 0.0344, |
|
"num_input_tokens_seen": 55269752, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 8.088803088803088, |
|
"grad_norm": 1.779180884361267, |
|
"learning_rate": 3.239302311824234e-05, |
|
"loss": 0.0409, |
|
"num_input_tokens_seen": 55409016, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 8.108108108108109, |
|
"grad_norm": 2.212592840194702, |
|
"learning_rate": 3.232056928191376e-05, |
|
"loss": 0.0425, |
|
"num_input_tokens_seen": 55533176, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.127413127413128, |
|
"grad_norm": 2.6077914237976074, |
|
"learning_rate": 3.224804812858462e-05, |
|
"loss": 0.0516, |
|
"num_input_tokens_seen": 55657080, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 8.146718146718147, |
|
"grad_norm": 2.070462942123413, |
|
"learning_rate": 3.2175460325130176e-05, |
|
"loss": 0.0493, |
|
"num_input_tokens_seen": 55815544, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 8.166023166023166, |
|
"grad_norm": 1.979649543762207, |
|
"learning_rate": 3.2102806539038564e-05, |
|
"loss": 0.0401, |
|
"num_input_tokens_seen": 55959928, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 8.185328185328185, |
|
"grad_norm": 2.296283006668091, |
|
"learning_rate": 3.2030087438404685e-05, |
|
"loss": 0.0481, |
|
"num_input_tokens_seen": 56084344, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 8.204633204633204, |
|
"grad_norm": 1.7037519216537476, |
|
"learning_rate": 3.195730369192404e-05, |
|
"loss": 0.0408, |
|
"num_input_tokens_seen": 56190072, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 8.223938223938223, |
|
"grad_norm": 2.012681722640991, |
|
"learning_rate": 3.1884455968886585e-05, |
|
"loss": 0.0514, |
|
"num_input_tokens_seen": 56282232, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 8.243243243243244, |
|
"grad_norm": 1.4678640365600586, |
|
"learning_rate": 3.1811544939170575e-05, |
|
"loss": 0.0477, |
|
"num_input_tokens_seen": 56411256, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 8.262548262548263, |
|
"grad_norm": 2.1076767444610596, |
|
"learning_rate": 3.173857127323642e-05, |
|
"loss": 0.0389, |
|
"num_input_tokens_seen": 56533112, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 8.281853281853282, |
|
"grad_norm": 2.167950391769409, |
|
"learning_rate": 3.166553564212049e-05, |
|
"loss": 0.0512, |
|
"num_input_tokens_seen": 56680824, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 8.301158301158301, |
|
"grad_norm": 2.000339984893799, |
|
"learning_rate": 3.159243871742899e-05, |
|
"loss": 0.0465, |
|
"num_input_tokens_seen": 56779896, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 8.32046332046332, |
|
"grad_norm": 1.747617483139038, |
|
"learning_rate": 3.1519281171331715e-05, |
|
"loss": 0.0376, |
|
"num_input_tokens_seen": 56950392, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 8.339768339768339, |
|
"grad_norm": 2.9224774837493896, |
|
"learning_rate": 3.144606367655595e-05, |
|
"loss": 0.0646, |
|
"num_input_tokens_seen": 57071736, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 8.35907335907336, |
|
"grad_norm": 3.2689764499664307, |
|
"learning_rate": 3.137278690638022e-05, |
|
"loss": 0.0664, |
|
"num_input_tokens_seen": 57178744, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 8.378378378378379, |
|
"grad_norm": 1.766910433769226, |
|
"learning_rate": 3.1299451534628135e-05, |
|
"loss": 0.0483, |
|
"num_input_tokens_seen": 57295736, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 8.397683397683398, |
|
"grad_norm": 2.5547988414764404, |
|
"learning_rate": 3.1226058235662165e-05, |
|
"loss": 0.0387, |
|
"num_input_tokens_seen": 57400440, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 8.416988416988417, |
|
"grad_norm": 1.7486003637313843, |
|
"learning_rate": 3.115260768437747e-05, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 57509752, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 8.436293436293436, |
|
"grad_norm": 1.807553768157959, |
|
"learning_rate": 3.1079100556195686e-05, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 57704568, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 8.455598455598455, |
|
"grad_norm": 1.7607319355010986, |
|
"learning_rate": 3.1005537527058683e-05, |
|
"loss": 0.0418, |
|
"num_input_tokens_seen": 57840760, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 8.474903474903474, |
|
"grad_norm": 1.8397008180618286, |
|
"learning_rate": 3.093191927342239e-05, |
|
"loss": 0.0434, |
|
"num_input_tokens_seen": 57988728, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 8.494208494208495, |
|
"grad_norm": 1.910217523574829, |
|
"learning_rate": 3.085824647225056e-05, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 58142328, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 8.513513513513514, |
|
"grad_norm": 1.5378773212432861, |
|
"learning_rate": 3.0784519801008546e-05, |
|
"loss": 0.0368, |
|
"num_input_tokens_seen": 58252920, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 8.532818532818533, |
|
"grad_norm": 1.6720370054244995, |
|
"learning_rate": 3.071073993765703e-05, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 58374776, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 8.552123552123552, |
|
"grad_norm": 2.8116345405578613, |
|
"learning_rate": 3.06369075606459e-05, |
|
"loss": 0.0582, |
|
"num_input_tokens_seen": 58504312, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 2.262876510620117, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 0.0424, |
|
"num_input_tokens_seen": 58626936, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 8.59073359073359, |
|
"grad_norm": 1.5794037580490112, |
|
"learning_rate": 3.0489087981852326e-05, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 58739320, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 8.61003861003861, |
|
"grad_norm": 1.4733036756515503, |
|
"learning_rate": 3.0415102139359087e-05, |
|
"loss": 0.0415, |
|
"num_input_tokens_seen": 58869112, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 8.62934362934363, |
|
"grad_norm": 1.8181804418563843, |
|
"learning_rate": 3.034106650177208e-05, |
|
"loss": 0.0382, |
|
"num_input_tokens_seen": 58967672, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 8.64864864864865, |
|
"grad_norm": 1.998101830482483, |
|
"learning_rate": 3.0266981749893157e-05, |
|
"loss": 0.0636, |
|
"num_input_tokens_seen": 59082872, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 8.667953667953668, |
|
"grad_norm": 2.1811866760253906, |
|
"learning_rate": 3.0192848564975802e-05, |
|
"loss": 0.0524, |
|
"num_input_tokens_seen": 59219320, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 8.687258687258687, |
|
"grad_norm": 2.4137532711029053, |
|
"learning_rate": 3.0118667628718848e-05, |
|
"loss": 0.0451, |
|
"num_input_tokens_seen": 59362936, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 8.706563706563706, |
|
"grad_norm": 1.908624291419983, |
|
"learning_rate": 3.0044439623260263e-05, |
|
"loss": 0.055, |
|
"num_input_tokens_seen": 59506808, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 8.725868725868725, |
|
"grad_norm": 1.9395898580551147, |
|
"learning_rate": 2.997016523117081e-05, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 59652984, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 8.745173745173744, |
|
"grad_norm": 2.6006052494049072, |
|
"learning_rate": 2.9895845135447835e-05, |
|
"loss": 0.0547, |
|
"num_input_tokens_seen": 59805560, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 8.764478764478765, |
|
"grad_norm": 2.3580074310302734, |
|
"learning_rate": 2.9821480019508935e-05, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 59937656, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 8.783783783783784, |
|
"grad_norm": 1.7670732736587524, |
|
"learning_rate": 2.974707056718571e-05, |
|
"loss": 0.043, |
|
"num_input_tokens_seen": 60078968, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 8.803088803088803, |
|
"grad_norm": 2.2447776794433594, |
|
"learning_rate": 2.9672617462717444e-05, |
|
"loss": 0.0437, |
|
"num_input_tokens_seen": 60211832, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 8.822393822393822, |
|
"grad_norm": 1.9530746936798096, |
|
"learning_rate": 2.959812139074484e-05, |
|
"loss": 0.0391, |
|
"num_input_tokens_seen": 60332664, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 8.841698841698841, |
|
"grad_norm": 1.9166369438171387, |
|
"learning_rate": 2.9523583036303713e-05, |
|
"loss": 0.0374, |
|
"num_input_tokens_seen": 60451448, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 8.86100386100386, |
|
"grad_norm": 2.464473247528076, |
|
"learning_rate": 2.9449003084818688e-05, |
|
"loss": 0.0399, |
|
"num_input_tokens_seen": 60596344, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 8.880308880308881, |
|
"grad_norm": 2.2919764518737793, |
|
"learning_rate": 2.9374382222096886e-05, |
|
"loss": 0.057, |
|
"num_input_tokens_seen": 60711544, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 8.8996138996139, |
|
"grad_norm": 2.0674142837524414, |
|
"learning_rate": 2.9299721134321662e-05, |
|
"loss": 0.0558, |
|
"num_input_tokens_seen": 60835192, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 8.91891891891892, |
|
"grad_norm": 2.8630294799804688, |
|
"learning_rate": 2.9225020508046232e-05, |
|
"loss": 0.0562, |
|
"num_input_tokens_seen": 60967032, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 8.938223938223938, |
|
"grad_norm": 1.54951810836792, |
|
"learning_rate": 2.915028103018741e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 61123448, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 8.957528957528957, |
|
"grad_norm": 2.8156898021698, |
|
"learning_rate": 2.9075503388019272e-05, |
|
"loss": 0.0523, |
|
"num_input_tokens_seen": 61280632, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 8.976833976833976, |
|
"grad_norm": 1.6042686700820923, |
|
"learning_rate": 2.9000688269166836e-05, |
|
"loss": 0.0512, |
|
"num_input_tokens_seen": 61427576, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 8.996138996138995, |
|
"grad_norm": 2.614938497543335, |
|
"learning_rate": 2.892583636159974e-05, |
|
"loss": 0.0446, |
|
"num_input_tokens_seen": 61573240, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 9.015444015444016, |
|
"grad_norm": 2.1098551750183105, |
|
"learning_rate": 2.885094835362591e-05, |
|
"loss": 0.0374, |
|
"num_input_tokens_seen": 61688312, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 9.034749034749035, |
|
"grad_norm": 1.2014209032058716, |
|
"learning_rate": 2.877602493388525e-05, |
|
"loss": 0.0329, |
|
"num_input_tokens_seen": 61788408, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 9.054054054054054, |
|
"grad_norm": 1.1981747150421143, |
|
"learning_rate": 2.8701066791343288e-05, |
|
"loss": 0.0231, |
|
"num_input_tokens_seen": 61935864, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 9.073359073359073, |
|
"grad_norm": 1.6186150312423706, |
|
"learning_rate": 2.8626074615284847e-05, |
|
"loss": 0.0384, |
|
"num_input_tokens_seen": 62054392, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 9.092664092664092, |
|
"grad_norm": 2.106605291366577, |
|
"learning_rate": 2.855104909530772e-05, |
|
"loss": 0.0422, |
|
"num_input_tokens_seen": 62181624, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 9.111969111969112, |
|
"grad_norm": 1.4917961359024048, |
|
"learning_rate": 2.8475990921316292e-05, |
|
"loss": 0.0343, |
|
"num_input_tokens_seen": 62302712, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 9.13127413127413, |
|
"grad_norm": 2.1147005558013916, |
|
"learning_rate": 2.8400900783515255e-05, |
|
"loss": 0.0291, |
|
"num_input_tokens_seen": 62461432, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 9.150579150579151, |
|
"grad_norm": 2.100454330444336, |
|
"learning_rate": 2.8325779372403194e-05, |
|
"loss": 0.0386, |
|
"num_input_tokens_seen": 62604280, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 9.16988416988417, |
|
"grad_norm": 1.4174185991287231, |
|
"learning_rate": 2.8250627378766297e-05, |
|
"loss": 0.037, |
|
"num_input_tokens_seen": 62740472, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 9.18918918918919, |
|
"grad_norm": 1.7185354232788086, |
|
"learning_rate": 2.8175445493671972e-05, |
|
"loss": 0.0346, |
|
"num_input_tokens_seen": 62860280, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 9.208494208494209, |
|
"grad_norm": 1.7431443929672241, |
|
"learning_rate": 2.8100234408462478e-05, |
|
"loss": 0.0302, |
|
"num_input_tokens_seen": 62966264, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 9.227799227799228, |
|
"grad_norm": 2.3386459350585938, |
|
"learning_rate": 2.8024994814748605e-05, |
|
"loss": 0.0348, |
|
"num_input_tokens_seen": 63144696, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 9.247104247104247, |
|
"grad_norm": 2.504485845565796, |
|
"learning_rate": 2.7949727404403302e-05, |
|
"loss": 0.0358, |
|
"num_input_tokens_seen": 63311096, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 9.266409266409266, |
|
"grad_norm": 1.7308130264282227, |
|
"learning_rate": 2.787443286955528e-05, |
|
"loss": 0.0316, |
|
"num_input_tokens_seen": 63436536, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"grad_norm": 1.5274659395217896, |
|
"learning_rate": 2.7799111902582696e-05, |
|
"loss": 0.0401, |
|
"num_input_tokens_seen": 63570680, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 9.305019305019306, |
|
"grad_norm": 2.725780963897705, |
|
"learning_rate": 2.772376519610677e-05, |
|
"loss": 0.0458, |
|
"num_input_tokens_seen": 63676408, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 9.324324324324325, |
|
"grad_norm": 1.3256404399871826, |
|
"learning_rate": 2.7648393442985403e-05, |
|
"loss": 0.0403, |
|
"num_input_tokens_seen": 63788792, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 9.343629343629344, |
|
"grad_norm": 1.2829856872558594, |
|
"learning_rate": 2.7572997336306812e-05, |
|
"loss": 0.0324, |
|
"num_input_tokens_seen": 63911672, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 9.362934362934363, |
|
"grad_norm": 1.5341579914093018, |
|
"learning_rate": 2.749757756938317e-05, |
|
"loss": 0.0329, |
|
"num_input_tokens_seen": 64066296, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 9.382239382239382, |
|
"grad_norm": 2.627748966217041, |
|
"learning_rate": 2.7422134835744213e-05, |
|
"loss": 0.0336, |
|
"num_input_tokens_seen": 64216056, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 9.4015444015444, |
|
"grad_norm": 2.0893774032592773, |
|
"learning_rate": 2.7346669829130867e-05, |
|
"loss": 0.0349, |
|
"num_input_tokens_seen": 64307448, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 9.420849420849422, |
|
"grad_norm": 2.162018299102783, |
|
"learning_rate": 2.7271183243488878e-05, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 64428536, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 9.44015444015444, |
|
"grad_norm": 2.446368932723999, |
|
"learning_rate": 2.7195675772962433e-05, |
|
"loss": 0.0393, |
|
"num_input_tokens_seen": 64535544, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 9.45945945945946, |
|
"grad_norm": 1.9724645614624023, |
|
"learning_rate": 2.7120148111887732e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 64653304, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 9.478764478764479, |
|
"grad_norm": 1.328217625617981, |
|
"learning_rate": 2.7044600954786687e-05, |
|
"loss": 0.0402, |
|
"num_input_tokens_seen": 64795896, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 9.498069498069498, |
|
"grad_norm": 2.314099073410034, |
|
"learning_rate": 2.696903499636045e-05, |
|
"loss": 0.044, |
|
"num_input_tokens_seen": 64955384, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 9.517374517374517, |
|
"grad_norm": 2.214271306991577, |
|
"learning_rate": 2.6893450931483083e-05, |
|
"loss": 0.0417, |
|
"num_input_tokens_seen": 65064696, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 9.536679536679536, |
|
"grad_norm": 1.3897863626480103, |
|
"learning_rate": 2.6817849455195133e-05, |
|
"loss": 0.0306, |
|
"num_input_tokens_seen": 65229304, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 9.555984555984557, |
|
"grad_norm": 2.2535767555236816, |
|
"learning_rate": 2.674223126269728e-05, |
|
"loss": 0.0257, |
|
"num_input_tokens_seen": 65346040, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 9.575289575289576, |
|
"grad_norm": 1.6450698375701904, |
|
"learning_rate": 2.6666597049343882e-05, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 65495288, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 9.594594594594595, |
|
"grad_norm": 1.8544244766235352, |
|
"learning_rate": 2.659094751063666e-05, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 65638392, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 9.613899613899614, |
|
"grad_norm": 2.5733869075775146, |
|
"learning_rate": 2.6515283342218228e-05, |
|
"loss": 0.0324, |
|
"num_input_tokens_seen": 65765624, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 9.633204633204633, |
|
"grad_norm": 1.7491555213928223, |
|
"learning_rate": 2.6439605239865745e-05, |
|
"loss": 0.0351, |
|
"num_input_tokens_seen": 65920248, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 9.652509652509652, |
|
"grad_norm": 1.1685649156570435, |
|
"learning_rate": 2.636391389948449e-05, |
|
"loss": 0.0367, |
|
"num_input_tokens_seen": 66074104, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 9.671814671814673, |
|
"grad_norm": 1.721147060394287, |
|
"learning_rate": 2.6288210017101488e-05, |
|
"loss": 0.0407, |
|
"num_input_tokens_seen": 66204408, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 9.691119691119692, |
|
"grad_norm": 2.34796404838562, |
|
"learning_rate": 2.621249428885908e-05, |
|
"loss": 0.0302, |
|
"num_input_tokens_seen": 66346744, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 9.71042471042471, |
|
"grad_norm": 1.8119391202926636, |
|
"learning_rate": 2.613676741100855e-05, |
|
"loss": 0.0332, |
|
"num_input_tokens_seen": 66492408, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 9.72972972972973, |
|
"grad_norm": 1.8855518102645874, |
|
"learning_rate": 2.606103007990371e-05, |
|
"loss": 0.0323, |
|
"num_input_tokens_seen": 66609144, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 9.749034749034749, |
|
"grad_norm": 1.2302645444869995, |
|
"learning_rate": 2.5985282991994482e-05, |
|
"loss": 0.0317, |
|
"num_input_tokens_seen": 66734584, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 9.768339768339768, |
|
"grad_norm": 2.0994198322296143, |
|
"learning_rate": 2.5909526843820508e-05, |
|
"loss": 0.0362, |
|
"num_input_tokens_seen": 66885624, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 9.787644787644787, |
|
"grad_norm": 1.5865612030029297, |
|
"learning_rate": 2.5833762332004768e-05, |
|
"loss": 0.0346, |
|
"num_input_tokens_seen": 67000056, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 9.806949806949808, |
|
"grad_norm": 1.5101972818374634, |
|
"learning_rate": 2.5757990153247124e-05, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 67156728, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 9.826254826254827, |
|
"grad_norm": 1.7384374141693115, |
|
"learning_rate": 2.5682211004317953e-05, |
|
"loss": 0.0369, |
|
"num_input_tokens_seen": 67278072, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 9.845559845559846, |
|
"grad_norm": 1.505611777305603, |
|
"learning_rate": 2.5606425582051718e-05, |
|
"loss": 0.0371, |
|
"num_input_tokens_seen": 67396344, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 9.864864864864865, |
|
"grad_norm": 1.6645147800445557, |
|
"learning_rate": 2.5530634583340592e-05, |
|
"loss": 0.0404, |
|
"num_input_tokens_seen": 67537656, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 9.884169884169884, |
|
"grad_norm": 1.9146854877471924, |
|
"learning_rate": 2.5454838705127993e-05, |
|
"loss": 0.0332, |
|
"num_input_tokens_seen": 67656184, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 9.903474903474903, |
|
"grad_norm": 1.539021372795105, |
|
"learning_rate": 2.5379038644402235e-05, |
|
"loss": 0.0333, |
|
"num_input_tokens_seen": 67802872, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 9.922779922779922, |
|
"grad_norm": 2.631417751312256, |
|
"learning_rate": 2.5303235098190076e-05, |
|
"loss": 0.0275, |
|
"num_input_tokens_seen": 67920888, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 9.942084942084943, |
|
"grad_norm": 2.6899986267089844, |
|
"learning_rate": 2.5227428763550347e-05, |
|
"loss": 0.0431, |
|
"num_input_tokens_seen": 68055032, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 9.961389961389962, |
|
"grad_norm": 1.6248239278793335, |
|
"learning_rate": 2.5151620337567495e-05, |
|
"loss": 0.032, |
|
"num_input_tokens_seen": 68205816, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 9.980694980694981, |
|
"grad_norm": 1.8143270015716553, |
|
"learning_rate": 2.5075810517345212e-05, |
|
"loss": 0.0367, |
|
"num_input_tokens_seen": 68333304, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.5047626495361328, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0363, |
|
"num_input_tokens_seen": 68448144, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 10.019305019305019, |
|
"grad_norm": 1.569034218788147, |
|
"learning_rate": 2.4924189482654794e-05, |
|
"loss": 0.0228, |
|
"num_input_tokens_seen": 68572304, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 10.038610038610038, |
|
"grad_norm": 1.3906745910644531, |
|
"learning_rate": 2.4848379662432515e-05, |
|
"loss": 0.0233, |
|
"num_input_tokens_seen": 68699792, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.057915057915057, |
|
"grad_norm": 1.429832100868225, |
|
"learning_rate": 2.477257123644966e-05, |
|
"loss": 0.0254, |
|
"num_input_tokens_seen": 68828304, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 10.077220077220078, |
|
"grad_norm": 1.1088787317276, |
|
"learning_rate": 2.4696764901809926e-05, |
|
"loss": 0.0302, |
|
"num_input_tokens_seen": 68985488, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 10.096525096525097, |
|
"grad_norm": 1.7969263792037964, |
|
"learning_rate": 2.462096135559777e-05, |
|
"loss": 0.041, |
|
"num_input_tokens_seen": 69069712, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 10.115830115830116, |
|
"grad_norm": 1.6021201610565186, |
|
"learning_rate": 2.4545161294872013e-05, |
|
"loss": 0.0268, |
|
"num_input_tokens_seen": 69187728, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 10.135135135135135, |
|
"grad_norm": 1.371846079826355, |
|
"learning_rate": 2.446936541665941e-05, |
|
"loss": 0.0252, |
|
"num_input_tokens_seen": 69336208, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 10.154440154440154, |
|
"grad_norm": 1.0030510425567627, |
|
"learning_rate": 2.4393574417948284e-05, |
|
"loss": 0.0224, |
|
"num_input_tokens_seen": 69449616, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 10.173745173745173, |
|
"grad_norm": 1.03380286693573, |
|
"learning_rate": 2.4317788995682052e-05, |
|
"loss": 0.0219, |
|
"num_input_tokens_seen": 69613456, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 10.193050193050192, |
|
"grad_norm": 0.9625149965286255, |
|
"learning_rate": 2.4242009846752885e-05, |
|
"loss": 0.0287, |
|
"num_input_tokens_seen": 69770128, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 10.212355212355213, |
|
"grad_norm": 1.572924017906189, |
|
"learning_rate": 2.4166237667995238e-05, |
|
"loss": 0.0242, |
|
"num_input_tokens_seen": 69901200, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 10.231660231660232, |
|
"grad_norm": 1.4663954973220825, |
|
"learning_rate": 2.4090473156179498e-05, |
|
"loss": 0.0226, |
|
"num_input_tokens_seen": 70050192, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 10.250965250965251, |
|
"grad_norm": 0.9331588745117188, |
|
"learning_rate": 2.4014717008005524e-05, |
|
"loss": 0.0351, |
|
"num_input_tokens_seen": 70166928, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 10.27027027027027, |
|
"grad_norm": 1.3637775182724, |
|
"learning_rate": 2.39389699200963e-05, |
|
"loss": 0.0209, |
|
"num_input_tokens_seen": 70283920, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 10.28957528957529, |
|
"grad_norm": 1.1673227548599243, |
|
"learning_rate": 2.386323258899145e-05, |
|
"loss": 0.0212, |
|
"num_input_tokens_seen": 70422928, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 10.308880308880308, |
|
"grad_norm": 1.4529045820236206, |
|
"learning_rate": 2.378750571114093e-05, |
|
"loss": 0.028, |
|
"num_input_tokens_seen": 70554512, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 10.328185328185327, |
|
"grad_norm": 1.3094784021377563, |
|
"learning_rate": 2.3711789982898518e-05, |
|
"loss": 0.021, |
|
"num_input_tokens_seen": 70661264, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 10.347490347490348, |
|
"grad_norm": 1.0893820524215698, |
|
"learning_rate": 2.3636086100515527e-05, |
|
"loss": 0.0283, |
|
"num_input_tokens_seen": 70817680, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 10.366795366795367, |
|
"grad_norm": 1.1798887252807617, |
|
"learning_rate": 2.3560394760134264e-05, |
|
"loss": 0.0228, |
|
"num_input_tokens_seen": 70970768, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 10.386100386100386, |
|
"grad_norm": 1.507263422012329, |
|
"learning_rate": 2.3484716657781785e-05, |
|
"loss": 0.0188, |
|
"num_input_tokens_seen": 71129488, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 10.405405405405405, |
|
"grad_norm": 1.5008866786956787, |
|
"learning_rate": 2.3409052489363342e-05, |
|
"loss": 0.0251, |
|
"num_input_tokens_seen": 71254928, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 10.424710424710424, |
|
"grad_norm": 1.0226303339004517, |
|
"learning_rate": 2.3333402950656124e-05, |
|
"loss": 0.0274, |
|
"num_input_tokens_seen": 71377296, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 10.444015444015443, |
|
"grad_norm": 2.0056560039520264, |
|
"learning_rate": 2.3257768737302728e-05, |
|
"loss": 0.0255, |
|
"num_input_tokens_seen": 71517072, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 10.463320463320464, |
|
"grad_norm": 0.8709852695465088, |
|
"learning_rate": 2.3182150544804876e-05, |
|
"loss": 0.017, |
|
"num_input_tokens_seen": 71660432, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 10.482625482625483, |
|
"grad_norm": 1.7117507457733154, |
|
"learning_rate": 2.3106549068516922e-05, |
|
"loss": 0.0275, |
|
"num_input_tokens_seen": 71778960, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 10.501930501930502, |
|
"grad_norm": 0.9321146011352539, |
|
"learning_rate": 2.3030965003639566e-05, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 71877264, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 10.521235521235521, |
|
"grad_norm": 1.4264882802963257, |
|
"learning_rate": 2.295539904521332e-05, |
|
"loss": 0.0221, |
|
"num_input_tokens_seen": 72016272, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 10.54054054054054, |
|
"grad_norm": 1.4184226989746094, |
|
"learning_rate": 2.287985188811228e-05, |
|
"loss": 0.0264, |
|
"num_input_tokens_seen": 72151440, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 10.55984555984556, |
|
"grad_norm": 0.8701891899108887, |
|
"learning_rate": 2.2804324227037576e-05, |
|
"loss": 0.0195, |
|
"num_input_tokens_seen": 72298128, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 10.579150579150578, |
|
"grad_norm": 1.276286005973816, |
|
"learning_rate": 2.272881675651112e-05, |
|
"loss": 0.0228, |
|
"num_input_tokens_seen": 72453264, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 10.5984555984556, |
|
"grad_norm": 1.7602120637893677, |
|
"learning_rate": 2.2653330170869135e-05, |
|
"loss": 0.023, |
|
"num_input_tokens_seen": 72573072, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 10.617760617760618, |
|
"grad_norm": 1.3974103927612305, |
|
"learning_rate": 2.257786516425579e-05, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 72700816, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 10.637065637065637, |
|
"grad_norm": 1.4042840003967285, |
|
"learning_rate": 2.2502422430616836e-05, |
|
"loss": 0.0234, |
|
"num_input_tokens_seen": 72825744, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 10.656370656370656, |
|
"grad_norm": 1.7441807985305786, |
|
"learning_rate": 2.242700266369319e-05, |
|
"loss": 0.0262, |
|
"num_input_tokens_seen": 72936336, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 10.675675675675675, |
|
"grad_norm": 2.0952086448669434, |
|
"learning_rate": 2.23516065570146e-05, |
|
"loss": 0.0299, |
|
"num_input_tokens_seen": 73027728, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 10.694980694980694, |
|
"grad_norm": 2.324326753616333, |
|
"learning_rate": 2.2276234803893232e-05, |
|
"loss": 0.0271, |
|
"num_input_tokens_seen": 73170320, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 10.714285714285714, |
|
"grad_norm": 1.1735119819641113, |
|
"learning_rate": 2.2200888097417307e-05, |
|
"loss": 0.0256, |
|
"num_input_tokens_seen": 73325456, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 10.733590733590734, |
|
"grad_norm": 1.1038674116134644, |
|
"learning_rate": 2.2125567130444724e-05, |
|
"loss": 0.0249, |
|
"num_input_tokens_seen": 73466512, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 10.752895752895753, |
|
"grad_norm": 1.9175952672958374, |
|
"learning_rate": 2.2050272595596704e-05, |
|
"loss": 0.0322, |
|
"num_input_tokens_seen": 73598608, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 10.772200772200772, |
|
"grad_norm": 1.6432002782821655, |
|
"learning_rate": 2.197500518525139e-05, |
|
"loss": 0.0274, |
|
"num_input_tokens_seen": 73708176, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 10.791505791505791, |
|
"grad_norm": 1.3228294849395752, |
|
"learning_rate": 2.1899765591537525e-05, |
|
"loss": 0.02, |
|
"num_input_tokens_seen": 73819536, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 10.81081081081081, |
|
"grad_norm": 1.7404805421829224, |
|
"learning_rate": 2.182455450632803e-05, |
|
"loss": 0.0243, |
|
"num_input_tokens_seen": 73968528, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 10.83011583011583, |
|
"grad_norm": 2.2002720832824707, |
|
"learning_rate": 2.174937262123371e-05, |
|
"loss": 0.0277, |
|
"num_input_tokens_seen": 74125968, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 10.849420849420849, |
|
"grad_norm": 1.1822292804718018, |
|
"learning_rate": 2.1674220627596812e-05, |
|
"loss": 0.0249, |
|
"num_input_tokens_seen": 74299792, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 10.86872586872587, |
|
"grad_norm": 1.8930922746658325, |
|
"learning_rate": 2.1599099216484757e-05, |
|
"loss": 0.0225, |
|
"num_input_tokens_seen": 74420880, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 10.888030888030888, |
|
"grad_norm": 1.2435505390167236, |
|
"learning_rate": 2.1524009078683717e-05, |
|
"loss": 0.0225, |
|
"num_input_tokens_seen": 74557072, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 10.907335907335908, |
|
"grad_norm": 1.4635958671569824, |
|
"learning_rate": 2.1448950904692293e-05, |
|
"loss": 0.02, |
|
"num_input_tokens_seen": 74673296, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 10.926640926640927, |
|
"grad_norm": 2.3589718341827393, |
|
"learning_rate": 2.137392538471516e-05, |
|
"loss": 0.024, |
|
"num_input_tokens_seen": 74808208, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 10.945945945945946, |
|
"grad_norm": 1.5779073238372803, |
|
"learning_rate": 2.1298933208656718e-05, |
|
"loss": 0.0255, |
|
"num_input_tokens_seen": 74922384, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 10.965250965250965, |
|
"grad_norm": 1.556203007698059, |
|
"learning_rate": 2.1223975066114754e-05, |
|
"loss": 0.0238, |
|
"num_input_tokens_seen": 75055248, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 10.984555984555985, |
|
"grad_norm": 1.1990100145339966, |
|
"learning_rate": 2.1149051646374098e-05, |
|
"loss": 0.0219, |
|
"num_input_tokens_seen": 75199376, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 11.003861003861005, |
|
"grad_norm": 1.4094477891921997, |
|
"learning_rate": 2.107416363840027e-05, |
|
"loss": 0.0212, |
|
"num_input_tokens_seen": 75316216, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 11.023166023166024, |
|
"grad_norm": 1.0576751232147217, |
|
"learning_rate": 2.0999311730833174e-05, |
|
"loss": 0.0188, |
|
"num_input_tokens_seen": 75449592, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 11.042471042471043, |
|
"grad_norm": 3.332852363586426, |
|
"learning_rate": 2.0924496611980734e-05, |
|
"loss": 0.0159, |
|
"num_input_tokens_seen": 75572216, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 11.061776061776062, |
|
"grad_norm": 0.4681335985660553, |
|
"learning_rate": 2.08497189698126e-05, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 75717880, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 11.08108108108108, |
|
"grad_norm": 1.0423972606658936, |
|
"learning_rate": 2.0774979491953777e-05, |
|
"loss": 0.0202, |
|
"num_input_tokens_seen": 75864312, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 11.1003861003861, |
|
"grad_norm": 0.7003400325775146, |
|
"learning_rate": 2.070027886567835e-05, |
|
"loss": 0.0107, |
|
"num_input_tokens_seen": 75988472, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 11.11969111969112, |
|
"grad_norm": 1.0607558488845825, |
|
"learning_rate": 2.062561777790312e-05, |
|
"loss": 0.0193, |
|
"num_input_tokens_seen": 76111352, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 11.13899613899614, |
|
"grad_norm": 1.5472525358200073, |
|
"learning_rate": 2.0550996915181314e-05, |
|
"loss": 0.0187, |
|
"num_input_tokens_seen": 76215544, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 11.158301158301159, |
|
"grad_norm": 1.590269684791565, |
|
"learning_rate": 2.0476416963696292e-05, |
|
"loss": 0.0206, |
|
"num_input_tokens_seen": 76341496, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 11.177606177606178, |
|
"grad_norm": 1.5428239107131958, |
|
"learning_rate": 2.0401878609255158e-05, |
|
"loss": 0.0244, |
|
"num_input_tokens_seen": 76481272, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 11.196911196911197, |
|
"grad_norm": 2.3503377437591553, |
|
"learning_rate": 2.0327382537282562e-05, |
|
"loss": 0.0276, |
|
"num_input_tokens_seen": 76608760, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 11.216216216216216, |
|
"grad_norm": 1.3726725578308105, |
|
"learning_rate": 2.025292943281429e-05, |
|
"loss": 0.0206, |
|
"num_input_tokens_seen": 76791032, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 11.235521235521235, |
|
"grad_norm": 0.9943969249725342, |
|
"learning_rate": 2.017851998049107e-05, |
|
"loss": 0.0128, |
|
"num_input_tokens_seen": 76941304, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 11.254826254826256, |
|
"grad_norm": 3.1360983848571777, |
|
"learning_rate": 2.0104154864552168e-05, |
|
"loss": 0.0236, |
|
"num_input_tokens_seen": 77066744, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 11.274131274131275, |
|
"grad_norm": 1.0769143104553223, |
|
"learning_rate": 2.0029834768829196e-05, |
|
"loss": 0.0129, |
|
"num_input_tokens_seen": 77192184, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 11.293436293436294, |
|
"grad_norm": 1.3311856985092163, |
|
"learning_rate": 1.995556037673974e-05, |
|
"loss": 0.0317, |
|
"num_input_tokens_seen": 77322232, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 11.312741312741313, |
|
"grad_norm": 1.2713100910186768, |
|
"learning_rate": 1.9881332371281158e-05, |
|
"loss": 0.0215, |
|
"num_input_tokens_seen": 77459192, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 11.332046332046332, |
|
"grad_norm": 1.015021562576294, |
|
"learning_rate": 1.98071514350242e-05, |
|
"loss": 0.0165, |
|
"num_input_tokens_seen": 77550072, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 11.35135135135135, |
|
"grad_norm": 1.0809468030929565, |
|
"learning_rate": 1.973301825010685e-05, |
|
"loss": 0.0164, |
|
"num_input_tokens_seen": 77662968, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 11.37065637065637, |
|
"grad_norm": 1.120558738708496, |
|
"learning_rate": 1.9658933498227923e-05, |
|
"loss": 0.0179, |
|
"num_input_tokens_seen": 77792248, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 11.38996138996139, |
|
"grad_norm": 0.9761649966239929, |
|
"learning_rate": 1.9584897860640922e-05, |
|
"loss": 0.016, |
|
"num_input_tokens_seen": 77929208, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 11.40926640926641, |
|
"grad_norm": 1.1505382061004639, |
|
"learning_rate": 1.9510912018147677e-05, |
|
"loss": 0.0155, |
|
"num_input_tokens_seen": 78104824, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 11.428571428571429, |
|
"grad_norm": 1.3358957767486572, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 78235384, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 11.447876447876448, |
|
"grad_norm": 0.9273285865783691, |
|
"learning_rate": 1.936309243935411e-05, |
|
"loss": 0.0161, |
|
"num_input_tokens_seen": 78378488, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 11.467181467181467, |
|
"grad_norm": 0.7956048250198364, |
|
"learning_rate": 1.928926006234297e-05, |
|
"loss": 0.0231, |
|
"num_input_tokens_seen": 78501112, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 11.486486486486486, |
|
"grad_norm": 0.9171015620231628, |
|
"learning_rate": 1.9215480198991466e-05, |
|
"loss": 0.0167, |
|
"num_input_tokens_seen": 78608376, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 11.505791505791505, |
|
"grad_norm": 0.8292757272720337, |
|
"learning_rate": 1.9141753527749443e-05, |
|
"loss": 0.02, |
|
"num_input_tokens_seen": 78749944, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 11.525096525096526, |
|
"grad_norm": 1.3520805835723877, |
|
"learning_rate": 1.906808072657761e-05, |
|
"loss": 0.0203, |
|
"num_input_tokens_seen": 78877432, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 11.544401544401545, |
|
"grad_norm": 1.324306845664978, |
|
"learning_rate": 1.8994462472941322e-05, |
|
"loss": 0.0147, |
|
"num_input_tokens_seen": 78997496, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 11.563706563706564, |
|
"grad_norm": 1.3603425025939941, |
|
"learning_rate": 1.892089944380432e-05, |
|
"loss": 0.0194, |
|
"num_input_tokens_seen": 79097080, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 11.583011583011583, |
|
"grad_norm": 0.8024409413337708, |
|
"learning_rate": 1.8847392315622535e-05, |
|
"loss": 0.0199, |
|
"num_input_tokens_seen": 79204600, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 11.602316602316602, |
|
"grad_norm": 1.7421422004699707, |
|
"learning_rate": 1.877394176433784e-05, |
|
"loss": 0.018, |
|
"num_input_tokens_seen": 79323384, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 11.621621621621621, |
|
"grad_norm": 1.2354506254196167, |
|
"learning_rate": 1.8700548465371874e-05, |
|
"loss": 0.0156, |
|
"num_input_tokens_seen": 79470328, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 11.64092664092664, |
|
"grad_norm": 0.9264888763427734, |
|
"learning_rate": 1.8627213093619783e-05, |
|
"loss": 0.0155, |
|
"num_input_tokens_seen": 79587576, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 11.660231660231661, |
|
"grad_norm": 1.572521448135376, |
|
"learning_rate": 1.8553936323444058e-05, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 79733752, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 11.67953667953668, |
|
"grad_norm": 1.2958881855010986, |
|
"learning_rate": 1.848071882866829e-05, |
|
"loss": 0.0174, |
|
"num_input_tokens_seen": 79845880, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 11.698841698841699, |
|
"grad_norm": 1.1187669038772583, |
|
"learning_rate": 1.8407561282571018e-05, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 79975672, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 11.718146718146718, |
|
"grad_norm": 2.266324043273926, |
|
"learning_rate": 1.8334464357879515e-05, |
|
"loss": 0.0211, |
|
"num_input_tokens_seen": 80102136, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 11.737451737451737, |
|
"grad_norm": 1.0018905401229858, |
|
"learning_rate": 1.8261428726763584e-05, |
|
"loss": 0.0165, |
|
"num_input_tokens_seen": 80206584, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 11.756756756756756, |
|
"grad_norm": 1.023120641708374, |
|
"learning_rate": 1.818845506082943e-05, |
|
"loss": 0.016, |
|
"num_input_tokens_seen": 80338424, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 11.776061776061777, |
|
"grad_norm": 2.093369960784912, |
|
"learning_rate": 1.811554403111342e-05, |
|
"loss": 0.0246, |
|
"num_input_tokens_seen": 80482552, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 11.795366795366796, |
|
"grad_norm": 0.853808581829071, |
|
"learning_rate": 1.8042696308075968e-05, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 80597752, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 11.814671814671815, |
|
"grad_norm": 1.6099953651428223, |
|
"learning_rate": 1.7969912561595317e-05, |
|
"loss": 0.0181, |
|
"num_input_tokens_seen": 80746744, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 11.833976833976834, |
|
"grad_norm": 0.9269828200340271, |
|
"learning_rate": 1.789719346096144e-05, |
|
"loss": 0.0162, |
|
"num_input_tokens_seen": 80899064, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 11.853281853281853, |
|
"grad_norm": 0.6096843481063843, |
|
"learning_rate": 1.7824539674869827e-05, |
|
"loss": 0.0142, |
|
"num_input_tokens_seen": 81060856, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 11.872586872586872, |
|
"grad_norm": 0.7218270897865295, |
|
"learning_rate": 1.7751951871415385e-05, |
|
"loss": 0.017, |
|
"num_input_tokens_seen": 81197816, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 11.891891891891891, |
|
"grad_norm": 2.1457033157348633, |
|
"learning_rate": 1.7679430718086243e-05, |
|
"loss": 0.0243, |
|
"num_input_tokens_seen": 81304312, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 11.91119691119691, |
|
"grad_norm": 1.1789997816085815, |
|
"learning_rate": 1.760697688175767e-05, |
|
"loss": 0.0206, |
|
"num_input_tokens_seen": 81434872, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 11.930501930501931, |
|
"grad_norm": 1.7655112743377686, |
|
"learning_rate": 1.7534591028685894e-05, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 81596920, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 11.94980694980695, |
|
"grad_norm": 1.3343671560287476, |
|
"learning_rate": 1.746227382450201e-05, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 81750008, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 11.96911196911197, |
|
"grad_norm": 1.830177903175354, |
|
"learning_rate": 1.7390025934205837e-05, |
|
"loss": 0.0239, |
|
"num_input_tokens_seen": 81850360, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 11.988416988416988, |
|
"grad_norm": 1.2948367595672607, |
|
"learning_rate": 1.7317848022159822e-05, |
|
"loss": 0.0191, |
|
"num_input_tokens_seen": 81999608, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 12.007722007722007, |
|
"grad_norm": 0.61854487657547, |
|
"learning_rate": 1.72457407520829e-05, |
|
"loss": 0.0113, |
|
"num_input_tokens_seen": 82144488, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 12.027027027027026, |
|
"grad_norm": 0.6811046004295349, |
|
"learning_rate": 1.7173704787044446e-05, |
|
"loss": 0.0123, |
|
"num_input_tokens_seen": 82274280, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 12.046332046332047, |
|
"grad_norm": 0.5352727174758911, |
|
"learning_rate": 1.7101740789458097e-05, |
|
"loss": 0.0071, |
|
"num_input_tokens_seen": 82384104, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 12.065637065637066, |
|
"grad_norm": 1.288461446762085, |
|
"learning_rate": 1.7029849421075757e-05, |
|
"loss": 0.0128, |
|
"num_input_tokens_seen": 82490344, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 12.084942084942085, |
|
"grad_norm": 0.9903656840324402, |
|
"learning_rate": 1.6958031342981405e-05, |
|
"loss": 0.0127, |
|
"num_input_tokens_seen": 82594536, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 12.104247104247104, |
|
"grad_norm": 0.786799430847168, |
|
"learning_rate": 1.6886287215585134e-05, |
|
"loss": 0.0132, |
|
"num_input_tokens_seen": 82700264, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 12.123552123552123, |
|
"grad_norm": 0.7773774266242981, |
|
"learning_rate": 1.6814617698616945e-05, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 82830824, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 12.142857142857142, |
|
"grad_norm": 1.209380030632019, |
|
"learning_rate": 1.6743023451120832e-05, |
|
"loss": 0.0111, |
|
"num_input_tokens_seen": 82963944, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 12.162162162162161, |
|
"grad_norm": 0.8817998170852661, |
|
"learning_rate": 1.667150513144856e-05, |
|
"loss": 0.0112, |
|
"num_input_tokens_seen": 83074024, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 12.181467181467182, |
|
"grad_norm": 1.7586342096328735, |
|
"learning_rate": 1.660006339725377e-05, |
|
"loss": 0.0133, |
|
"num_input_tokens_seen": 83183080, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 12.200772200772201, |
|
"grad_norm": 0.815854549407959, |
|
"learning_rate": 1.6528698905485784e-05, |
|
"loss": 0.0114, |
|
"num_input_tokens_seen": 83314408, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 12.22007722007722, |
|
"grad_norm": 1.0186066627502441, |
|
"learning_rate": 1.645741231238369e-05, |
|
"loss": 0.0151, |
|
"num_input_tokens_seen": 83448808, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 12.23938223938224, |
|
"grad_norm": 0.9455981254577637, |
|
"learning_rate": 1.6386204273470208e-05, |
|
"loss": 0.0081, |
|
"num_input_tokens_seen": 83603688, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 12.258687258687258, |
|
"grad_norm": 0.5357372164726257, |
|
"learning_rate": 1.6315075443545734e-05, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 83779304, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 12.277992277992277, |
|
"grad_norm": 1.0170356035232544, |
|
"learning_rate": 1.624402647668229e-05, |
|
"loss": 0.0123, |
|
"num_input_tokens_seen": 83904232, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 12.297297297297296, |
|
"grad_norm": 1.7153340578079224, |
|
"learning_rate": 1.617305802621748e-05, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 84064488, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 12.316602316602317, |
|
"grad_norm": 0.8847137093544006, |
|
"learning_rate": 1.610217074474855e-05, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 84197352, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 12.335907335907336, |
|
"grad_norm": 3.1383509635925293, |
|
"learning_rate": 1.6031365284126314e-05, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 84329704, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 12.355212355212355, |
|
"grad_norm": 0.9392812252044678, |
|
"learning_rate": 1.5960642295449228e-05, |
|
"loss": 0.0092, |
|
"num_input_tokens_seen": 84475112, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 12.374517374517374, |
|
"grad_norm": 1.5533843040466309, |
|
"learning_rate": 1.5890002429057344e-05, |
|
"loss": 0.0131, |
|
"num_input_tokens_seen": 84606440, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 12.393822393822393, |
|
"grad_norm": 0.9457998275756836, |
|
"learning_rate": 1.581944633452636e-05, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 84771048, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 12.413127413127413, |
|
"grad_norm": 0.1900276094675064, |
|
"learning_rate": 1.5748974660661653e-05, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 84870120, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 12.432432432432432, |
|
"grad_norm": 1.335801362991333, |
|
"learning_rate": 1.567858805549229e-05, |
|
"loss": 0.0108, |
|
"num_input_tokens_seen": 84991976, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 12.451737451737452, |
|
"grad_norm": 1.124742865562439, |
|
"learning_rate": 1.5608287166265075e-05, |
|
"loss": 0.012, |
|
"num_input_tokens_seen": 85147880, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 12.471042471042471, |
|
"grad_norm": 1.095903992652893, |
|
"learning_rate": 1.5538072639438633e-05, |
|
"loss": 0.0097, |
|
"num_input_tokens_seen": 85335016, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 12.49034749034749, |
|
"grad_norm": 0.7687711715698242, |
|
"learning_rate": 1.546794512067739e-05, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 85456872, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 12.50965250965251, |
|
"grad_norm": 0.6484484672546387, |
|
"learning_rate": 1.539790525484573e-05, |
|
"loss": 0.0086, |
|
"num_input_tokens_seen": 85594344, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 12.528957528957529, |
|
"grad_norm": 0.6688821315765381, |
|
"learning_rate": 1.5327953686001973e-05, |
|
"loss": 0.0177, |
|
"num_input_tokens_seen": 85765352, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 12.548262548262548, |
|
"grad_norm": 1.0532523393630981, |
|
"learning_rate": 1.525809105739252e-05, |
|
"loss": 0.0124, |
|
"num_input_tokens_seen": 85894888, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 12.567567567567568, |
|
"grad_norm": 0.633204460144043, |
|
"learning_rate": 1.5188318011445906e-05, |
|
"loss": 0.0121, |
|
"num_input_tokens_seen": 86042856, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 12.586872586872587, |
|
"grad_norm": 1.3112810850143433, |
|
"learning_rate": 1.511863518976691e-05, |
|
"loss": 0.0137, |
|
"num_input_tokens_seen": 86182632, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 12.606177606177607, |
|
"grad_norm": 0.8959654569625854, |
|
"learning_rate": 1.5049043233130622e-05, |
|
"loss": 0.0117, |
|
"num_input_tokens_seen": 86324200, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 12.625482625482626, |
|
"grad_norm": 1.571006417274475, |
|
"learning_rate": 1.4979542781476601e-05, |
|
"loss": 0.0116, |
|
"num_input_tokens_seen": 86514152, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 12.644787644787645, |
|
"grad_norm": 1.573019027709961, |
|
"learning_rate": 1.4910134473902943e-05, |
|
"loss": 0.0097, |
|
"num_input_tokens_seen": 86638312, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 12.664092664092664, |
|
"grad_norm": 1.1178486347198486, |
|
"learning_rate": 1.4840818948660434e-05, |
|
"loss": 0.0117, |
|
"num_input_tokens_seen": 86778856, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 12.683397683397683, |
|
"grad_norm": 0.44218119978904724, |
|
"learning_rate": 1.4771596843146665e-05, |
|
"loss": 0.0072, |
|
"num_input_tokens_seen": 86894824, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 12.702702702702704, |
|
"grad_norm": 0.4475366175174713, |
|
"learning_rate": 1.4702468793900188e-05, |
|
"loss": 0.0158, |
|
"num_input_tokens_seen": 87012840, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 12.722007722007723, |
|
"grad_norm": 0.7916795611381531, |
|
"learning_rate": 1.4633435436594636e-05, |
|
"loss": 0.0086, |
|
"num_input_tokens_seen": 87125736, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 12.741312741312742, |
|
"grad_norm": 0.755908191204071, |
|
"learning_rate": 1.456449740603291e-05, |
|
"loss": 0.0122, |
|
"num_input_tokens_seen": 87274216, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 12.76061776061776, |
|
"grad_norm": 0.6457445621490479, |
|
"learning_rate": 1.4495655336141301e-05, |
|
"loss": 0.0125, |
|
"num_input_tokens_seen": 87417832, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 12.77992277992278, |
|
"grad_norm": 0.6330153346061707, |
|
"learning_rate": 1.4426909859963717e-05, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 87560936, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 12.799227799227799, |
|
"grad_norm": 4.419582843780518, |
|
"learning_rate": 1.4358261609655804e-05, |
|
"loss": 0.0335, |
|
"num_input_tokens_seen": 87672040, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 12.818532818532818, |
|
"grad_norm": 1.6640141010284424, |
|
"learning_rate": 1.4289711216479156e-05, |
|
"loss": 0.0126, |
|
"num_input_tokens_seen": 87772648, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 12.837837837837839, |
|
"grad_norm": 1.2241111993789673, |
|
"learning_rate": 1.4221259310795543e-05, |
|
"loss": 0.0121, |
|
"num_input_tokens_seen": 87863272, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 12.857142857142858, |
|
"grad_norm": 0.7807748913764954, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 87976424, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 12.876447876447877, |
|
"grad_norm": 0.5787301659584045, |
|
"learning_rate": 1.4084653478820336e-05, |
|
"loss": 0.0141, |
|
"num_input_tokens_seen": 88120808, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 12.895752895752896, |
|
"grad_norm": 0.5152149796485901, |
|
"learning_rate": 1.401650080870083e-05, |
|
"loss": 0.0088, |
|
"num_input_tokens_seen": 88256232, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 12.915057915057915, |
|
"grad_norm": 1.5801312923431396, |
|
"learning_rate": 1.3948449138407002e-05, |
|
"loss": 0.0109, |
|
"num_input_tokens_seen": 88392936, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 12.934362934362934, |
|
"grad_norm": 1.0263210535049438, |
|
"learning_rate": 1.3880499093714534e-05, |
|
"loss": 0.0152, |
|
"num_input_tokens_seen": 88529896, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 12.953667953667953, |
|
"grad_norm": 1.541890263557434, |
|
"learning_rate": 1.3812651299464612e-05, |
|
"loss": 0.0123, |
|
"num_input_tokens_seen": 88639208, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 12.972972972972974, |
|
"grad_norm": 0.8043264746665955, |
|
"learning_rate": 1.3744906379558165e-05, |
|
"loss": 0.0141, |
|
"num_input_tokens_seen": 88752872, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 12.992277992277993, |
|
"grad_norm": 0.8968296051025391, |
|
"learning_rate": 1.367726495695015e-05, |
|
"loss": 0.0111, |
|
"num_input_tokens_seen": 88867816, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 13.011583011583012, |
|
"grad_norm": 0.6285333633422852, |
|
"learning_rate": 1.3609727653643779e-05, |
|
"loss": 0.0114, |
|
"num_input_tokens_seen": 88965264, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 13.03088803088803, |
|
"grad_norm": 0.5702968239784241, |
|
"learning_rate": 1.3542295090684837e-05, |
|
"loss": 0.0057, |
|
"num_input_tokens_seen": 89141392, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 13.05019305019305, |
|
"grad_norm": 1.4923161268234253, |
|
"learning_rate": 1.3474967888155948e-05, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 89261200, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 13.069498069498069, |
|
"grad_norm": 0.506509006023407, |
|
"learning_rate": 1.3407746665170912e-05, |
|
"loss": 0.0072, |
|
"num_input_tokens_seen": 89401488, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 13.088803088803088, |
|
"grad_norm": 1.0197346210479736, |
|
"learning_rate": 1.3340632039868953e-05, |
|
"loss": 0.0055, |
|
"num_input_tokens_seen": 89504912, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 13.108108108108109, |
|
"grad_norm": 0.270197331905365, |
|
"learning_rate": 1.3273624629409082e-05, |
|
"loss": 0.0039, |
|
"num_input_tokens_seen": 89619856, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 13.127413127413128, |
|
"grad_norm": 0.5792621374130249, |
|
"learning_rate": 1.3206725049964392e-05, |
|
"loss": 0.0044, |
|
"num_input_tokens_seen": 89806992, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 13.146718146718147, |
|
"grad_norm": 0.8487739562988281, |
|
"learning_rate": 1.3139933916716435e-05, |
|
"loss": 0.0084, |
|
"num_input_tokens_seen": 89948304, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 13.166023166023166, |
|
"grad_norm": 0.6044009327888489, |
|
"learning_rate": 1.3073251843849501e-05, |
|
"loss": 0.0074, |
|
"num_input_tokens_seen": 90098320, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 13.185328185328185, |
|
"grad_norm": 0.7097198367118835, |
|
"learning_rate": 1.3006679444545025e-05, |
|
"loss": 0.0066, |
|
"num_input_tokens_seen": 90211216, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 13.204633204633204, |
|
"grad_norm": 1.2196393013000488, |
|
"learning_rate": 1.2940217330975912e-05, |
|
"loss": 0.0054, |
|
"num_input_tokens_seen": 90331024, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 13.223938223938223, |
|
"grad_norm": 0.7763360142707825, |
|
"learning_rate": 1.2873866114300951e-05, |
|
"loss": 0.0115, |
|
"num_input_tokens_seen": 90446224, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 13.243243243243244, |
|
"grad_norm": 0.3707767128944397, |
|
"learning_rate": 1.2807626404659142e-05, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 90567568, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 13.262548262548263, |
|
"grad_norm": 2.04919171333313, |
|
"learning_rate": 1.2741498811164127e-05, |
|
"loss": 0.012, |
|
"num_input_tokens_seen": 90708368, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 13.281853281853282, |
|
"grad_norm": 0.5803795456886292, |
|
"learning_rate": 1.2675483941898548e-05, |
|
"loss": 0.0122, |
|
"num_input_tokens_seen": 90872976, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 13.301158301158301, |
|
"grad_norm": 0.20803996920585632, |
|
"learning_rate": 1.2609582403908526e-05, |
|
"loss": 0.0045, |
|
"num_input_tokens_seen": 90986128, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 13.32046332046332, |
|
"grad_norm": 0.2549193203449249, |
|
"learning_rate": 1.2543794803197995e-05, |
|
"loss": 0.0065, |
|
"num_input_tokens_seen": 91110544, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 13.339768339768339, |
|
"grad_norm": 0.8609089851379395, |
|
"learning_rate": 1.2478121744723164e-05, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 91256720, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 13.35907335907336, |
|
"grad_norm": 0.8688700199127197, |
|
"learning_rate": 1.2412563832387003e-05, |
|
"loss": 0.0035, |
|
"num_input_tokens_seen": 91354512, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 13.378378378378379, |
|
"grad_norm": 0.6687062978744507, |
|
"learning_rate": 1.234712166903359e-05, |
|
"loss": 0.0199, |
|
"num_input_tokens_seen": 91484304, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 13.397683397683398, |
|
"grad_norm": 1.2453186511993408, |
|
"learning_rate": 1.2281795856442668e-05, |
|
"loss": 0.0178, |
|
"num_input_tokens_seen": 91621520, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 13.416988416988417, |
|
"grad_norm": 0.5360046029090881, |
|
"learning_rate": 1.2216586995324031e-05, |
|
"loss": 0.008, |
|
"num_input_tokens_seen": 91743888, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 13.436293436293436, |
|
"grad_norm": 0.4313749372959137, |
|
"learning_rate": 1.2151495685312054e-05, |
|
"loss": 0.0084, |
|
"num_input_tokens_seen": 91875728, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 13.455598455598455, |
|
"grad_norm": 0.7261118292808533, |
|
"learning_rate": 1.2086522524960137e-05, |
|
"loss": 0.0061, |
|
"num_input_tokens_seen": 91986576, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 13.474903474903474, |
|
"grad_norm": 0.6693235635757446, |
|
"learning_rate": 1.2021668111735263e-05, |
|
"loss": 0.0104, |
|
"num_input_tokens_seen": 92082320, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 13.494208494208495, |
|
"grad_norm": 0.5525346994400024, |
|
"learning_rate": 1.1956933042012427e-05, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 92183696, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 13.513513513513514, |
|
"grad_norm": 1.5677294731140137, |
|
"learning_rate": 1.1892317911069212e-05, |
|
"loss": 0.0055, |
|
"num_input_tokens_seen": 92300432, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 13.532818532818533, |
|
"grad_norm": 0.368313729763031, |
|
"learning_rate": 1.1827823313080266e-05, |
|
"loss": 0.0051, |
|
"num_input_tokens_seen": 92447120, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 13.552123552123552, |
|
"grad_norm": 0.3600405752658844, |
|
"learning_rate": 1.1763449841111906e-05, |
|
"loss": 0.0092, |
|
"num_input_tokens_seen": 92584848, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 13.571428571428571, |
|
"grad_norm": 0.3889321982860565, |
|
"learning_rate": 1.1699198087116589e-05, |
|
"loss": 0.0048, |
|
"num_input_tokens_seen": 92709520, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 13.59073359073359, |
|
"grad_norm": 0.35128799080848694, |
|
"learning_rate": 1.163506864192751e-05, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 92850832, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 13.61003861003861, |
|
"grad_norm": 0.8390093445777893, |
|
"learning_rate": 1.1571062095253157e-05, |
|
"loss": 0.0116, |
|
"num_input_tokens_seen": 92988560, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 13.62934362934363, |
|
"grad_norm": 0.51105797290802, |
|
"learning_rate": 1.1507179035671922e-05, |
|
"loss": 0.0063, |
|
"num_input_tokens_seen": 93109392, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 13.64864864864865, |
|
"grad_norm": 0.18288812041282654, |
|
"learning_rate": 1.1443420050626625e-05, |
|
"loss": 0.004, |
|
"num_input_tokens_seen": 93271440, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 13.667953667953668, |
|
"grad_norm": 0.7214062809944153, |
|
"learning_rate": 1.1379785726419162e-05, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 93416592, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 13.687258687258687, |
|
"grad_norm": 0.3515792489051819, |
|
"learning_rate": 1.1316276648205085e-05, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 93549200, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 13.706563706563706, |
|
"grad_norm": 0.46057796478271484, |
|
"learning_rate": 1.1252893399988263e-05, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 93678224, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 13.725868725868725, |
|
"grad_norm": 1.148747205734253, |
|
"learning_rate": 1.1189636564615458e-05, |
|
"loss": 0.0061, |
|
"num_input_tokens_seen": 93810832, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 13.745173745173744, |
|
"grad_norm": 0.8160545825958252, |
|
"learning_rate": 1.1126506723770996e-05, |
|
"loss": 0.0093, |
|
"num_input_tokens_seen": 93951888, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 13.764478764478765, |
|
"grad_norm": 0.3965890109539032, |
|
"learning_rate": 1.1063504457971408e-05, |
|
"loss": 0.0077, |
|
"num_input_tokens_seen": 94042000, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 13.783783783783784, |
|
"grad_norm": 0.2592761814594269, |
|
"learning_rate": 1.1000630346560117e-05, |
|
"loss": 0.0057, |
|
"num_input_tokens_seen": 94174864, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 13.803088803088803, |
|
"grad_norm": 0.4129311442375183, |
|
"learning_rate": 1.0937884967702073e-05, |
|
"loss": 0.0064, |
|
"num_input_tokens_seen": 94305424, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 13.822393822393822, |
|
"grad_norm": 0.2524395287036896, |
|
"learning_rate": 1.087526889837845e-05, |
|
"loss": 0.0091, |
|
"num_input_tokens_seen": 94430352, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 13.841698841698841, |
|
"grad_norm": 0.7542433142662048, |
|
"learning_rate": 1.0812782714381342e-05, |
|
"loss": 0.0053, |
|
"num_input_tokens_seen": 94561424, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 13.86100386100386, |
|
"grad_norm": 0.449748694896698, |
|
"learning_rate": 1.0750426990308498e-05, |
|
"loss": 0.0094, |
|
"num_input_tokens_seen": 94686864, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 13.880308880308881, |
|
"grad_norm": 0.30807429552078247, |
|
"learning_rate": 1.0688202299557982e-05, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 94844816, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 13.8996138996139, |
|
"grad_norm": 1.2935056686401367, |
|
"learning_rate": 1.0626109214322923e-05, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 94953104, |
|
"step": 3600 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 5180, |
|
"num_input_tokens_seen": 94953104, |
|
"num_train_epochs": 20, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.544194638795899e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|