|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 11, |
|
"global_step": 101, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009900990099009901, |
|
"grad_norm": 1.9865820407867432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8807, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.019801980198019802, |
|
"grad_norm": 1.6995460987091064, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7298, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0297029702970297, |
|
"grad_norm": 1.870273232460022, |
|
"learning_rate": 9.997482711915926e-06, |
|
"loss": 0.8256, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.039603960396039604, |
|
"grad_norm": 2.035698175430298, |
|
"learning_rate": 9.989933382359423e-06, |
|
"loss": 0.8423, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04950495049504951, |
|
"grad_norm": 1.5802401304244995, |
|
"learning_rate": 9.977359612865424e-06, |
|
"loss": 1.0321, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0594059405940594, |
|
"grad_norm": 1.3238128423690796, |
|
"learning_rate": 9.959774064153977e-06, |
|
"loss": 0.7894, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06930693069306931, |
|
"grad_norm": 1.3449586629867554, |
|
"learning_rate": 9.937194443381972e-06, |
|
"loss": 0.69, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07920792079207921, |
|
"grad_norm": 1.215697169303894, |
|
"learning_rate": 9.909643486313533e-06, |
|
"loss": 0.8263, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0891089108910891, |
|
"grad_norm": 1.4947011470794678, |
|
"learning_rate": 9.877148934427037e-06, |
|
"loss": 0.7706, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09900990099009901, |
|
"grad_norm": 0.9939355850219727, |
|
"learning_rate": 9.839743506981783e-06, |
|
"loss": 0.6854, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10891089108910891, |
|
"grad_norm": 1.3050940036773682, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": 0.9061, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.10891089108910891, |
|
"eval_loss": 0.7099791765213013, |
|
"eval_runtime": 1.6876, |
|
"eval_samples_per_second": 5.333, |
|
"eval_steps_per_second": 1.185, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1188118811881188, |
|
"grad_norm": 1.0027287006378174, |
|
"learning_rate": 9.750355588704728e-06, |
|
"loss": 0.7043, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12871287128712872, |
|
"grad_norm": 0.9371753334999084, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.6727, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.13861386138613863, |
|
"grad_norm": 0.9471062421798706, |
|
"learning_rate": 9.641839665080363e-06, |
|
"loss": 0.6772, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1485148514851485, |
|
"grad_norm": 1.1542134284973145, |
|
"learning_rate": 9.580542287160348e-06, |
|
"loss": 0.7956, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15841584158415842, |
|
"grad_norm": 1.1275238990783691, |
|
"learning_rate": 9.514632691433108e-06, |
|
"loss": 0.7486, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16831683168316833, |
|
"grad_norm": 1.0453617572784424, |
|
"learning_rate": 9.444177243274619e-06, |
|
"loss": 0.659, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1782178217821782, |
|
"grad_norm": 1.0331149101257324, |
|
"learning_rate": 9.369246885348926e-06, |
|
"loss": 0.7471, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18811881188118812, |
|
"grad_norm": 0.8706634044647217, |
|
"learning_rate": 9.289917066174887e-06, |
|
"loss": 0.5547, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.19801980198019803, |
|
"grad_norm": 1.1781532764434814, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.6239, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2079207920792079, |
|
"grad_norm": 1.2011443376541138, |
|
"learning_rate": 9.118382907149164e-06, |
|
"loss": 0.7847, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.21782178217821782, |
|
"grad_norm": 1.028536319732666, |
|
"learning_rate": 9.026351287655294e-06, |
|
"loss": 0.6565, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21782178217821782, |
|
"eval_loss": 0.6684643030166626, |
|
"eval_runtime": 1.6848, |
|
"eval_samples_per_second": 5.342, |
|
"eval_steps_per_second": 1.187, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.22772277227722773, |
|
"grad_norm": 0.9103761315345764, |
|
"learning_rate": 8.930265473713939e-06, |
|
"loss": 0.6495, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2376237623762376, |
|
"grad_norm": 1.0075730085372925, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.7109, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.24752475247524752, |
|
"grad_norm": 0.9617104530334473, |
|
"learning_rate": 8.726322248378775e-06, |
|
"loss": 0.6993, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.25742574257425743, |
|
"grad_norm": 1.072245717048645, |
|
"learning_rate": 8.61867019052535e-06, |
|
"loss": 0.8897, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.26732673267326734, |
|
"grad_norm": 1.0244859457015991, |
|
"learning_rate": 8.507374438531606e-06, |
|
"loss": 0.8158, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.27722772277227725, |
|
"grad_norm": 1.0862797498703003, |
|
"learning_rate": 8.392547057785662e-06, |
|
"loss": 0.7539, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2871287128712871, |
|
"grad_norm": 0.9954370260238647, |
|
"learning_rate": 8.274303669726427e-06, |
|
"loss": 0.718, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.297029702970297, |
|
"grad_norm": 1.0018794536590576, |
|
"learning_rate": 8.152763335422612e-06, |
|
"loss": 0.7518, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3069306930693069, |
|
"grad_norm": 0.9397566318511963, |
|
"learning_rate": 8.028048435688333e-06, |
|
"loss": 0.663, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.31683168316831684, |
|
"grad_norm": 0.8733705878257751, |
|
"learning_rate": 7.900284547855992e-06, |
|
"loss": 0.6565, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.32673267326732675, |
|
"grad_norm": 0.938222348690033, |
|
"learning_rate": 7.769600319330553e-06, |
|
"loss": 0.601, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.32673267326732675, |
|
"eval_loss": 0.648015022277832, |
|
"eval_runtime": 1.6839, |
|
"eval_samples_per_second": 5.345, |
|
"eval_steps_per_second": 1.188, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.33663366336633666, |
|
"grad_norm": 0.9610555768013, |
|
"learning_rate": 7.636127338052513e-06, |
|
"loss": 0.7488, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3465346534653465, |
|
"grad_norm": 1.0051265954971313, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.7929, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3564356435643564, |
|
"grad_norm": 0.8689800500869751, |
|
"learning_rate": 7.361355373863415e-06, |
|
"loss": 0.57, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.36633663366336633, |
|
"grad_norm": 1.0085370540618896, |
|
"learning_rate": 7.2203330630288714e-06, |
|
"loss": 0.672, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.37623762376237624, |
|
"grad_norm": 0.8325203061103821, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": 0.6143, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.38613861386138615, |
|
"grad_norm": 0.9245986342430115, |
|
"learning_rate": 6.931725628465643e-06, |
|
"loss": 0.7288, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.39603960396039606, |
|
"grad_norm": 1.1846799850463867, |
|
"learning_rate": 6.78443110795936e-06, |
|
"loss": 0.8207, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.40594059405940597, |
|
"grad_norm": 0.8949479460716248, |
|
"learning_rate": 6.635339816587109e-06, |
|
"loss": 0.6476, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4158415841584158, |
|
"grad_norm": 0.9853402376174927, |
|
"learning_rate": 6.484601876641375e-06, |
|
"loss": 0.7146, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.42574257425742573, |
|
"grad_norm": 0.9564022421836853, |
|
"learning_rate": 6.332369068450175e-06, |
|
"loss": 0.7382, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.43564356435643564, |
|
"grad_norm": 1.0044441223144531, |
|
"learning_rate": 6.178794677547138e-06, |
|
"loss": 0.7668, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.43564356435643564, |
|
"eval_loss": 0.6342881917953491, |
|
"eval_runtime": 1.6847, |
|
"eval_samples_per_second": 5.342, |
|
"eval_steps_per_second": 1.187, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.44554455445544555, |
|
"grad_norm": 0.8848042488098145, |
|
"learning_rate": 6.024033340325954e-06, |
|
"loss": 0.6532, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45544554455445546, |
|
"grad_norm": 0.8762900233268738, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.674, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.46534653465346537, |
|
"grad_norm": 0.8674903512001038, |
|
"learning_rate": 5.711574191366427e-06, |
|
"loss": 0.6138, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4752475247524752, |
|
"grad_norm": 0.9267687201499939, |
|
"learning_rate": 5.5541909995050554e-06, |
|
"loss": 0.6838, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.48514851485148514, |
|
"grad_norm": 0.8423399329185486, |
|
"learning_rate": 5.396249784283943e-06, |
|
"loss": 0.6081, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.49504950495049505, |
|
"grad_norm": 0.9809019565582275, |
|
"learning_rate": 5.237909579118713e-06, |
|
"loss": 0.7202, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.504950495049505, |
|
"grad_norm": 0.8688491582870483, |
|
"learning_rate": 5.07932981917404e-06, |
|
"loss": 0.6236, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5148514851485149, |
|
"grad_norm": 0.9908064007759094, |
|
"learning_rate": 4.9206701808259605e-06, |
|
"loss": 0.7449, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5247524752475248, |
|
"grad_norm": 0.935342013835907, |
|
"learning_rate": 4.762090420881289e-06, |
|
"loss": 0.6582, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5346534653465347, |
|
"grad_norm": 1.0338435173034668, |
|
"learning_rate": 4.603750215716057e-06, |
|
"loss": 0.7808, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5445544554455446, |
|
"grad_norm": 1.0253307819366455, |
|
"learning_rate": 4.445809000494945e-06, |
|
"loss": 0.8058, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5445544554455446, |
|
"eval_loss": 0.6248189210891724, |
|
"eval_runtime": 1.6835, |
|
"eval_samples_per_second": 5.346, |
|
"eval_steps_per_second": 1.188, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5544554455445545, |
|
"grad_norm": 0.9448875188827515, |
|
"learning_rate": 4.2884258086335755e-06, |
|
"loss": 0.7117, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5643564356435643, |
|
"grad_norm": 0.8737557530403137, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.5988, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5742574257425742, |
|
"grad_norm": 0.764798104763031, |
|
"learning_rate": 3.975966659674048e-06, |
|
"loss": 0.501, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5841584158415841, |
|
"grad_norm": 0.9195338487625122, |
|
"learning_rate": 3.821205322452863e-06, |
|
"loss": 0.677, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.594059405940594, |
|
"grad_norm": 0.9102662801742554, |
|
"learning_rate": 3.667630931549826e-06, |
|
"loss": 0.6363, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6039603960396039, |
|
"grad_norm": 1.1512144804000854, |
|
"learning_rate": 3.5153981233586277e-06, |
|
"loss": 0.7082, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6138613861386139, |
|
"grad_norm": 1.0006070137023926, |
|
"learning_rate": 3.3646601834128924e-06, |
|
"loss": 0.6685, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6237623762376238, |
|
"grad_norm": 0.9273307919502258, |
|
"learning_rate": 3.2155688920406415e-06, |
|
"loss": 0.6652, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6336633663366337, |
|
"grad_norm": 0.9092214703559875, |
|
"learning_rate": 3.0682743715343565e-06, |
|
"loss": 0.6497, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6435643564356436, |
|
"grad_norm": 0.8190207481384277, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 0.5194, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6534653465346535, |
|
"grad_norm": 0.9039291739463806, |
|
"learning_rate": 2.7796669369711294e-06, |
|
"loss": 0.675, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6534653465346535, |
|
"eval_loss": 0.6190232038497925, |
|
"eval_runtime": 1.6846, |
|
"eval_samples_per_second": 5.342, |
|
"eval_steps_per_second": 1.187, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6633663366336634, |
|
"grad_norm": 1.0311757326126099, |
|
"learning_rate": 2.6386446261365874e-06, |
|
"loss": 0.8244, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6732673267326733, |
|
"grad_norm": 0.8113132119178772, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.5488, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6831683168316832, |
|
"grad_norm": 0.9003159403800964, |
|
"learning_rate": 2.363872661947488e-06, |
|
"loss": 0.7291, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.693069306930693, |
|
"grad_norm": 1.057133674621582, |
|
"learning_rate": 2.230399680669449e-06, |
|
"loss": 0.8625, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7029702970297029, |
|
"grad_norm": 0.9384030699729919, |
|
"learning_rate": 2.09971545214401e-06, |
|
"loss": 0.6813, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7128712871287128, |
|
"grad_norm": 0.8832238912582397, |
|
"learning_rate": 1.971951564311668e-06, |
|
"loss": 0.6099, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7227722772277227, |
|
"grad_norm": 0.8297242522239685, |
|
"learning_rate": 1.8472366645773892e-06, |
|
"loss": 0.5465, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7326732673267327, |
|
"grad_norm": 0.9183589816093445, |
|
"learning_rate": 1.7256963302735752e-06, |
|
"loss": 0.6971, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7425742574257426, |
|
"grad_norm": 0.8353100419044495, |
|
"learning_rate": 1.6074529422143398e-06, |
|
"loss": 0.5426, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7524752475247525, |
|
"grad_norm": 1.051756739616394, |
|
"learning_rate": 1.4926255614683931e-06, |
|
"loss": 0.7567, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7623762376237624, |
|
"grad_norm": 0.8063908815383911, |
|
"learning_rate": 1.3813298094746491e-06, |
|
"loss": 0.5383, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7623762376237624, |
|
"eval_loss": 0.6155756711959839, |
|
"eval_runtime": 1.6854, |
|
"eval_samples_per_second": 5.34, |
|
"eval_steps_per_second": 1.187, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7722772277227723, |
|
"grad_norm": 0.8472511768341064, |
|
"learning_rate": 1.2736777516212267e-06, |
|
"loss": 0.5792, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7821782178217822, |
|
"grad_norm": 0.8465410470962524, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.6283, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7920792079207921, |
|
"grad_norm": 0.7360741496086121, |
|
"learning_rate": 1.0697345262860638e-06, |
|
"loss": 0.4699, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.801980198019802, |
|
"grad_norm": 0.9224135875701904, |
|
"learning_rate": 9.73648712344707e-07, |
|
"loss": 0.6806, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8118811881188119, |
|
"grad_norm": 0.871505081653595, |
|
"learning_rate": 8.816170928508367e-07, |
|
"loss": 0.6824, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8217821782178217, |
|
"grad_norm": 0.8890396952629089, |
|
"learning_rate": 7.937323358440935e-07, |
|
"loss": 0.6427, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8316831683168316, |
|
"grad_norm": 1.091757893562317, |
|
"learning_rate": 7.100829338251147e-07, |
|
"loss": 0.7535, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8415841584158416, |
|
"grad_norm": 0.809833824634552, |
|
"learning_rate": 6.307531146510754e-07, |
|
"loss": 0.5965, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8514851485148515, |
|
"grad_norm": 0.861588716506958, |
|
"learning_rate": 5.558227567253832e-07, |
|
"loss": 0.6629, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8613861386138614, |
|
"grad_norm": 0.7921182513237, |
|
"learning_rate": 4.853673085668947e-07, |
|
"loss": 0.482, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8712871287128713, |
|
"grad_norm": 0.7692471742630005, |
|
"learning_rate": 4.194577128396521e-07, |
|
"loss": 0.5044, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8712871287128713, |
|
"eval_loss": 0.6139124035835266, |
|
"eval_runtime": 1.684, |
|
"eval_samples_per_second": 5.344, |
|
"eval_steps_per_second": 1.188, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8811881188118812, |
|
"grad_norm": 0.8603846430778503, |
|
"learning_rate": 3.581603349196372e-07, |
|
"loss": 0.6439, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8910891089108911, |
|
"grad_norm": 0.8953405022621155, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.6881, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.900990099009901, |
|
"grad_norm": 0.9108988046646118, |
|
"learning_rate": 2.4964441129527337e-07, |
|
"loss": 0.7118, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9108910891089109, |
|
"grad_norm": 0.886647641658783, |
|
"learning_rate": 2.0253513192751374e-07, |
|
"loss": 0.7544, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9207920792079208, |
|
"grad_norm": 0.943878173828125, |
|
"learning_rate": 1.6025649301821877e-07, |
|
"loss": 0.7527, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9306930693069307, |
|
"grad_norm": 0.8354966044425964, |
|
"learning_rate": 1.2285106557296479e-07, |
|
"loss": 0.6079, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9405940594059405, |
|
"grad_norm": 0.8121664524078369, |
|
"learning_rate": 9.035651368646647e-08, |
|
"loss": 0.5891, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9504950495049505, |
|
"grad_norm": 0.9194057583808899, |
|
"learning_rate": 6.280555661802857e-08, |
|
"loss": 0.6953, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9603960396039604, |
|
"grad_norm": 0.9447068572044373, |
|
"learning_rate": 4.02259358460233e-08, |
|
"loss": 0.7401, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9702970297029703, |
|
"grad_norm": 0.8672422766685486, |
|
"learning_rate": 2.264038713457706e-08, |
|
"loss": 0.6349, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9801980198019802, |
|
"grad_norm": 0.904771625995636, |
|
"learning_rate": 1.006661764057837e-08, |
|
"loss": 0.6322, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9801980198019802, |
|
"eval_loss": 0.6136024594306946, |
|
"eval_runtime": 1.6864, |
|
"eval_samples_per_second": 5.337, |
|
"eval_steps_per_second": 1.186, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9900990099009901, |
|
"grad_norm": 0.8689496517181396, |
|
"learning_rate": 2.5172880840745873e-09, |
|
"loss": 0.6191, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8673611283302307, |
|
"learning_rate": 0.0, |
|
"loss": 0.5897, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 101, |
|
"total_flos": 1.5438974547499418e+17, |
|
"train_loss": 0.6889385657735391, |
|
"train_runtime": 730.4137, |
|
"train_samples_per_second": 1.101, |
|
"train_steps_per_second": 0.138 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 101, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 101, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5438974547499418e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|