|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.999163179916318, |
|
"eval_steps": 500, |
|
"global_step": 420, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07140864714086471, |
|
"grad_norm": 77.45325469970703, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.6702, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.14281729428172943, |
|
"grad_norm": 59.097469329833984, |
|
"learning_rate": 3.997081220115612e-05, |
|
"loss": 0.5793, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21422594142259413, |
|
"grad_norm": 52.32804870605469, |
|
"learning_rate": 3.982805660299152e-05, |
|
"loss": 0.5313, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.28563458856345886, |
|
"grad_norm": 42.44932174682617, |
|
"learning_rate": 3.956722125241571e-05, |
|
"loss": 0.4916, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.35704323570432356, |
|
"grad_norm": 44.81378936767578, |
|
"learning_rate": 3.9189859472289956e-05, |
|
"loss": 0.4705, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.42845188284518826, |
|
"grad_norm": 47.092281341552734, |
|
"learning_rate": 3.869821852196291e-05, |
|
"loss": 0.4503, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.499860529986053, |
|
"grad_norm": 49.76475143432617, |
|
"learning_rate": 3.809522621442463e-05, |
|
"loss": 0.4432, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5712691771269177, |
|
"grad_norm": 47.245887756347656, |
|
"learning_rate": 3.738447348063752e-05, |
|
"loss": 0.4199, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6426778242677824, |
|
"grad_norm": 53.50367736816406, |
|
"learning_rate": 3.657019298487685e-05, |
|
"loss": 0.4037, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7140864714086471, |
|
"grad_norm": 45.266780853271484, |
|
"learning_rate": 3.565723391843037e-05, |
|
"loss": 0.3886, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7854951185495118, |
|
"grad_norm": 42.24121856689453, |
|
"learning_rate": 3.465103312176541e-05, |
|
"loss": 0.382, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8569037656903765, |
|
"grad_norm": 46.39073181152344, |
|
"learning_rate": 3.35575827071361e-05, |
|
"loss": 0.366, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9283124128312413, |
|
"grad_norm": 44.36848831176758, |
|
"learning_rate": 3.238339437444418e-05, |
|
"loss": 0.3539, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.999721059972106, |
|
"grad_norm": 43.62345504760742, |
|
"learning_rate": 3.113546063285907e-05, |
|
"loss": 0.348, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0711297071129706, |
|
"grad_norm": 42.84613037109375, |
|
"learning_rate": 2.9821213159129655e-05, |
|
"loss": 0.2955, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1425383542538354, |
|
"grad_norm": 47.24495315551758, |
|
"learning_rate": 2.8448478540571694e-05, |
|
"loss": 0.2896, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2139470013947, |
|
"grad_norm": 42.732933044433594, |
|
"learning_rate": 2.70254316662896e-05, |
|
"loss": 0.2889, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2853556485355648, |
|
"grad_norm": 44.07963180541992, |
|
"learning_rate": 2.5560547044196552e-05, |
|
"loss": 0.288, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3567642956764296, |
|
"grad_norm": 43.4594612121582, |
|
"learning_rate": 2.4062548333748996e-05, |
|
"loss": 0.2808, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4281729428172942, |
|
"grad_norm": 42.07709884643555, |
|
"learning_rate": 2.2540356394937577e-05, |
|
"loss": 0.2762, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.499581589958159, |
|
"grad_norm": 41.83521270751953, |
|
"learning_rate": 2.1003036162912327e-05, |
|
"loss": 0.2798, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5709902370990236, |
|
"grad_norm": 40.76252365112305, |
|
"learning_rate": 1.945974266461355e-05, |
|
"loss": 0.2747, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6423988842398884, |
|
"grad_norm": 44.36031723022461, |
|
"learning_rate": 1.791966649888943e-05, |
|
"loss": 0.2585, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7138075313807533, |
|
"grad_norm": 40.18080139160156, |
|
"learning_rate": 1.639197910477628e-05, |
|
"loss": 0.2593, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7852161785216178, |
|
"grad_norm": 42.134490966796875, |
|
"learning_rate": 1.4885778143879096e-05, |
|
"loss": 0.2474, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8566248256624824, |
|
"grad_norm": 47.6600456237793, |
|
"learning_rate": 1.3410033322110323e-05, |
|
"loss": 0.2593, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9280334728033472, |
|
"grad_norm": 43.97560119628906, |
|
"learning_rate": 1.1973532973428536e-05, |
|
"loss": 0.2482, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.999442119944212, |
|
"grad_norm": 41.20925521850586, |
|
"learning_rate": 1.05848317236807e-05, |
|
"loss": 0.2488, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.070850767085077, |
|
"grad_norm": 44.23745346069336, |
|
"learning_rate": 9.25219954621956e-06, |
|
"loss": 0.196, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.1422594142259412, |
|
"grad_norm": 45.39742660522461, |
|
"learning_rate": 7.983572512679384e-06, |
|
"loss": 0.1993, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.213668061366806, |
|
"grad_norm": 43.84931182861328, |
|
"learning_rate": 6.7865055321983754e-06, |
|
"loss": 0.1937, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.285076708507671, |
|
"grad_norm": 44.85445022583008, |
|
"learning_rate": 5.668127360534343e-06, |
|
"loss": 0.191, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.3564853556485357, |
|
"grad_norm": 39.72176742553711, |
|
"learning_rate": 4.635098147002792e-06, |
|
"loss": 0.1931, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4278940027894, |
|
"grad_norm": 44.86882400512695, |
|
"learning_rate": 3.6935697720532095e-06, |
|
"loss": 0.1827, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.499302649930265, |
|
"grad_norm": 42.152137756347656, |
|
"learning_rate": 2.849149211680693e-06, |
|
"loss": 0.1886, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.5707112970711297, |
|
"grad_norm": 39.35033416748047, |
|
"learning_rate": 2.1068651468445546e-06, |
|
"loss": 0.19, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.6421199442119945, |
|
"grad_norm": 40.978153228759766, |
|
"learning_rate": 1.4711380167411094e-06, |
|
"loss": 0.1912, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7135285913528593, |
|
"grad_norm": 44.58827209472656, |
|
"learning_rate": 9.45753694268885e-07, |
|
"loss": 0.1828, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.7849372384937237, |
|
"grad_norm": 41.021331787109375, |
|
"learning_rate": 5.338409404537537e-07, |
|
"loss": 0.1808, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.8563458856345885, |
|
"grad_norm": 42.61368179321289, |
|
"learning_rate": 2.3785277209707802e-07, |
|
"loss": 0.1946, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9277545327754533, |
|
"grad_norm": 39.83855438232422, |
|
"learning_rate": 5.9551853605968044e-08, |
|
"loss": 0.1858, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.999163179916318, |
|
"grad_norm": 39.88948059082031, |
|
"learning_rate": 0.0, |
|
"loss": 0.1823, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.999163179916318, |
|
"step": 420, |
|
"total_flos": 4.006659812993925e+17, |
|
"train_loss": 0.2970780080273038, |
|
"train_runtime": 9700.7388, |
|
"train_samples_per_second": 5.543, |
|
"train_steps_per_second": 0.043 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 420, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.006659812993925e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|