CTMAE2_CS_V7_5 / trainer_state.json
beingbatman's picture
End of training
987e57b verified
{
"best_metric": 0.8666666666666667,
"best_model_checkpoint": "CTMAE2_CS_V7_5/checkpoint-1755",
"epoch": 49.01494845360825,
"eval_steps": 500,
"global_step": 9700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010309278350515464,
"grad_norm": 5.501185894012451,
"learning_rate": 1.0309278350515465e-07,
"loss": 0.6908,
"step": 10
},
{
"epoch": 0.002061855670103093,
"grad_norm": 2.460876941680908,
"learning_rate": 2.061855670103093e-07,
"loss": 0.6912,
"step": 20
},
{
"epoch": 0.003092783505154639,
"grad_norm": 2.6134722232818604,
"learning_rate": 3.0927835051546394e-07,
"loss": 0.6906,
"step": 30
},
{
"epoch": 0.004123711340206186,
"grad_norm": 3.098764181137085,
"learning_rate": 4.123711340206186e-07,
"loss": 0.6779,
"step": 40
},
{
"epoch": 0.005154639175257732,
"grad_norm": 2.6916747093200684,
"learning_rate": 5.154639175257732e-07,
"loss": 0.688,
"step": 50
},
{
"epoch": 0.006185567010309278,
"grad_norm": 2.6864609718322754,
"learning_rate": 6.185567010309279e-07,
"loss": 0.6744,
"step": 60
},
{
"epoch": 0.007216494845360825,
"grad_norm": 1.6162508726119995,
"learning_rate": 7.216494845360824e-07,
"loss": 0.6806,
"step": 70
},
{
"epoch": 0.008247422680412371,
"grad_norm": 6.75019645690918,
"learning_rate": 8.247422680412372e-07,
"loss": 0.6594,
"step": 80
},
{
"epoch": 0.009278350515463918,
"grad_norm": 3.412571668624878,
"learning_rate": 9.278350515463919e-07,
"loss": 0.6602,
"step": 90
},
{
"epoch": 0.010309278350515464,
"grad_norm": 3.0713565349578857,
"learning_rate": 1.0309278350515464e-06,
"loss": 0.6957,
"step": 100
},
{
"epoch": 0.01134020618556701,
"grad_norm": 2.902609348297119,
"learning_rate": 1.134020618556701e-06,
"loss": 0.6819,
"step": 110
},
{
"epoch": 0.012371134020618556,
"grad_norm": 13.36193561553955,
"learning_rate": 1.2371134020618557e-06,
"loss": 0.6161,
"step": 120
},
{
"epoch": 0.013402061855670102,
"grad_norm": 5.576000213623047,
"learning_rate": 1.3402061855670104e-06,
"loss": 0.6261,
"step": 130
},
{
"epoch": 0.01443298969072165,
"grad_norm": 6.617053031921387,
"learning_rate": 1.4432989690721649e-06,
"loss": 0.529,
"step": 140
},
{
"epoch": 0.015463917525773196,
"grad_norm": 12.840880393981934,
"learning_rate": 1.5463917525773197e-06,
"loss": 0.7079,
"step": 150
},
{
"epoch": 0.016494845360824743,
"grad_norm": 10.174439430236816,
"learning_rate": 1.6494845360824744e-06,
"loss": 0.6972,
"step": 160
},
{
"epoch": 0.01752577319587629,
"grad_norm": 19.259891510009766,
"learning_rate": 1.7525773195876288e-06,
"loss": 0.54,
"step": 170
},
{
"epoch": 0.018556701030927835,
"grad_norm": 6.065194129943848,
"learning_rate": 1.8556701030927837e-06,
"loss": 0.646,
"step": 180
},
{
"epoch": 0.01958762886597938,
"grad_norm": 9.375090599060059,
"learning_rate": 1.9587628865979384e-06,
"loss": 0.6793,
"step": 190
},
{
"epoch": 0.020103092783505156,
"eval_accuracy": 0.4666666666666667,
"eval_loss": 0.7615872025489807,
"eval_runtime": 15.3942,
"eval_samples_per_second": 2.923,
"eval_steps_per_second": 0.78,
"step": 195
},
{
"epoch": 1.0005154639175258,
"grad_norm": 13.356775283813477,
"learning_rate": 2.061855670103093e-06,
"loss": 0.5875,
"step": 200
},
{
"epoch": 1.0015463917525773,
"grad_norm": 4.08266544342041,
"learning_rate": 2.1649484536082477e-06,
"loss": 0.6459,
"step": 210
},
{
"epoch": 1.0025773195876289,
"grad_norm": 6.374290466308594,
"learning_rate": 2.268041237113402e-06,
"loss": 0.6691,
"step": 220
},
{
"epoch": 1.0036082474226804,
"grad_norm": 5.553038597106934,
"learning_rate": 2.3711340206185566e-06,
"loss": 0.6392,
"step": 230
},
{
"epoch": 1.004639175257732,
"grad_norm": 19.62584686279297,
"learning_rate": 2.4742268041237115e-06,
"loss": 0.634,
"step": 240
},
{
"epoch": 1.0056701030927835,
"grad_norm": 8.070799827575684,
"learning_rate": 2.577319587628866e-06,
"loss": 0.6665,
"step": 250
},
{
"epoch": 1.006701030927835,
"grad_norm": 8.732718467712402,
"learning_rate": 2.680412371134021e-06,
"loss": 0.6228,
"step": 260
},
{
"epoch": 1.0077319587628866,
"grad_norm": 4.9655890464782715,
"learning_rate": 2.7835051546391757e-06,
"loss": 0.6189,
"step": 270
},
{
"epoch": 1.0087628865979381,
"grad_norm": 9.290786743164062,
"learning_rate": 2.8865979381443297e-06,
"loss": 0.7474,
"step": 280
},
{
"epoch": 1.0097938144329897,
"grad_norm": 7.63596248626709,
"learning_rate": 2.9896907216494846e-06,
"loss": 0.5911,
"step": 290
},
{
"epoch": 1.0108247422680412,
"grad_norm": 10.840821266174316,
"learning_rate": 3.0927835051546395e-06,
"loss": 0.6093,
"step": 300
},
{
"epoch": 1.0118556701030927,
"grad_norm": 4.624059200286865,
"learning_rate": 3.195876288659794e-06,
"loss": 0.7803,
"step": 310
},
{
"epoch": 1.0128865979381443,
"grad_norm": 5.822396755218506,
"learning_rate": 3.298969072164949e-06,
"loss": 0.598,
"step": 320
},
{
"epoch": 1.0139175257731958,
"grad_norm": 3.33036732673645,
"learning_rate": 3.4020618556701037e-06,
"loss": 0.5865,
"step": 330
},
{
"epoch": 1.0149484536082474,
"grad_norm": 5.949455261230469,
"learning_rate": 3.5051546391752577e-06,
"loss": 0.6764,
"step": 340
},
{
"epoch": 1.015979381443299,
"grad_norm": 6.306659698486328,
"learning_rate": 3.6082474226804126e-06,
"loss": 0.645,
"step": 350
},
{
"epoch": 1.0170103092783505,
"grad_norm": 17.39265251159668,
"learning_rate": 3.7113402061855674e-06,
"loss": 0.64,
"step": 360
},
{
"epoch": 1.018041237113402,
"grad_norm": 7.141329288482666,
"learning_rate": 3.814432989690722e-06,
"loss": 0.5337,
"step": 370
},
{
"epoch": 1.0190721649484535,
"grad_norm": 6.430778980255127,
"learning_rate": 3.917525773195877e-06,
"loss": 0.5442,
"step": 380
},
{
"epoch": 1.020103092783505,
"grad_norm": 15.002116203308105,
"learning_rate": 4.020618556701032e-06,
"loss": 0.5591,
"step": 390
},
{
"epoch": 1.020103092783505,
"eval_accuracy": 0.4666666666666667,
"eval_loss": 0.7040268182754517,
"eval_runtime": 12.9016,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 0.93,
"step": 390
},
{
"epoch": 2.0010309278350515,
"grad_norm": 9.877669334411621,
"learning_rate": 4.123711340206186e-06,
"loss": 0.5959,
"step": 400
},
{
"epoch": 2.002061855670103,
"grad_norm": 7.665306568145752,
"learning_rate": 4.2268041237113405e-06,
"loss": 0.6647,
"step": 410
},
{
"epoch": 2.0030927835051546,
"grad_norm": 19.635908126831055,
"learning_rate": 4.329896907216495e-06,
"loss": 0.5142,
"step": 420
},
{
"epoch": 2.004123711340206,
"grad_norm": 13.251553535461426,
"learning_rate": 4.4329896907216494e-06,
"loss": 0.7387,
"step": 430
},
{
"epoch": 2.0051546391752577,
"grad_norm": 33.24567794799805,
"learning_rate": 4.536082474226804e-06,
"loss": 0.5326,
"step": 440
},
{
"epoch": 2.0061855670103093,
"grad_norm": 17.012880325317383,
"learning_rate": 4.639175257731959e-06,
"loss": 0.6149,
"step": 450
},
{
"epoch": 2.007216494845361,
"grad_norm": 12.129619598388672,
"learning_rate": 4.742268041237113e-06,
"loss": 0.4566,
"step": 460
},
{
"epoch": 2.0082474226804123,
"grad_norm": 32.084442138671875,
"learning_rate": 4.845360824742268e-06,
"loss": 0.8027,
"step": 470
},
{
"epoch": 2.009278350515464,
"grad_norm": 6.839757442474365,
"learning_rate": 4.948453608247423e-06,
"loss": 0.5719,
"step": 480
},
{
"epoch": 2.0103092783505154,
"grad_norm": 18.979419708251953,
"learning_rate": 5.051546391752578e-06,
"loss": 0.5684,
"step": 490
},
{
"epoch": 2.011340206185567,
"grad_norm": 11.15890121459961,
"learning_rate": 5.154639175257732e-06,
"loss": 0.4082,
"step": 500
},
{
"epoch": 2.0123711340206185,
"grad_norm": 10.10776138305664,
"learning_rate": 5.257731958762888e-06,
"loss": 0.5722,
"step": 510
},
{
"epoch": 2.01340206185567,
"grad_norm": 14.203100204467773,
"learning_rate": 5.360824742268042e-06,
"loss": 0.4529,
"step": 520
},
{
"epoch": 2.0144329896907216,
"grad_norm": 13.093475341796875,
"learning_rate": 5.463917525773196e-06,
"loss": 0.5994,
"step": 530
},
{
"epoch": 2.015463917525773,
"grad_norm": 23.01972007751465,
"learning_rate": 5.567010309278351e-06,
"loss": 0.5696,
"step": 540
},
{
"epoch": 2.0164948453608247,
"grad_norm": 24.430442810058594,
"learning_rate": 5.670103092783505e-06,
"loss": 0.5652,
"step": 550
},
{
"epoch": 2.0175257731958762,
"grad_norm": 15.79038143157959,
"learning_rate": 5.7731958762886594e-06,
"loss": 0.5913,
"step": 560
},
{
"epoch": 2.0185567010309278,
"grad_norm": 21.04030990600586,
"learning_rate": 5.876288659793815e-06,
"loss": 0.484,
"step": 570
},
{
"epoch": 2.0195876288659793,
"grad_norm": 8.871047973632812,
"learning_rate": 5.979381443298969e-06,
"loss": 0.7211,
"step": 580
},
{
"epoch": 2.020103092783505,
"eval_accuracy": 0.8222222222222222,
"eval_loss": 0.4916439354419708,
"eval_runtime": 12.7637,
"eval_samples_per_second": 3.526,
"eval_steps_per_second": 0.94,
"step": 585
},
{
"epoch": 3.0005154639175258,
"grad_norm": 9.930680274963379,
"learning_rate": 6.082474226804124e-06,
"loss": 0.4872,
"step": 590
},
{
"epoch": 3.0015463917525773,
"grad_norm": 8.775280952453613,
"learning_rate": 6.185567010309279e-06,
"loss": 0.4803,
"step": 600
},
{
"epoch": 3.002577319587629,
"grad_norm": 16.243377685546875,
"learning_rate": 6.288659793814433e-06,
"loss": 0.4948,
"step": 610
},
{
"epoch": 3.0036082474226804,
"grad_norm": 43.30958557128906,
"learning_rate": 6.391752577319588e-06,
"loss": 0.6622,
"step": 620
},
{
"epoch": 3.004639175257732,
"grad_norm": 22.228527069091797,
"learning_rate": 6.494845360824743e-06,
"loss": 0.4748,
"step": 630
},
{
"epoch": 3.0056701030927835,
"grad_norm": 17.75151252746582,
"learning_rate": 6.597938144329898e-06,
"loss": 0.4999,
"step": 640
},
{
"epoch": 3.006701030927835,
"grad_norm": 11.695698738098145,
"learning_rate": 6.701030927835052e-06,
"loss": 0.259,
"step": 650
},
{
"epoch": 3.0077319587628866,
"grad_norm": 38.64352035522461,
"learning_rate": 6.804123711340207e-06,
"loss": 1.0941,
"step": 660
},
{
"epoch": 3.008762886597938,
"grad_norm": 13.178338050842285,
"learning_rate": 6.907216494845361e-06,
"loss": 0.9189,
"step": 670
},
{
"epoch": 3.0097938144329897,
"grad_norm": 18.1324462890625,
"learning_rate": 7.010309278350515e-06,
"loss": 0.6344,
"step": 680
},
{
"epoch": 3.010824742268041,
"grad_norm": 11.067273139953613,
"learning_rate": 7.113402061855671e-06,
"loss": 0.4916,
"step": 690
},
{
"epoch": 3.0118556701030927,
"grad_norm": 7.603753566741943,
"learning_rate": 7.216494845360825e-06,
"loss": 0.5001,
"step": 700
},
{
"epoch": 3.0128865979381443,
"grad_norm": 4.259734630584717,
"learning_rate": 7.319587628865979e-06,
"loss": 0.4538,
"step": 710
},
{
"epoch": 3.013917525773196,
"grad_norm": 19.640830993652344,
"learning_rate": 7.422680412371135e-06,
"loss": 0.5484,
"step": 720
},
{
"epoch": 3.0149484536082474,
"grad_norm": 20.696624755859375,
"learning_rate": 7.525773195876289e-06,
"loss": 0.5014,
"step": 730
},
{
"epoch": 3.015979381443299,
"grad_norm": 59.72232437133789,
"learning_rate": 7.628865979381444e-06,
"loss": 0.5271,
"step": 740
},
{
"epoch": 3.0170103092783505,
"grad_norm": 11.003219604492188,
"learning_rate": 7.731958762886599e-06,
"loss": 0.5286,
"step": 750
},
{
"epoch": 3.018041237113402,
"grad_norm": 13.869205474853516,
"learning_rate": 7.835051546391754e-06,
"loss": 0.4798,
"step": 760
},
{
"epoch": 3.0190721649484535,
"grad_norm": 12.36577320098877,
"learning_rate": 7.938144329896907e-06,
"loss": 0.6944,
"step": 770
},
{
"epoch": 3.020103092783505,
"grad_norm": 30.60397720336914,
"learning_rate": 8.041237113402063e-06,
"loss": 0.5544,
"step": 780
},
{
"epoch": 3.020103092783505,
"eval_accuracy": 0.5555555555555556,
"eval_loss": 0.7589893937110901,
"eval_runtime": 12.6816,
"eval_samples_per_second": 3.548,
"eval_steps_per_second": 0.946,
"step": 780
},
{
"epoch": 4.0010309278350515,
"grad_norm": 20.62627601623535,
"learning_rate": 8.144329896907216e-06,
"loss": 0.5034,
"step": 790
},
{
"epoch": 4.002061855670103,
"grad_norm": 4.391873836517334,
"learning_rate": 8.247422680412371e-06,
"loss": 0.7131,
"step": 800
},
{
"epoch": 4.003092783505155,
"grad_norm": 14.114368438720703,
"learning_rate": 8.350515463917526e-06,
"loss": 0.5573,
"step": 810
},
{
"epoch": 4.004123711340206,
"grad_norm": 3.9918265342712402,
"learning_rate": 8.453608247422681e-06,
"loss": 0.5594,
"step": 820
},
{
"epoch": 4.005154639175258,
"grad_norm": 29.062408447265625,
"learning_rate": 8.556701030927836e-06,
"loss": 0.5178,
"step": 830
},
{
"epoch": 4.006185567010309,
"grad_norm": 13.267045021057129,
"learning_rate": 8.65979381443299e-06,
"loss": 0.6529,
"step": 840
},
{
"epoch": 4.007216494845361,
"grad_norm": 31.6133975982666,
"learning_rate": 8.762886597938146e-06,
"loss": 0.4495,
"step": 850
},
{
"epoch": 4.008247422680412,
"grad_norm": 7.6835150718688965,
"learning_rate": 8.865979381443299e-06,
"loss": 0.5844,
"step": 860
},
{
"epoch": 4.009278350515464,
"grad_norm": 3.838587522506714,
"learning_rate": 8.969072164948455e-06,
"loss": 0.6229,
"step": 870
},
{
"epoch": 4.010309278350515,
"grad_norm": 12.973073959350586,
"learning_rate": 9.072164948453609e-06,
"loss": 0.5575,
"step": 880
},
{
"epoch": 4.011340206185567,
"grad_norm": 9.979464530944824,
"learning_rate": 9.175257731958764e-06,
"loss": 0.7956,
"step": 890
},
{
"epoch": 4.0123711340206185,
"grad_norm": 10.023995399475098,
"learning_rate": 9.278350515463918e-06,
"loss": 0.5851,
"step": 900
},
{
"epoch": 4.01340206185567,
"grad_norm": 25.86085319519043,
"learning_rate": 9.381443298969073e-06,
"loss": 0.4864,
"step": 910
},
{
"epoch": 4.014432989690722,
"grad_norm": 5.599601745605469,
"learning_rate": 9.484536082474226e-06,
"loss": 0.5482,
"step": 920
},
{
"epoch": 4.015463917525773,
"grad_norm": 1.694999098777771,
"learning_rate": 9.587628865979383e-06,
"loss": 0.4848,
"step": 930
},
{
"epoch": 4.016494845360825,
"grad_norm": 49.36362075805664,
"learning_rate": 9.690721649484536e-06,
"loss": 0.3432,
"step": 940
},
{
"epoch": 4.017525773195876,
"grad_norm": 62.547489166259766,
"learning_rate": 9.793814432989691e-06,
"loss": 0.9801,
"step": 950
},
{
"epoch": 4.018556701030928,
"grad_norm": 6.7530517578125,
"learning_rate": 9.896907216494846e-06,
"loss": 0.7231,
"step": 960
},
{
"epoch": 4.019587628865979,
"grad_norm": 18.402511596679688,
"learning_rate": 1e-05,
"loss": 0.6032,
"step": 970
},
{
"epoch": 4.020103092783505,
"eval_accuracy": 0.6666666666666666,
"eval_loss": 0.5508340001106262,
"eval_runtime": 13.2243,
"eval_samples_per_second": 3.403,
"eval_steps_per_second": 0.907,
"step": 975
},
{
"epoch": 5.000515463917526,
"grad_norm": 9.008790969848633,
"learning_rate": 9.988545246277205e-06,
"loss": 0.5444,
"step": 980
},
{
"epoch": 5.001546391752577,
"grad_norm": 24.455364227294922,
"learning_rate": 9.977090492554411e-06,
"loss": 0.6265,
"step": 990
},
{
"epoch": 5.002577319587629,
"grad_norm": 2.534010410308838,
"learning_rate": 9.965635738831616e-06,
"loss": 0.4103,
"step": 1000
},
{
"epoch": 5.00360824742268,
"grad_norm": 2.197962999343872,
"learning_rate": 9.95418098510882e-06,
"loss": 0.6683,
"step": 1010
},
{
"epoch": 5.004639175257732,
"grad_norm": 21.681041717529297,
"learning_rate": 9.942726231386026e-06,
"loss": 0.5277,
"step": 1020
},
{
"epoch": 5.0056701030927835,
"grad_norm": 6.0828752517700195,
"learning_rate": 9.931271477663231e-06,
"loss": 0.8429,
"step": 1030
},
{
"epoch": 5.006701030927835,
"grad_norm": 9.508393287658691,
"learning_rate": 9.919816723940437e-06,
"loss": 0.4463,
"step": 1040
},
{
"epoch": 5.007731958762887,
"grad_norm": 8.229544639587402,
"learning_rate": 9.908361970217641e-06,
"loss": 0.4216,
"step": 1050
},
{
"epoch": 5.008762886597938,
"grad_norm": 45.83427429199219,
"learning_rate": 9.896907216494846e-06,
"loss": 0.5766,
"step": 1060
},
{
"epoch": 5.00979381443299,
"grad_norm": 6.862059116363525,
"learning_rate": 9.885452462772052e-06,
"loss": 0.587,
"step": 1070
},
{
"epoch": 5.010824742268041,
"grad_norm": 8.74276065826416,
"learning_rate": 9.873997709049257e-06,
"loss": 0.4442,
"step": 1080
},
{
"epoch": 5.011855670103093,
"grad_norm": 22.86684226989746,
"learning_rate": 9.862542955326461e-06,
"loss": 0.6656,
"step": 1090
},
{
"epoch": 5.012886597938144,
"grad_norm": 6.2574849128723145,
"learning_rate": 9.851088201603667e-06,
"loss": 0.421,
"step": 1100
},
{
"epoch": 5.013917525773196,
"grad_norm": 19.12122344970703,
"learning_rate": 9.839633447880872e-06,
"loss": 0.4395,
"step": 1110
},
{
"epoch": 5.014948453608247,
"grad_norm": 47.755435943603516,
"learning_rate": 9.828178694158076e-06,
"loss": 0.6335,
"step": 1120
},
{
"epoch": 5.015979381443299,
"grad_norm": 11.343196868896484,
"learning_rate": 9.81672394043528e-06,
"loss": 0.4231,
"step": 1130
},
{
"epoch": 5.0170103092783505,
"grad_norm": 15.7100248336792,
"learning_rate": 9.805269186712487e-06,
"loss": 0.4681,
"step": 1140
},
{
"epoch": 5.018041237113402,
"grad_norm": 29.985126495361328,
"learning_rate": 9.793814432989691e-06,
"loss": 0.7692,
"step": 1150
},
{
"epoch": 5.0190721649484535,
"grad_norm": 4.043577194213867,
"learning_rate": 9.782359679266896e-06,
"loss": 0.5417,
"step": 1160
},
{
"epoch": 5.020103092783505,
"grad_norm": 6.033001899719238,
"learning_rate": 9.770904925544102e-06,
"loss": 0.518,
"step": 1170
},
{
"epoch": 5.020103092783505,
"eval_accuracy": 0.4666666666666667,
"eval_loss": 0.8928155899047852,
"eval_runtime": 13.9107,
"eval_samples_per_second": 3.235,
"eval_steps_per_second": 0.863,
"step": 1170
},
{
"epoch": 6.0010309278350515,
"grad_norm": 6.170740604400635,
"learning_rate": 9.759450171821306e-06,
"loss": 0.5075,
"step": 1180
},
{
"epoch": 6.002061855670103,
"grad_norm": 6.046166896820068,
"learning_rate": 9.747995418098512e-06,
"loss": 0.6887,
"step": 1190
},
{
"epoch": 6.003092783505155,
"grad_norm": 8.674880981445312,
"learning_rate": 9.736540664375717e-06,
"loss": 0.5482,
"step": 1200
},
{
"epoch": 6.004123711340206,
"grad_norm": 4.15226411819458,
"learning_rate": 9.725085910652921e-06,
"loss": 0.5811,
"step": 1210
},
{
"epoch": 6.005154639175258,
"grad_norm": 9.141851425170898,
"learning_rate": 9.713631156930127e-06,
"loss": 0.507,
"step": 1220
},
{
"epoch": 6.006185567010309,
"grad_norm": 10.06373119354248,
"learning_rate": 9.702176403207332e-06,
"loss": 0.6606,
"step": 1230
},
{
"epoch": 6.007216494845361,
"grad_norm": 1.9274550676345825,
"learning_rate": 9.690721649484536e-06,
"loss": 0.4037,
"step": 1240
},
{
"epoch": 6.008247422680412,
"grad_norm": 15.267620086669922,
"learning_rate": 9.679266895761742e-06,
"loss": 0.439,
"step": 1250
},
{
"epoch": 6.009278350515464,
"grad_norm": 87.103759765625,
"learning_rate": 9.667812142038947e-06,
"loss": 0.6971,
"step": 1260
},
{
"epoch": 6.010309278350515,
"grad_norm": 17.93890380859375,
"learning_rate": 9.656357388316153e-06,
"loss": 0.6488,
"step": 1270
},
{
"epoch": 6.011340206185567,
"grad_norm": 13.628206253051758,
"learning_rate": 9.644902634593357e-06,
"loss": 0.4316,
"step": 1280
},
{
"epoch": 6.0123711340206185,
"grad_norm": 1.0886458158493042,
"learning_rate": 9.633447880870562e-06,
"loss": 0.4252,
"step": 1290
},
{
"epoch": 6.01340206185567,
"grad_norm": 0.41759127378463745,
"learning_rate": 9.621993127147768e-06,
"loss": 0.4604,
"step": 1300
},
{
"epoch": 6.014432989690722,
"grad_norm": 7.780788421630859,
"learning_rate": 9.610538373424972e-06,
"loss": 0.7416,
"step": 1310
},
{
"epoch": 6.015463917525773,
"grad_norm": 1.952467918395996,
"learning_rate": 9.599083619702177e-06,
"loss": 0.3673,
"step": 1320
},
{
"epoch": 6.016494845360825,
"grad_norm": 2.227653741836548,
"learning_rate": 9.587628865979383e-06,
"loss": 0.5467,
"step": 1330
},
{
"epoch": 6.017525773195876,
"grad_norm": 45.32661819458008,
"learning_rate": 9.576174112256587e-06,
"loss": 0.6066,
"step": 1340
},
{
"epoch": 6.018556701030928,
"grad_norm": 6.04111385345459,
"learning_rate": 9.564719358533792e-06,
"loss": 0.9221,
"step": 1350
},
{
"epoch": 6.019587628865979,
"grad_norm": 19.719083786010742,
"learning_rate": 9.553264604810998e-06,
"loss": 0.4857,
"step": 1360
},
{
"epoch": 6.020103092783505,
"eval_accuracy": 0.6222222222222222,
"eval_loss": 0.5889319777488708,
"eval_runtime": 12.7268,
"eval_samples_per_second": 3.536,
"eval_steps_per_second": 0.943,
"step": 1365
},
{
"epoch": 7.000515463917526,
"grad_norm": 4.581728935241699,
"learning_rate": 9.541809851088203e-06,
"loss": 0.488,
"step": 1370
},
{
"epoch": 7.001546391752577,
"grad_norm": 27.277809143066406,
"learning_rate": 9.530355097365407e-06,
"loss": 0.5966,
"step": 1380
},
{
"epoch": 7.002577319587629,
"grad_norm": 0.2609100341796875,
"learning_rate": 9.518900343642611e-06,
"loss": 0.3383,
"step": 1390
},
{
"epoch": 7.00360824742268,
"grad_norm": 22.71802520751953,
"learning_rate": 9.507445589919818e-06,
"loss": 0.7841,
"step": 1400
},
{
"epoch": 7.004639175257732,
"grad_norm": 16.892574310302734,
"learning_rate": 9.495990836197022e-06,
"loss": 0.4176,
"step": 1410
},
{
"epoch": 7.0056701030927835,
"grad_norm": 39.4376335144043,
"learning_rate": 9.484536082474226e-06,
"loss": 0.3999,
"step": 1420
},
{
"epoch": 7.006701030927835,
"grad_norm": 14.214164733886719,
"learning_rate": 9.473081328751433e-06,
"loss": 0.6387,
"step": 1430
},
{
"epoch": 7.007731958762887,
"grad_norm": 11.498187065124512,
"learning_rate": 9.461626575028637e-06,
"loss": 0.5193,
"step": 1440
},
{
"epoch": 7.008762886597938,
"grad_norm": 8.90798568725586,
"learning_rate": 9.450171821305843e-06,
"loss": 0.4921,
"step": 1450
},
{
"epoch": 7.00979381443299,
"grad_norm": 4.212175369262695,
"learning_rate": 9.438717067583048e-06,
"loss": 0.4248,
"step": 1460
},
{
"epoch": 7.010824742268041,
"grad_norm": 15.754873275756836,
"learning_rate": 9.427262313860252e-06,
"loss": 0.5522,
"step": 1470
},
{
"epoch": 7.011855670103093,
"grad_norm": 20.611268997192383,
"learning_rate": 9.415807560137458e-06,
"loss": 0.5084,
"step": 1480
},
{
"epoch": 7.012886597938144,
"grad_norm": 9.555137634277344,
"learning_rate": 9.404352806414663e-06,
"loss": 0.2632,
"step": 1490
},
{
"epoch": 7.013917525773196,
"grad_norm": 2.367072343826294,
"learning_rate": 9.392898052691867e-06,
"loss": 0.4345,
"step": 1500
},
{
"epoch": 7.014948453608247,
"grad_norm": 3.8319485187530518,
"learning_rate": 9.381443298969073e-06,
"loss": 0.5227,
"step": 1510
},
{
"epoch": 7.015979381443299,
"grad_norm": 7.583319187164307,
"learning_rate": 9.369988545246278e-06,
"loss": 0.3003,
"step": 1520
},
{
"epoch": 7.0170103092783505,
"grad_norm": 7.431451797485352,
"learning_rate": 9.358533791523484e-06,
"loss": 0.6448,
"step": 1530
},
{
"epoch": 7.018041237113402,
"grad_norm": 32.28150939941406,
"learning_rate": 9.347079037800688e-06,
"loss": 0.4669,
"step": 1540
},
{
"epoch": 7.0190721649484535,
"grad_norm": 24.32545280456543,
"learning_rate": 9.335624284077893e-06,
"loss": 0.5635,
"step": 1550
},
{
"epoch": 7.020103092783505,
"grad_norm": 1.7349843978881836,
"learning_rate": 9.324169530355099e-06,
"loss": 0.3634,
"step": 1560
},
{
"epoch": 7.020103092783505,
"eval_accuracy": 0.6444444444444445,
"eval_loss": 0.8522602319717407,
"eval_runtime": 12.7523,
"eval_samples_per_second": 3.529,
"eval_steps_per_second": 0.941,
"step": 1560
},
{
"epoch": 8.001030927835052,
"grad_norm": 27.929956436157227,
"learning_rate": 9.312714776632303e-06,
"loss": 0.8071,
"step": 1570
},
{
"epoch": 8.002061855670103,
"grad_norm": 19.774404525756836,
"learning_rate": 9.301260022909508e-06,
"loss": 0.714,
"step": 1580
},
{
"epoch": 8.003092783505155,
"grad_norm": 5.483359336853027,
"learning_rate": 9.289805269186714e-06,
"loss": 0.4367,
"step": 1590
},
{
"epoch": 8.004123711340206,
"grad_norm": 13.389904022216797,
"learning_rate": 9.278350515463918e-06,
"loss": 0.5099,
"step": 1600
},
{
"epoch": 8.005154639175258,
"grad_norm": 7.315684795379639,
"learning_rate": 9.266895761741125e-06,
"loss": 0.4736,
"step": 1610
},
{
"epoch": 8.00618556701031,
"grad_norm": 14.2125825881958,
"learning_rate": 9.255441008018329e-06,
"loss": 0.392,
"step": 1620
},
{
"epoch": 8.00721649484536,
"grad_norm": 8.564682006835938,
"learning_rate": 9.243986254295533e-06,
"loss": 0.4436,
"step": 1630
},
{
"epoch": 8.008247422680412,
"grad_norm": 27.293010711669922,
"learning_rate": 9.23253150057274e-06,
"loss": 0.4409,
"step": 1640
},
{
"epoch": 8.009278350515464,
"grad_norm": 15.106826782226562,
"learning_rate": 9.221076746849944e-06,
"loss": 0.3801,
"step": 1650
},
{
"epoch": 8.010309278350515,
"grad_norm": 22.984642028808594,
"learning_rate": 9.209621993127148e-06,
"loss": 0.5309,
"step": 1660
},
{
"epoch": 8.011340206185567,
"grad_norm": 40.4260139465332,
"learning_rate": 9.198167239404353e-06,
"loss": 0.6704,
"step": 1670
},
{
"epoch": 8.012371134020619,
"grad_norm": 11.083319664001465,
"learning_rate": 9.186712485681557e-06,
"loss": 0.651,
"step": 1680
},
{
"epoch": 8.01340206185567,
"grad_norm": 6.827057838439941,
"learning_rate": 9.175257731958764e-06,
"loss": 0.7953,
"step": 1690
},
{
"epoch": 8.014432989690722,
"grad_norm": 6.367009162902832,
"learning_rate": 9.163802978235968e-06,
"loss": 0.454,
"step": 1700
},
{
"epoch": 8.015463917525773,
"grad_norm": 23.583059310913086,
"learning_rate": 9.152348224513174e-06,
"loss": 0.609,
"step": 1710
},
{
"epoch": 8.016494845360825,
"grad_norm": 3.92618989944458,
"learning_rate": 9.140893470790379e-06,
"loss": 0.4297,
"step": 1720
},
{
"epoch": 8.017525773195876,
"grad_norm": 4.04769229888916,
"learning_rate": 9.129438717067583e-06,
"loss": 0.3581,
"step": 1730
},
{
"epoch": 8.018556701030928,
"grad_norm": 16.875659942626953,
"learning_rate": 9.117983963344789e-06,
"loss": 0.5501,
"step": 1740
},
{
"epoch": 8.01958762886598,
"grad_norm": 2.816603183746338,
"learning_rate": 9.106529209621994e-06,
"loss": 0.4082,
"step": 1750
},
{
"epoch": 8.020103092783506,
"eval_accuracy": 0.8666666666666667,
"eval_loss": 0.36465370655059814,
"eval_runtime": 12.703,
"eval_samples_per_second": 3.542,
"eval_steps_per_second": 0.945,
"step": 1755
},
{
"epoch": 9.000515463917527,
"grad_norm": 19.659175872802734,
"learning_rate": 9.095074455899198e-06,
"loss": 0.2411,
"step": 1760
},
{
"epoch": 9.001546391752578,
"grad_norm": 9.694845199584961,
"learning_rate": 9.083619702176404e-06,
"loss": 0.5444,
"step": 1770
},
{
"epoch": 9.00257731958763,
"grad_norm": 51.400840759277344,
"learning_rate": 9.072164948453609e-06,
"loss": 0.5556,
"step": 1780
},
{
"epoch": 9.003608247422681,
"grad_norm": 20.920190811157227,
"learning_rate": 9.060710194730815e-06,
"loss": 0.4964,
"step": 1790
},
{
"epoch": 9.004639175257733,
"grad_norm": 25.781005859375,
"learning_rate": 9.04925544100802e-06,
"loss": 0.4506,
"step": 1800
},
{
"epoch": 9.005670103092784,
"grad_norm": 21.402894973754883,
"learning_rate": 9.037800687285224e-06,
"loss": 0.4645,
"step": 1810
},
{
"epoch": 9.006701030927836,
"grad_norm": 6.41744327545166,
"learning_rate": 9.02634593356243e-06,
"loss": 0.2999,
"step": 1820
},
{
"epoch": 9.007731958762887,
"grad_norm": 8.043073654174805,
"learning_rate": 9.014891179839634e-06,
"loss": 0.8446,
"step": 1830
},
{
"epoch": 9.008762886597939,
"grad_norm": 10.563735008239746,
"learning_rate": 9.003436426116839e-06,
"loss": 0.3282,
"step": 1840
},
{
"epoch": 9.00979381443299,
"grad_norm": 5.581331253051758,
"learning_rate": 8.991981672394045e-06,
"loss": 0.398,
"step": 1850
},
{
"epoch": 9.010824742268042,
"grad_norm": 1.6693127155303955,
"learning_rate": 8.98052691867125e-06,
"loss": 0.2961,
"step": 1860
},
{
"epoch": 9.011855670103094,
"grad_norm": 9.04586410522461,
"learning_rate": 8.969072164948455e-06,
"loss": 0.2784,
"step": 1870
},
{
"epoch": 9.012886597938145,
"grad_norm": 15.87678337097168,
"learning_rate": 8.95761741122566e-06,
"loss": 0.4126,
"step": 1880
},
{
"epoch": 9.013917525773197,
"grad_norm": 53.66142272949219,
"learning_rate": 8.946162657502864e-06,
"loss": 0.4191,
"step": 1890
},
{
"epoch": 9.014948453608248,
"grad_norm": 31.80818748474121,
"learning_rate": 8.93470790378007e-06,
"loss": 0.4929,
"step": 1900
},
{
"epoch": 9.0159793814433,
"grad_norm": 12.100385665893555,
"learning_rate": 8.923253150057275e-06,
"loss": 0.7639,
"step": 1910
},
{
"epoch": 9.017010309278351,
"grad_norm": 3.1334633827209473,
"learning_rate": 8.91179839633448e-06,
"loss": 0.5291,
"step": 1920
},
{
"epoch": 9.018041237113403,
"grad_norm": 6.849935531616211,
"learning_rate": 8.900343642611684e-06,
"loss": 0.6711,
"step": 1930
},
{
"epoch": 9.019072164948454,
"grad_norm": 18.349355697631836,
"learning_rate": 8.888888888888888e-06,
"loss": 0.7262,
"step": 1940
},
{
"epoch": 9.020103092783506,
"grad_norm": 41.263458251953125,
"learning_rate": 8.877434135166094e-06,
"loss": 0.4897,
"step": 1950
},
{
"epoch": 9.020103092783506,
"eval_accuracy": 0.7777777777777778,
"eval_loss": 0.5648357272148132,
"eval_runtime": 12.0862,
"eval_samples_per_second": 3.723,
"eval_steps_per_second": 0.993,
"step": 1950
},
{
"epoch": 10.001030927835052,
"grad_norm": 22.894468307495117,
"learning_rate": 8.865979381443299e-06,
"loss": 0.4276,
"step": 1960
},
{
"epoch": 10.002061855670103,
"grad_norm": 23.770139694213867,
"learning_rate": 8.854524627720505e-06,
"loss": 0.5575,
"step": 1970
},
{
"epoch": 10.003092783505155,
"grad_norm": 7.501220703125,
"learning_rate": 8.84306987399771e-06,
"loss": 0.2925,
"step": 1980
},
{
"epoch": 10.004123711340206,
"grad_norm": 16.40544891357422,
"learning_rate": 8.831615120274914e-06,
"loss": 0.4975,
"step": 1990
},
{
"epoch": 10.005154639175258,
"grad_norm": 18.579830169677734,
"learning_rate": 8.82016036655212e-06,
"loss": 0.5642,
"step": 2000
},
{
"epoch": 10.00618556701031,
"grad_norm": 10.547638893127441,
"learning_rate": 8.808705612829325e-06,
"loss": 0.3936,
"step": 2010
},
{
"epoch": 10.00721649484536,
"grad_norm": 26.152042388916016,
"learning_rate": 8.797250859106529e-06,
"loss": 0.4257,
"step": 2020
},
{
"epoch": 10.008247422680412,
"grad_norm": 29.884458541870117,
"learning_rate": 8.785796105383735e-06,
"loss": 0.6164,
"step": 2030
},
{
"epoch": 10.009278350515464,
"grad_norm": 6.939556121826172,
"learning_rate": 8.77434135166094e-06,
"loss": 0.57,
"step": 2040
},
{
"epoch": 10.010309278350515,
"grad_norm": 53.5316162109375,
"learning_rate": 8.762886597938146e-06,
"loss": 0.4892,
"step": 2050
},
{
"epoch": 10.011340206185567,
"grad_norm": 14.673813819885254,
"learning_rate": 8.75143184421535e-06,
"loss": 0.3881,
"step": 2060
},
{
"epoch": 10.012371134020619,
"grad_norm": 13.877957344055176,
"learning_rate": 8.739977090492555e-06,
"loss": 0.3445,
"step": 2070
},
{
"epoch": 10.01340206185567,
"grad_norm": 0.6452627778053284,
"learning_rate": 8.72852233676976e-06,
"loss": 0.3769,
"step": 2080
},
{
"epoch": 10.014432989690722,
"grad_norm": 27.587717056274414,
"learning_rate": 8.717067583046965e-06,
"loss": 0.4407,
"step": 2090
},
{
"epoch": 10.015463917525773,
"grad_norm": 19.309782028198242,
"learning_rate": 8.70561282932417e-06,
"loss": 0.3936,
"step": 2100
},
{
"epoch": 10.016494845360825,
"grad_norm": 13.032252311706543,
"learning_rate": 8.694158075601376e-06,
"loss": 0.3719,
"step": 2110
},
{
"epoch": 10.017525773195876,
"grad_norm": 2.494875431060791,
"learning_rate": 8.68270332187858e-06,
"loss": 0.411,
"step": 2120
},
{
"epoch": 10.018556701030928,
"grad_norm": 46.90223693847656,
"learning_rate": 8.671248568155786e-06,
"loss": 0.554,
"step": 2130
},
{
"epoch": 10.01958762886598,
"grad_norm": 11.451055526733398,
"learning_rate": 8.65979381443299e-06,
"loss": 0.389,
"step": 2140
},
{
"epoch": 10.020103092783506,
"eval_accuracy": 0.7777777777777778,
"eval_loss": 0.5735762715339661,
"eval_runtime": 12.8299,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 0.935,
"step": 2145
},
{
"epoch": 11.000515463917527,
"grad_norm": 11.298035621643066,
"learning_rate": 8.648339060710195e-06,
"loss": 0.5879,
"step": 2150
},
{
"epoch": 11.001546391752578,
"grad_norm": 19.15152931213379,
"learning_rate": 8.636884306987401e-06,
"loss": 0.4341,
"step": 2160
},
{
"epoch": 11.00257731958763,
"grad_norm": 1.2121660709381104,
"learning_rate": 8.625429553264606e-06,
"loss": 0.2884,
"step": 2170
},
{
"epoch": 11.003608247422681,
"grad_norm": 36.596675872802734,
"learning_rate": 8.61397479954181e-06,
"loss": 0.4636,
"step": 2180
},
{
"epoch": 11.004639175257733,
"grad_norm": 14.555014610290527,
"learning_rate": 8.602520045819016e-06,
"loss": 0.3522,
"step": 2190
},
{
"epoch": 11.005670103092784,
"grad_norm": 77.03166961669922,
"learning_rate": 8.591065292096221e-06,
"loss": 0.3229,
"step": 2200
},
{
"epoch": 11.006701030927836,
"grad_norm": 0.5664463639259338,
"learning_rate": 8.579610538373425e-06,
"loss": 0.5331,
"step": 2210
},
{
"epoch": 11.007731958762887,
"grad_norm": 1.5499430894851685,
"learning_rate": 8.56815578465063e-06,
"loss": 0.4122,
"step": 2220
},
{
"epoch": 11.008762886597939,
"grad_norm": 25.140165328979492,
"learning_rate": 8.556701030927836e-06,
"loss": 0.9055,
"step": 2230
},
{
"epoch": 11.00979381443299,
"grad_norm": 42.95240020751953,
"learning_rate": 8.54524627720504e-06,
"loss": 0.4527,
"step": 2240
},
{
"epoch": 11.010824742268042,
"grad_norm": 35.971248626708984,
"learning_rate": 8.533791523482245e-06,
"loss": 0.5452,
"step": 2250
},
{
"epoch": 11.011855670103094,
"grad_norm": 21.85768699645996,
"learning_rate": 8.522336769759451e-06,
"loss": 0.5303,
"step": 2260
},
{
"epoch": 11.012886597938145,
"grad_norm": 6.7367119789123535,
"learning_rate": 8.510882016036655e-06,
"loss": 0.5654,
"step": 2270
},
{
"epoch": 11.013917525773197,
"grad_norm": 62.03700637817383,
"learning_rate": 8.49942726231386e-06,
"loss": 0.3585,
"step": 2280
},
{
"epoch": 11.014948453608248,
"grad_norm": 79.50271606445312,
"learning_rate": 8.487972508591066e-06,
"loss": 0.5117,
"step": 2290
},
{
"epoch": 11.0159793814433,
"grad_norm": 12.448205947875977,
"learning_rate": 8.47651775486827e-06,
"loss": 0.352,
"step": 2300
},
{
"epoch": 11.017010309278351,
"grad_norm": 5.7825541496276855,
"learning_rate": 8.465063001145477e-06,
"loss": 0.4759,
"step": 2310
},
{
"epoch": 11.018041237113403,
"grad_norm": 64.92884826660156,
"learning_rate": 8.453608247422681e-06,
"loss": 0.4795,
"step": 2320
},
{
"epoch": 11.019072164948454,
"grad_norm": 8.412237167358398,
"learning_rate": 8.442153493699886e-06,
"loss": 0.4374,
"step": 2330
},
{
"epoch": 11.020103092783506,
"grad_norm": 0.19983382523059845,
"learning_rate": 8.430698739977092e-06,
"loss": 0.3753,
"step": 2340
},
{
"epoch": 11.020103092783506,
"eval_accuracy": 0.5777777777777777,
"eval_loss": 1.0849213600158691,
"eval_runtime": 12.7494,
"eval_samples_per_second": 3.53,
"eval_steps_per_second": 0.941,
"step": 2340
},
{
"epoch": 12.001030927835052,
"grad_norm": 0.6972218155860901,
"learning_rate": 8.419243986254296e-06,
"loss": 0.2594,
"step": 2350
},
{
"epoch": 12.002061855670103,
"grad_norm": 27.135608673095703,
"learning_rate": 8.4077892325315e-06,
"loss": 0.3156,
"step": 2360
},
{
"epoch": 12.003092783505155,
"grad_norm": 126.41283416748047,
"learning_rate": 8.396334478808707e-06,
"loss": 0.3705,
"step": 2370
},
{
"epoch": 12.004123711340206,
"grad_norm": 96.77040100097656,
"learning_rate": 8.384879725085911e-06,
"loss": 0.503,
"step": 2380
},
{
"epoch": 12.005154639175258,
"grad_norm": 0.21895720064640045,
"learning_rate": 8.373424971363117e-06,
"loss": 0.5193,
"step": 2390
},
{
"epoch": 12.00618556701031,
"grad_norm": 0.8776753544807434,
"learning_rate": 8.361970217640322e-06,
"loss": 0.3678,
"step": 2400
},
{
"epoch": 12.00721649484536,
"grad_norm": 0.986826479434967,
"learning_rate": 8.350515463917526e-06,
"loss": 0.9035,
"step": 2410
},
{
"epoch": 12.008247422680412,
"grad_norm": 33.315673828125,
"learning_rate": 8.339060710194732e-06,
"loss": 0.5579,
"step": 2420
},
{
"epoch": 12.009278350515464,
"grad_norm": 25.64463996887207,
"learning_rate": 8.327605956471937e-06,
"loss": 0.4433,
"step": 2430
},
{
"epoch": 12.010309278350515,
"grad_norm": 61.11088180541992,
"learning_rate": 8.316151202749141e-06,
"loss": 0.4935,
"step": 2440
},
{
"epoch": 12.011340206185567,
"grad_norm": 0.02137608826160431,
"learning_rate": 8.304696449026347e-06,
"loss": 0.1616,
"step": 2450
},
{
"epoch": 12.012371134020619,
"grad_norm": 54.19511413574219,
"learning_rate": 8.293241695303552e-06,
"loss": 1.0093,
"step": 2460
},
{
"epoch": 12.01340206185567,
"grad_norm": 8.395423889160156,
"learning_rate": 8.281786941580758e-06,
"loss": 0.1185,
"step": 2470
},
{
"epoch": 12.014432989690722,
"grad_norm": 42.02037811279297,
"learning_rate": 8.27033218785796e-06,
"loss": 0.8235,
"step": 2480
},
{
"epoch": 12.015463917525773,
"grad_norm": 22.178823471069336,
"learning_rate": 8.258877434135167e-06,
"loss": 0.5585,
"step": 2490
},
{
"epoch": 12.016494845360825,
"grad_norm": 2.1525092124938965,
"learning_rate": 8.247422680412371e-06,
"loss": 0.7552,
"step": 2500
},
{
"epoch": 12.017525773195876,
"grad_norm": 42.38743209838867,
"learning_rate": 8.235967926689576e-06,
"loss": 0.3942,
"step": 2510
},
{
"epoch": 12.018556701030928,
"grad_norm": 6.113926410675049,
"learning_rate": 8.224513172966782e-06,
"loss": 0.4449,
"step": 2520
},
{
"epoch": 12.01958762886598,
"grad_norm": 1.72330904006958,
"learning_rate": 8.213058419243986e-06,
"loss": 0.3118,
"step": 2530
},
{
"epoch": 12.020103092783506,
"eval_accuracy": 0.6222222222222222,
"eval_loss": 1.0597846508026123,
"eval_runtime": 12.6829,
"eval_samples_per_second": 3.548,
"eval_steps_per_second": 0.946,
"step": 2535
},
{
"epoch": 13.000515463917527,
"grad_norm": 8.192316055297852,
"learning_rate": 8.201603665521193e-06,
"loss": 0.4995,
"step": 2540
},
{
"epoch": 13.001546391752578,
"grad_norm": 45.1771125793457,
"learning_rate": 8.190148911798397e-06,
"loss": 0.3395,
"step": 2550
},
{
"epoch": 13.00257731958763,
"grad_norm": 42.817996978759766,
"learning_rate": 8.178694158075601e-06,
"loss": 0.7111,
"step": 2560
},
{
"epoch": 13.003608247422681,
"grad_norm": 63.306854248046875,
"learning_rate": 8.167239404352808e-06,
"loss": 0.3741,
"step": 2570
},
{
"epoch": 13.004639175257733,
"grad_norm": 14.730040550231934,
"learning_rate": 8.155784650630012e-06,
"loss": 0.4185,
"step": 2580
},
{
"epoch": 13.005670103092784,
"grad_norm": 7.792994499206543,
"learning_rate": 8.144329896907216e-06,
"loss": 0.7574,
"step": 2590
},
{
"epoch": 13.006701030927836,
"grad_norm": 3.4603207111358643,
"learning_rate": 8.132875143184423e-06,
"loss": 0.4004,
"step": 2600
},
{
"epoch": 13.007731958762887,
"grad_norm": 34.50138854980469,
"learning_rate": 8.121420389461627e-06,
"loss": 0.7797,
"step": 2610
},
{
"epoch": 13.008762886597939,
"grad_norm": 30.966228485107422,
"learning_rate": 8.109965635738832e-06,
"loss": 0.308,
"step": 2620
},
{
"epoch": 13.00979381443299,
"grad_norm": 14.006564140319824,
"learning_rate": 8.098510882016038e-06,
"loss": 0.1939,
"step": 2630
},
{
"epoch": 13.010824742268042,
"grad_norm": 31.138307571411133,
"learning_rate": 8.087056128293242e-06,
"loss": 0.308,
"step": 2640
},
{
"epoch": 13.011855670103094,
"grad_norm": 17.796743392944336,
"learning_rate": 8.075601374570448e-06,
"loss": 0.3016,
"step": 2650
},
{
"epoch": 13.012886597938145,
"grad_norm": 34.422054290771484,
"learning_rate": 8.064146620847653e-06,
"loss": 0.4823,
"step": 2660
},
{
"epoch": 13.013917525773197,
"grad_norm": 0.29599234461784363,
"learning_rate": 8.052691867124857e-06,
"loss": 0.1329,
"step": 2670
},
{
"epoch": 13.014948453608248,
"grad_norm": 46.61289978027344,
"learning_rate": 8.041237113402063e-06,
"loss": 0.5176,
"step": 2680
},
{
"epoch": 13.0159793814433,
"grad_norm": 20.459152221679688,
"learning_rate": 8.029782359679268e-06,
"loss": 0.5521,
"step": 2690
},
{
"epoch": 13.017010309278351,
"grad_norm": 48.8476676940918,
"learning_rate": 8.018327605956472e-06,
"loss": 0.3581,
"step": 2700
},
{
"epoch": 13.018041237113403,
"grad_norm": 36.25406265258789,
"learning_rate": 8.006872852233678e-06,
"loss": 0.5387,
"step": 2710
},
{
"epoch": 13.019072164948454,
"grad_norm": 15.75755786895752,
"learning_rate": 7.995418098510883e-06,
"loss": 0.3677,
"step": 2720
},
{
"epoch": 13.020103092783506,
"grad_norm": 0.06314048171043396,
"learning_rate": 7.983963344788089e-06,
"loss": 0.5823,
"step": 2730
},
{
"epoch": 13.020103092783506,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 0.7085598111152649,
"eval_runtime": 14.437,
"eval_samples_per_second": 3.117,
"eval_steps_per_second": 0.831,
"step": 2730
},
{
"epoch": 14.001030927835052,
"grad_norm": 15.936594009399414,
"learning_rate": 7.972508591065293e-06,
"loss": 0.2965,
"step": 2740
},
{
"epoch": 14.002061855670103,
"grad_norm": 34.82296371459961,
"learning_rate": 7.961053837342498e-06,
"loss": 0.3932,
"step": 2750
},
{
"epoch": 14.003092783505155,
"grad_norm": 24.281137466430664,
"learning_rate": 7.949599083619702e-06,
"loss": 0.2176,
"step": 2760
},
{
"epoch": 14.004123711340206,
"grad_norm": 3.835561990737915,
"learning_rate": 7.938144329896907e-06,
"loss": 0.3466,
"step": 2770
},
{
"epoch": 14.005154639175258,
"grad_norm": 59.81629943847656,
"learning_rate": 7.926689576174113e-06,
"loss": 0.4342,
"step": 2780
},
{
"epoch": 14.00618556701031,
"grad_norm": 21.131526947021484,
"learning_rate": 7.915234822451317e-06,
"loss": 0.3176,
"step": 2790
},
{
"epoch": 14.00721649484536,
"grad_norm": 28.3918399810791,
"learning_rate": 7.903780068728523e-06,
"loss": 0.508,
"step": 2800
},
{
"epoch": 14.008247422680412,
"grad_norm": 68.05310821533203,
"learning_rate": 7.892325315005728e-06,
"loss": 0.2753,
"step": 2810
},
{
"epoch": 14.009278350515464,
"grad_norm": 15.491647720336914,
"learning_rate": 7.880870561282932e-06,
"loss": 0.233,
"step": 2820
},
{
"epoch": 14.010309278350515,
"grad_norm": 46.456077575683594,
"learning_rate": 7.869415807560138e-06,
"loss": 0.2713,
"step": 2830
},
{
"epoch": 14.011340206185567,
"grad_norm": 11.542130470275879,
"learning_rate": 7.857961053837343e-06,
"loss": 0.488,
"step": 2840
},
{
"epoch": 14.012371134020619,
"grad_norm": 21.926034927368164,
"learning_rate": 7.846506300114547e-06,
"loss": 1.0477,
"step": 2850
},
{
"epoch": 14.01340206185567,
"grad_norm": 0.11074031889438629,
"learning_rate": 7.835051546391754e-06,
"loss": 0.3838,
"step": 2860
},
{
"epoch": 14.014432989690722,
"grad_norm": 21.328231811523438,
"learning_rate": 7.823596792668958e-06,
"loss": 0.6549,
"step": 2870
},
{
"epoch": 14.015463917525773,
"grad_norm": 45.43595886230469,
"learning_rate": 7.812142038946164e-06,
"loss": 0.5422,
"step": 2880
},
{
"epoch": 14.016494845360825,
"grad_norm": 43.23012161254883,
"learning_rate": 7.800687285223369e-06,
"loss": 0.348,
"step": 2890
},
{
"epoch": 14.017525773195876,
"grad_norm": 97.96592712402344,
"learning_rate": 7.789232531500573e-06,
"loss": 0.3177,
"step": 2900
},
{
"epoch": 14.018556701030928,
"grad_norm": 24.8525390625,
"learning_rate": 7.77777777777778e-06,
"loss": 0.46,
"step": 2910
},
{
"epoch": 14.01958762886598,
"grad_norm": 0.3702309727668762,
"learning_rate": 7.766323024054984e-06,
"loss": 0.2604,
"step": 2920
},
{
"epoch": 14.020103092783506,
"eval_accuracy": 0.6222222222222222,
"eval_loss": 1.4168084859848022,
"eval_runtime": 13.2629,
"eval_samples_per_second": 3.393,
"eval_steps_per_second": 0.905,
"step": 2925
},
{
"epoch": 15.000515463917527,
"grad_norm": 21.372608184814453,
"learning_rate": 7.754868270332188e-06,
"loss": 0.5669,
"step": 2930
},
{
"epoch": 15.001546391752578,
"grad_norm": 2.491281270980835,
"learning_rate": 7.743413516609394e-06,
"loss": 0.5194,
"step": 2940
},
{
"epoch": 15.00257731958763,
"grad_norm": 30.641578674316406,
"learning_rate": 7.731958762886599e-06,
"loss": 0.3343,
"step": 2950
},
{
"epoch": 15.003608247422681,
"grad_norm": 61.93765640258789,
"learning_rate": 7.720504009163803e-06,
"loss": 0.399,
"step": 2960
},
{
"epoch": 15.004639175257733,
"grad_norm": 41.56774139404297,
"learning_rate": 7.70904925544101e-06,
"loss": 0.2883,
"step": 2970
},
{
"epoch": 15.005670103092784,
"grad_norm": 31.026432037353516,
"learning_rate": 7.697594501718214e-06,
"loss": 0.5769,
"step": 2980
},
{
"epoch": 15.006701030927836,
"grad_norm": 32.793243408203125,
"learning_rate": 7.68613974799542e-06,
"loss": 0.5666,
"step": 2990
},
{
"epoch": 15.007731958762887,
"grad_norm": 68.84432220458984,
"learning_rate": 7.674684994272624e-06,
"loss": 0.5853,
"step": 3000
},
{
"epoch": 15.008762886597939,
"grad_norm": 79.7370376586914,
"learning_rate": 7.663230240549829e-06,
"loss": 0.3274,
"step": 3010
},
{
"epoch": 15.00979381443299,
"grad_norm": 0.1271493136882782,
"learning_rate": 7.651775486827033e-06,
"loss": 0.286,
"step": 3020
},
{
"epoch": 15.010824742268042,
"grad_norm": 5.414377212524414,
"learning_rate": 7.640320733104238e-06,
"loss": 0.2541,
"step": 3030
},
{
"epoch": 15.011855670103094,
"grad_norm": 7.974775791168213,
"learning_rate": 7.628865979381444e-06,
"loss": 0.6325,
"step": 3040
},
{
"epoch": 15.012886597938145,
"grad_norm": 2.6058754920959473,
"learning_rate": 7.617411225658648e-06,
"loss": 0.6288,
"step": 3050
},
{
"epoch": 15.013917525773197,
"grad_norm": 5.675416469573975,
"learning_rate": 7.6059564719358535e-06,
"loss": 0.2737,
"step": 3060
},
{
"epoch": 15.014948453608248,
"grad_norm": 21.808191299438477,
"learning_rate": 7.594501718213059e-06,
"loss": 0.4687,
"step": 3070
},
{
"epoch": 15.0159793814433,
"grad_norm": 4.567986011505127,
"learning_rate": 7.583046964490264e-06,
"loss": 0.2163,
"step": 3080
},
{
"epoch": 15.017010309278351,
"grad_norm": 30.94685173034668,
"learning_rate": 7.5715922107674686e-06,
"loss": 0.3175,
"step": 3090
},
{
"epoch": 15.018041237113403,
"grad_norm": 8.627877235412598,
"learning_rate": 7.560137457044674e-06,
"loss": 0.1734,
"step": 3100
},
{
"epoch": 15.019072164948454,
"grad_norm": 91.24742889404297,
"learning_rate": 7.548682703321879e-06,
"loss": 0.3208,
"step": 3110
},
{
"epoch": 15.020103092783506,
"grad_norm": 0.007905744016170502,
"learning_rate": 7.5372279495990845e-06,
"loss": 0.5767,
"step": 3120
},
{
"epoch": 15.020103092783506,
"eval_accuracy": 0.8,
"eval_loss": 0.7965757846832275,
"eval_runtime": 13.3334,
"eval_samples_per_second": 3.375,
"eval_steps_per_second": 0.9,
"step": 3120
},
{
"epoch": 16.001030927835053,
"grad_norm": 4.9155964851379395,
"learning_rate": 7.525773195876289e-06,
"loss": 0.3086,
"step": 3130
},
{
"epoch": 16.002061855670103,
"grad_norm": 41.20609664916992,
"learning_rate": 7.514318442153494e-06,
"loss": 0.7778,
"step": 3140
},
{
"epoch": 16.003092783505156,
"grad_norm": 31.576202392578125,
"learning_rate": 7.5028636884306995e-06,
"loss": 0.0978,
"step": 3150
},
{
"epoch": 16.004123711340206,
"grad_norm": 0.07261794060468674,
"learning_rate": 7.491408934707905e-06,
"loss": 0.2957,
"step": 3160
},
{
"epoch": 16.00515463917526,
"grad_norm": 34.24305725097656,
"learning_rate": 7.479954180985109e-06,
"loss": 0.6108,
"step": 3170
},
{
"epoch": 16.00618556701031,
"grad_norm": 64.02971649169922,
"learning_rate": 7.4684994272623145e-06,
"loss": 0.256,
"step": 3180
},
{
"epoch": 16.007216494845363,
"grad_norm": 31.043434143066406,
"learning_rate": 7.45704467353952e-06,
"loss": 0.8459,
"step": 3190
},
{
"epoch": 16.008247422680412,
"grad_norm": 100.2379379272461,
"learning_rate": 7.445589919816725e-06,
"loss": 0.391,
"step": 3200
},
{
"epoch": 16.009278350515466,
"grad_norm": 34.66457748413086,
"learning_rate": 7.43413516609393e-06,
"loss": 0.7614,
"step": 3210
},
{
"epoch": 16.010309278350515,
"grad_norm": 78.97329711914062,
"learning_rate": 7.422680412371135e-06,
"loss": 0.5367,
"step": 3220
},
{
"epoch": 16.01134020618557,
"grad_norm": 55.71527862548828,
"learning_rate": 7.41122565864834e-06,
"loss": 0.5274,
"step": 3230
},
{
"epoch": 16.01237113402062,
"grad_norm": 10.22823715209961,
"learning_rate": 7.3997709049255455e-06,
"loss": 0.3361,
"step": 3240
},
{
"epoch": 16.013402061855672,
"grad_norm": 2.243086814880371,
"learning_rate": 7.38831615120275e-06,
"loss": 0.7985,
"step": 3250
},
{
"epoch": 16.01443298969072,
"grad_norm": 4.293857574462891,
"learning_rate": 7.376861397479955e-06,
"loss": 0.2727,
"step": 3260
},
{
"epoch": 16.015463917525775,
"grad_norm": 0.2114896923303604,
"learning_rate": 7.3654066437571605e-06,
"loss": 0.2435,
"step": 3270
},
{
"epoch": 16.016494845360825,
"grad_norm": 36.44148635864258,
"learning_rate": 7.353951890034365e-06,
"loss": 0.5246,
"step": 3280
},
{
"epoch": 16.017525773195878,
"grad_norm": 51.74038314819336,
"learning_rate": 7.342497136311569e-06,
"loss": 0.0543,
"step": 3290
},
{
"epoch": 16.018556701030928,
"grad_norm": 50.51845169067383,
"learning_rate": 7.331042382588775e-06,
"loss": 0.7459,
"step": 3300
},
{
"epoch": 16.01958762886598,
"grad_norm": 37.770626068115234,
"learning_rate": 7.319587628865979e-06,
"loss": 0.4844,
"step": 3310
},
{
"epoch": 16.020103092783504,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.0488171577453613,
"eval_runtime": 12.4047,
"eval_samples_per_second": 3.628,
"eval_steps_per_second": 0.967,
"step": 3315
},
{
"epoch": 17.000515463917527,
"grad_norm": 97.32837677001953,
"learning_rate": 7.3081328751431845e-06,
"loss": 0.1819,
"step": 3320
},
{
"epoch": 17.001546391752576,
"grad_norm": 4.759186744689941,
"learning_rate": 7.29667812142039e-06,
"loss": 0.2479,
"step": 3330
},
{
"epoch": 17.00257731958763,
"grad_norm": 1.728135347366333,
"learning_rate": 7.285223367697595e-06,
"loss": 0.5371,
"step": 3340
},
{
"epoch": 17.00360824742268,
"grad_norm": 0.7207375168800354,
"learning_rate": 7.2737686139747995e-06,
"loss": 0.2002,
"step": 3350
},
{
"epoch": 17.004639175257733,
"grad_norm": 32.98098373413086,
"learning_rate": 7.262313860252005e-06,
"loss": 0.4171,
"step": 3360
},
{
"epoch": 17.005670103092783,
"grad_norm": 1.2624742984771729,
"learning_rate": 7.25085910652921e-06,
"loss": 0.2694,
"step": 3370
},
{
"epoch": 17.006701030927836,
"grad_norm": 40.10099411010742,
"learning_rate": 7.239404352806415e-06,
"loss": 0.4878,
"step": 3380
},
{
"epoch": 17.007731958762886,
"grad_norm": 0.11596504598855972,
"learning_rate": 7.22794959908362e-06,
"loss": 0.5169,
"step": 3390
},
{
"epoch": 17.00876288659794,
"grad_norm": 52.03158187866211,
"learning_rate": 7.216494845360825e-06,
"loss": 0.6374,
"step": 3400
},
{
"epoch": 17.00979381443299,
"grad_norm": 20.594680786132812,
"learning_rate": 7.2050400916380304e-06,
"loss": 0.8605,
"step": 3410
},
{
"epoch": 17.010824742268042,
"grad_norm": 61.95948791503906,
"learning_rate": 7.193585337915236e-06,
"loss": 0.357,
"step": 3420
},
{
"epoch": 17.011855670103092,
"grad_norm": 56.67446517944336,
"learning_rate": 7.18213058419244e-06,
"loss": 0.3436,
"step": 3430
},
{
"epoch": 17.012886597938145,
"grad_norm": 0.013172958046197891,
"learning_rate": 7.1706758304696455e-06,
"loss": 0.1324,
"step": 3440
},
{
"epoch": 17.013917525773195,
"grad_norm": 31.45111846923828,
"learning_rate": 7.159221076746851e-06,
"loss": 0.1581,
"step": 3450
},
{
"epoch": 17.01494845360825,
"grad_norm": 49.000885009765625,
"learning_rate": 7.147766323024056e-06,
"loss": 0.2419,
"step": 3460
},
{
"epoch": 17.015979381443298,
"grad_norm": 36.576114654541016,
"learning_rate": 7.1363115693012605e-06,
"loss": 0.4095,
"step": 3470
},
{
"epoch": 17.01701030927835,
"grad_norm": 2.826425552368164,
"learning_rate": 7.124856815578466e-06,
"loss": 0.2097,
"step": 3480
},
{
"epoch": 17.0180412371134,
"grad_norm": 43.49801254272461,
"learning_rate": 7.113402061855671e-06,
"loss": 0.2707,
"step": 3490
},
{
"epoch": 17.019072164948454,
"grad_norm": 0.675110399723053,
"learning_rate": 7.101947308132876e-06,
"loss": 0.3188,
"step": 3500
},
{
"epoch": 17.020103092783504,
"grad_norm": 0.0034136411268264055,
"learning_rate": 7.090492554410081e-06,
"loss": 0.0729,
"step": 3510
},
{
"epoch": 17.020103092783504,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.0075429677963257,
"eval_runtime": 12.9969,
"eval_samples_per_second": 3.462,
"eval_steps_per_second": 0.923,
"step": 3510
},
{
"epoch": 18.001030927835053,
"grad_norm": 19.52350425720215,
"learning_rate": 7.079037800687286e-06,
"loss": 0.4801,
"step": 3520
},
{
"epoch": 18.002061855670103,
"grad_norm": 0.09289558976888657,
"learning_rate": 7.0675830469644914e-06,
"loss": 0.2281,
"step": 3530
},
{
"epoch": 18.003092783505156,
"grad_norm": 38.042137145996094,
"learning_rate": 7.056128293241697e-06,
"loss": 0.1157,
"step": 3540
},
{
"epoch": 18.004123711340206,
"grad_norm": 2.72111439704895,
"learning_rate": 7.044673539518901e-06,
"loss": 0.3428,
"step": 3550
},
{
"epoch": 18.00515463917526,
"grad_norm": 1.5013760328292847,
"learning_rate": 7.0332187857961065e-06,
"loss": 0.5245,
"step": 3560
},
{
"epoch": 18.00618556701031,
"grad_norm": 123.11444854736328,
"learning_rate": 7.02176403207331e-06,
"loss": 0.6386,
"step": 3570
},
{
"epoch": 18.007216494845363,
"grad_norm": 5.601233959197998,
"learning_rate": 7.010309278350515e-06,
"loss": 0.3835,
"step": 3580
},
{
"epoch": 18.008247422680412,
"grad_norm": 0.00440385052934289,
"learning_rate": 6.998854524627721e-06,
"loss": 0.6067,
"step": 3590
},
{
"epoch": 18.009278350515466,
"grad_norm": 16.199575424194336,
"learning_rate": 6.987399770904926e-06,
"loss": 0.4997,
"step": 3600
},
{
"epoch": 18.010309278350515,
"grad_norm": 65.31697082519531,
"learning_rate": 6.9759450171821304e-06,
"loss": 0.3521,
"step": 3610
},
{
"epoch": 18.01134020618557,
"grad_norm": 9.682928085327148,
"learning_rate": 6.964490263459336e-06,
"loss": 0.3728,
"step": 3620
},
{
"epoch": 18.01237113402062,
"grad_norm": 61.22377014160156,
"learning_rate": 6.953035509736541e-06,
"loss": 0.7717,
"step": 3630
},
{
"epoch": 18.013402061855672,
"grad_norm": 55.61003494262695,
"learning_rate": 6.941580756013746e-06,
"loss": 0.4314,
"step": 3640
},
{
"epoch": 18.01443298969072,
"grad_norm": 9.212549209594727,
"learning_rate": 6.930126002290951e-06,
"loss": 0.5837,
"step": 3650
},
{
"epoch": 18.015463917525775,
"grad_norm": 9.7285737991333,
"learning_rate": 6.918671248568156e-06,
"loss": 0.3758,
"step": 3660
},
{
"epoch": 18.016494845360825,
"grad_norm": 40.76766586303711,
"learning_rate": 6.907216494845361e-06,
"loss": 0.1162,
"step": 3670
},
{
"epoch": 18.017525773195878,
"grad_norm": 50.804962158203125,
"learning_rate": 6.895761741122567e-06,
"loss": 0.0843,
"step": 3680
},
{
"epoch": 18.018556701030928,
"grad_norm": 2.414264440536499,
"learning_rate": 6.884306987399771e-06,
"loss": 0.1919,
"step": 3690
},
{
"epoch": 18.01958762886598,
"grad_norm": 1.428833246231079,
"learning_rate": 6.872852233676976e-06,
"loss": 0.4188,
"step": 3700
},
{
"epoch": 18.020103092783504,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 1.2723888158798218,
"eval_runtime": 12.9654,
"eval_samples_per_second": 3.471,
"eval_steps_per_second": 0.926,
"step": 3705
},
{
"epoch": 19.000515463917527,
"grad_norm": 44.43056869506836,
"learning_rate": 6.861397479954182e-06,
"loss": 0.2822,
"step": 3710
},
{
"epoch": 19.001546391752576,
"grad_norm": 0.05244099348783493,
"learning_rate": 6.849942726231387e-06,
"loss": 0.1549,
"step": 3720
},
{
"epoch": 19.00257731958763,
"grad_norm": 0.09560086578130722,
"learning_rate": 6.8384879725085914e-06,
"loss": 0.1708,
"step": 3730
},
{
"epoch": 19.00360824742268,
"grad_norm": 0.2790253758430481,
"learning_rate": 6.827033218785797e-06,
"loss": 0.0709,
"step": 3740
},
{
"epoch": 19.004639175257733,
"grad_norm": 4.10269021987915,
"learning_rate": 6.815578465063002e-06,
"loss": 0.46,
"step": 3750
},
{
"epoch": 19.005670103092783,
"grad_norm": 22.924274444580078,
"learning_rate": 6.804123711340207e-06,
"loss": 0.4041,
"step": 3760
},
{
"epoch": 19.006701030927836,
"grad_norm": 0.1639637053012848,
"learning_rate": 6.792668957617412e-06,
"loss": 0.3816,
"step": 3770
},
{
"epoch": 19.007731958762886,
"grad_norm": 0.21744728088378906,
"learning_rate": 6.781214203894617e-06,
"loss": 0.1098,
"step": 3780
},
{
"epoch": 19.00876288659794,
"grad_norm": 135.28933715820312,
"learning_rate": 6.769759450171822e-06,
"loss": 0.3937,
"step": 3790
},
{
"epoch": 19.00979381443299,
"grad_norm": 104.67581176757812,
"learning_rate": 6.758304696449028e-06,
"loss": 0.4015,
"step": 3800
},
{
"epoch": 19.010824742268042,
"grad_norm": 68.89933776855469,
"learning_rate": 6.746849942726232e-06,
"loss": 0.2513,
"step": 3810
},
{
"epoch": 19.011855670103092,
"grad_norm": 0.19136613607406616,
"learning_rate": 6.735395189003437e-06,
"loss": 0.1594,
"step": 3820
},
{
"epoch": 19.012886597938145,
"grad_norm": 106.2833023071289,
"learning_rate": 6.723940435280643e-06,
"loss": 0.6457,
"step": 3830
},
{
"epoch": 19.013917525773195,
"grad_norm": 12.311858177185059,
"learning_rate": 6.712485681557846e-06,
"loss": 0.2894,
"step": 3840
},
{
"epoch": 19.01494845360825,
"grad_norm": 10.986001014709473,
"learning_rate": 6.701030927835052e-06,
"loss": 0.6104,
"step": 3850
},
{
"epoch": 19.015979381443298,
"grad_norm": 9.053174018859863,
"learning_rate": 6.689576174112257e-06,
"loss": 0.6795,
"step": 3860
},
{
"epoch": 19.01701030927835,
"grad_norm": 0.6393508911132812,
"learning_rate": 6.678121420389461e-06,
"loss": 0.1746,
"step": 3870
},
{
"epoch": 19.0180412371134,
"grad_norm": 41.1427001953125,
"learning_rate": 6.666666666666667e-06,
"loss": 0.1791,
"step": 3880
},
{
"epoch": 19.019072164948454,
"grad_norm": 0.29769831895828247,
"learning_rate": 6.655211912943872e-06,
"loss": 0.1533,
"step": 3890
},
{
"epoch": 19.020103092783504,
"grad_norm": 0.005904354155063629,
"learning_rate": 6.643757159221077e-06,
"loss": 0.247,
"step": 3900
},
{
"epoch": 19.020103092783504,
"eval_accuracy": 0.7555555555555555,
"eval_loss": 1.3884011507034302,
"eval_runtime": 12.8798,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 0.932,
"step": 3900
},
{
"epoch": 20.001030927835053,
"grad_norm": 121.39861297607422,
"learning_rate": 6.632302405498282e-06,
"loss": 0.4984,
"step": 3910
},
{
"epoch": 20.002061855670103,
"grad_norm": 80.33428192138672,
"learning_rate": 6.620847651775487e-06,
"loss": 0.4636,
"step": 3920
},
{
"epoch": 20.003092783505156,
"grad_norm": 0.3919704854488373,
"learning_rate": 6.609392898052692e-06,
"loss": 0.3251,
"step": 3930
},
{
"epoch": 20.004123711340206,
"grad_norm": 0.08509159088134766,
"learning_rate": 6.597938144329898e-06,
"loss": 0.1908,
"step": 3940
},
{
"epoch": 20.00515463917526,
"grad_norm": 0.09845045208930969,
"learning_rate": 6.586483390607102e-06,
"loss": 0.4081,
"step": 3950
},
{
"epoch": 20.00618556701031,
"grad_norm": 0.17746928334236145,
"learning_rate": 6.575028636884307e-06,
"loss": 0.1395,
"step": 3960
},
{
"epoch": 20.007216494845363,
"grad_norm": 133.173828125,
"learning_rate": 6.563573883161513e-06,
"loss": 0.4535,
"step": 3970
},
{
"epoch": 20.008247422680412,
"grad_norm": 0.980322003364563,
"learning_rate": 6.552119129438718e-06,
"loss": 0.473,
"step": 3980
},
{
"epoch": 20.009278350515466,
"grad_norm": 21.204669952392578,
"learning_rate": 6.540664375715922e-06,
"loss": 0.2637,
"step": 3990
},
{
"epoch": 20.010309278350515,
"grad_norm": 88.95044708251953,
"learning_rate": 6.529209621993128e-06,
"loss": 0.2103,
"step": 4000
},
{
"epoch": 20.01134020618557,
"grad_norm": 0.22797317802906036,
"learning_rate": 6.517754868270333e-06,
"loss": 0.3344,
"step": 4010
},
{
"epoch": 20.01237113402062,
"grad_norm": 0.12223013490438461,
"learning_rate": 6.506300114547538e-06,
"loss": 0.0848,
"step": 4020
},
{
"epoch": 20.013402061855672,
"grad_norm": 0.11136689782142639,
"learning_rate": 6.494845360824743e-06,
"loss": 0.3754,
"step": 4030
},
{
"epoch": 20.01443298969072,
"grad_norm": 0.18626107275485992,
"learning_rate": 6.483390607101948e-06,
"loss": 0.1341,
"step": 4040
},
{
"epoch": 20.015463917525775,
"grad_norm": 10.783482551574707,
"learning_rate": 6.471935853379153e-06,
"loss": 0.0183,
"step": 4050
},
{
"epoch": 20.016494845360825,
"grad_norm": 0.06843023747205734,
"learning_rate": 6.460481099656359e-06,
"loss": 0.2099,
"step": 4060
},
{
"epoch": 20.017525773195878,
"grad_norm": 0.16018100082874298,
"learning_rate": 6.449026345933563e-06,
"loss": 0.1934,
"step": 4070
},
{
"epoch": 20.018556701030928,
"grad_norm": 0.03921249136328697,
"learning_rate": 6.437571592210768e-06,
"loss": 0.2822,
"step": 4080
},
{
"epoch": 20.01958762886598,
"grad_norm": 73.11908721923828,
"learning_rate": 6.426116838487974e-06,
"loss": 0.8041,
"step": 4090
},
{
"epoch": 20.020103092783504,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.1552423238754272,
"eval_runtime": 12.8488,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 0.934,
"step": 4095
},
{
"epoch": 21.000515463917527,
"grad_norm": 0.8366032242774963,
"learning_rate": 6.414662084765179e-06,
"loss": 0.4461,
"step": 4100
},
{
"epoch": 21.001546391752576,
"grad_norm": 0.06696546822786331,
"learning_rate": 6.4032073310423825e-06,
"loss": 0.3449,
"step": 4110
},
{
"epoch": 21.00257731958763,
"grad_norm": 55.97826385498047,
"learning_rate": 6.391752577319588e-06,
"loss": 0.6324,
"step": 4120
},
{
"epoch": 21.00360824742268,
"grad_norm": 30.455825805664062,
"learning_rate": 6.380297823596793e-06,
"loss": 0.4075,
"step": 4130
},
{
"epoch": 21.004639175257733,
"grad_norm": 0.05050866678357124,
"learning_rate": 6.3688430698739976e-06,
"loss": 0.0636,
"step": 4140
},
{
"epoch": 21.005670103092783,
"grad_norm": 114.8602294921875,
"learning_rate": 6.357388316151203e-06,
"loss": 0.3984,
"step": 4150
},
{
"epoch": 21.006701030927836,
"grad_norm": 8.037965774536133,
"learning_rate": 6.345933562428408e-06,
"loss": 0.1513,
"step": 4160
},
{
"epoch": 21.007731958762886,
"grad_norm": 0.20525996387004852,
"learning_rate": 6.3344788087056135e-06,
"loss": 0.1569,
"step": 4170
},
{
"epoch": 21.00876288659794,
"grad_norm": 0.16769638657569885,
"learning_rate": 6.323024054982818e-06,
"loss": 0.2572,
"step": 4180
},
{
"epoch": 21.00979381443299,
"grad_norm": 12.56114387512207,
"learning_rate": 6.311569301260023e-06,
"loss": 0.1308,
"step": 4190
},
{
"epoch": 21.010824742268042,
"grad_norm": 0.05304228886961937,
"learning_rate": 6.3001145475372285e-06,
"loss": 0.2517,
"step": 4200
},
{
"epoch": 21.011855670103092,
"grad_norm": 0.02033202536404133,
"learning_rate": 6.288659793814433e-06,
"loss": 0.1589,
"step": 4210
},
{
"epoch": 21.012886597938145,
"grad_norm": 40.56743240356445,
"learning_rate": 6.277205040091638e-06,
"loss": 0.1517,
"step": 4220
},
{
"epoch": 21.013917525773195,
"grad_norm": 0.18166792392730713,
"learning_rate": 6.2657502863688435e-06,
"loss": 0.1245,
"step": 4230
},
{
"epoch": 21.01494845360825,
"grad_norm": 168.2699432373047,
"learning_rate": 6.254295532646049e-06,
"loss": 0.751,
"step": 4240
},
{
"epoch": 21.015979381443298,
"grad_norm": 63.15861511230469,
"learning_rate": 6.242840778923253e-06,
"loss": 0.4252,
"step": 4250
},
{
"epoch": 21.01701030927835,
"grad_norm": 0.03196326643228531,
"learning_rate": 6.231386025200459e-06,
"loss": 0.188,
"step": 4260
},
{
"epoch": 21.0180412371134,
"grad_norm": 0.10200751572847366,
"learning_rate": 6.219931271477664e-06,
"loss": 0.44,
"step": 4270
},
{
"epoch": 21.019072164948454,
"grad_norm": 0.9600178003311157,
"learning_rate": 6.208476517754869e-06,
"loss": 0.6086,
"step": 4280
},
{
"epoch": 21.020103092783504,
"grad_norm": 0.15486416220664978,
"learning_rate": 6.197021764032074e-06,
"loss": 0.168,
"step": 4290
},
{
"epoch": 21.020103092783504,
"eval_accuracy": 0.6888888888888889,
"eval_loss": 1.7923972606658936,
"eval_runtime": 13.1906,
"eval_samples_per_second": 3.412,
"eval_steps_per_second": 0.91,
"step": 4290
},
{
"epoch": 22.001030927835053,
"grad_norm": 0.07751865684986115,
"learning_rate": 6.185567010309279e-06,
"loss": 0.187,
"step": 4300
},
{
"epoch": 22.002061855670103,
"grad_norm": 10.4796142578125,
"learning_rate": 6.174112256586484e-06,
"loss": 0.6962,
"step": 4310
},
{
"epoch": 22.003092783505156,
"grad_norm": 92.16351318359375,
"learning_rate": 6.1626575028636895e-06,
"loss": 0.2417,
"step": 4320
},
{
"epoch": 22.004123711340206,
"grad_norm": 45.09387969970703,
"learning_rate": 6.151202749140894e-06,
"loss": 0.3242,
"step": 4330
},
{
"epoch": 22.00515463917526,
"grad_norm": 78.62527465820312,
"learning_rate": 6.139747995418099e-06,
"loss": 0.1895,
"step": 4340
},
{
"epoch": 22.00618556701031,
"grad_norm": 24.385234832763672,
"learning_rate": 6.1282932416953046e-06,
"loss": 0.487,
"step": 4350
},
{
"epoch": 22.007216494845363,
"grad_norm": 0.06612977385520935,
"learning_rate": 6.11683848797251e-06,
"loss": 0.3641,
"step": 4360
},
{
"epoch": 22.008247422680412,
"grad_norm": 37.19966125488281,
"learning_rate": 6.105383734249714e-06,
"loss": 0.4266,
"step": 4370
},
{
"epoch": 22.009278350515466,
"grad_norm": 121.07097625732422,
"learning_rate": 6.09392898052692e-06,
"loss": 0.4789,
"step": 4380
},
{
"epoch": 22.010309278350515,
"grad_norm": 40.00523376464844,
"learning_rate": 6.082474226804124e-06,
"loss": 0.4866,
"step": 4390
},
{
"epoch": 22.01134020618557,
"grad_norm": 0.05597120150923729,
"learning_rate": 6.0710194730813285e-06,
"loss": 0.3815,
"step": 4400
},
{
"epoch": 22.01237113402062,
"grad_norm": 0.8036267757415771,
"learning_rate": 6.059564719358534e-06,
"loss": 0.1899,
"step": 4410
},
{
"epoch": 22.013402061855672,
"grad_norm": 144.4720001220703,
"learning_rate": 6.048109965635739e-06,
"loss": 0.4231,
"step": 4420
},
{
"epoch": 22.01443298969072,
"grad_norm": 0.2990947961807251,
"learning_rate": 6.036655211912944e-06,
"loss": 0.2152,
"step": 4430
},
{
"epoch": 22.015463917525775,
"grad_norm": 0.10453028976917267,
"learning_rate": 6.025200458190149e-06,
"loss": 0.404,
"step": 4440
},
{
"epoch": 22.016494845360825,
"grad_norm": 112.81282043457031,
"learning_rate": 6.013745704467354e-06,
"loss": 0.1327,
"step": 4450
},
{
"epoch": 22.017525773195878,
"grad_norm": 0.112935371696949,
"learning_rate": 6.0022909507445594e-06,
"loss": 0.117,
"step": 4460
},
{
"epoch": 22.018556701030928,
"grad_norm": 4.366937160491943,
"learning_rate": 5.990836197021765e-06,
"loss": 0.4038,
"step": 4470
},
{
"epoch": 22.01958762886598,
"grad_norm": 0.005484442692250013,
"learning_rate": 5.979381443298969e-06,
"loss": 0.2564,
"step": 4480
},
{
"epoch": 22.020103092783504,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.5681549310684204,
"eval_runtime": 21.1102,
"eval_samples_per_second": 2.132,
"eval_steps_per_second": 0.568,
"step": 4485
},
{
"epoch": 23.000515463917527,
"grad_norm": 14.104279518127441,
"learning_rate": 5.9679266895761745e-06,
"loss": 0.8201,
"step": 4490
},
{
"epoch": 23.001546391752576,
"grad_norm": 1.4341076612472534,
"learning_rate": 5.95647193585338e-06,
"loss": 0.0827,
"step": 4500
},
{
"epoch": 23.00257731958763,
"grad_norm": 13.500885963439941,
"learning_rate": 5.945017182130585e-06,
"loss": 0.1132,
"step": 4510
},
{
"epoch": 23.00360824742268,
"grad_norm": 0.08413302898406982,
"learning_rate": 5.9335624284077895e-06,
"loss": 0.2336,
"step": 4520
},
{
"epoch": 23.004639175257733,
"grad_norm": 18.12093734741211,
"learning_rate": 5.922107674684995e-06,
"loss": 0.4751,
"step": 4530
},
{
"epoch": 23.005670103092783,
"grad_norm": 1.0863031148910522,
"learning_rate": 5.9106529209622e-06,
"loss": 0.1428,
"step": 4540
},
{
"epoch": 23.006701030927836,
"grad_norm": 0.06023244187235832,
"learning_rate": 5.8991981672394046e-06,
"loss": 0.1603,
"step": 4550
},
{
"epoch": 23.007731958762886,
"grad_norm": 2.2026829719543457,
"learning_rate": 5.88774341351661e-06,
"loss": 0.3565,
"step": 4560
},
{
"epoch": 23.00876288659794,
"grad_norm": 100.63137817382812,
"learning_rate": 5.876288659793815e-06,
"loss": 0.1315,
"step": 4570
},
{
"epoch": 23.00979381443299,
"grad_norm": 399.76123046875,
"learning_rate": 5.8648339060710204e-06,
"loss": 0.3714,
"step": 4580
},
{
"epoch": 23.010824742268042,
"grad_norm": 154.24449157714844,
"learning_rate": 5.853379152348225e-06,
"loss": 0.3266,
"step": 4590
},
{
"epoch": 23.011855670103092,
"grad_norm": 0.03562017157673836,
"learning_rate": 5.84192439862543e-06,
"loss": 0.3065,
"step": 4600
},
{
"epoch": 23.012886597938145,
"grad_norm": 0.08134841173887253,
"learning_rate": 5.8304696449026355e-06,
"loss": 0.3782,
"step": 4610
},
{
"epoch": 23.013917525773195,
"grad_norm": 181.0287628173828,
"learning_rate": 5.819014891179841e-06,
"loss": 0.4084,
"step": 4620
},
{
"epoch": 23.01494845360825,
"grad_norm": 2.42154598236084,
"learning_rate": 5.807560137457045e-06,
"loss": 0.0032,
"step": 4630
},
{
"epoch": 23.015979381443298,
"grad_norm": 123.42518615722656,
"learning_rate": 5.7961053837342505e-06,
"loss": 0.4781,
"step": 4640
},
{
"epoch": 23.01701030927835,
"grad_norm": 3.3009181022644043,
"learning_rate": 5.784650630011456e-06,
"loss": 0.0389,
"step": 4650
},
{
"epoch": 23.0180412371134,
"grad_norm": 0.07052547484636307,
"learning_rate": 5.7731958762886594e-06,
"loss": 0.3134,
"step": 4660
},
{
"epoch": 23.019072164948454,
"grad_norm": 0.1382911652326584,
"learning_rate": 5.761741122565865e-06,
"loss": 0.5257,
"step": 4670
},
{
"epoch": 23.020103092783504,
"grad_norm": 0.008108101785182953,
"learning_rate": 5.75028636884307e-06,
"loss": 0.2034,
"step": 4680
},
{
"epoch": 23.020103092783504,
"eval_accuracy": 0.6888888888888889,
"eval_loss": 1.6061440706253052,
"eval_runtime": 12.9527,
"eval_samples_per_second": 3.474,
"eval_steps_per_second": 0.926,
"step": 4680
},
{
"epoch": 24.001030927835053,
"grad_norm": 0.15025880932807922,
"learning_rate": 5.738831615120275e-06,
"loss": 0.0039,
"step": 4690
},
{
"epoch": 24.002061855670103,
"grad_norm": 112.1488037109375,
"learning_rate": 5.72737686139748e-06,
"loss": 0.1306,
"step": 4700
},
{
"epoch": 24.003092783505156,
"grad_norm": 0.02043057605624199,
"learning_rate": 5.715922107674685e-06,
"loss": 0.091,
"step": 4710
},
{
"epoch": 24.004123711340206,
"grad_norm": 51.4376220703125,
"learning_rate": 5.70446735395189e-06,
"loss": 0.2416,
"step": 4720
},
{
"epoch": 24.00515463917526,
"grad_norm": 47.24472427368164,
"learning_rate": 5.693012600229096e-06,
"loss": 0.4605,
"step": 4730
},
{
"epoch": 24.00618556701031,
"grad_norm": 164.00759887695312,
"learning_rate": 5.6815578465063e-06,
"loss": 0.3363,
"step": 4740
},
{
"epoch": 24.007216494845363,
"grad_norm": 0.009168693795800209,
"learning_rate": 5.670103092783505e-06,
"loss": 0.0162,
"step": 4750
},
{
"epoch": 24.008247422680412,
"grad_norm": 38.39619827270508,
"learning_rate": 5.658648339060711e-06,
"loss": 0.4564,
"step": 4760
},
{
"epoch": 24.009278350515466,
"grad_norm": 1.3691141605377197,
"learning_rate": 5.647193585337916e-06,
"loss": 0.2679,
"step": 4770
},
{
"epoch": 24.010309278350515,
"grad_norm": 269.83489990234375,
"learning_rate": 5.6357388316151204e-06,
"loss": 0.4089,
"step": 4780
},
{
"epoch": 24.01134020618557,
"grad_norm": 0.0756688266992569,
"learning_rate": 5.624284077892326e-06,
"loss": 0.1391,
"step": 4790
},
{
"epoch": 24.01237113402062,
"grad_norm": 0.003854341572150588,
"learning_rate": 5.612829324169531e-06,
"loss": 0.548,
"step": 4800
},
{
"epoch": 24.013402061855672,
"grad_norm": 93.3668212890625,
"learning_rate": 5.601374570446736e-06,
"loss": 0.2312,
"step": 4810
},
{
"epoch": 24.01443298969072,
"grad_norm": 0.05672283470630646,
"learning_rate": 5.589919816723941e-06,
"loss": 0.1191,
"step": 4820
},
{
"epoch": 24.015463917525775,
"grad_norm": 2.6616814136505127,
"learning_rate": 5.578465063001146e-06,
"loss": 0.0088,
"step": 4830
},
{
"epoch": 24.016494845360825,
"grad_norm": 1.235160231590271,
"learning_rate": 5.567010309278351e-06,
"loss": 0.2314,
"step": 4840
},
{
"epoch": 24.017525773195878,
"grad_norm": 14.321471214294434,
"learning_rate": 5.555555555555557e-06,
"loss": 0.35,
"step": 4850
},
{
"epoch": 24.018556701030928,
"grad_norm": 0.02161994017660618,
"learning_rate": 5.544100801832761e-06,
"loss": 0.4611,
"step": 4860
},
{
"epoch": 24.01958762886598,
"grad_norm": 0.03108271211385727,
"learning_rate": 5.532646048109966e-06,
"loss": 0.436,
"step": 4870
},
{
"epoch": 24.020103092783504,
"eval_accuracy": 0.6888888888888889,
"eval_loss": 1.5507913827896118,
"eval_runtime": 12.8715,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 0.932,
"step": 4875
},
{
"epoch": 25.000515463917527,
"grad_norm": 161.4268035888672,
"learning_rate": 5.521191294387172e-06,
"loss": 0.5077,
"step": 4880
},
{
"epoch": 25.001546391752576,
"grad_norm": 0.3123011291027069,
"learning_rate": 5.509736540664376e-06,
"loss": 0.1336,
"step": 4890
},
{
"epoch": 25.00257731958763,
"grad_norm": 138.39981079101562,
"learning_rate": 5.4982817869415815e-06,
"loss": 0.5236,
"step": 4900
},
{
"epoch": 25.00360824742268,
"grad_norm": 0.1495334655046463,
"learning_rate": 5.486827033218787e-06,
"loss": 0.3321,
"step": 4910
},
{
"epoch": 25.004639175257733,
"grad_norm": 122.72096252441406,
"learning_rate": 5.475372279495992e-06,
"loss": 0.7098,
"step": 4920
},
{
"epoch": 25.005670103092783,
"grad_norm": 13.088062286376953,
"learning_rate": 5.463917525773196e-06,
"loss": 0.3201,
"step": 4930
},
{
"epoch": 25.006701030927836,
"grad_norm": 0.04432636499404907,
"learning_rate": 5.452462772050401e-06,
"loss": 0.1142,
"step": 4940
},
{
"epoch": 25.007731958762886,
"grad_norm": 0.874567985534668,
"learning_rate": 5.441008018327606e-06,
"loss": 0.1487,
"step": 4950
},
{
"epoch": 25.00876288659794,
"grad_norm": 0.19093450903892517,
"learning_rate": 5.429553264604811e-06,
"loss": 0.1065,
"step": 4960
},
{
"epoch": 25.00979381443299,
"grad_norm": 0.11419642716646194,
"learning_rate": 5.418098510882016e-06,
"loss": 0.199,
"step": 4970
},
{
"epoch": 25.010824742268042,
"grad_norm": 4.453356742858887,
"learning_rate": 5.406643757159221e-06,
"loss": 0.0043,
"step": 4980
},
{
"epoch": 25.011855670103092,
"grad_norm": 0.0850525051355362,
"learning_rate": 5.395189003436427e-06,
"loss": 0.0794,
"step": 4990
},
{
"epoch": 25.012886597938145,
"grad_norm": 0.13754314184188843,
"learning_rate": 5.383734249713631e-06,
"loss": 0.2092,
"step": 5000
},
{
"epoch": 25.013917525773195,
"grad_norm": 157.50311279296875,
"learning_rate": 5.372279495990836e-06,
"loss": 0.4186,
"step": 5010
},
{
"epoch": 25.01494845360825,
"grad_norm": 0.975379228591919,
"learning_rate": 5.360824742268042e-06,
"loss": 0.1117,
"step": 5020
},
{
"epoch": 25.015979381443298,
"grad_norm": 0.030048053711652756,
"learning_rate": 5.349369988545247e-06,
"loss": 0.1822,
"step": 5030
},
{
"epoch": 25.01701030927835,
"grad_norm": 0.09299586713314056,
"learning_rate": 5.337915234822451e-06,
"loss": 0.2461,
"step": 5040
},
{
"epoch": 25.0180412371134,
"grad_norm": 0.050019096583127975,
"learning_rate": 5.326460481099657e-06,
"loss": 0.024,
"step": 5050
},
{
"epoch": 25.019072164948454,
"grad_norm": 152.73289489746094,
"learning_rate": 5.315005727376862e-06,
"loss": 0.2815,
"step": 5060
},
{
"epoch": 25.020103092783504,
"grad_norm": 0.0027058201376348734,
"learning_rate": 5.303550973654067e-06,
"loss": 0.379,
"step": 5070
},
{
"epoch": 25.020103092783504,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 1.6197640895843506,
"eval_runtime": 12.8821,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 0.932,
"step": 5070
},
{
"epoch": 26.001030927835053,
"grad_norm": 0.036444660276174545,
"learning_rate": 5.292096219931272e-06,
"loss": 0.2972,
"step": 5080
},
{
"epoch": 26.002061855670103,
"grad_norm": 8.088865280151367,
"learning_rate": 5.280641466208477e-06,
"loss": 0.0962,
"step": 5090
},
{
"epoch": 26.003092783505156,
"grad_norm": 0.029521504417061806,
"learning_rate": 5.269186712485682e-06,
"loss": 0.0095,
"step": 5100
},
{
"epoch": 26.004123711340206,
"grad_norm": 0.20875123143196106,
"learning_rate": 5.257731958762888e-06,
"loss": 0.4124,
"step": 5110
},
{
"epoch": 26.00515463917526,
"grad_norm": 148.48397827148438,
"learning_rate": 5.246277205040092e-06,
"loss": 0.3919,
"step": 5120
},
{
"epoch": 26.00618556701031,
"grad_norm": 0.03706370294094086,
"learning_rate": 5.234822451317297e-06,
"loss": 0.1514,
"step": 5130
},
{
"epoch": 26.007216494845363,
"grad_norm": 0.0603543259203434,
"learning_rate": 5.223367697594503e-06,
"loss": 0.085,
"step": 5140
},
{
"epoch": 26.008247422680412,
"grad_norm": 268.0271301269531,
"learning_rate": 5.211912943871708e-06,
"loss": 0.1849,
"step": 5150
},
{
"epoch": 26.009278350515466,
"grad_norm": 0.05716723948717117,
"learning_rate": 5.200458190148912e-06,
"loss": 0.0813,
"step": 5160
},
{
"epoch": 26.010309278350515,
"grad_norm": 36.733333587646484,
"learning_rate": 5.189003436426118e-06,
"loss": 0.1493,
"step": 5170
},
{
"epoch": 26.01134020618557,
"grad_norm": 73.86190032958984,
"learning_rate": 5.177548682703323e-06,
"loss": 0.1373,
"step": 5180
},
{
"epoch": 26.01237113402062,
"grad_norm": 0.04079816862940788,
"learning_rate": 5.166093928980528e-06,
"loss": 0.2272,
"step": 5190
},
{
"epoch": 26.013402061855672,
"grad_norm": 0.07667230069637299,
"learning_rate": 5.154639175257732e-06,
"loss": 0.0554,
"step": 5200
},
{
"epoch": 26.01443298969072,
"grad_norm": 0.8772750496864319,
"learning_rate": 5.143184421534937e-06,
"loss": 0.2251,
"step": 5210
},
{
"epoch": 26.015463917525775,
"grad_norm": 0.012073386460542679,
"learning_rate": 5.131729667812142e-06,
"loss": 0.098,
"step": 5220
},
{
"epoch": 26.016494845360825,
"grad_norm": 0.23029600083827972,
"learning_rate": 5.120274914089347e-06,
"loss": 0.1361,
"step": 5230
},
{
"epoch": 26.017525773195878,
"grad_norm": 152.81813049316406,
"learning_rate": 5.108820160366552e-06,
"loss": 0.1956,
"step": 5240
},
{
"epoch": 26.018556701030928,
"grad_norm": 0.01020987518131733,
"learning_rate": 5.0973654066437575e-06,
"loss": 0.3316,
"step": 5250
},
{
"epoch": 26.01958762886598,
"grad_norm": 0.03010565973818302,
"learning_rate": 5.085910652920962e-06,
"loss": 0.0726,
"step": 5260
},
{
"epoch": 26.020103092783504,
"eval_accuracy": 0.6,
"eval_loss": 2.329340696334839,
"eval_runtime": 12.9666,
"eval_samples_per_second": 3.47,
"eval_steps_per_second": 0.925,
"step": 5265
},
{
"epoch": 27.000515463917527,
"grad_norm": 301.4482116699219,
"learning_rate": 5.074455899198167e-06,
"loss": 0.3906,
"step": 5270
},
{
"epoch": 27.001546391752576,
"grad_norm": 0.012246874161064625,
"learning_rate": 5.0630011454753726e-06,
"loss": 0.3566,
"step": 5280
},
{
"epoch": 27.00257731958763,
"grad_norm": 0.036673251539468765,
"learning_rate": 5.051546391752578e-06,
"loss": 0.0058,
"step": 5290
},
{
"epoch": 27.00360824742268,
"grad_norm": 176.9713592529297,
"learning_rate": 5.040091638029782e-06,
"loss": 0.0528,
"step": 5300
},
{
"epoch": 27.004639175257733,
"grad_norm": 110.79297637939453,
"learning_rate": 5.028636884306988e-06,
"loss": 0.1249,
"step": 5310
},
{
"epoch": 27.005670103092783,
"grad_norm": 37.024574279785156,
"learning_rate": 5.017182130584193e-06,
"loss": 0.2464,
"step": 5320
},
{
"epoch": 27.006701030927836,
"grad_norm": 0.11262047290802002,
"learning_rate": 5.005727376861398e-06,
"loss": 0.1313,
"step": 5330
},
{
"epoch": 27.007731958762886,
"grad_norm": 0.6237982511520386,
"learning_rate": 4.994272623138603e-06,
"loss": 0.2988,
"step": 5340
},
{
"epoch": 27.00876288659794,
"grad_norm": 0.011754998005926609,
"learning_rate": 4.982817869415808e-06,
"loss": 0.5804,
"step": 5350
},
{
"epoch": 27.00979381443299,
"grad_norm": 0.027967087924480438,
"learning_rate": 4.971363115693013e-06,
"loss": 0.4306,
"step": 5360
},
{
"epoch": 27.010824742268042,
"grad_norm": 0.041461341083049774,
"learning_rate": 4.9599083619702185e-06,
"loss": 0.0075,
"step": 5370
},
{
"epoch": 27.011855670103092,
"grad_norm": 0.10922248661518097,
"learning_rate": 4.948453608247423e-06,
"loss": 0.5134,
"step": 5380
},
{
"epoch": 27.012886597938145,
"grad_norm": 0.01793413981795311,
"learning_rate": 4.936998854524628e-06,
"loss": 0.4059,
"step": 5390
},
{
"epoch": 27.013917525773195,
"grad_norm": 0.12545382976531982,
"learning_rate": 4.9255441008018336e-06,
"loss": 0.0593,
"step": 5400
},
{
"epoch": 27.01494845360825,
"grad_norm": 0.23186025023460388,
"learning_rate": 4.914089347079038e-06,
"loss": 0.1651,
"step": 5410
},
{
"epoch": 27.015979381443298,
"grad_norm": 24.21805191040039,
"learning_rate": 4.902634593356243e-06,
"loss": 0.5709,
"step": 5420
},
{
"epoch": 27.01701030927835,
"grad_norm": 0.19743561744689941,
"learning_rate": 4.891179839633448e-06,
"loss": 0.2359,
"step": 5430
},
{
"epoch": 27.0180412371134,
"grad_norm": 99.03250122070312,
"learning_rate": 4.879725085910653e-06,
"loss": 0.3897,
"step": 5440
},
{
"epoch": 27.019072164948454,
"grad_norm": 0.35776469111442566,
"learning_rate": 4.868270332187858e-06,
"loss": 0.1425,
"step": 5450
},
{
"epoch": 27.020103092783504,
"grad_norm": 0.001978857209905982,
"learning_rate": 4.856815578465064e-06,
"loss": 0.0099,
"step": 5460
},
{
"epoch": 27.020103092783504,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.765753984451294,
"eval_runtime": 12.8308,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 0.935,
"step": 5460
},
{
"epoch": 28.001030927835053,
"grad_norm": 0.10768745839595795,
"learning_rate": 4.845360824742268e-06,
"loss": 0.2311,
"step": 5470
},
{
"epoch": 28.002061855670103,
"grad_norm": 57.12821960449219,
"learning_rate": 4.833906071019473e-06,
"loss": 0.1598,
"step": 5480
},
{
"epoch": 28.003092783505156,
"grad_norm": 0.11094794422388077,
"learning_rate": 4.822451317296679e-06,
"loss": 0.2435,
"step": 5490
},
{
"epoch": 28.004123711340206,
"grad_norm": 0.04346878081560135,
"learning_rate": 4.810996563573884e-06,
"loss": 0.4259,
"step": 5500
},
{
"epoch": 28.00515463917526,
"grad_norm": 99.23379516601562,
"learning_rate": 4.7995418098510884e-06,
"loss": 0.4067,
"step": 5510
},
{
"epoch": 28.00618556701031,
"grad_norm": 0.011886666528880596,
"learning_rate": 4.788087056128294e-06,
"loss": 0.3366,
"step": 5520
},
{
"epoch": 28.007216494845363,
"grad_norm": 17.010530471801758,
"learning_rate": 4.776632302405499e-06,
"loss": 0.5229,
"step": 5530
},
{
"epoch": 28.008247422680412,
"grad_norm": 84.538818359375,
"learning_rate": 4.7651775486827035e-06,
"loss": 0.2171,
"step": 5540
},
{
"epoch": 28.009278350515466,
"grad_norm": 0.029075944796204567,
"learning_rate": 4.753722794959909e-06,
"loss": 0.2936,
"step": 5550
},
{
"epoch": 28.010309278350515,
"grad_norm": 0.5513918399810791,
"learning_rate": 4.742268041237113e-06,
"loss": 0.1806,
"step": 5560
},
{
"epoch": 28.01134020618557,
"grad_norm": 0.14319252967834473,
"learning_rate": 4.7308132875143185e-06,
"loss": 0.5482,
"step": 5570
},
{
"epoch": 28.01237113402062,
"grad_norm": 0.33188721537590027,
"learning_rate": 4.719358533791524e-06,
"loss": 0.2692,
"step": 5580
},
{
"epoch": 28.013402061855672,
"grad_norm": 0.03212364390492439,
"learning_rate": 4.707903780068729e-06,
"loss": 0.0034,
"step": 5590
},
{
"epoch": 28.01443298969072,
"grad_norm": 0.03373891860246658,
"learning_rate": 4.6964490263459336e-06,
"loss": 0.0255,
"step": 5600
},
{
"epoch": 28.015463917525775,
"grad_norm": 200.98361206054688,
"learning_rate": 4.684994272623139e-06,
"loss": 0.3354,
"step": 5610
},
{
"epoch": 28.016494845360825,
"grad_norm": 41.92877960205078,
"learning_rate": 4.673539518900344e-06,
"loss": 0.0269,
"step": 5620
},
{
"epoch": 28.017525773195878,
"grad_norm": 0.11387676745653152,
"learning_rate": 4.6620847651775494e-06,
"loss": 0.0015,
"step": 5630
},
{
"epoch": 28.018556701030928,
"grad_norm": 0.05513579770922661,
"learning_rate": 4.650630011454754e-06,
"loss": 0.4027,
"step": 5640
},
{
"epoch": 28.01958762886598,
"grad_norm": 0.003919053822755814,
"learning_rate": 4.639175257731959e-06,
"loss": 0.0346,
"step": 5650
},
{
"epoch": 28.020103092783504,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 1.5937350988388062,
"eval_runtime": 11.439,
"eval_samples_per_second": 3.934,
"eval_steps_per_second": 1.049,
"step": 5655
},
{
"epoch": 29.000515463917527,
"grad_norm": 0.547856867313385,
"learning_rate": 4.6277205040091645e-06,
"loss": 0.1152,
"step": 5660
},
{
"epoch": 29.001546391752576,
"grad_norm": 0.010174279101192951,
"learning_rate": 4.61626575028637e-06,
"loss": 0.1109,
"step": 5670
},
{
"epoch": 29.00257731958763,
"grad_norm": 0.08245531469583511,
"learning_rate": 4.604810996563574e-06,
"loss": 0.3028,
"step": 5680
},
{
"epoch": 29.00360824742268,
"grad_norm": 0.2131088227033615,
"learning_rate": 4.593356242840779e-06,
"loss": 0.1538,
"step": 5690
},
{
"epoch": 29.004639175257733,
"grad_norm": 128.2372589111328,
"learning_rate": 4.581901489117984e-06,
"loss": 0.4187,
"step": 5700
},
{
"epoch": 29.005670103092783,
"grad_norm": 3.6024224758148193,
"learning_rate": 4.570446735395189e-06,
"loss": 0.1509,
"step": 5710
},
{
"epoch": 29.006701030927836,
"grad_norm": 6.2508931159973145,
"learning_rate": 4.5589919816723946e-06,
"loss": 0.3448,
"step": 5720
},
{
"epoch": 29.007731958762886,
"grad_norm": 25.978151321411133,
"learning_rate": 4.547537227949599e-06,
"loss": 0.378,
"step": 5730
},
{
"epoch": 29.00876288659794,
"grad_norm": 97.8603286743164,
"learning_rate": 4.536082474226804e-06,
"loss": 0.3905,
"step": 5740
},
{
"epoch": 29.00979381443299,
"grad_norm": 58.110443115234375,
"learning_rate": 4.52462772050401e-06,
"loss": 0.1606,
"step": 5750
},
{
"epoch": 29.010824742268042,
"grad_norm": 0.03289901092648506,
"learning_rate": 4.513172966781215e-06,
"loss": 0.0116,
"step": 5760
},
{
"epoch": 29.011855670103092,
"grad_norm": 0.4851415455341339,
"learning_rate": 4.501718213058419e-06,
"loss": 0.0014,
"step": 5770
},
{
"epoch": 29.012886597938145,
"grad_norm": 0.6099609732627869,
"learning_rate": 4.490263459335625e-06,
"loss": 0.2822,
"step": 5780
},
{
"epoch": 29.013917525773195,
"grad_norm": 0.035203512758016586,
"learning_rate": 4.47880870561283e-06,
"loss": 0.1903,
"step": 5790
},
{
"epoch": 29.01494845360825,
"grad_norm": 0.075258269906044,
"learning_rate": 4.467353951890035e-06,
"loss": 0.2484,
"step": 5800
},
{
"epoch": 29.015979381443298,
"grad_norm": 0.014334428124129772,
"learning_rate": 4.45589919816724e-06,
"loss": 0.2231,
"step": 5810
},
{
"epoch": 29.01701030927835,
"grad_norm": 0.03960711136460304,
"learning_rate": 4.444444444444444e-06,
"loss": 0.203,
"step": 5820
},
{
"epoch": 29.0180412371134,
"grad_norm": 0.082605741918087,
"learning_rate": 4.4329896907216494e-06,
"loss": 0.009,
"step": 5830
},
{
"epoch": 29.019072164948454,
"grad_norm": 1.0183836221694946,
"learning_rate": 4.421534936998855e-06,
"loss": 0.3987,
"step": 5840
},
{
"epoch": 29.020103092783504,
"grad_norm": 0.003887833096086979,
"learning_rate": 4.41008018327606e-06,
"loss": 0.0058,
"step": 5850
},
{
"epoch": 29.020103092783504,
"eval_accuracy": 0.6444444444444445,
"eval_loss": 2.351128578186035,
"eval_runtime": 12.9569,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 0.926,
"step": 5850
},
{
"epoch": 30.001030927835053,
"grad_norm": 5.273438453674316,
"learning_rate": 4.3986254295532645e-06,
"loss": 0.4555,
"step": 5860
},
{
"epoch": 30.002061855670103,
"grad_norm": 0.054419275373220444,
"learning_rate": 4.38717067583047e-06,
"loss": 0.0564,
"step": 5870
},
{
"epoch": 30.003092783505156,
"grad_norm": 23.50123405456543,
"learning_rate": 4.375715922107675e-06,
"loss": 0.0966,
"step": 5880
},
{
"epoch": 30.004123711340206,
"grad_norm": 196.958251953125,
"learning_rate": 4.36426116838488e-06,
"loss": 0.133,
"step": 5890
},
{
"epoch": 30.00515463917526,
"grad_norm": 52.535770416259766,
"learning_rate": 4.352806414662085e-06,
"loss": 0.2876,
"step": 5900
},
{
"epoch": 30.00618556701031,
"grad_norm": 0.10040858387947083,
"learning_rate": 4.34135166093929e-06,
"loss": 0.2648,
"step": 5910
},
{
"epoch": 30.007216494845363,
"grad_norm": 0.025616025552153587,
"learning_rate": 4.329896907216495e-06,
"loss": 0.3765,
"step": 5920
},
{
"epoch": 30.008247422680412,
"grad_norm": 0.006266108714044094,
"learning_rate": 4.318442153493701e-06,
"loss": 0.1254,
"step": 5930
},
{
"epoch": 30.009278350515466,
"grad_norm": 0.8512455224990845,
"learning_rate": 4.306987399770905e-06,
"loss": 0.0057,
"step": 5940
},
{
"epoch": 30.010309278350515,
"grad_norm": 0.033568281680345535,
"learning_rate": 4.2955326460481105e-06,
"loss": 0.1417,
"step": 5950
},
{
"epoch": 30.01134020618557,
"grad_norm": 30.20100212097168,
"learning_rate": 4.284077892325315e-06,
"loss": 0.0088,
"step": 5960
},
{
"epoch": 30.01237113402062,
"grad_norm": 0.2173480987548828,
"learning_rate": 4.27262313860252e-06,
"loss": 0.1447,
"step": 5970
},
{
"epoch": 30.013402061855672,
"grad_norm": 0.009785875678062439,
"learning_rate": 4.2611683848797255e-06,
"loss": 0.002,
"step": 5980
},
{
"epoch": 30.01443298969072,
"grad_norm": 0.0599403902888298,
"learning_rate": 4.24971363115693e-06,
"loss": 0.0011,
"step": 5990
},
{
"epoch": 30.015463917525775,
"grad_norm": 0.23365233838558197,
"learning_rate": 4.238258877434135e-06,
"loss": 0.2509,
"step": 6000
},
{
"epoch": 30.016494845360825,
"grad_norm": 0.14375662803649902,
"learning_rate": 4.2268041237113405e-06,
"loss": 0.4676,
"step": 6010
},
{
"epoch": 30.017525773195878,
"grad_norm": 0.03053143061697483,
"learning_rate": 4.215349369988546e-06,
"loss": 0.1415,
"step": 6020
},
{
"epoch": 30.018556701030928,
"grad_norm": 0.08184775710105896,
"learning_rate": 4.20389461626575e-06,
"loss": 0.0848,
"step": 6030
},
{
"epoch": 30.01958762886598,
"grad_norm": 0.6414709091186523,
"learning_rate": 4.192439862542956e-06,
"loss": 0.1163,
"step": 6040
},
{
"epoch": 30.020103092783504,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.7067806720733643,
"eval_runtime": 13.9465,
"eval_samples_per_second": 3.227,
"eval_steps_per_second": 0.86,
"step": 6045
},
{
"epoch": 31.000515463917527,
"grad_norm": 0.016645895317196846,
"learning_rate": 4.180985108820161e-06,
"loss": 0.4294,
"step": 6050
},
{
"epoch": 31.001546391752576,
"grad_norm": 0.7525324821472168,
"learning_rate": 4.169530355097366e-06,
"loss": 0.4195,
"step": 6060
},
{
"epoch": 31.00257731958763,
"grad_norm": 55.71934127807617,
"learning_rate": 4.158075601374571e-06,
"loss": 0.2149,
"step": 6070
},
{
"epoch": 31.00360824742268,
"grad_norm": 0.006537964567542076,
"learning_rate": 4.146620847651776e-06,
"loss": 0.4038,
"step": 6080
},
{
"epoch": 31.004639175257733,
"grad_norm": 2.2747881412506104,
"learning_rate": 4.13516609392898e-06,
"loss": 0.33,
"step": 6090
},
{
"epoch": 31.005670103092783,
"grad_norm": 0.04077708348631859,
"learning_rate": 4.123711340206186e-06,
"loss": 0.0016,
"step": 6100
},
{
"epoch": 31.006701030927836,
"grad_norm": 2.354952573776245,
"learning_rate": 4.112256586483391e-06,
"loss": 0.4812,
"step": 6110
},
{
"epoch": 31.007731958762886,
"grad_norm": 137.84132385253906,
"learning_rate": 4.100801832760596e-06,
"loss": 0.2663,
"step": 6120
},
{
"epoch": 31.00876288659794,
"grad_norm": 0.071143738925457,
"learning_rate": 4.089347079037801e-06,
"loss": 0.0608,
"step": 6130
},
{
"epoch": 31.00979381443299,
"grad_norm": 0.9687093496322632,
"learning_rate": 4.077892325315006e-06,
"loss": 0.2852,
"step": 6140
},
{
"epoch": 31.010824742268042,
"grad_norm": 0.061937566846609116,
"learning_rate": 4.066437571592211e-06,
"loss": 0.2822,
"step": 6150
},
{
"epoch": 31.011855670103092,
"grad_norm": 51.97724914550781,
"learning_rate": 4.054982817869416e-06,
"loss": 0.4565,
"step": 6160
},
{
"epoch": 31.012886597938145,
"grad_norm": 0.051208335906267166,
"learning_rate": 4.043528064146621e-06,
"loss": 0.0235,
"step": 6170
},
{
"epoch": 31.013917525773195,
"grad_norm": 0.028698451817035675,
"learning_rate": 4.032073310423826e-06,
"loss": 0.0208,
"step": 6180
},
{
"epoch": 31.01494845360825,
"grad_norm": 0.02642163261771202,
"learning_rate": 4.020618556701032e-06,
"loss": 0.0016,
"step": 6190
},
{
"epoch": 31.015979381443298,
"grad_norm": 0.019867481663823128,
"learning_rate": 4.009163802978236e-06,
"loss": 0.3667,
"step": 6200
},
{
"epoch": 31.01701030927835,
"grad_norm": 0.12207566946744919,
"learning_rate": 3.997709049255441e-06,
"loss": 0.0483,
"step": 6210
},
{
"epoch": 31.0180412371134,
"grad_norm": 0.0024308476131409407,
"learning_rate": 3.986254295532647e-06,
"loss": 0.2048,
"step": 6220
},
{
"epoch": 31.019072164948454,
"grad_norm": 0.3703158497810364,
"learning_rate": 3.974799541809851e-06,
"loss": 0.4534,
"step": 6230
},
{
"epoch": 31.020103092783504,
"grad_norm": 0.07688494771718979,
"learning_rate": 3.9633447880870564e-06,
"loss": 0.0962,
"step": 6240
},
{
"epoch": 31.020103092783504,
"eval_accuracy": 0.6888888888888889,
"eval_loss": 1.8766827583312988,
"eval_runtime": 12.7435,
"eval_samples_per_second": 3.531,
"eval_steps_per_second": 0.942,
"step": 6240
},
{
"epoch": 32.00103092783505,
"grad_norm": 3.0947821140289307,
"learning_rate": 3.951890034364262e-06,
"loss": 0.003,
"step": 6250
},
{
"epoch": 32.00206185567011,
"grad_norm": 0.021548155695199966,
"learning_rate": 3.940435280641466e-06,
"loss": 0.2464,
"step": 6260
},
{
"epoch": 32.00309278350515,
"grad_norm": 0.010396094061434269,
"learning_rate": 3.9289805269186715e-06,
"loss": 0.1475,
"step": 6270
},
{
"epoch": 32.004123711340206,
"grad_norm": 0.26135310530662537,
"learning_rate": 3.917525773195877e-06,
"loss": 0.0456,
"step": 6280
},
{
"epoch": 32.00515463917526,
"grad_norm": 0.04791492596268654,
"learning_rate": 3.906071019473082e-06,
"loss": 0.0836,
"step": 6290
},
{
"epoch": 32.00618556701031,
"grad_norm": 0.00865848921239376,
"learning_rate": 3.8946162657502865e-06,
"loss": 0.0589,
"step": 6300
},
{
"epoch": 32.00721649484536,
"grad_norm": 0.04501219838857651,
"learning_rate": 3.883161512027492e-06,
"loss": 0.1143,
"step": 6310
},
{
"epoch": 32.00824742268041,
"grad_norm": 0.049783822149038315,
"learning_rate": 3.871706758304697e-06,
"loss": 0.1852,
"step": 6320
},
{
"epoch": 32.009278350515466,
"grad_norm": 0.031185345724225044,
"learning_rate": 3.8602520045819016e-06,
"loss": 0.3198,
"step": 6330
},
{
"epoch": 32.01030927835052,
"grad_norm": 139.4656219482422,
"learning_rate": 3.848797250859107e-06,
"loss": 0.0602,
"step": 6340
},
{
"epoch": 32.011340206185565,
"grad_norm": 155.99598693847656,
"learning_rate": 3.837342497136312e-06,
"loss": 0.133,
"step": 6350
},
{
"epoch": 32.01237113402062,
"grad_norm": 0.06643953174352646,
"learning_rate": 3.825887743413517e-06,
"loss": 0.2172,
"step": 6360
},
{
"epoch": 32.01340206185567,
"grad_norm": 149.49154663085938,
"learning_rate": 3.814432989690722e-06,
"loss": 0.2295,
"step": 6370
},
{
"epoch": 32.014432989690725,
"grad_norm": 0.12383408844470978,
"learning_rate": 3.8029782359679268e-06,
"loss": 0.0736,
"step": 6380
},
{
"epoch": 32.01546391752577,
"grad_norm": 46.82889938354492,
"learning_rate": 3.791523482245132e-06,
"loss": 0.2792,
"step": 6390
},
{
"epoch": 32.016494845360825,
"grad_norm": 0.019881825894117355,
"learning_rate": 3.780068728522337e-06,
"loss": 0.1028,
"step": 6400
},
{
"epoch": 32.01752577319588,
"grad_norm": 53.818206787109375,
"learning_rate": 3.7686139747995422e-06,
"loss": 0.5382,
"step": 6410
},
{
"epoch": 32.01855670103093,
"grad_norm": 10.200157165527344,
"learning_rate": 3.757159221076747e-06,
"loss": 0.0028,
"step": 6420
},
{
"epoch": 32.01958762886598,
"grad_norm": 0.9711626768112183,
"learning_rate": 3.7457044673539524e-06,
"loss": 0.2826,
"step": 6430
},
{
"epoch": 32.02010309278351,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 2.165703773498535,
"eval_runtime": 14.5293,
"eval_samples_per_second": 3.097,
"eval_steps_per_second": 0.826,
"step": 6435
},
{
"epoch": 33.00051546391752,
"grad_norm": 0.02483387477695942,
"learning_rate": 3.7342497136311573e-06,
"loss": 0.0014,
"step": 6440
},
{
"epoch": 33.001546391752576,
"grad_norm": 65.55487060546875,
"learning_rate": 3.7227949599083626e-06,
"loss": 0.1579,
"step": 6450
},
{
"epoch": 33.00257731958763,
"grad_norm": 0.09028110653162003,
"learning_rate": 3.7113402061855674e-06,
"loss": 0.2578,
"step": 6460
},
{
"epoch": 33.00360824742268,
"grad_norm": 0.028494760394096375,
"learning_rate": 3.6998854524627727e-06,
"loss": 0.0175,
"step": 6470
},
{
"epoch": 33.00463917525773,
"grad_norm": 0.015268037095665932,
"learning_rate": 3.6884306987399776e-06,
"loss": 0.1281,
"step": 6480
},
{
"epoch": 33.00567010309278,
"grad_norm": 0.022972600534558296,
"learning_rate": 3.6769759450171825e-06,
"loss": 0.2622,
"step": 6490
},
{
"epoch": 33.006701030927836,
"grad_norm": 0.022758277133107185,
"learning_rate": 3.6655211912943874e-06,
"loss": 0.0093,
"step": 6500
},
{
"epoch": 33.00773195876289,
"grad_norm": 8.829773902893066,
"learning_rate": 3.6540664375715922e-06,
"loss": 0.597,
"step": 6510
},
{
"epoch": 33.008762886597935,
"grad_norm": 0.006442221347242594,
"learning_rate": 3.6426116838487975e-06,
"loss": 0.1217,
"step": 6520
},
{
"epoch": 33.00979381443299,
"grad_norm": 0.15540894865989685,
"learning_rate": 3.6311569301260024e-06,
"loss": 0.2546,
"step": 6530
},
{
"epoch": 33.01082474226804,
"grad_norm": 1.448900580406189,
"learning_rate": 3.6197021764032077e-06,
"loss": 0.0054,
"step": 6540
},
{
"epoch": 33.011855670103095,
"grad_norm": 0.1175546869635582,
"learning_rate": 3.6082474226804126e-06,
"loss": 0.0229,
"step": 6550
},
{
"epoch": 33.01288659793814,
"grad_norm": 0.04827328771352768,
"learning_rate": 3.596792668957618e-06,
"loss": 0.2278,
"step": 6560
},
{
"epoch": 33.013917525773195,
"grad_norm": 0.011951892636716366,
"learning_rate": 3.5853379152348227e-06,
"loss": 0.1539,
"step": 6570
},
{
"epoch": 33.01494845360825,
"grad_norm": 0.05575776845216751,
"learning_rate": 3.573883161512028e-06,
"loss": 0.3902,
"step": 6580
},
{
"epoch": 33.0159793814433,
"grad_norm": 43.80656051635742,
"learning_rate": 3.562428407789233e-06,
"loss": 0.011,
"step": 6590
},
{
"epoch": 33.01701030927835,
"grad_norm": 1.3099844455718994,
"learning_rate": 3.550973654066438e-06,
"loss": 0.1933,
"step": 6600
},
{
"epoch": 33.0180412371134,
"grad_norm": 0.03854267671704292,
"learning_rate": 3.539518900343643e-06,
"loss": 0.0416,
"step": 6610
},
{
"epoch": 33.019072164948454,
"grad_norm": 0.01258633378893137,
"learning_rate": 3.5280641466208484e-06,
"loss": 0.2978,
"step": 6620
},
{
"epoch": 33.02010309278351,
"grad_norm": 0.0009200758067891002,
"learning_rate": 3.5166093928980532e-06,
"loss": 0.1249,
"step": 6630
},
{
"epoch": 33.02010309278351,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 1.7385255098342896,
"eval_runtime": 12.3632,
"eval_samples_per_second": 3.64,
"eval_steps_per_second": 0.971,
"step": 6630
},
{
"epoch": 34.00103092783505,
"grad_norm": 0.1539161652326584,
"learning_rate": 3.5051546391752577e-06,
"loss": 0.1841,
"step": 6640
},
{
"epoch": 34.00206185567011,
"grad_norm": 0.02649785578250885,
"learning_rate": 3.493699885452463e-06,
"loss": 0.2675,
"step": 6650
},
{
"epoch": 34.00309278350515,
"grad_norm": 0.025071190670132637,
"learning_rate": 3.482245131729668e-06,
"loss": 0.0023,
"step": 6660
},
{
"epoch": 34.004123711340206,
"grad_norm": 0.019204530864953995,
"learning_rate": 3.470790378006873e-06,
"loss": 0.4407,
"step": 6670
},
{
"epoch": 34.00515463917526,
"grad_norm": 0.02421189844608307,
"learning_rate": 3.459335624284078e-06,
"loss": 0.0031,
"step": 6680
},
{
"epoch": 34.00618556701031,
"grad_norm": 0.02116827294230461,
"learning_rate": 3.4478808705612833e-06,
"loss": 0.2766,
"step": 6690
},
{
"epoch": 34.00721649484536,
"grad_norm": 0.0051218606531620026,
"learning_rate": 3.436426116838488e-06,
"loss": 0.6209,
"step": 6700
},
{
"epoch": 34.00824742268041,
"grad_norm": 0.015079047530889511,
"learning_rate": 3.4249713631156935e-06,
"loss": 0.0015,
"step": 6710
},
{
"epoch": 34.009278350515466,
"grad_norm": 1.001089096069336,
"learning_rate": 3.4135166093928984e-06,
"loss": 0.1645,
"step": 6720
},
{
"epoch": 34.01030927835052,
"grad_norm": 0.0209248848259449,
"learning_rate": 3.4020618556701037e-06,
"loss": 0.0015,
"step": 6730
},
{
"epoch": 34.011340206185565,
"grad_norm": 0.010426868684589863,
"learning_rate": 3.3906071019473085e-06,
"loss": 0.0104,
"step": 6740
},
{
"epoch": 34.01237113402062,
"grad_norm": 0.06406555324792862,
"learning_rate": 3.379152348224514e-06,
"loss": 0.0415,
"step": 6750
},
{
"epoch": 34.01340206185567,
"grad_norm": 222.54000854492188,
"learning_rate": 3.3676975945017187e-06,
"loss": 0.2297,
"step": 6760
},
{
"epoch": 34.014432989690725,
"grad_norm": 0.2206907570362091,
"learning_rate": 3.356242840778923e-06,
"loss": 0.0033,
"step": 6770
},
{
"epoch": 34.01546391752577,
"grad_norm": 0.0022467290982604027,
"learning_rate": 3.3447880870561285e-06,
"loss": 0.0908,
"step": 6780
},
{
"epoch": 34.016494845360825,
"grad_norm": 281.9080505371094,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2918,
"step": 6790
},
{
"epoch": 34.01752577319588,
"grad_norm": 0.13616985082626343,
"learning_rate": 3.3218785796105386e-06,
"loss": 0.1153,
"step": 6800
},
{
"epoch": 34.01855670103093,
"grad_norm": 1.4510610103607178,
"learning_rate": 3.3104238258877435e-06,
"loss": 0.0018,
"step": 6810
},
{
"epoch": 34.01958762886598,
"grad_norm": 0.008665881119668484,
"learning_rate": 3.298969072164949e-06,
"loss": 0.2191,
"step": 6820
},
{
"epoch": 34.02010309278351,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 2.178891181945801,
"eval_runtime": 12.7411,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 0.942,
"step": 6825
},
{
"epoch": 35.00051546391752,
"grad_norm": 0.05293448269367218,
"learning_rate": 3.2875143184421537e-06,
"loss": 0.1271,
"step": 6830
},
{
"epoch": 35.001546391752576,
"grad_norm": 0.0611007995903492,
"learning_rate": 3.276059564719359e-06,
"loss": 0.1393,
"step": 6840
},
{
"epoch": 35.00257731958763,
"grad_norm": 0.024918990209698677,
"learning_rate": 3.264604810996564e-06,
"loss": 0.1459,
"step": 6850
},
{
"epoch": 35.00360824742268,
"grad_norm": 0.28028416633605957,
"learning_rate": 3.253150057273769e-06,
"loss": 0.1449,
"step": 6860
},
{
"epoch": 35.00463917525773,
"grad_norm": 0.0019702534191310406,
"learning_rate": 3.241695303550974e-06,
"loss": 0.0093,
"step": 6870
},
{
"epoch": 35.00567010309278,
"grad_norm": 7.490848064422607,
"learning_rate": 3.2302405498281793e-06,
"loss": 0.122,
"step": 6880
},
{
"epoch": 35.006701030927836,
"grad_norm": 0.013970672152936459,
"learning_rate": 3.218785796105384e-06,
"loss": 0.0232,
"step": 6890
},
{
"epoch": 35.00773195876289,
"grad_norm": 0.031982772052288055,
"learning_rate": 3.2073310423825895e-06,
"loss": 0.086,
"step": 6900
},
{
"epoch": 35.008762886597935,
"grad_norm": 0.010435467585921288,
"learning_rate": 3.195876288659794e-06,
"loss": 0.3166,
"step": 6910
},
{
"epoch": 35.00979381443299,
"grad_norm": 86.65909576416016,
"learning_rate": 3.1844215349369988e-06,
"loss": 0.3312,
"step": 6920
},
{
"epoch": 35.01082474226804,
"grad_norm": 122.04566955566406,
"learning_rate": 3.172966781214204e-06,
"loss": 0.2556,
"step": 6930
},
{
"epoch": 35.011855670103095,
"grad_norm": 0.12888219952583313,
"learning_rate": 3.161512027491409e-06,
"loss": 0.3073,
"step": 6940
},
{
"epoch": 35.01288659793814,
"grad_norm": 0.025842690840363503,
"learning_rate": 3.1500572737686143e-06,
"loss": 0.1798,
"step": 6950
},
{
"epoch": 35.013917525773195,
"grad_norm": 0.011454744264483452,
"learning_rate": 3.138602520045819e-06,
"loss": 0.0072,
"step": 6960
},
{
"epoch": 35.01494845360825,
"grad_norm": 167.35546875,
"learning_rate": 3.1271477663230244e-06,
"loss": 0.071,
"step": 6970
},
{
"epoch": 35.0159793814433,
"grad_norm": 0.024680141359567642,
"learning_rate": 3.1156930126002293e-06,
"loss": 0.4104,
"step": 6980
},
{
"epoch": 35.01701030927835,
"grad_norm": 0.008994595147669315,
"learning_rate": 3.1042382588774346e-06,
"loss": 0.1758,
"step": 6990
},
{
"epoch": 35.0180412371134,
"grad_norm": 46.581077575683594,
"learning_rate": 3.0927835051546395e-06,
"loss": 0.1648,
"step": 7000
},
{
"epoch": 35.019072164948454,
"grad_norm": 199.21475219726562,
"learning_rate": 3.0813287514318448e-06,
"loss": 0.264,
"step": 7010
},
{
"epoch": 35.02010309278351,
"grad_norm": 0.001545836334116757,
"learning_rate": 3.0698739977090496e-06,
"loss": 0.0958,
"step": 7020
},
{
"epoch": 35.02010309278351,
"eval_accuracy": 0.6444444444444445,
"eval_loss": 2.472182035446167,
"eval_runtime": 12.7933,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 0.938,
"step": 7020
},
{
"epoch": 36.00103092783505,
"grad_norm": 0.045767877250909805,
"learning_rate": 3.058419243986255e-06,
"loss": 0.2266,
"step": 7030
},
{
"epoch": 36.00206185567011,
"grad_norm": 0.014738555997610092,
"learning_rate": 3.04696449026346e-06,
"loss": 0.0054,
"step": 7040
},
{
"epoch": 36.00309278350515,
"grad_norm": 0.00678382720798254,
"learning_rate": 3.0355097365406643e-06,
"loss": 0.3692,
"step": 7050
},
{
"epoch": 36.004123711340206,
"grad_norm": 0.00942118652164936,
"learning_rate": 3.0240549828178695e-06,
"loss": 0.0014,
"step": 7060
},
{
"epoch": 36.00515463917526,
"grad_norm": 0.024564573541283607,
"learning_rate": 3.0126002290950744e-06,
"loss": 0.1198,
"step": 7070
},
{
"epoch": 36.00618556701031,
"grad_norm": 271.1005859375,
"learning_rate": 3.0011454753722797e-06,
"loss": 0.2814,
"step": 7080
},
{
"epoch": 36.00721649484536,
"grad_norm": 0.043836403638124466,
"learning_rate": 2.9896907216494846e-06,
"loss": 0.0023,
"step": 7090
},
{
"epoch": 36.00824742268041,
"grad_norm": 0.008552830666303635,
"learning_rate": 2.97823596792669e-06,
"loss": 0.0138,
"step": 7100
},
{
"epoch": 36.009278350515466,
"grad_norm": 0.2735336124897003,
"learning_rate": 2.9667812142038948e-06,
"loss": 0.0027,
"step": 7110
},
{
"epoch": 36.01030927835052,
"grad_norm": 0.0645265206694603,
"learning_rate": 2.9553264604811e-06,
"loss": 0.1183,
"step": 7120
},
{
"epoch": 36.011340206185565,
"grad_norm": 0.035903312265872955,
"learning_rate": 2.943871706758305e-06,
"loss": 0.014,
"step": 7130
},
{
"epoch": 36.01237113402062,
"grad_norm": 4.987162113189697,
"learning_rate": 2.9324169530355102e-06,
"loss": 0.1061,
"step": 7140
},
{
"epoch": 36.01340206185567,
"grad_norm": 0.45441123843193054,
"learning_rate": 2.920962199312715e-06,
"loss": 0.0013,
"step": 7150
},
{
"epoch": 36.014432989690725,
"grad_norm": 1.8816462755203247,
"learning_rate": 2.9095074455899204e-06,
"loss": 0.0026,
"step": 7160
},
{
"epoch": 36.01546391752577,
"grad_norm": 0.005839156918227673,
"learning_rate": 2.8980526918671253e-06,
"loss": 0.0038,
"step": 7170
},
{
"epoch": 36.016494845360825,
"grad_norm": 0.5239657163619995,
"learning_rate": 2.8865979381443297e-06,
"loss": 0.0005,
"step": 7180
},
{
"epoch": 36.01752577319588,
"grad_norm": 60.700618743896484,
"learning_rate": 2.875143184421535e-06,
"loss": 0.1786,
"step": 7190
},
{
"epoch": 36.01855670103093,
"grad_norm": 0.0037163153756409883,
"learning_rate": 2.86368843069874e-06,
"loss": 0.4697,
"step": 7200
},
{
"epoch": 36.01958762886598,
"grad_norm": 0.007669605780392885,
"learning_rate": 2.852233676975945e-06,
"loss": 0.0006,
"step": 7210
},
{
"epoch": 36.02010309278351,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 1.917656421661377,
"eval_runtime": 12.6565,
"eval_samples_per_second": 3.555,
"eval_steps_per_second": 0.948,
"step": 7215
},
{
"epoch": 37.00051546391752,
"grad_norm": 0.019232606515288353,
"learning_rate": 2.84077892325315e-06,
"loss": 0.1498,
"step": 7220
},
{
"epoch": 37.001546391752576,
"grad_norm": 0.006618044804781675,
"learning_rate": 2.8293241695303553e-06,
"loss": 0.0006,
"step": 7230
},
{
"epoch": 37.00257731958763,
"grad_norm": 32.33464813232422,
"learning_rate": 2.8178694158075602e-06,
"loss": 0.1678,
"step": 7240
},
{
"epoch": 37.00360824742268,
"grad_norm": 0.02818481996655464,
"learning_rate": 2.8064146620847655e-06,
"loss": 0.2838,
"step": 7250
},
{
"epoch": 37.00463917525773,
"grad_norm": 136.29965209960938,
"learning_rate": 2.7949599083619704e-06,
"loss": 0.0919,
"step": 7260
},
{
"epoch": 37.00567010309278,
"grad_norm": 0.004848357755690813,
"learning_rate": 2.7835051546391757e-06,
"loss": 0.1393,
"step": 7270
},
{
"epoch": 37.006701030927836,
"grad_norm": 91.26074981689453,
"learning_rate": 2.7720504009163806e-06,
"loss": 0.293,
"step": 7280
},
{
"epoch": 37.00773195876289,
"grad_norm": 0.008552628569304943,
"learning_rate": 2.760595647193586e-06,
"loss": 0.0011,
"step": 7290
},
{
"epoch": 37.008762886597935,
"grad_norm": 0.011269760318100452,
"learning_rate": 2.7491408934707907e-06,
"loss": 0.0008,
"step": 7300
},
{
"epoch": 37.00979381443299,
"grad_norm": 0.07644051313400269,
"learning_rate": 2.737686139747996e-06,
"loss": 0.2102,
"step": 7310
},
{
"epoch": 37.01082474226804,
"grad_norm": 0.0022292486391961575,
"learning_rate": 2.7262313860252005e-06,
"loss": 0.1402,
"step": 7320
},
{
"epoch": 37.011855670103095,
"grad_norm": 7.599360466003418,
"learning_rate": 2.7147766323024053e-06,
"loss": 0.0044,
"step": 7330
},
{
"epoch": 37.01288659793814,
"grad_norm": 220.58099365234375,
"learning_rate": 2.7033218785796106e-06,
"loss": 0.4214,
"step": 7340
},
{
"epoch": 37.013917525773195,
"grad_norm": 83.0787353515625,
"learning_rate": 2.6918671248568155e-06,
"loss": 0.0265,
"step": 7350
},
{
"epoch": 37.01494845360825,
"grad_norm": 0.015676092356443405,
"learning_rate": 2.680412371134021e-06,
"loss": 0.0018,
"step": 7360
},
{
"epoch": 37.0159793814433,
"grad_norm": 0.008032367564737797,
"learning_rate": 2.6689576174112257e-06,
"loss": 0.0151,
"step": 7370
},
{
"epoch": 37.01701030927835,
"grad_norm": 0.36304762959480286,
"learning_rate": 2.657502863688431e-06,
"loss": 0.1789,
"step": 7380
},
{
"epoch": 37.0180412371134,
"grad_norm": 0.010707261972129345,
"learning_rate": 2.646048109965636e-06,
"loss": 0.0672,
"step": 7390
},
{
"epoch": 37.019072164948454,
"grad_norm": 110.2496566772461,
"learning_rate": 2.634593356242841e-06,
"loss": 0.664,
"step": 7400
},
{
"epoch": 37.02010309278351,
"grad_norm": 0.13198626041412354,
"learning_rate": 2.623138602520046e-06,
"loss": 0.0036,
"step": 7410
},
{
"epoch": 37.02010309278351,
"eval_accuracy": 0.6888888888888889,
"eval_loss": 1.9590771198272705,
"eval_runtime": 14.573,
"eval_samples_per_second": 3.088,
"eval_steps_per_second": 0.823,
"step": 7410
},
{
"epoch": 38.00103092783505,
"grad_norm": 64.50994110107422,
"learning_rate": 2.6116838487972513e-06,
"loss": 0.3479,
"step": 7420
},
{
"epoch": 38.00206185567011,
"grad_norm": 139.82247924804688,
"learning_rate": 2.600229095074456e-06,
"loss": 0.2922,
"step": 7430
},
{
"epoch": 38.00309278350515,
"grad_norm": 0.011880365200340748,
"learning_rate": 2.5887743413516615e-06,
"loss": 0.0738,
"step": 7440
},
{
"epoch": 38.004123711340206,
"grad_norm": 0.0019393068505451083,
"learning_rate": 2.577319587628866e-06,
"loss": 0.0053,
"step": 7450
},
{
"epoch": 38.00515463917526,
"grad_norm": 0.007837596349418163,
"learning_rate": 2.565864833906071e-06,
"loss": 0.1809,
"step": 7460
},
{
"epoch": 38.00618556701031,
"grad_norm": 0.051794491708278656,
"learning_rate": 2.554410080183276e-06,
"loss": 0.1372,
"step": 7470
},
{
"epoch": 38.00721649484536,
"grad_norm": 0.11213437467813492,
"learning_rate": 2.542955326460481e-06,
"loss": 0.0021,
"step": 7480
},
{
"epoch": 38.00824742268041,
"grad_norm": 0.008571717888116837,
"learning_rate": 2.5315005727376863e-06,
"loss": 0.2099,
"step": 7490
},
{
"epoch": 38.009278350515466,
"grad_norm": 0.021522628143429756,
"learning_rate": 2.520045819014891e-06,
"loss": 0.214,
"step": 7500
},
{
"epoch": 38.01030927835052,
"grad_norm": 0.24400968849658966,
"learning_rate": 2.5085910652920964e-06,
"loss": 0.0008,
"step": 7510
},
{
"epoch": 38.011340206185565,
"grad_norm": 0.008205863647162914,
"learning_rate": 2.4971363115693013e-06,
"loss": 0.1451,
"step": 7520
},
{
"epoch": 38.01237113402062,
"grad_norm": 0.0307331420481205,
"learning_rate": 2.4856815578465066e-06,
"loss": 0.1482,
"step": 7530
},
{
"epoch": 38.01340206185567,
"grad_norm": 0.06798528879880905,
"learning_rate": 2.4742268041237115e-06,
"loss": 0.0751,
"step": 7540
},
{
"epoch": 38.014432989690725,
"grad_norm": 0.0034336706157773733,
"learning_rate": 2.4627720504009168e-06,
"loss": 0.0006,
"step": 7550
},
{
"epoch": 38.01546391752577,
"grad_norm": 0.14595551788806915,
"learning_rate": 2.4513172966781217e-06,
"loss": 0.0006,
"step": 7560
},
{
"epoch": 38.016494845360825,
"grad_norm": 0.00959043949842453,
"learning_rate": 2.4398625429553265e-06,
"loss": 0.0011,
"step": 7570
},
{
"epoch": 38.01752577319588,
"grad_norm": 77.62090301513672,
"learning_rate": 2.428407789232532e-06,
"loss": 0.5068,
"step": 7580
},
{
"epoch": 38.01855670103093,
"grad_norm": 0.020476188510656357,
"learning_rate": 2.4169530355097367e-06,
"loss": 0.1332,
"step": 7590
},
{
"epoch": 38.01958762886598,
"grad_norm": 0.06416749209165573,
"learning_rate": 2.405498281786942e-06,
"loss": 0.0009,
"step": 7600
},
{
"epoch": 38.02010309278351,
"eval_accuracy": 0.6222222222222222,
"eval_loss": 2.399278402328491,
"eval_runtime": 12.9659,
"eval_samples_per_second": 3.471,
"eval_steps_per_second": 0.926,
"step": 7605
},
{
"epoch": 39.00051546391752,
"grad_norm": 5.2809295654296875,
"learning_rate": 2.394043528064147e-06,
"loss": 0.0652,
"step": 7610
},
{
"epoch": 39.001546391752576,
"grad_norm": 0.008623295463621616,
"learning_rate": 2.3825887743413517e-06,
"loss": 0.1178,
"step": 7620
},
{
"epoch": 39.00257731958763,
"grad_norm": 0.03899654373526573,
"learning_rate": 2.3711340206185566e-06,
"loss": 0.0365,
"step": 7630
},
{
"epoch": 39.00360824742268,
"grad_norm": 0.01230724435299635,
"learning_rate": 2.359679266895762e-06,
"loss": 0.1324,
"step": 7640
},
{
"epoch": 39.00463917525773,
"grad_norm": 0.5138551592826843,
"learning_rate": 2.3482245131729668e-06,
"loss": 0.0991,
"step": 7650
},
{
"epoch": 39.00567010309278,
"grad_norm": 0.004986596293747425,
"learning_rate": 2.336769759450172e-06,
"loss": 0.002,
"step": 7660
},
{
"epoch": 39.006701030927836,
"grad_norm": 0.04998844861984253,
"learning_rate": 2.325315005727377e-06,
"loss": 0.0007,
"step": 7670
},
{
"epoch": 39.00773195876289,
"grad_norm": 0.0053374143317341805,
"learning_rate": 2.3138602520045822e-06,
"loss": 0.0531,
"step": 7680
},
{
"epoch": 39.008762886597935,
"grad_norm": 326.64031982421875,
"learning_rate": 2.302405498281787e-06,
"loss": 0.307,
"step": 7690
},
{
"epoch": 39.00979381443299,
"grad_norm": 0.18461164832115173,
"learning_rate": 2.290950744558992e-06,
"loss": 0.0038,
"step": 7700
},
{
"epoch": 39.01082474226804,
"grad_norm": 0.01692032255232334,
"learning_rate": 2.2794959908361973e-06,
"loss": 0.001,
"step": 7710
},
{
"epoch": 39.011855670103095,
"grad_norm": 0.012526489794254303,
"learning_rate": 2.268041237113402e-06,
"loss": 0.3061,
"step": 7720
},
{
"epoch": 39.01288659793814,
"grad_norm": 0.009198295883834362,
"learning_rate": 2.2565864833906075e-06,
"loss": 0.1932,
"step": 7730
},
{
"epoch": 39.013917525773195,
"grad_norm": 0.1381886750459671,
"learning_rate": 2.2451317296678123e-06,
"loss": 0.0878,
"step": 7740
},
{
"epoch": 39.01494845360825,
"grad_norm": 0.03150659054517746,
"learning_rate": 2.2336769759450176e-06,
"loss": 0.1654,
"step": 7750
},
{
"epoch": 39.0159793814433,
"grad_norm": 0.011908908374607563,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0004,
"step": 7760
},
{
"epoch": 39.01701030927835,
"grad_norm": 16.51814842224121,
"learning_rate": 2.2107674684994274e-06,
"loss": 0.0024,
"step": 7770
},
{
"epoch": 39.0180412371134,
"grad_norm": 0.05224655196070671,
"learning_rate": 2.1993127147766322e-06,
"loss": 0.0003,
"step": 7780
},
{
"epoch": 39.019072164948454,
"grad_norm": 0.10637877881526947,
"learning_rate": 2.1878579610538375e-06,
"loss": 0.0116,
"step": 7790
},
{
"epoch": 39.02010309278351,
"grad_norm": 0.08694580942392349,
"learning_rate": 2.1764032073310424e-06,
"loss": 0.0005,
"step": 7800
},
{
"epoch": 39.02010309278351,
"eval_accuracy": 0.7777777777777778,
"eval_loss": 1.7377581596374512,
"eval_runtime": 12.9146,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 0.929,
"step": 7800
},
{
"epoch": 40.00103092783505,
"grad_norm": 0.0032964874990284443,
"learning_rate": 2.1649484536082477e-06,
"loss": 0.057,
"step": 7810
},
{
"epoch": 40.00206185567011,
"grad_norm": 0.01317548006772995,
"learning_rate": 2.1534936998854526e-06,
"loss": 0.0008,
"step": 7820
},
{
"epoch": 40.00309278350515,
"grad_norm": 0.017642924562096596,
"learning_rate": 2.1420389461626575e-06,
"loss": 0.0005,
"step": 7830
},
{
"epoch": 40.004123711340206,
"grad_norm": 0.012752017937600613,
"learning_rate": 2.1305841924398628e-06,
"loss": 0.0004,
"step": 7840
},
{
"epoch": 40.00515463917526,
"grad_norm": 0.06961321830749512,
"learning_rate": 2.1191294387170676e-06,
"loss": 0.0003,
"step": 7850
},
{
"epoch": 40.00618556701031,
"grad_norm": 0.09094371646642685,
"learning_rate": 2.107674684994273e-06,
"loss": 0.1611,
"step": 7860
},
{
"epoch": 40.00721649484536,
"grad_norm": 0.008556496351957321,
"learning_rate": 2.096219931271478e-06,
"loss": 0.0007,
"step": 7870
},
{
"epoch": 40.00824742268041,
"grad_norm": 309.84698486328125,
"learning_rate": 2.084765177548683e-06,
"loss": 0.2056,
"step": 7880
},
{
"epoch": 40.009278350515466,
"grad_norm": 0.03041210025548935,
"learning_rate": 2.073310423825888e-06,
"loss": 0.0025,
"step": 7890
},
{
"epoch": 40.01030927835052,
"grad_norm": 82.68245697021484,
"learning_rate": 2.061855670103093e-06,
"loss": 0.026,
"step": 7900
},
{
"epoch": 40.011340206185565,
"grad_norm": 0.02870158664882183,
"learning_rate": 2.050400916380298e-06,
"loss": 0.0072,
"step": 7910
},
{
"epoch": 40.01237113402062,
"grad_norm": 0.02945534512400627,
"learning_rate": 2.038946162657503e-06,
"loss": 0.1894,
"step": 7920
},
{
"epoch": 40.01340206185567,
"grad_norm": 0.007657837588340044,
"learning_rate": 2.027491408934708e-06,
"loss": 0.0024,
"step": 7930
},
{
"epoch": 40.014432989690725,
"grad_norm": 0.05909576267004013,
"learning_rate": 2.016036655211913e-06,
"loss": 0.1906,
"step": 7940
},
{
"epoch": 40.01546391752577,
"grad_norm": 0.5790134072303772,
"learning_rate": 2.004581901489118e-06,
"loss": 0.0005,
"step": 7950
},
{
"epoch": 40.016494845360825,
"grad_norm": 0.0051000709645450115,
"learning_rate": 1.9931271477663233e-06,
"loss": 0.1721,
"step": 7960
},
{
"epoch": 40.01752577319588,
"grad_norm": 0.011354904621839523,
"learning_rate": 1.9816723940435282e-06,
"loss": 0.0046,
"step": 7970
},
{
"epoch": 40.01855670103093,
"grad_norm": 327.5389099121094,
"learning_rate": 1.970217640320733e-06,
"loss": 0.4001,
"step": 7980
},
{
"epoch": 40.01958762886598,
"grad_norm": 0.006849356461316347,
"learning_rate": 1.9587628865979384e-06,
"loss": 0.0014,
"step": 7990
},
{
"epoch": 40.02010309278351,
"eval_accuracy": 0.6888888888888889,
"eval_loss": 2.4453516006469727,
"eval_runtime": 13.0464,
"eval_samples_per_second": 3.449,
"eval_steps_per_second": 0.92,
"step": 7995
},
{
"epoch": 41.00051546391752,
"grad_norm": 0.013001556508243084,
"learning_rate": 1.9473081328751433e-06,
"loss": 0.2103,
"step": 8000
},
{
"epoch": 41.001546391752576,
"grad_norm": 107.24781799316406,
"learning_rate": 1.9358533791523486e-06,
"loss": 0.3106,
"step": 8010
},
{
"epoch": 41.00257731958763,
"grad_norm": 0.008363613858819008,
"learning_rate": 1.9243986254295534e-06,
"loss": 0.028,
"step": 8020
},
{
"epoch": 41.00360824742268,
"grad_norm": 0.04029637202620506,
"learning_rate": 1.9129438717067583e-06,
"loss": 0.0264,
"step": 8030
},
{
"epoch": 41.00463917525773,
"grad_norm": 0.06103358045220375,
"learning_rate": 1.9014891179839634e-06,
"loss": 0.0334,
"step": 8040
},
{
"epoch": 41.00567010309278,
"grad_norm": 0.00733905378729105,
"learning_rate": 1.8900343642611685e-06,
"loss": 0.0014,
"step": 8050
},
{
"epoch": 41.006701030927836,
"grad_norm": 338.4005432128906,
"learning_rate": 1.8785796105383736e-06,
"loss": 0.0489,
"step": 8060
},
{
"epoch": 41.00773195876289,
"grad_norm": 0.006053756456822157,
"learning_rate": 1.8671248568155786e-06,
"loss": 0.1368,
"step": 8070
},
{
"epoch": 41.008762886597935,
"grad_norm": 0.0014053594786673784,
"learning_rate": 1.8556701030927837e-06,
"loss": 0.1025,
"step": 8080
},
{
"epoch": 41.00979381443299,
"grad_norm": 0.0035459971986711025,
"learning_rate": 1.8442153493699888e-06,
"loss": 0.0006,
"step": 8090
},
{
"epoch": 41.01082474226804,
"grad_norm": 384.0328369140625,
"learning_rate": 1.8327605956471937e-06,
"loss": 0.3021,
"step": 8100
},
{
"epoch": 41.011855670103095,
"grad_norm": 0.5014728903770447,
"learning_rate": 1.8213058419243988e-06,
"loss": 0.1137,
"step": 8110
},
{
"epoch": 41.01288659793814,
"grad_norm": 0.0056852176785469055,
"learning_rate": 1.8098510882016038e-06,
"loss": 0.0004,
"step": 8120
},
{
"epoch": 41.013917525773195,
"grad_norm": 0.013269063085317612,
"learning_rate": 1.798396334478809e-06,
"loss": 0.0002,
"step": 8130
},
{
"epoch": 41.01494845360825,
"grad_norm": 56.177669525146484,
"learning_rate": 1.786941580756014e-06,
"loss": 0.1576,
"step": 8140
},
{
"epoch": 41.0159793814433,
"grad_norm": 0.43153733015060425,
"learning_rate": 1.775486827033219e-06,
"loss": 0.1699,
"step": 8150
},
{
"epoch": 41.01701030927835,
"grad_norm": 0.010211652144789696,
"learning_rate": 1.7640320733104242e-06,
"loss": 0.0006,
"step": 8160
},
{
"epoch": 41.0180412371134,
"grad_norm": 0.004664810374379158,
"learning_rate": 1.7525773195876288e-06,
"loss": 0.087,
"step": 8170
},
{
"epoch": 41.019072164948454,
"grad_norm": 124.0995864868164,
"learning_rate": 1.741122565864834e-06,
"loss": 0.1361,
"step": 8180
},
{
"epoch": 41.02010309278351,
"grad_norm": 0.04815516620874405,
"learning_rate": 1.729667812142039e-06,
"loss": 0.1203,
"step": 8190
},
{
"epoch": 41.02010309278351,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 2.113752841949463,
"eval_runtime": 12.9032,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 0.93,
"step": 8190
},
{
"epoch": 42.00103092783505,
"grad_norm": 0.0071434988640248775,
"learning_rate": 1.718213058419244e-06,
"loss": 0.0003,
"step": 8200
},
{
"epoch": 42.00206185567011,
"grad_norm": 0.020918577909469604,
"learning_rate": 1.7067583046964492e-06,
"loss": 0.0011,
"step": 8210
},
{
"epoch": 42.00309278350515,
"grad_norm": 0.03154708817601204,
"learning_rate": 1.6953035509736543e-06,
"loss": 0.0815,
"step": 8220
},
{
"epoch": 42.004123711340206,
"grad_norm": 0.013127969577908516,
"learning_rate": 1.6838487972508594e-06,
"loss": 0.0354,
"step": 8230
},
{
"epoch": 42.00515463917526,
"grad_norm": 0.11450503766536713,
"learning_rate": 1.6723940435280642e-06,
"loss": 0.6204,
"step": 8240
},
{
"epoch": 42.00618556701031,
"grad_norm": 0.007757487706840038,
"learning_rate": 1.6609392898052693e-06,
"loss": 0.1229,
"step": 8250
},
{
"epoch": 42.00721649484536,
"grad_norm": 0.015443161129951477,
"learning_rate": 1.6494845360824744e-06,
"loss": 0.0002,
"step": 8260
},
{
"epoch": 42.00824742268041,
"grad_norm": 0.002430438296869397,
"learning_rate": 1.6380297823596795e-06,
"loss": 0.0002,
"step": 8270
},
{
"epoch": 42.009278350515466,
"grad_norm": 0.10335277765989304,
"learning_rate": 1.6265750286368846e-06,
"loss": 0.0185,
"step": 8280
},
{
"epoch": 42.01030927835052,
"grad_norm": 0.002046901499852538,
"learning_rate": 1.6151202749140896e-06,
"loss": 0.0508,
"step": 8290
},
{
"epoch": 42.011340206185565,
"grad_norm": 0.005286916624754667,
"learning_rate": 1.6036655211912947e-06,
"loss": 0.0702,
"step": 8300
},
{
"epoch": 42.01237113402062,
"grad_norm": 0.007195398211479187,
"learning_rate": 1.5922107674684994e-06,
"loss": 0.3414,
"step": 8310
},
{
"epoch": 42.01340206185567,
"grad_norm": 0.002511281054466963,
"learning_rate": 1.5807560137457045e-06,
"loss": 0.2097,
"step": 8320
},
{
"epoch": 42.014432989690725,
"grad_norm": 201.3845977783203,
"learning_rate": 1.5693012600229096e-06,
"loss": 0.2369,
"step": 8330
},
{
"epoch": 42.01546391752577,
"grad_norm": 0.04441210627555847,
"learning_rate": 1.5578465063001146e-06,
"loss": 0.0554,
"step": 8340
},
{
"epoch": 42.016494845360825,
"grad_norm": 12.391925811767578,
"learning_rate": 1.5463917525773197e-06,
"loss": 0.0019,
"step": 8350
},
{
"epoch": 42.01752577319588,
"grad_norm": 0.1355491429567337,
"learning_rate": 1.5349369988545248e-06,
"loss": 0.0007,
"step": 8360
},
{
"epoch": 42.01855670103093,
"grad_norm": 0.0035951670724898577,
"learning_rate": 1.52348224513173e-06,
"loss": 0.1423,
"step": 8370
},
{
"epoch": 42.01958762886598,
"grad_norm": 0.007604878395795822,
"learning_rate": 1.5120274914089348e-06,
"loss": 0.0138,
"step": 8380
},
{
"epoch": 42.02010309278351,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 2.1768667697906494,
"eval_runtime": 12.9363,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 0.928,
"step": 8385
},
{
"epoch": 43.00051546391752,
"grad_norm": 0.026959659531712532,
"learning_rate": 1.5005727376861399e-06,
"loss": 0.0004,
"step": 8390
},
{
"epoch": 43.001546391752576,
"grad_norm": 0.09655001759529114,
"learning_rate": 1.489117983963345e-06,
"loss": 0.2277,
"step": 8400
},
{
"epoch": 43.00257731958763,
"grad_norm": 1.289469838142395,
"learning_rate": 1.47766323024055e-06,
"loss": 0.1235,
"step": 8410
},
{
"epoch": 43.00360824742268,
"grad_norm": 0.0012255634646862745,
"learning_rate": 1.4662084765177551e-06,
"loss": 0.0079,
"step": 8420
},
{
"epoch": 43.00463917525773,
"grad_norm": 0.0007930409628897905,
"learning_rate": 1.4547537227949602e-06,
"loss": 0.0002,
"step": 8430
},
{
"epoch": 43.00567010309278,
"grad_norm": 0.024416925385594368,
"learning_rate": 1.4432989690721649e-06,
"loss": 0.3317,
"step": 8440
},
{
"epoch": 43.006701030927836,
"grad_norm": 0.0007845965446904302,
"learning_rate": 1.43184421534937e-06,
"loss": 0.0002,
"step": 8450
},
{
"epoch": 43.00773195876289,
"grad_norm": 0.001673469552770257,
"learning_rate": 1.420389461626575e-06,
"loss": 0.0028,
"step": 8460
},
{
"epoch": 43.008762886597935,
"grad_norm": 0.00685878423973918,
"learning_rate": 1.4089347079037801e-06,
"loss": 0.0008,
"step": 8470
},
{
"epoch": 43.00979381443299,
"grad_norm": 0.0035130823962390423,
"learning_rate": 1.3974799541809852e-06,
"loss": 0.0007,
"step": 8480
},
{
"epoch": 43.01082474226804,
"grad_norm": 198.6305694580078,
"learning_rate": 1.3860252004581903e-06,
"loss": 0.1222,
"step": 8490
},
{
"epoch": 43.011855670103095,
"grad_norm": 0.0031229022424668074,
"learning_rate": 1.3745704467353954e-06,
"loss": 0.0002,
"step": 8500
},
{
"epoch": 43.01288659793814,
"grad_norm": 0.004219813738018274,
"learning_rate": 1.3631156930126002e-06,
"loss": 0.0001,
"step": 8510
},
{
"epoch": 43.013917525773195,
"grad_norm": 0.0015190464910119772,
"learning_rate": 1.3516609392898053e-06,
"loss": 0.014,
"step": 8520
},
{
"epoch": 43.01494845360825,
"grad_norm": 0.003646484576165676,
"learning_rate": 1.3402061855670104e-06,
"loss": 0.1198,
"step": 8530
},
{
"epoch": 43.0159793814433,
"grad_norm": 0.0039468565955758095,
"learning_rate": 1.3287514318442155e-06,
"loss": 0.0002,
"step": 8540
},
{
"epoch": 43.01701030927835,
"grad_norm": 0.0018218251643702388,
"learning_rate": 1.3172966781214206e-06,
"loss": 0.0013,
"step": 8550
},
{
"epoch": 43.0180412371134,
"grad_norm": 104.12612915039062,
"learning_rate": 1.3058419243986257e-06,
"loss": 0.2044,
"step": 8560
},
{
"epoch": 43.019072164948454,
"grad_norm": 0.0026801032945513725,
"learning_rate": 1.2943871706758307e-06,
"loss": 0.0003,
"step": 8570
},
{
"epoch": 43.02010309278351,
"grad_norm": 0.0005498857935890555,
"learning_rate": 1.2829324169530354e-06,
"loss": 0.3569,
"step": 8580
},
{
"epoch": 43.02010309278351,
"eval_accuracy": 0.6222222222222222,
"eval_loss": 2.6946287155151367,
"eval_runtime": 13.5653,
"eval_samples_per_second": 3.317,
"eval_steps_per_second": 0.885,
"step": 8580
},
{
"epoch": 44.00103092783505,
"grad_norm": 0.0034866807982325554,
"learning_rate": 1.2714776632302405e-06,
"loss": 0.0771,
"step": 8590
},
{
"epoch": 44.00206185567011,
"grad_norm": 0.0050008767284452915,
"learning_rate": 1.2600229095074456e-06,
"loss": 0.1379,
"step": 8600
},
{
"epoch": 44.00309278350515,
"grad_norm": 0.003171220887452364,
"learning_rate": 1.2485681557846507e-06,
"loss": 0.0002,
"step": 8610
},
{
"epoch": 44.004123711340206,
"grad_norm": 0.00555449491366744,
"learning_rate": 1.2371134020618557e-06,
"loss": 0.2303,
"step": 8620
},
{
"epoch": 44.00515463917526,
"grad_norm": 0.00758061558008194,
"learning_rate": 1.2256586483390608e-06,
"loss": 0.0019,
"step": 8630
},
{
"epoch": 44.00618556701031,
"grad_norm": 35.78764724731445,
"learning_rate": 1.214203894616266e-06,
"loss": 0.2258,
"step": 8640
},
{
"epoch": 44.00721649484536,
"grad_norm": 0.002628314308822155,
"learning_rate": 1.202749140893471e-06,
"loss": 0.1071,
"step": 8650
},
{
"epoch": 44.00824742268041,
"grad_norm": 0.0065179308876395226,
"learning_rate": 1.1912943871706759e-06,
"loss": 0.0002,
"step": 8660
},
{
"epoch": 44.009278350515466,
"grad_norm": 0.00975864939391613,
"learning_rate": 1.179839633447881e-06,
"loss": 0.0002,
"step": 8670
},
{
"epoch": 44.01030927835052,
"grad_norm": 0.008498983457684517,
"learning_rate": 1.168384879725086e-06,
"loss": 0.0009,
"step": 8680
},
{
"epoch": 44.011340206185565,
"grad_norm": 0.0022127425763756037,
"learning_rate": 1.1569301260022911e-06,
"loss": 0.0182,
"step": 8690
},
{
"epoch": 44.01237113402062,
"grad_norm": 0.015520663000643253,
"learning_rate": 1.145475372279496e-06,
"loss": 0.1718,
"step": 8700
},
{
"epoch": 44.01340206185567,
"grad_norm": 0.00714871846139431,
"learning_rate": 1.134020618556701e-06,
"loss": 0.0001,
"step": 8710
},
{
"epoch": 44.014432989690725,
"grad_norm": 98.3997573852539,
"learning_rate": 1.1225658648339062e-06,
"loss": 0.3241,
"step": 8720
},
{
"epoch": 44.01546391752577,
"grad_norm": 0.004079570062458515,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0002,
"step": 8730
},
{
"epoch": 44.016494845360825,
"grad_norm": 0.002131837885826826,
"learning_rate": 1.0996563573883161e-06,
"loss": 0.0002,
"step": 8740
},
{
"epoch": 44.01752577319588,
"grad_norm": 0.027215346693992615,
"learning_rate": 1.0882016036655212e-06,
"loss": 0.388,
"step": 8750
},
{
"epoch": 44.01855670103093,
"grad_norm": 0.025429489091038704,
"learning_rate": 1.0767468499427263e-06,
"loss": 0.1207,
"step": 8760
},
{
"epoch": 44.01958762886598,
"grad_norm": 0.5874564051628113,
"learning_rate": 1.0652920962199314e-06,
"loss": 0.0002,
"step": 8770
},
{
"epoch": 44.02010309278351,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 2.156606435775757,
"eval_runtime": 13.2361,
"eval_samples_per_second": 3.4,
"eval_steps_per_second": 0.907,
"step": 8775
},
{
"epoch": 45.00051546391752,
"grad_norm": 0.01159296091645956,
"learning_rate": 1.0538373424971365e-06,
"loss": 0.0002,
"step": 8780
},
{
"epoch": 45.001546391752576,
"grad_norm": 0.002699700416997075,
"learning_rate": 1.0423825887743415e-06,
"loss": 0.1022,
"step": 8790
},
{
"epoch": 45.00257731958763,
"grad_norm": 0.002690671943128109,
"learning_rate": 1.0309278350515464e-06,
"loss": 0.0003,
"step": 8800
},
{
"epoch": 45.00360824742268,
"grad_norm": 0.011441020295023918,
"learning_rate": 1.0194730813287515e-06,
"loss": 0.0027,
"step": 8810
},
{
"epoch": 45.00463917525773,
"grad_norm": 0.3645267188549042,
"learning_rate": 1.0080183276059566e-06,
"loss": 0.0002,
"step": 8820
},
{
"epoch": 45.00567010309278,
"grad_norm": 0.014001118950545788,
"learning_rate": 9.965635738831617e-07,
"loss": 0.0002,
"step": 8830
},
{
"epoch": 45.006701030927836,
"grad_norm": 0.0017673630500212312,
"learning_rate": 9.851088201603665e-07,
"loss": 0.0006,
"step": 8840
},
{
"epoch": 45.00773195876289,
"grad_norm": 44.564510345458984,
"learning_rate": 9.736540664375716e-07,
"loss": 0.4445,
"step": 8850
},
{
"epoch": 45.008762886597935,
"grad_norm": 0.09444551169872284,
"learning_rate": 9.621993127147767e-07,
"loss": 0.0002,
"step": 8860
},
{
"epoch": 45.00979381443299,
"grad_norm": 0.001571907545439899,
"learning_rate": 9.507445589919817e-07,
"loss": 0.2272,
"step": 8870
},
{
"epoch": 45.01082474226804,
"grad_norm": 117.25080108642578,
"learning_rate": 9.392898052691868e-07,
"loss": 0.0253,
"step": 8880
},
{
"epoch": 45.011855670103095,
"grad_norm": 0.02719367854297161,
"learning_rate": 9.278350515463919e-07,
"loss": 0.2072,
"step": 8890
},
{
"epoch": 45.01288659793814,
"grad_norm": 0.011369436047971249,
"learning_rate": 9.163802978235968e-07,
"loss": 0.0002,
"step": 8900
},
{
"epoch": 45.013917525773195,
"grad_norm": 0.009269513189792633,
"learning_rate": 9.049255441008019e-07,
"loss": 0.0184,
"step": 8910
},
{
"epoch": 45.01494845360825,
"grad_norm": 0.003293866291642189,
"learning_rate": 8.93470790378007e-07,
"loss": 0.0084,
"step": 8920
},
{
"epoch": 45.0159793814433,
"grad_norm": 0.012929639779031277,
"learning_rate": 8.820160366552121e-07,
"loss": 0.0005,
"step": 8930
},
{
"epoch": 45.01701030927835,
"grad_norm": 0.012119736522436142,
"learning_rate": 8.70561282932417e-07,
"loss": 0.2072,
"step": 8940
},
{
"epoch": 45.0180412371134,
"grad_norm": 0.1019548624753952,
"learning_rate": 8.59106529209622e-07,
"loss": 0.0002,
"step": 8950
},
{
"epoch": 45.019072164948454,
"grad_norm": 0.005744592752307653,
"learning_rate": 8.476517754868271e-07,
"loss": 0.1167,
"step": 8960
},
{
"epoch": 45.02010309278351,
"grad_norm": 0.08731329441070557,
"learning_rate": 8.361970217640321e-07,
"loss": 0.0924,
"step": 8970
},
{
"epoch": 45.02010309278351,
"eval_accuracy": 0.6666666666666666,
"eval_loss": 2.4635937213897705,
"eval_runtime": 12.8852,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 0.931,
"step": 8970
},
{
"epoch": 46.00103092783505,
"grad_norm": 0.006049647461622953,
"learning_rate": 8.247422680412372e-07,
"loss": 0.3291,
"step": 8980
},
{
"epoch": 46.00206185567011,
"grad_norm": 206.76785278320312,
"learning_rate": 8.132875143184423e-07,
"loss": 0.1158,
"step": 8990
},
{
"epoch": 46.00309278350515,
"grad_norm": 0.002335500903427601,
"learning_rate": 8.018327605956474e-07,
"loss": 0.0002,
"step": 9000
},
{
"epoch": 46.004123711340206,
"grad_norm": 0.02907603606581688,
"learning_rate": 7.903780068728522e-07,
"loss": 0.0002,
"step": 9010
},
{
"epoch": 46.00515463917526,
"grad_norm": 0.004002581350505352,
"learning_rate": 7.789232531500573e-07,
"loss": 0.3073,
"step": 9020
},
{
"epoch": 46.00618556701031,
"grad_norm": 0.005749888252466917,
"learning_rate": 7.674684994272624e-07,
"loss": 0.3535,
"step": 9030
},
{
"epoch": 46.00721649484536,
"grad_norm": 0.008823657408356667,
"learning_rate": 7.560137457044674e-07,
"loss": 0.0001,
"step": 9040
},
{
"epoch": 46.00824742268041,
"grad_norm": 0.005698314867913723,
"learning_rate": 7.445589919816725e-07,
"loss": 0.0002,
"step": 9050
},
{
"epoch": 46.009278350515466,
"grad_norm": 161.3025665283203,
"learning_rate": 7.331042382588776e-07,
"loss": 0.1556,
"step": 9060
},
{
"epoch": 46.01030927835052,
"grad_norm": 0.010940884239971638,
"learning_rate": 7.216494845360824e-07,
"loss": 0.0938,
"step": 9070
},
{
"epoch": 46.011340206185565,
"grad_norm": 0.004464665427803993,
"learning_rate": 7.101947308132875e-07,
"loss": 0.0002,
"step": 9080
},
{
"epoch": 46.01237113402062,
"grad_norm": 0.0026364498771727085,
"learning_rate": 6.987399770904926e-07,
"loss": 0.0002,
"step": 9090
},
{
"epoch": 46.01340206185567,
"grad_norm": 0.009777194820344448,
"learning_rate": 6.872852233676977e-07,
"loss": 0.0002,
"step": 9100
},
{
"epoch": 46.014432989690725,
"grad_norm": 0.008579901419579983,
"learning_rate": 6.758304696449027e-07,
"loss": 0.0001,
"step": 9110
},
{
"epoch": 46.01546391752577,
"grad_norm": 0.6798976063728333,
"learning_rate": 6.643757159221077e-07,
"loss": 0.015,
"step": 9120
},
{
"epoch": 46.016494845360825,
"grad_norm": 0.032176949083805084,
"learning_rate": 6.529209621993128e-07,
"loss": 0.0002,
"step": 9130
},
{
"epoch": 46.01752577319588,
"grad_norm": 0.010497340932488441,
"learning_rate": 6.414662084765177e-07,
"loss": 0.0002,
"step": 9140
},
{
"epoch": 46.01855670103093,
"grad_norm": 0.022227320820093155,
"learning_rate": 6.300114547537228e-07,
"loss": 0.0006,
"step": 9150
},
{
"epoch": 46.01958762886598,
"grad_norm": 0.02829979918897152,
"learning_rate": 6.185567010309279e-07,
"loss": 0.0004,
"step": 9160
},
{
"epoch": 46.02010309278351,
"eval_accuracy": 0.7333333333333333,
"eval_loss": 2.1075925827026367,
"eval_runtime": 12.8105,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 0.937,
"step": 9165
},
{
"epoch": 47.00051546391752,
"grad_norm": 0.004454383160918951,
"learning_rate": 6.07101947308133e-07,
"loss": 0.0002,
"step": 9170
},
{
"epoch": 47.001546391752576,
"grad_norm": 0.007718723267316818,
"learning_rate": 5.956471935853379e-07,
"loss": 0.0004,
"step": 9180
},
{
"epoch": 47.00257731958763,
"grad_norm": 0.0052038333378732204,
"learning_rate": 5.84192439862543e-07,
"loss": 0.1885,
"step": 9190
},
{
"epoch": 47.00360824742268,
"grad_norm": 150.0712127685547,
"learning_rate": 5.72737686139748e-07,
"loss": 0.0726,
"step": 9200
},
{
"epoch": 47.00463917525773,
"grad_norm": 0.002846953459084034,
"learning_rate": 5.612829324169531e-07,
"loss": 0.0002,
"step": 9210
},
{
"epoch": 47.00567010309278,
"grad_norm": 0.0032731653191149235,
"learning_rate": 5.498281786941581e-07,
"loss": 0.0014,
"step": 9220
},
{
"epoch": 47.006701030927836,
"grad_norm": 0.04309961199760437,
"learning_rate": 5.383734249713631e-07,
"loss": 0.0002,
"step": 9230
},
{
"epoch": 47.00773195876289,
"grad_norm": 0.01399976946413517,
"learning_rate": 5.269186712485682e-07,
"loss": 0.0002,
"step": 9240
},
{
"epoch": 47.008762886597935,
"grad_norm": 0.007444604765623808,
"learning_rate": 5.154639175257732e-07,
"loss": 0.0002,
"step": 9250
},
{
"epoch": 47.00979381443299,
"grad_norm": 0.0034925006330013275,
"learning_rate": 5.040091638029783e-07,
"loss": 0.04,
"step": 9260
},
{
"epoch": 47.01082474226804,
"grad_norm": 0.0037354633677750826,
"learning_rate": 4.925544100801833e-07,
"loss": 0.0002,
"step": 9270
},
{
"epoch": 47.011855670103095,
"grad_norm": 0.0180855430662632,
"learning_rate": 4.810996563573884e-07,
"loss": 0.0004,
"step": 9280
},
{
"epoch": 47.01288659793814,
"grad_norm": 0.007617347873747349,
"learning_rate": 4.696449026345934e-07,
"loss": 0.183,
"step": 9290
},
{
"epoch": 47.013917525773195,
"grad_norm": 0.011123509146273136,
"learning_rate": 4.581901489117984e-07,
"loss": 0.0002,
"step": 9300
},
{
"epoch": 47.01494845360825,
"grad_norm": 0.019915536046028137,
"learning_rate": 4.467353951890035e-07,
"loss": 0.0002,
"step": 9310
},
{
"epoch": 47.0159793814433,
"grad_norm": 0.003200843231752515,
"learning_rate": 4.352806414662085e-07,
"loss": 0.0004,
"step": 9320
},
{
"epoch": 47.01701030927835,
"grad_norm": 0.018746715039014816,
"learning_rate": 4.2382588774341357e-07,
"loss": 0.1612,
"step": 9330
},
{
"epoch": 47.0180412371134,
"grad_norm": 0.004534402396529913,
"learning_rate": 4.123711340206186e-07,
"loss": 0.1429,
"step": 9340
},
{
"epoch": 47.019072164948454,
"grad_norm": 0.014469140209257603,
"learning_rate": 4.009163802978237e-07,
"loss": 0.0001,
"step": 9350
},
{
"epoch": 47.02010309278351,
"grad_norm": 0.0009225418325513601,
"learning_rate": 3.8946162657502866e-07,
"loss": 0.0645,
"step": 9360
},
{
"epoch": 47.02010309278351,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 2.1502678394317627,
"eval_runtime": 12.792,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 0.938,
"step": 9360
},
{
"epoch": 48.00103092783505,
"grad_norm": 0.008684216067194939,
"learning_rate": 3.780068728522337e-07,
"loss": 0.0002,
"step": 9370
},
{
"epoch": 48.00206185567011,
"grad_norm": 0.002957036718726158,
"learning_rate": 3.665521191294388e-07,
"loss": 0.0059,
"step": 9380
},
{
"epoch": 48.00309278350515,
"grad_norm": 0.0021803213749080896,
"learning_rate": 3.5509736540664376e-07,
"loss": 0.0002,
"step": 9390
},
{
"epoch": 48.004123711340206,
"grad_norm": 0.00486848596483469,
"learning_rate": 3.4364261168384884e-07,
"loss": 0.0001,
"step": 9400
},
{
"epoch": 48.00515463917526,
"grad_norm": 0.0031498922035098076,
"learning_rate": 3.3218785796105387e-07,
"loss": 0.1926,
"step": 9410
},
{
"epoch": 48.00618556701031,
"grad_norm": 0.004292685072869062,
"learning_rate": 3.2073310423825885e-07,
"loss": 0.0026,
"step": 9420
},
{
"epoch": 48.00721649484536,
"grad_norm": 0.003267282620072365,
"learning_rate": 3.0927835051546394e-07,
"loss": 0.0006,
"step": 9430
},
{
"epoch": 48.00824742268041,
"grad_norm": 0.022121211513876915,
"learning_rate": 2.9782359679266897e-07,
"loss": 0.0001,
"step": 9440
},
{
"epoch": 48.009278350515466,
"grad_norm": 375.67889404296875,
"learning_rate": 2.86368843069874e-07,
"loss": 0.0801,
"step": 9450
},
{
"epoch": 48.01030927835052,
"grad_norm": 23.7979736328125,
"learning_rate": 2.7491408934707903e-07,
"loss": 0.0009,
"step": 9460
},
{
"epoch": 48.011340206185565,
"grad_norm": 0.0010315030813217163,
"learning_rate": 2.634593356242841e-07,
"loss": 0.0671,
"step": 9470
},
{
"epoch": 48.01237113402062,
"grad_norm": 0.01132703386247158,
"learning_rate": 2.5200458190148915e-07,
"loss": 0.0056,
"step": 9480
},
{
"epoch": 48.01340206185567,
"grad_norm": 0.0034062955528497696,
"learning_rate": 2.405498281786942e-07,
"loss": 0.0002,
"step": 9490
},
{
"epoch": 48.014432989690725,
"grad_norm": 0.23974573612213135,
"learning_rate": 2.290950744558992e-07,
"loss": 0.0002,
"step": 9500
},
{
"epoch": 48.01546391752577,
"grad_norm": 0.006117957644164562,
"learning_rate": 2.1764032073310424e-07,
"loss": 0.3933,
"step": 9510
},
{
"epoch": 48.016494845360825,
"grad_norm": 0.23375919461250305,
"learning_rate": 2.061855670103093e-07,
"loss": 0.0328,
"step": 9520
},
{
"epoch": 48.01752577319588,
"grad_norm": 0.003966511692851782,
"learning_rate": 1.9473081328751433e-07,
"loss": 0.0002,
"step": 9530
},
{
"epoch": 48.01855670103093,
"grad_norm": 0.015851015225052834,
"learning_rate": 1.832760595647194e-07,
"loss": 0.0002,
"step": 9540
},
{
"epoch": 48.01958762886598,
"grad_norm": 0.07893943041563034,
"learning_rate": 1.7182130584192442e-07,
"loss": 0.1121,
"step": 9550
},
{
"epoch": 48.02010309278351,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 2.2610628604888916,
"eval_runtime": 12.1107,
"eval_samples_per_second": 3.716,
"eval_steps_per_second": 0.991,
"step": 9555
},
{
"epoch": 49.00051546391752,
"grad_norm": 85.26998901367188,
"learning_rate": 1.6036655211912943e-07,
"loss": 0.1761,
"step": 9560
},
{
"epoch": 49.001546391752576,
"grad_norm": 0.005270041059702635,
"learning_rate": 1.4891179839633448e-07,
"loss": 0.1176,
"step": 9570
},
{
"epoch": 49.00257731958763,
"grad_norm": 0.004510932136327028,
"learning_rate": 1.3745704467353952e-07,
"loss": 0.0033,
"step": 9580
},
{
"epoch": 49.00360824742268,
"grad_norm": 0.005600025877356529,
"learning_rate": 1.2600229095074457e-07,
"loss": 0.2291,
"step": 9590
},
{
"epoch": 49.00463917525773,
"grad_norm": 0.03189009428024292,
"learning_rate": 1.145475372279496e-07,
"loss": 0.0002,
"step": 9600
},
{
"epoch": 49.00567010309278,
"grad_norm": 0.06988941878080368,
"learning_rate": 1.0309278350515465e-07,
"loss": 0.0002,
"step": 9610
},
{
"epoch": 49.006701030927836,
"grad_norm": 0.0760478675365448,
"learning_rate": 9.16380297823597e-08,
"loss": 0.0005,
"step": 9620
},
{
"epoch": 49.00773195876289,
"grad_norm": 113.97074890136719,
"learning_rate": 8.018327605956471e-08,
"loss": 0.1573,
"step": 9630
},
{
"epoch": 49.008762886597935,
"grad_norm": 0.008412045426666737,
"learning_rate": 6.872852233676976e-08,
"loss": 0.2028,
"step": 9640
},
{
"epoch": 49.00979381443299,
"grad_norm": 0.014376115053892136,
"learning_rate": 5.72737686139748e-08,
"loss": 0.0003,
"step": 9650
},
{
"epoch": 49.01082474226804,
"grad_norm": 0.02352799102663994,
"learning_rate": 4.581901489117985e-08,
"loss": 0.1283,
"step": 9660
},
{
"epoch": 49.011855670103095,
"grad_norm": 0.0021438777912408113,
"learning_rate": 3.436426116838488e-08,
"loss": 0.1895,
"step": 9670
},
{
"epoch": 49.01288659793814,
"grad_norm": 44.23455810546875,
"learning_rate": 2.2909507445589924e-08,
"loss": 0.1931,
"step": 9680
},
{
"epoch": 49.013917525773195,
"grad_norm": 0.00817310530692339,
"learning_rate": 1.1454753722794962e-08,
"loss": 0.0002,
"step": 9690
},
{
"epoch": 49.01494845360825,
"grad_norm": 0.005044811405241489,
"learning_rate": 0.0,
"loss": 0.1268,
"step": 9700
},
{
"epoch": 49.01494845360825,
"eval_accuracy": 0.7111111111111111,
"eval_loss": 2.172813653945923,
"eval_runtime": 13.8199,
"eval_samples_per_second": 3.256,
"eval_steps_per_second": 0.868,
"step": 9700
},
{
"epoch": 49.01494845360825,
"step": 9700,
"total_flos": 1.6972634016516597e+20,
"train_loss": 0.2964441666583838,
"train_runtime": 22323.5867,
"train_samples_per_second": 1.738,
"train_steps_per_second": 0.435
},
{
"epoch": 49.01494845360825,
"eval_accuracy": 0.8666666666666667,
"eval_loss": 0.36465370655059814,
"eval_runtime": 12.1222,
"eval_samples_per_second": 3.712,
"eval_steps_per_second": 0.99,
"step": 9700
},
{
"epoch": 49.01494845360825,
"eval_accuracy": 0.8666666666666667,
"eval_loss": 0.36465373635292053,
"eval_runtime": 12.3017,
"eval_samples_per_second": 3.658,
"eval_steps_per_second": 0.975,
"step": 9700
}
],
"logging_steps": 10,
"max_steps": 9700,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6972634016516597e+20,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}