{ "best_metric": 0.9046849757673667, "best_model_checkpoint": "videomae-base-finetuned-kinetics-allkisa-crop-background-0312-clip_duration-abnormal12_resize/checkpoint-7400", "epoch": 34.01, "eval_steps": 500, "global_step": 10360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.3783783783783784e-05, "grad_norm": 4.763236045837402, "learning_rate": 5e-05, "loss": 0.6425, "step": 1 }, { "epoch": 6.756756756756757e-05, "grad_norm": 4.194863319396973, "learning_rate": 5e-05, "loss": 0.5603, "step": 2 }, { "epoch": 0.00010135135135135135, "grad_norm": 5.783139705657959, "learning_rate": 5e-05, "loss": 0.7197, "step": 3 }, { "epoch": 0.00013513513513513514, "grad_norm": 5.4392409324646, "learning_rate": 5e-05, "loss": 0.6476, "step": 4 }, { "epoch": 0.00016891891891891893, "grad_norm": 9.009134292602539, "learning_rate": 5e-05, "loss": 1.0875, "step": 5 }, { "epoch": 0.0002027027027027027, "grad_norm": 3.11857008934021, "learning_rate": 5e-05, "loss": 0.3892, "step": 6 }, { "epoch": 0.00023648648648648648, "grad_norm": 4.530641078948975, "learning_rate": 5e-05, "loss": 0.6057, "step": 7 }, { "epoch": 0.0002702702702702703, "grad_norm": 4.267138957977295, "learning_rate": 5e-05, "loss": 0.5187, "step": 8 }, { "epoch": 0.00030405405405405404, "grad_norm": 3.6243436336517334, "learning_rate": 5e-05, "loss": 0.4131, "step": 9 }, { "epoch": 0.00033783783783783786, "grad_norm": 4.500889301300049, "learning_rate": 5e-05, "loss": 0.4265, "step": 10 }, { "epoch": 0.0003716216216216216, "grad_norm": 4.295251846313477, "learning_rate": 5e-05, "loss": 0.4357, "step": 11 }, { "epoch": 0.0004054054054054054, "grad_norm": 3.916989326477051, "learning_rate": 5e-05, "loss": 0.3063, "step": 12 }, { "epoch": 0.0004391891891891892, "grad_norm": 5.446530818939209, "learning_rate": 5e-05, "loss": 0.5793, "step": 13 }, { "epoch": 0.00047297297297297297, "grad_norm": 4.745922565460205, "learning_rate": 5e-05, "loss": 0.3873, "step": 14 }, { "epoch": 0.0005067567567567568, "grad_norm": 4.204296588897705, "learning_rate": 5e-05, "loss": 0.2179, "step": 15 }, { "epoch": 0.0005405405405405405, "grad_norm": 5.245059013366699, "learning_rate": 5e-05, "loss": 0.3441, "step": 16 }, { "epoch": 0.0005743243243243243, "grad_norm": 7.508062362670898, "learning_rate": 5e-05, "loss": 0.488, "step": 17 }, { "epoch": 0.0006081081081081081, "grad_norm": 4.002020359039307, "learning_rate": 5e-05, "loss": 0.1819, "step": 18 }, { "epoch": 0.0006418918918918918, "grad_norm": 6.757015705108643, "learning_rate": 5e-05, "loss": 0.2382, "step": 19 }, { "epoch": 0.0006756756756756757, "grad_norm": 9.879010200500488, "learning_rate": 5e-05, "loss": 0.3372, "step": 20 }, { "epoch": 0.0007094594594594595, "grad_norm": 9.476975440979004, "learning_rate": 5e-05, "loss": 0.2715, "step": 21 }, { "epoch": 0.0007432432432432432, "grad_norm": 13.50439167022705, "learning_rate": 5e-05, "loss": 0.5017, "step": 22 }, { "epoch": 0.000777027027027027, "grad_norm": 12.207003593444824, "learning_rate": 5e-05, "loss": 0.6072, "step": 23 }, { "epoch": 0.0008108108108108108, "grad_norm": 6.401312351226807, "learning_rate": 5e-05, "loss": 0.1774, "step": 24 }, { "epoch": 0.0008445945945945946, "grad_norm": 10.416728973388672, "learning_rate": 5e-05, "loss": 0.6115, "step": 25 }, { "epoch": 0.0008783783783783784, "grad_norm": 7.397419452667236, "learning_rate": 5e-05, "loss": 0.236, "step": 26 }, { "epoch": 0.0009121621621621622, "grad_norm": 5.543112754821777, "learning_rate": 5e-05, "loss": 0.1509, "step": 27 }, { "epoch": 0.0009459459459459459, "grad_norm": 7.257582187652588, "learning_rate": 5e-05, "loss": 0.2529, "step": 28 }, { "epoch": 0.0009797297297297297, "grad_norm": 10.196565628051758, "learning_rate": 5e-05, "loss": 0.3017, "step": 29 }, { "epoch": 0.0010135135135135136, "grad_norm": 15.370634078979492, "learning_rate": 5e-05, "loss": 0.6027, "step": 30 }, { "epoch": 0.0010472972972972972, "grad_norm": 8.478128433227539, "learning_rate": 5e-05, "loss": 0.2596, "step": 31 }, { "epoch": 0.001081081081081081, "grad_norm": 3.798360586166382, "learning_rate": 5e-05, "loss": 0.1002, "step": 32 }, { "epoch": 0.001114864864864865, "grad_norm": 12.28720760345459, "learning_rate": 5e-05, "loss": 0.3595, "step": 33 }, { "epoch": 0.0011486486486486486, "grad_norm": 9.372284889221191, "learning_rate": 5e-05, "loss": 0.2527, "step": 34 }, { "epoch": 0.0011824324324324325, "grad_norm": 17.158458709716797, "learning_rate": 5e-05, "loss": 0.5238, "step": 35 }, { "epoch": 0.0012162162162162162, "grad_norm": 6.00411319732666, "learning_rate": 5e-05, "loss": 0.1457, "step": 36 }, { "epoch": 0.00125, "grad_norm": 8.184800148010254, "learning_rate": 5e-05, "loss": 0.3217, "step": 37 }, { "epoch": 0.0012837837837837837, "grad_norm": 10.248456954956055, "learning_rate": 5e-05, "loss": 0.6788, "step": 38 }, { "epoch": 0.0013175675675675676, "grad_norm": 3.016446352005005, "learning_rate": 5e-05, "loss": 0.0746, "step": 39 }, { "epoch": 0.0013513513513513514, "grad_norm": 6.633119106292725, "learning_rate": 5e-05, "loss": 0.4746, "step": 40 }, { "epoch": 0.001385135135135135, "grad_norm": 6.5008673667907715, "learning_rate": 5e-05, "loss": 0.1766, "step": 41 }, { "epoch": 0.001418918918918919, "grad_norm": 8.202096939086914, "learning_rate": 5e-05, "loss": 0.2134, "step": 42 }, { "epoch": 0.0014527027027027026, "grad_norm": 3.1673755645751953, "learning_rate": 5e-05, "loss": 0.1102, "step": 43 }, { "epoch": 0.0014864864864864865, "grad_norm": 2.490569591522217, "learning_rate": 5e-05, "loss": 0.0782, "step": 44 }, { "epoch": 0.0015202702702702704, "grad_norm": 3.0965981483459473, "learning_rate": 5e-05, "loss": 0.0896, "step": 45 }, { "epoch": 0.001554054054054054, "grad_norm": 10.9614896774292, "learning_rate": 5e-05, "loss": 0.5846, "step": 46 }, { "epoch": 0.0015878378378378379, "grad_norm": 7.325829982757568, "learning_rate": 5e-05, "loss": 0.2046, "step": 47 }, { "epoch": 0.0016216216216216215, "grad_norm": 5.967557430267334, "learning_rate": 5e-05, "loss": 0.2328, "step": 48 }, { "epoch": 0.0016554054054054054, "grad_norm": 6.542816638946533, "learning_rate": 5e-05, "loss": 0.2183, "step": 49 }, { "epoch": 0.0016891891891891893, "grad_norm": 0.9632725715637207, "learning_rate": 5e-05, "loss": 0.0269, "step": 50 }, { "epoch": 0.001722972972972973, "grad_norm": 9.050546646118164, "learning_rate": 5e-05, "loss": 0.4285, "step": 51 }, { "epoch": 0.0017567567567567568, "grad_norm": 10.815817832946777, "learning_rate": 5e-05, "loss": 0.3479, "step": 52 }, { "epoch": 0.0017905405405405405, "grad_norm": 0.9851428866386414, "learning_rate": 5e-05, "loss": 0.0222, "step": 53 }, { "epoch": 0.0018243243243243243, "grad_norm": 6.754751205444336, "learning_rate": 5e-05, "loss": 0.2897, "step": 54 }, { "epoch": 0.0018581081081081082, "grad_norm": 13.51555061340332, "learning_rate": 5e-05, "loss": 0.3749, "step": 55 }, { "epoch": 0.0018918918918918919, "grad_norm": 14.491055488586426, "learning_rate": 5e-05, "loss": 0.4434, "step": 56 }, { "epoch": 0.0019256756756756757, "grad_norm": 6.928836345672607, "learning_rate": 5e-05, "loss": 0.1073, "step": 57 }, { "epoch": 0.0019594594594594594, "grad_norm": 9.567647933959961, "learning_rate": 5e-05, "loss": 0.6727, "step": 58 }, { "epoch": 0.0019932432432432433, "grad_norm": 10.009958267211914, "learning_rate": 5e-05, "loss": 0.1941, "step": 59 }, { "epoch": 0.002027027027027027, "grad_norm": 19.726070404052734, "learning_rate": 5e-05, "loss": 1.0051, "step": 60 }, { "epoch": 0.002060810810810811, "grad_norm": 3.1145119667053223, "learning_rate": 5e-05, "loss": 0.0669, "step": 61 }, { "epoch": 0.0020945945945945945, "grad_norm": 0.6208921074867249, "learning_rate": 5e-05, "loss": 0.0165, "step": 62 }, { "epoch": 0.0021283783783783783, "grad_norm": 9.88986873626709, "learning_rate": 5e-05, "loss": 0.2113, "step": 63 }, { "epoch": 0.002162162162162162, "grad_norm": 12.574295997619629, "learning_rate": 5e-05, "loss": 0.4255, "step": 64 }, { "epoch": 0.002195945945945946, "grad_norm": 9.298095703125, "learning_rate": 5e-05, "loss": 0.1449, "step": 65 }, { "epoch": 0.00222972972972973, "grad_norm": 9.003008842468262, "learning_rate": 5e-05, "loss": 0.3114, "step": 66 }, { "epoch": 0.0022635135135135134, "grad_norm": 13.028115272521973, "learning_rate": 5e-05, "loss": 0.4932, "step": 67 }, { "epoch": 0.0022972972972972973, "grad_norm": 15.751253128051758, "learning_rate": 5e-05, "loss": 0.7906, "step": 68 }, { "epoch": 0.002331081081081081, "grad_norm": 12.320690155029297, "learning_rate": 5e-05, "loss": 0.2073, "step": 69 }, { "epoch": 0.002364864864864865, "grad_norm": 0.6392372846603394, "learning_rate": 5e-05, "loss": 0.015, "step": 70 }, { "epoch": 0.0023986486486486484, "grad_norm": 2.259880304336548, "learning_rate": 5e-05, "loss": 0.0596, "step": 71 }, { "epoch": 0.0024324324324324323, "grad_norm": 6.386998176574707, "learning_rate": 5e-05, "loss": 0.129, "step": 72 }, { "epoch": 0.002466216216216216, "grad_norm": 9.23928165435791, "learning_rate": 5e-05, "loss": 0.4356, "step": 73 }, { "epoch": 0.0025, "grad_norm": 1.4919358491897583, "learning_rate": 5e-05, "loss": 0.0273, "step": 74 }, { "epoch": 0.002533783783783784, "grad_norm": 13.650653839111328, "learning_rate": 5e-05, "loss": 0.5401, "step": 75 }, { "epoch": 0.0025675675675675674, "grad_norm": 19.601682662963867, "learning_rate": 5e-05, "loss": 0.673, "step": 76 }, { "epoch": 0.0026013513513513512, "grad_norm": 2.496951103210449, "learning_rate": 5e-05, "loss": 0.0786, "step": 77 }, { "epoch": 0.002635135135135135, "grad_norm": 4.387405872344971, "learning_rate": 5e-05, "loss": 0.0961, "step": 78 }, { "epoch": 0.002668918918918919, "grad_norm": 9.528008460998535, "learning_rate": 5e-05, "loss": 0.3396, "step": 79 }, { "epoch": 0.002702702702702703, "grad_norm": 5.167794704437256, "learning_rate": 5e-05, "loss": 0.1125, "step": 80 }, { "epoch": 0.0027364864864864863, "grad_norm": 10.104130744934082, "learning_rate": 5e-05, "loss": 0.2652, "step": 81 }, { "epoch": 0.00277027027027027, "grad_norm": 4.212308883666992, "learning_rate": 5e-05, "loss": 0.0994, "step": 82 }, { "epoch": 0.002804054054054054, "grad_norm": 7.248816013336182, "learning_rate": 5e-05, "loss": 0.1899, "step": 83 }, { "epoch": 0.002837837837837838, "grad_norm": 12.298823356628418, "learning_rate": 5e-05, "loss": 0.1978, "step": 84 }, { "epoch": 0.0028716216216216218, "grad_norm": 10.017976760864258, "learning_rate": 5e-05, "loss": 0.4099, "step": 85 }, { "epoch": 0.0029054054054054052, "grad_norm": 5.063788890838623, "learning_rate": 5e-05, "loss": 0.1438, "step": 86 }, { "epoch": 0.002939189189189189, "grad_norm": 9.208039283752441, "learning_rate": 5e-05, "loss": 0.1848, "step": 87 }, { "epoch": 0.002972972972972973, "grad_norm": 8.877870559692383, "learning_rate": 5e-05, "loss": 0.3702, "step": 88 }, { "epoch": 0.003006756756756757, "grad_norm": 9.124505043029785, "learning_rate": 5e-05, "loss": 0.1816, "step": 89 }, { "epoch": 0.0030405405405405407, "grad_norm": 15.875794410705566, "learning_rate": 5e-05, "loss": 0.5291, "step": 90 }, { "epoch": 0.003074324324324324, "grad_norm": 11.007501602172852, "learning_rate": 5e-05, "loss": 0.2495, "step": 91 }, { "epoch": 0.003108108108108108, "grad_norm": 0.9116981029510498, "learning_rate": 5e-05, "loss": 0.025, "step": 92 }, { "epoch": 0.003141891891891892, "grad_norm": 14.658970832824707, "learning_rate": 5e-05, "loss": 0.5394, "step": 93 }, { "epoch": 0.0031756756756756758, "grad_norm": 7.185973644256592, "learning_rate": 5e-05, "loss": 0.0985, "step": 94 }, { "epoch": 0.0032094594594594596, "grad_norm": 5.652172088623047, "learning_rate": 5e-05, "loss": 0.2548, "step": 95 }, { "epoch": 0.003243243243243243, "grad_norm": 14.831002235412598, "learning_rate": 5e-05, "loss": 1.3982, "step": 96 }, { "epoch": 0.003277027027027027, "grad_norm": 1.8606945276260376, "learning_rate": 5e-05, "loss": 0.0475, "step": 97 }, { "epoch": 0.003310810810810811, "grad_norm": 8.411602973937988, "learning_rate": 5e-05, "loss": 0.3721, "step": 98 }, { "epoch": 0.0033445945945945947, "grad_norm": 0.4436762034893036, "learning_rate": 5e-05, "loss": 0.0125, "step": 99 }, { "epoch": 0.0033783783783783786, "grad_norm": 0.2939775586128235, "learning_rate": 5e-05, "loss": 0.0101, "step": 100 }, { "epoch": 0.003412162162162162, "grad_norm": 13.165569305419922, "learning_rate": 5e-05, "loss": 0.4946, "step": 101 }, { "epoch": 0.003445945945945946, "grad_norm": 9.640936851501465, "learning_rate": 5e-05, "loss": 0.1826, "step": 102 }, { "epoch": 0.0034797297297297297, "grad_norm": 3.12628173828125, "learning_rate": 5e-05, "loss": 0.0335, "step": 103 }, { "epoch": 0.0035135135135135136, "grad_norm": 12.363953590393066, "learning_rate": 5e-05, "loss": 0.2901, "step": 104 }, { "epoch": 0.0035472972972972975, "grad_norm": 1.5966756343841553, "learning_rate": 5e-05, "loss": 0.0428, "step": 105 }, { "epoch": 0.003581081081081081, "grad_norm": 12.981180191040039, "learning_rate": 5e-05, "loss": 0.3118, "step": 106 }, { "epoch": 0.003614864864864865, "grad_norm": 10.5849027633667, "learning_rate": 5e-05, "loss": 0.3011, "step": 107 }, { "epoch": 0.0036486486486486487, "grad_norm": 8.855254173278809, "learning_rate": 5e-05, "loss": 0.12, "step": 108 }, { "epoch": 0.0036824324324324326, "grad_norm": 8.160152435302734, "learning_rate": 5e-05, "loss": 0.208, "step": 109 }, { "epoch": 0.0037162162162162164, "grad_norm": 9.278417587280273, "learning_rate": 5e-05, "loss": 0.2201, "step": 110 }, { "epoch": 0.00375, "grad_norm": 4.260694980621338, "learning_rate": 5e-05, "loss": 0.0472, "step": 111 }, { "epoch": 0.0037837837837837837, "grad_norm": 9.956145286560059, "learning_rate": 5e-05, "loss": 0.0871, "step": 112 }, { "epoch": 0.0038175675675675676, "grad_norm": 11.623459815979004, "learning_rate": 5e-05, "loss": 0.5752, "step": 113 }, { "epoch": 0.0038513513513513515, "grad_norm": 10.76927375793457, "learning_rate": 5e-05, "loss": 0.3879, "step": 114 }, { "epoch": 0.0038851351351351354, "grad_norm": 1.2409955263137817, "learning_rate": 5e-05, "loss": 0.0262, "step": 115 }, { "epoch": 0.003918918918918919, "grad_norm": 12.803996086120605, "learning_rate": 5e-05, "loss": 0.1608, "step": 116 }, { "epoch": 0.003952702702702703, "grad_norm": 19.291501998901367, "learning_rate": 5e-05, "loss": 0.2488, "step": 117 }, { "epoch": 0.0039864864864864865, "grad_norm": 32.60211181640625, "learning_rate": 5e-05, "loss": 0.7544, "step": 118 }, { "epoch": 0.00402027027027027, "grad_norm": 5.430222988128662, "learning_rate": 5e-05, "loss": 0.1316, "step": 119 }, { "epoch": 0.004054054054054054, "grad_norm": 0.06419495493173599, "learning_rate": 5e-05, "loss": 0.0026, "step": 120 }, { "epoch": 0.004087837837837838, "grad_norm": 1.3549314737319946, "learning_rate": 5e-05, "loss": 0.0201, "step": 121 }, { "epoch": 0.004121621621621622, "grad_norm": 3.716251850128174, "learning_rate": 5e-05, "loss": 0.0605, "step": 122 }, { "epoch": 0.0041554054054054055, "grad_norm": 6.568942546844482, "learning_rate": 5e-05, "loss": 0.7528, "step": 123 }, { "epoch": 0.004189189189189189, "grad_norm": 0.9323207139968872, "learning_rate": 5e-05, "loss": 0.0286, "step": 124 }, { "epoch": 0.004222972972972973, "grad_norm": 16.43741798400879, "learning_rate": 5e-05, "loss": 0.5986, "step": 125 }, { "epoch": 0.004256756756756757, "grad_norm": 16.898113250732422, "learning_rate": 5e-05, "loss": 0.7168, "step": 126 }, { "epoch": 0.004290540540540541, "grad_norm": 5.594399452209473, "learning_rate": 5e-05, "loss": 0.1296, "step": 127 }, { "epoch": 0.004324324324324324, "grad_norm": 11.1876859664917, "learning_rate": 5e-05, "loss": 0.5451, "step": 128 }, { "epoch": 0.004358108108108108, "grad_norm": 0.9194720387458801, "learning_rate": 5e-05, "loss": 0.0287, "step": 129 }, { "epoch": 0.004391891891891892, "grad_norm": 3.178480863571167, "learning_rate": 5e-05, "loss": 0.0461, "step": 130 }, { "epoch": 0.004425675675675676, "grad_norm": 19.704635620117188, "learning_rate": 5e-05, "loss": 0.9871, "step": 131 }, { "epoch": 0.00445945945945946, "grad_norm": 11.074614524841309, "learning_rate": 5e-05, "loss": 0.1098, "step": 132 }, { "epoch": 0.004493243243243243, "grad_norm": 31.713953018188477, "learning_rate": 5e-05, "loss": 1.6494, "step": 133 }, { "epoch": 0.004527027027027027, "grad_norm": 11.841605186462402, "learning_rate": 5e-05, "loss": 0.9526, "step": 134 }, { "epoch": 0.004560810810810811, "grad_norm": 21.800161361694336, "learning_rate": 5e-05, "loss": 1.8909, "step": 135 }, { "epoch": 0.0045945945945945945, "grad_norm": 3.281144618988037, "learning_rate": 5e-05, "loss": 0.0636, "step": 136 }, { "epoch": 0.004628378378378379, "grad_norm": 17.102052688598633, "learning_rate": 5e-05, "loss": 1.4067, "step": 137 }, { "epoch": 0.004662162162162162, "grad_norm": 2.564472198486328, "learning_rate": 5e-05, "loss": 0.0535, "step": 138 }, { "epoch": 0.004695945945945946, "grad_norm": 12.661782264709473, "learning_rate": 5e-05, "loss": 0.4088, "step": 139 }, { "epoch": 0.00472972972972973, "grad_norm": 1.6475439071655273, "learning_rate": 5e-05, "loss": 0.0469, "step": 140 }, { "epoch": 0.004763513513513513, "grad_norm": 7.36920166015625, "learning_rate": 5e-05, "loss": 0.3305, "step": 141 }, { "epoch": 0.004797297297297297, "grad_norm": 14.303715705871582, "learning_rate": 5e-05, "loss": 0.7321, "step": 142 }, { "epoch": 0.004831081081081081, "grad_norm": 5.139068603515625, "learning_rate": 5e-05, "loss": 0.2276, "step": 143 }, { "epoch": 0.004864864864864865, "grad_norm": 6.128992080688477, "learning_rate": 5e-05, "loss": 0.2267, "step": 144 }, { "epoch": 0.004898648648648649, "grad_norm": 4.673349857330322, "learning_rate": 5e-05, "loss": 0.1292, "step": 145 }, { "epoch": 0.004932432432432432, "grad_norm": 4.553489685058594, "learning_rate": 5e-05, "loss": 0.1234, "step": 146 }, { "epoch": 0.004966216216216216, "grad_norm": 1.0003315210342407, "learning_rate": 5e-05, "loss": 0.0317, "step": 147 }, { "epoch": 0.005, "grad_norm": 4.401022911071777, "learning_rate": 5e-05, "loss": 0.0845, "step": 148 }, { "epoch": 0.0050337837837837835, "grad_norm": 1.5413405895233154, "learning_rate": 5e-05, "loss": 0.0361, "step": 149 }, { "epoch": 0.005067567567567568, "grad_norm": 7.482183933258057, "learning_rate": 5e-05, "loss": 0.121, "step": 150 }, { "epoch": 0.005101351351351351, "grad_norm": 1.3016899824142456, "learning_rate": 5e-05, "loss": 0.0366, "step": 151 }, { "epoch": 0.005135135135135135, "grad_norm": 1.8677703142166138, "learning_rate": 5e-05, "loss": 0.0264, "step": 152 }, { "epoch": 0.005168918918918919, "grad_norm": 7.681979179382324, "learning_rate": 5e-05, "loss": 0.1999, "step": 153 }, { "epoch": 0.0052027027027027025, "grad_norm": 8.36997127532959, "learning_rate": 5e-05, "loss": 0.1923, "step": 154 }, { "epoch": 0.005236486486486487, "grad_norm": 5.257721900939941, "learning_rate": 5e-05, "loss": 0.0671, "step": 155 }, { "epoch": 0.00527027027027027, "grad_norm": 13.50872802734375, "learning_rate": 5e-05, "loss": 0.6002, "step": 156 }, { "epoch": 0.005304054054054054, "grad_norm": 4.623517036437988, "learning_rate": 5e-05, "loss": 0.0621, "step": 157 }, { "epoch": 0.005337837837837838, "grad_norm": 5.824158191680908, "learning_rate": 5e-05, "loss": 0.2245, "step": 158 }, { "epoch": 0.005371621621621621, "grad_norm": 17.016101837158203, "learning_rate": 5e-05, "loss": 0.7312, "step": 159 }, { "epoch": 0.005405405405405406, "grad_norm": 12.914795875549316, "learning_rate": 5e-05, "loss": 0.2896, "step": 160 }, { "epoch": 0.005439189189189189, "grad_norm": 7.9256415367126465, "learning_rate": 5e-05, "loss": 0.6795, "step": 161 }, { "epoch": 0.005472972972972973, "grad_norm": 35.83378601074219, "learning_rate": 5e-05, "loss": 0.7137, "step": 162 }, { "epoch": 0.005506756756756757, "grad_norm": 15.954791069030762, "learning_rate": 5e-05, "loss": 1.2856, "step": 163 }, { "epoch": 0.00554054054054054, "grad_norm": 13.875743865966797, "learning_rate": 5e-05, "loss": 0.4009, "step": 164 }, { "epoch": 0.005574324324324325, "grad_norm": 8.01961612701416, "learning_rate": 5e-05, "loss": 0.1228, "step": 165 }, { "epoch": 0.005608108108108108, "grad_norm": 0.7665528655052185, "learning_rate": 5e-05, "loss": 0.0202, "step": 166 }, { "epoch": 0.0056418918918918915, "grad_norm": 14.736701965332031, "learning_rate": 5e-05, "loss": 0.2119, "step": 167 }, { "epoch": 0.005675675675675676, "grad_norm": 15.743979454040527, "learning_rate": 5e-05, "loss": 0.5651, "step": 168 }, { "epoch": 0.005709459459459459, "grad_norm": 1.3885583877563477, "learning_rate": 5e-05, "loss": 0.019, "step": 169 }, { "epoch": 0.0057432432432432436, "grad_norm": 10.659895896911621, "learning_rate": 5e-05, "loss": 0.2607, "step": 170 }, { "epoch": 0.005777027027027027, "grad_norm": 12.450589179992676, "learning_rate": 5e-05, "loss": 0.5743, "step": 171 }, { "epoch": 0.0058108108108108104, "grad_norm": 8.090046882629395, "learning_rate": 5e-05, "loss": 0.1621, "step": 172 }, { "epoch": 0.005844594594594595, "grad_norm": 5.976766109466553, "learning_rate": 5e-05, "loss": 0.587, "step": 173 }, { "epoch": 0.005878378378378378, "grad_norm": 11.011799812316895, "learning_rate": 5e-05, "loss": 0.6869, "step": 174 }, { "epoch": 0.0059121621621621625, "grad_norm": 9.813197135925293, "learning_rate": 5e-05, "loss": 0.3879, "step": 175 }, { "epoch": 0.005945945945945946, "grad_norm": 13.810066223144531, "learning_rate": 5e-05, "loss": 0.6541, "step": 176 }, { "epoch": 0.005979729729729729, "grad_norm": 12.085053443908691, "learning_rate": 5e-05, "loss": 0.7893, "step": 177 }, { "epoch": 0.006013513513513514, "grad_norm": 2.4295542240142822, "learning_rate": 5e-05, "loss": 0.0754, "step": 178 }, { "epoch": 0.006047297297297297, "grad_norm": 6.477272987365723, "learning_rate": 5e-05, "loss": 0.2537, "step": 179 }, { "epoch": 0.006081081081081081, "grad_norm": 12.256583213806152, "learning_rate": 5e-05, "loss": 0.252, "step": 180 }, { "epoch": 0.006114864864864865, "grad_norm": 6.5329694747924805, "learning_rate": 5e-05, "loss": 0.6965, "step": 181 }, { "epoch": 0.006148648648648648, "grad_norm": 4.10219669342041, "learning_rate": 5e-05, "loss": 0.1444, "step": 182 }, { "epoch": 0.006182432432432433, "grad_norm": 5.930855751037598, "learning_rate": 5e-05, "loss": 0.3203, "step": 183 }, { "epoch": 0.006216216216216216, "grad_norm": 11.282504081726074, "learning_rate": 5e-05, "loss": 0.4255, "step": 184 }, { "epoch": 0.00625, "grad_norm": 2.4597086906433105, "learning_rate": 5e-05, "loss": 0.1103, "step": 185 }, { "epoch": 0.006283783783783784, "grad_norm": 4.22900915145874, "learning_rate": 5e-05, "loss": 0.1251, "step": 186 }, { "epoch": 0.006317567567567567, "grad_norm": 9.874602317810059, "learning_rate": 5e-05, "loss": 0.6236, "step": 187 }, { "epoch": 0.0063513513513513515, "grad_norm": 8.954041481018066, "learning_rate": 5e-05, "loss": 0.154, "step": 188 }, { "epoch": 0.006385135135135135, "grad_norm": 4.380626678466797, "learning_rate": 5e-05, "loss": 0.1791, "step": 189 }, { "epoch": 0.006418918918918919, "grad_norm": 6.422996997833252, "learning_rate": 5e-05, "loss": 0.4046, "step": 190 }, { "epoch": 0.006452702702702703, "grad_norm": 3.185004472732544, "learning_rate": 5e-05, "loss": 0.0933, "step": 191 }, { "epoch": 0.006486486486486486, "grad_norm": 3.9040873050689697, "learning_rate": 5e-05, "loss": 0.0681, "step": 192 }, { "epoch": 0.0065202702702702705, "grad_norm": 3.1061630249023438, "learning_rate": 5e-05, "loss": 0.1552, "step": 193 }, { "epoch": 0.006554054054054054, "grad_norm": 3.9487271308898926, "learning_rate": 5e-05, "loss": 0.1146, "step": 194 }, { "epoch": 0.006587837837837838, "grad_norm": 3.5376527309417725, "learning_rate": 5e-05, "loss": 0.1161, "step": 195 }, { "epoch": 0.006621621621621622, "grad_norm": 0.5218279957771301, "learning_rate": 5e-05, "loss": 0.019, "step": 196 }, { "epoch": 0.006655405405405405, "grad_norm": 10.97942066192627, "learning_rate": 5e-05, "loss": 0.5318, "step": 197 }, { "epoch": 0.006689189189189189, "grad_norm": 5.198629856109619, "learning_rate": 5e-05, "loss": 0.4316, "step": 198 }, { "epoch": 0.006722972972972973, "grad_norm": 7.750682353973389, "learning_rate": 5e-05, "loss": 0.2402, "step": 199 }, { "epoch": 0.006756756756756757, "grad_norm": 4.16708517074585, "learning_rate": 5e-05, "loss": 0.0763, "step": 200 }, { "epoch": 0.006790540540540541, "grad_norm": 7.139863014221191, "learning_rate": 5e-05, "loss": 0.2838, "step": 201 }, { "epoch": 0.006824324324324324, "grad_norm": 6.08656644821167, "learning_rate": 5e-05, "loss": 0.0925, "step": 202 }, { "epoch": 0.006858108108108108, "grad_norm": 0.9385524988174438, "learning_rate": 5e-05, "loss": 0.0329, "step": 203 }, { "epoch": 0.006891891891891892, "grad_norm": 11.962742805480957, "learning_rate": 5e-05, "loss": 0.2909, "step": 204 }, { "epoch": 0.006925675675675676, "grad_norm": 0.7744629383087158, "learning_rate": 5e-05, "loss": 0.0109, "step": 205 }, { "epoch": 0.0069594594594594595, "grad_norm": 8.929247856140137, "learning_rate": 5e-05, "loss": 0.3792, "step": 206 }, { "epoch": 0.006993243243243243, "grad_norm": 19.4526309967041, "learning_rate": 5e-05, "loss": 0.2584, "step": 207 }, { "epoch": 0.007027027027027027, "grad_norm": 9.467694282531738, "learning_rate": 5e-05, "loss": 0.2229, "step": 208 }, { "epoch": 0.007060810810810811, "grad_norm": 6.788573265075684, "learning_rate": 5e-05, "loss": 0.0904, "step": 209 }, { "epoch": 0.007094594594594595, "grad_norm": 6.753182888031006, "learning_rate": 5e-05, "loss": 0.152, "step": 210 }, { "epoch": 0.007128378378378378, "grad_norm": 8.888411521911621, "learning_rate": 5e-05, "loss": 0.3759, "step": 211 }, { "epoch": 0.007162162162162162, "grad_norm": 0.2932502031326294, "learning_rate": 5e-05, "loss": 0.0092, "step": 212 }, { "epoch": 0.007195945945945946, "grad_norm": 17.78807830810547, "learning_rate": 5e-05, "loss": 0.1795, "step": 213 }, { "epoch": 0.00722972972972973, "grad_norm": 13.436080932617188, "learning_rate": 5e-05, "loss": 0.1147, "step": 214 }, { "epoch": 0.007263513513513514, "grad_norm": 7.880235195159912, "learning_rate": 5e-05, "loss": 0.1579, "step": 215 }, { "epoch": 0.007297297297297297, "grad_norm": 17.115325927734375, "learning_rate": 5e-05, "loss": 0.4126, "step": 216 }, { "epoch": 0.007331081081081081, "grad_norm": 3.3122177124023438, "learning_rate": 5e-05, "loss": 0.0891, "step": 217 }, { "epoch": 0.007364864864864865, "grad_norm": 0.5805332660675049, "learning_rate": 5e-05, "loss": 0.0115, "step": 218 }, { "epoch": 0.0073986486486486485, "grad_norm": 4.6908860206604, "learning_rate": 5e-05, "loss": 0.0926, "step": 219 }, { "epoch": 0.007432432432432433, "grad_norm": 13.796879768371582, "learning_rate": 5e-05, "loss": 0.1754, "step": 220 }, { "epoch": 0.007466216216216216, "grad_norm": 0.1696520298719406, "learning_rate": 5e-05, "loss": 0.0063, "step": 221 }, { "epoch": 0.0075, "grad_norm": 13.797794342041016, "learning_rate": 5e-05, "loss": 0.3261, "step": 222 }, { "epoch": 0.007533783783783784, "grad_norm": 28.988758087158203, "learning_rate": 5e-05, "loss": 0.9203, "step": 223 }, { "epoch": 0.0075675675675675675, "grad_norm": 16.19241714477539, "learning_rate": 5e-05, "loss": 0.4733, "step": 224 }, { "epoch": 0.007601351351351352, "grad_norm": 5.228217601776123, "learning_rate": 5e-05, "loss": 0.3341, "step": 225 }, { "epoch": 0.007635135135135135, "grad_norm": 0.2570604979991913, "learning_rate": 5e-05, "loss": 0.0057, "step": 226 }, { "epoch": 0.007668918918918919, "grad_norm": 0.48140496015548706, "learning_rate": 5e-05, "loss": 0.0133, "step": 227 }, { "epoch": 0.007702702702702703, "grad_norm": 0.3710658848285675, "learning_rate": 5e-05, "loss": 0.0086, "step": 228 }, { "epoch": 0.007736486486486486, "grad_norm": 12.625741004943848, "learning_rate": 5e-05, "loss": 0.1514, "step": 229 }, { "epoch": 0.007770270270270271, "grad_norm": 10.521201133728027, "learning_rate": 5e-05, "loss": 0.2229, "step": 230 }, { "epoch": 0.007804054054054054, "grad_norm": 8.65296745300293, "learning_rate": 5e-05, "loss": 0.073, "step": 231 }, { "epoch": 0.007837837837837838, "grad_norm": 2.189868688583374, "learning_rate": 5e-05, "loss": 0.0216, "step": 232 }, { "epoch": 0.007871621621621621, "grad_norm": 22.979652404785156, "learning_rate": 5e-05, "loss": 1.9054, "step": 233 }, { "epoch": 0.007905405405405406, "grad_norm": 0.6220009922981262, "learning_rate": 5e-05, "loss": 0.0083, "step": 234 }, { "epoch": 0.00793918918918919, "grad_norm": 6.66795539855957, "learning_rate": 5e-05, "loss": 0.2577, "step": 235 }, { "epoch": 0.007972972972972973, "grad_norm": 4.719351291656494, "learning_rate": 5e-05, "loss": 0.0456, "step": 236 }, { "epoch": 0.008006756756756757, "grad_norm": 0.23176579177379608, "learning_rate": 5e-05, "loss": 0.0033, "step": 237 }, { "epoch": 0.00804054054054054, "grad_norm": 0.5002822875976562, "learning_rate": 5e-05, "loss": 0.0107, "step": 238 }, { "epoch": 0.008074324324324325, "grad_norm": 9.683141708374023, "learning_rate": 5e-05, "loss": 0.4281, "step": 239 }, { "epoch": 0.008108108108108109, "grad_norm": 14.80207633972168, "learning_rate": 5e-05, "loss": 0.9918, "step": 240 }, { "epoch": 0.008141891891891892, "grad_norm": 12.974120140075684, "learning_rate": 5e-05, "loss": 0.5601, "step": 241 }, { "epoch": 0.008175675675675675, "grad_norm": 19.574323654174805, "learning_rate": 5e-05, "loss": 0.6351, "step": 242 }, { "epoch": 0.008209459459459459, "grad_norm": 6.075523853302002, "learning_rate": 5e-05, "loss": 0.0456, "step": 243 }, { "epoch": 0.008243243243243244, "grad_norm": 3.3280274868011475, "learning_rate": 5e-05, "loss": 0.0291, "step": 244 }, { "epoch": 0.008277027027027027, "grad_norm": 32.493106842041016, "learning_rate": 5e-05, "loss": 0.7927, "step": 245 }, { "epoch": 0.008310810810810811, "grad_norm": 8.484050750732422, "learning_rate": 5e-05, "loss": 0.7258, "step": 246 }, { "epoch": 0.008344594594594594, "grad_norm": 11.673108100891113, "learning_rate": 5e-05, "loss": 0.1784, "step": 247 }, { "epoch": 0.008378378378378378, "grad_norm": 1.7333955764770508, "learning_rate": 5e-05, "loss": 0.0211, "step": 248 }, { "epoch": 0.008412162162162163, "grad_norm": 0.37162017822265625, "learning_rate": 5e-05, "loss": 0.0082, "step": 249 }, { "epoch": 0.008445945945945946, "grad_norm": 3.513526439666748, "learning_rate": 5e-05, "loss": 0.0349, "step": 250 }, { "epoch": 0.00847972972972973, "grad_norm": 11.075325012207031, "learning_rate": 5e-05, "loss": 0.1317, "step": 251 }, { "epoch": 0.008513513513513513, "grad_norm": 9.876625061035156, "learning_rate": 5e-05, "loss": 0.2764, "step": 252 }, { "epoch": 0.008547297297297297, "grad_norm": 17.873689651489258, "learning_rate": 5e-05, "loss": 0.183, "step": 253 }, { "epoch": 0.008581081081081082, "grad_norm": 8.111893653869629, "learning_rate": 5e-05, "loss": 0.1107, "step": 254 }, { "epoch": 0.008614864864864865, "grad_norm": 2.9738593101501465, "learning_rate": 5e-05, "loss": 0.071, "step": 255 }, { "epoch": 0.008648648648648649, "grad_norm": 0.7590795159339905, "learning_rate": 5e-05, "loss": 0.0191, "step": 256 }, { "epoch": 0.008682432432432432, "grad_norm": 4.163105487823486, "learning_rate": 5e-05, "loss": 0.0729, "step": 257 }, { "epoch": 0.008716216216216216, "grad_norm": 5.65493106842041, "learning_rate": 5e-05, "loss": 0.5772, "step": 258 }, { "epoch": 0.00875, "grad_norm": 12.280457496643066, "learning_rate": 5e-05, "loss": 0.7176, "step": 259 }, { "epoch": 0.008783783783783784, "grad_norm": 15.146199226379395, "learning_rate": 5e-05, "loss": 0.4297, "step": 260 }, { "epoch": 0.008817567567567568, "grad_norm": 15.107786178588867, "learning_rate": 5e-05, "loss": 0.2966, "step": 261 }, { "epoch": 0.008851351351351351, "grad_norm": 5.287258625030518, "learning_rate": 5e-05, "loss": 0.0882, "step": 262 }, { "epoch": 0.008885135135135135, "grad_norm": 6.877081394195557, "learning_rate": 5e-05, "loss": 0.7178, "step": 263 }, { "epoch": 0.00891891891891892, "grad_norm": 5.946399211883545, "learning_rate": 5e-05, "loss": 0.1126, "step": 264 }, { "epoch": 0.008952702702702703, "grad_norm": 11.256361961364746, "learning_rate": 5e-05, "loss": 0.6506, "step": 265 }, { "epoch": 0.008986486486486487, "grad_norm": 2.2141542434692383, "learning_rate": 5e-05, "loss": 0.0535, "step": 266 }, { "epoch": 0.00902027027027027, "grad_norm": 4.087910175323486, "learning_rate": 5e-05, "loss": 0.0907, "step": 267 }, { "epoch": 0.009054054054054054, "grad_norm": 1.1392817497253418, "learning_rate": 5e-05, "loss": 0.0444, "step": 268 }, { "epoch": 0.009087837837837839, "grad_norm": 26.30414390563965, "learning_rate": 5e-05, "loss": 0.3962, "step": 269 }, { "epoch": 0.009121621621621622, "grad_norm": 19.0109920501709, "learning_rate": 5e-05, "loss": 0.2111, "step": 270 }, { "epoch": 0.009155405405405406, "grad_norm": 0.7101518511772156, "learning_rate": 5e-05, "loss": 0.0134, "step": 271 }, { "epoch": 0.009189189189189189, "grad_norm": 17.524700164794922, "learning_rate": 5e-05, "loss": 0.8859, "step": 272 }, { "epoch": 0.009222972972972972, "grad_norm": 2.3233284950256348, "learning_rate": 5e-05, "loss": 0.0475, "step": 273 }, { "epoch": 0.009256756756756758, "grad_norm": 5.183705806732178, "learning_rate": 5e-05, "loss": 0.0886, "step": 274 }, { "epoch": 0.009290540540540541, "grad_norm": 1.8726388216018677, "learning_rate": 5e-05, "loss": 0.0363, "step": 275 }, { "epoch": 0.009324324324324324, "grad_norm": 16.649808883666992, "learning_rate": 5e-05, "loss": 0.6962, "step": 276 }, { "epoch": 0.009358108108108108, "grad_norm": 0.8635942339897156, "learning_rate": 5e-05, "loss": 0.0149, "step": 277 }, { "epoch": 0.009391891891891891, "grad_norm": 3.08219051361084, "learning_rate": 5e-05, "loss": 0.0233, "step": 278 }, { "epoch": 0.009425675675675675, "grad_norm": 0.46617695689201355, "learning_rate": 5e-05, "loss": 0.0055, "step": 279 }, { "epoch": 0.00945945945945946, "grad_norm": 15.277531623840332, "learning_rate": 5e-05, "loss": 0.2817, "step": 280 }, { "epoch": 0.009493243243243243, "grad_norm": 3.4520070552825928, "learning_rate": 5e-05, "loss": 0.028, "step": 281 }, { "epoch": 0.009527027027027027, "grad_norm": 0.18085399270057678, "learning_rate": 5e-05, "loss": 0.0053, "step": 282 }, { "epoch": 0.00956081081081081, "grad_norm": 25.212858200073242, "learning_rate": 5e-05, "loss": 1.3199, "step": 283 }, { "epoch": 0.009594594594594594, "grad_norm": 13.989949226379395, "learning_rate": 5e-05, "loss": 0.5384, "step": 284 }, { "epoch": 0.009628378378378379, "grad_norm": 13.841154098510742, "learning_rate": 5e-05, "loss": 0.2348, "step": 285 }, { "epoch": 0.009662162162162162, "grad_norm": 14.450275421142578, "learning_rate": 5e-05, "loss": 0.1298, "step": 286 }, { "epoch": 0.009695945945945946, "grad_norm": 30.845762252807617, "learning_rate": 5e-05, "loss": 0.9195, "step": 287 }, { "epoch": 0.00972972972972973, "grad_norm": 6.86149263381958, "learning_rate": 5e-05, "loss": 0.319, "step": 288 }, { "epoch": 0.009763513513513513, "grad_norm": 3.3457558155059814, "learning_rate": 5e-05, "loss": 0.035, "step": 289 }, { "epoch": 0.009797297297297298, "grad_norm": 8.232183456420898, "learning_rate": 5e-05, "loss": 0.8445, "step": 290 }, { "epoch": 0.009831081081081081, "grad_norm": 6.311878204345703, "learning_rate": 5e-05, "loss": 0.0588, "step": 291 }, { "epoch": 0.009864864864864865, "grad_norm": 12.563526153564453, "learning_rate": 5e-05, "loss": 0.588, "step": 292 }, { "epoch": 0.009898648648648648, "grad_norm": 9.714451789855957, "learning_rate": 5e-05, "loss": 0.0671, "step": 293 }, { "epoch": 0.009932432432432432, "grad_norm": 1.4672858715057373, "learning_rate": 5e-05, "loss": 0.0088, "step": 294 }, { "epoch": 0.009966216216216217, "grad_norm": 20.795063018798828, "learning_rate": 5e-05, "loss": 0.5613, "step": 295 }, { "epoch": 0.01, "grad_norm": 7.719156265258789, "learning_rate": 5e-05, "loss": 0.6616, "step": 296 }, { "epoch": 0.01, "eval_accuracy": 0.8675282714054927, "eval_loss": 0.37857499718666077, "eval_runtime": 31.8107, "eval_samples_per_second": 19.459, "eval_steps_per_second": 2.452, "step": 296 }, { "epoch": 1.0000337837837838, "grad_norm": 18.327472686767578, "learning_rate": 5e-05, "loss": 0.5517, "step": 297 }, { "epoch": 1.0000675675675677, "grad_norm": 6.432889461517334, "learning_rate": 5e-05, "loss": 0.5078, "step": 298 }, { "epoch": 1.0001013513513513, "grad_norm": 1.31559419631958, "learning_rate": 5e-05, "loss": 0.0184, "step": 299 }, { "epoch": 1.000135135135135, "grad_norm": 8.401947975158691, "learning_rate": 5e-05, "loss": 0.149, "step": 300 }, { "epoch": 1.000168918918919, "grad_norm": 7.655717372894287, "learning_rate": 5e-05, "loss": 0.6018, "step": 301 }, { "epoch": 1.0002027027027027, "grad_norm": 19.668670654296875, "learning_rate": 5e-05, "loss": 0.518, "step": 302 }, { "epoch": 1.0002364864864866, "grad_norm": 12.430267333984375, "learning_rate": 5e-05, "loss": 0.1862, "step": 303 }, { "epoch": 1.0002702702702704, "grad_norm": 6.628232002258301, "learning_rate": 5e-05, "loss": 0.1057, "step": 304 }, { "epoch": 1.000304054054054, "grad_norm": 13.767115592956543, "learning_rate": 5e-05, "loss": 0.2966, "step": 305 }, { "epoch": 1.0003378378378378, "grad_norm": 4.167873382568359, "learning_rate": 5e-05, "loss": 0.0571, "step": 306 }, { "epoch": 1.0003716216216216, "grad_norm": 17.503559112548828, "learning_rate": 5e-05, "loss": 0.2403, "step": 307 }, { "epoch": 1.0004054054054055, "grad_norm": 11.585195541381836, "learning_rate": 5e-05, "loss": 0.2981, "step": 308 }, { "epoch": 1.0004391891891893, "grad_norm": 13.36983871459961, "learning_rate": 5e-05, "loss": 0.3139, "step": 309 }, { "epoch": 1.0004729729729729, "grad_norm": 3.457028388977051, "learning_rate": 5e-05, "loss": 0.073, "step": 310 }, { "epoch": 1.0005067567567567, "grad_norm": 13.70763874053955, "learning_rate": 5e-05, "loss": 0.6268, "step": 311 }, { "epoch": 1.0005405405405405, "grad_norm": 5.714291095733643, "learning_rate": 5e-05, "loss": 0.2851, "step": 312 }, { "epoch": 1.0005743243243244, "grad_norm": 0.2859991490840912, "learning_rate": 5e-05, "loss": 0.0105, "step": 313 }, { "epoch": 1.0006081081081082, "grad_norm": 8.91828441619873, "learning_rate": 5e-05, "loss": 0.2616, "step": 314 }, { "epoch": 1.0006418918918918, "grad_norm": 17.790393829345703, "learning_rate": 5e-05, "loss": 0.4998, "step": 315 }, { "epoch": 1.0006756756756756, "grad_norm": 7.6049485206604, "learning_rate": 5e-05, "loss": 0.3582, "step": 316 }, { "epoch": 1.0007094594594594, "grad_norm": 7.554125785827637, "learning_rate": 5e-05, "loss": 0.0961, "step": 317 }, { "epoch": 1.0007432432432433, "grad_norm": 8.535198211669922, "learning_rate": 5e-05, "loss": 0.3952, "step": 318 }, { "epoch": 1.000777027027027, "grad_norm": 3.648113250732422, "learning_rate": 5e-05, "loss": 0.0732, "step": 319 }, { "epoch": 1.000810810810811, "grad_norm": 0.4640820026397705, "learning_rate": 5e-05, "loss": 0.0178, "step": 320 }, { "epoch": 1.0008445945945945, "grad_norm": 16.946779251098633, "learning_rate": 5e-05, "loss": 0.7526, "step": 321 }, { "epoch": 1.0008783783783783, "grad_norm": 13.98729419708252, "learning_rate": 5e-05, "loss": 0.2732, "step": 322 }, { "epoch": 1.0009121621621622, "grad_norm": 14.336233139038086, "learning_rate": 5e-05, "loss": 0.5331, "step": 323 }, { "epoch": 1.000945945945946, "grad_norm": 7.27912712097168, "learning_rate": 5e-05, "loss": 0.4787, "step": 324 }, { "epoch": 1.0009797297297298, "grad_norm": 2.9718868732452393, "learning_rate": 5e-05, "loss": 0.116, "step": 325 }, { "epoch": 1.0010135135135134, "grad_norm": 9.396027565002441, "learning_rate": 5e-05, "loss": 0.2398, "step": 326 }, { "epoch": 1.0010472972972972, "grad_norm": 8.37507152557373, "learning_rate": 5e-05, "loss": 0.2975, "step": 327 }, { "epoch": 1.001081081081081, "grad_norm": 4.86107063293457, "learning_rate": 5e-05, "loss": 0.1046, "step": 328 }, { "epoch": 1.001114864864865, "grad_norm": 0.7864731550216675, "learning_rate": 5e-05, "loss": 0.0248, "step": 329 }, { "epoch": 1.0011486486486487, "grad_norm": 12.566457748413086, "learning_rate": 5e-05, "loss": 0.8422, "step": 330 }, { "epoch": 1.0011824324324323, "grad_norm": 5.410187244415283, "learning_rate": 5e-05, "loss": 0.0762, "step": 331 }, { "epoch": 1.0012162162162161, "grad_norm": 10.021848678588867, "learning_rate": 5e-05, "loss": 0.2536, "step": 332 }, { "epoch": 1.00125, "grad_norm": 27.86660385131836, "learning_rate": 5e-05, "loss": 0.3057, "step": 333 }, { "epoch": 1.0012837837837838, "grad_norm": 3.302428722381592, "learning_rate": 5e-05, "loss": 0.0627, "step": 334 }, { "epoch": 1.0013175675675676, "grad_norm": 0.8194648623466492, "learning_rate": 5e-05, "loss": 0.0183, "step": 335 }, { "epoch": 1.0013513513513514, "grad_norm": 8.396317481994629, "learning_rate": 5e-05, "loss": 0.3797, "step": 336 }, { "epoch": 1.001385135135135, "grad_norm": 11.712349891662598, "learning_rate": 5e-05, "loss": 0.348, "step": 337 }, { "epoch": 1.0014189189189189, "grad_norm": 17.441410064697266, "learning_rate": 5e-05, "loss": 0.3385, "step": 338 }, { "epoch": 1.0014527027027027, "grad_norm": 0.10936759412288666, "learning_rate": 5e-05, "loss": 0.004, "step": 339 }, { "epoch": 1.0014864864864865, "grad_norm": 1.2607685327529907, "learning_rate": 5e-05, "loss": 0.0339, "step": 340 }, { "epoch": 1.0015202702702704, "grad_norm": 21.867067337036133, "learning_rate": 5e-05, "loss": 0.6567, "step": 341 }, { "epoch": 1.001554054054054, "grad_norm": 14.54991340637207, "learning_rate": 5e-05, "loss": 0.5191, "step": 342 }, { "epoch": 1.0015878378378378, "grad_norm": 3.979036331176758, "learning_rate": 5e-05, "loss": 0.0292, "step": 343 }, { "epoch": 1.0016216216216216, "grad_norm": 8.805299758911133, "learning_rate": 5e-05, "loss": 0.9629, "step": 344 }, { "epoch": 1.0016554054054054, "grad_norm": 15.057844161987305, "learning_rate": 5e-05, "loss": 0.4803, "step": 345 }, { "epoch": 1.0016891891891893, "grad_norm": 8.259688377380371, "learning_rate": 5e-05, "loss": 0.248, "step": 346 }, { "epoch": 1.001722972972973, "grad_norm": 1.3958911895751953, "learning_rate": 5e-05, "loss": 0.0269, "step": 347 }, { "epoch": 1.0017567567567567, "grad_norm": 7.4806976318359375, "learning_rate": 5e-05, "loss": 0.0683, "step": 348 }, { "epoch": 1.0017905405405405, "grad_norm": 3.0627031326293945, "learning_rate": 5e-05, "loss": 0.0507, "step": 349 }, { "epoch": 1.0018243243243243, "grad_norm": 2.3993871212005615, "learning_rate": 5e-05, "loss": 0.0291, "step": 350 }, { "epoch": 1.0018581081081082, "grad_norm": 24.035802841186523, "learning_rate": 5e-05, "loss": 0.2314, "step": 351 }, { "epoch": 1.001891891891892, "grad_norm": 7.744791030883789, "learning_rate": 5e-05, "loss": 0.5247, "step": 352 }, { "epoch": 1.0019256756756756, "grad_norm": 3.788958787918091, "learning_rate": 5e-05, "loss": 0.0593, "step": 353 }, { "epoch": 1.0019594594594594, "grad_norm": 13.826367378234863, "learning_rate": 5e-05, "loss": 0.6746, "step": 354 }, { "epoch": 1.0019932432432432, "grad_norm": 4.496058464050293, "learning_rate": 5e-05, "loss": 0.0782, "step": 355 }, { "epoch": 1.002027027027027, "grad_norm": 4.760463237762451, "learning_rate": 5e-05, "loss": 0.0604, "step": 356 }, { "epoch": 1.0020608108108109, "grad_norm": 12.097336769104004, "learning_rate": 5e-05, "loss": 0.188, "step": 357 }, { "epoch": 1.0020945945945945, "grad_norm": 12.065207481384277, "learning_rate": 5e-05, "loss": 0.3648, "step": 358 }, { "epoch": 1.0021283783783783, "grad_norm": 12.444439888000488, "learning_rate": 5e-05, "loss": 0.4816, "step": 359 }, { "epoch": 1.0021621621621621, "grad_norm": 13.131026268005371, "learning_rate": 5e-05, "loss": 0.5732, "step": 360 }, { "epoch": 1.002195945945946, "grad_norm": 5.755440711975098, "learning_rate": 5e-05, "loss": 0.3998, "step": 361 }, { "epoch": 1.0022297297297298, "grad_norm": 9.06641674041748, "learning_rate": 5e-05, "loss": 0.1442, "step": 362 }, { "epoch": 1.0022635135135136, "grad_norm": 8.99311637878418, "learning_rate": 5e-05, "loss": 0.2008, "step": 363 }, { "epoch": 1.0022972972972972, "grad_norm": 18.004989624023438, "learning_rate": 5e-05, "loss": 0.2628, "step": 364 }, { "epoch": 1.002331081081081, "grad_norm": 1.3715308904647827, "learning_rate": 5e-05, "loss": 0.0313, "step": 365 }, { "epoch": 1.0023648648648649, "grad_norm": 2.1455271244049072, "learning_rate": 5e-05, "loss": 0.0443, "step": 366 }, { "epoch": 1.0023986486486487, "grad_norm": 17.028606414794922, "learning_rate": 5e-05, "loss": 0.3664, "step": 367 }, { "epoch": 1.0024324324324325, "grad_norm": 3.441626787185669, "learning_rate": 5e-05, "loss": 0.0674, "step": 368 }, { "epoch": 1.0024662162162161, "grad_norm": 0.302746444940567, "learning_rate": 5e-05, "loss": 0.0069, "step": 369 }, { "epoch": 1.0025, "grad_norm": 11.221211433410645, "learning_rate": 5e-05, "loss": 0.5526, "step": 370 }, { "epoch": 1.0025337837837838, "grad_norm": 23.603960037231445, "learning_rate": 5e-05, "loss": 1.0146, "step": 371 }, { "epoch": 1.0025675675675676, "grad_norm": 16.4934139251709, "learning_rate": 5e-05, "loss": 0.8502, "step": 372 }, { "epoch": 1.0026013513513514, "grad_norm": 14.612640380859375, "learning_rate": 5e-05, "loss": 0.2941, "step": 373 }, { "epoch": 1.002635135135135, "grad_norm": 8.082893371582031, "learning_rate": 5e-05, "loss": 0.1026, "step": 374 }, { "epoch": 1.0026689189189189, "grad_norm": 12.49261474609375, "learning_rate": 5e-05, "loss": 0.4184, "step": 375 }, { "epoch": 1.0027027027027027, "grad_norm": 16.094497680664062, "learning_rate": 5e-05, "loss": 0.6026, "step": 376 }, { "epoch": 1.0027364864864865, "grad_norm": 11.86057186126709, "learning_rate": 5e-05, "loss": 0.2524, "step": 377 }, { "epoch": 1.0027702702702703, "grad_norm": 0.13020193576812744, "learning_rate": 5e-05, "loss": 0.0037, "step": 378 }, { "epoch": 1.0028040540540542, "grad_norm": 10.565208435058594, "learning_rate": 5e-05, "loss": 0.1257, "step": 379 }, { "epoch": 1.0028378378378378, "grad_norm": 0.5038501620292664, "learning_rate": 5e-05, "loss": 0.0149, "step": 380 }, { "epoch": 1.0028716216216216, "grad_norm": 0.18564702570438385, "learning_rate": 5e-05, "loss": 0.0064, "step": 381 }, { "epoch": 1.0029054054054054, "grad_norm": 16.555551528930664, "learning_rate": 5e-05, "loss": 0.8726, "step": 382 }, { "epoch": 1.0029391891891892, "grad_norm": 0.8833307027816772, "learning_rate": 5e-05, "loss": 0.0203, "step": 383 }, { "epoch": 1.002972972972973, "grad_norm": 16.634878158569336, "learning_rate": 5e-05, "loss": 0.2744, "step": 384 }, { "epoch": 1.0030067567567567, "grad_norm": 2.602725028991699, "learning_rate": 5e-05, "loss": 0.0913, "step": 385 }, { "epoch": 1.0030405405405405, "grad_norm": 3.443964958190918, "learning_rate": 5e-05, "loss": 0.0633, "step": 386 }, { "epoch": 1.0030743243243243, "grad_norm": 21.191104888916016, "learning_rate": 5e-05, "loss": 0.2558, "step": 387 }, { "epoch": 1.0031081081081081, "grad_norm": 14.173974990844727, "learning_rate": 5e-05, "loss": 0.168, "step": 388 }, { "epoch": 1.003141891891892, "grad_norm": 0.7141666412353516, "learning_rate": 5e-05, "loss": 0.0102, "step": 389 }, { "epoch": 1.0031756756756758, "grad_norm": 13.12467098236084, "learning_rate": 5e-05, "loss": 0.7597, "step": 390 }, { "epoch": 1.0032094594594594, "grad_norm": 7.112369060516357, "learning_rate": 5e-05, "loss": 0.5797, "step": 391 }, { "epoch": 1.0032432432432432, "grad_norm": 18.1895809173584, "learning_rate": 5e-05, "loss": 0.3638, "step": 392 }, { "epoch": 1.003277027027027, "grad_norm": 7.483143329620361, "learning_rate": 5e-05, "loss": 0.0669, "step": 393 }, { "epoch": 1.0033108108108109, "grad_norm": 11.431897163391113, "learning_rate": 5e-05, "loss": 0.3808, "step": 394 }, { "epoch": 1.0033445945945947, "grad_norm": 0.27210137248039246, "learning_rate": 5e-05, "loss": 0.0071, "step": 395 }, { "epoch": 1.0033783783783783, "grad_norm": 0.6917670965194702, "learning_rate": 5e-05, "loss": 0.0132, "step": 396 }, { "epoch": 1.0034121621621621, "grad_norm": 7.449429035186768, "learning_rate": 5e-05, "loss": 0.4509, "step": 397 }, { "epoch": 1.003445945945946, "grad_norm": 6.268540382385254, "learning_rate": 5e-05, "loss": 0.1724, "step": 398 }, { "epoch": 1.0034797297297298, "grad_norm": 4.7975873947143555, "learning_rate": 5e-05, "loss": 0.2571, "step": 399 }, { "epoch": 1.0035135135135136, "grad_norm": 1.188530445098877, "learning_rate": 5e-05, "loss": 0.0289, "step": 400 }, { "epoch": 1.0035472972972972, "grad_norm": 8.60204792022705, "learning_rate": 5e-05, "loss": 0.3898, "step": 401 }, { "epoch": 1.003581081081081, "grad_norm": 8.075508117675781, "learning_rate": 5e-05, "loss": 0.1536, "step": 402 }, { "epoch": 1.0036148648648648, "grad_norm": 22.42634391784668, "learning_rate": 5e-05, "loss": 0.5792, "step": 403 }, { "epoch": 1.0036486486486487, "grad_norm": 5.058320045471191, "learning_rate": 5e-05, "loss": 0.0916, "step": 404 }, { "epoch": 1.0036824324324325, "grad_norm": 1.860862374305725, "learning_rate": 5e-05, "loss": 0.019, "step": 405 }, { "epoch": 1.0037162162162163, "grad_norm": 2.2431678771972656, "learning_rate": 5e-05, "loss": 0.0926, "step": 406 }, { "epoch": 1.00375, "grad_norm": 0.6028563380241394, "learning_rate": 5e-05, "loss": 0.0136, "step": 407 }, { "epoch": 1.0037837837837837, "grad_norm": 0.33664995431900024, "learning_rate": 5e-05, "loss": 0.0053, "step": 408 }, { "epoch": 1.0038175675675676, "grad_norm": 7.633973598480225, "learning_rate": 5e-05, "loss": 0.1036, "step": 409 }, { "epoch": 1.0038513513513514, "grad_norm": 5.844037055969238, "learning_rate": 5e-05, "loss": 0.6224, "step": 410 }, { "epoch": 1.0038851351351352, "grad_norm": 2.9611566066741943, "learning_rate": 5e-05, "loss": 0.0488, "step": 411 }, { "epoch": 1.0039189189189188, "grad_norm": 6.786110877990723, "learning_rate": 5e-05, "loss": 0.2527, "step": 412 }, { "epoch": 1.0039527027027026, "grad_norm": 27.718809127807617, "learning_rate": 5e-05, "loss": 0.9159, "step": 413 }, { "epoch": 1.0039864864864865, "grad_norm": 13.792681694030762, "learning_rate": 5e-05, "loss": 0.3695, "step": 414 }, { "epoch": 1.0040202702702703, "grad_norm": 0.5800839066505432, "learning_rate": 5e-05, "loss": 0.0201, "step": 415 }, { "epoch": 1.0040540540540541, "grad_norm": 1.3893951177597046, "learning_rate": 5e-05, "loss": 0.014, "step": 416 }, { "epoch": 1.004087837837838, "grad_norm": 2.76973032951355, "learning_rate": 5e-05, "loss": 0.0528, "step": 417 }, { "epoch": 1.0041216216216216, "grad_norm": 3.1937482357025146, "learning_rate": 5e-05, "loss": 0.035, "step": 418 }, { "epoch": 1.0041554054054054, "grad_norm": 2.23164963722229, "learning_rate": 5e-05, "loss": 0.044, "step": 419 }, { "epoch": 1.0041891891891892, "grad_norm": 6.379853248596191, "learning_rate": 5e-05, "loss": 0.0865, "step": 420 }, { "epoch": 1.004222972972973, "grad_norm": 0.1908046305179596, "learning_rate": 5e-05, "loss": 0.0047, "step": 421 }, { "epoch": 1.0042567567567569, "grad_norm": 17.389925003051758, "learning_rate": 5e-05, "loss": 1.0536, "step": 422 }, { "epoch": 1.0042905405405405, "grad_norm": 1.8542147874832153, "learning_rate": 5e-05, "loss": 0.116, "step": 423 }, { "epoch": 1.0043243243243243, "grad_norm": 5.493863582611084, "learning_rate": 5e-05, "loss": 0.0564, "step": 424 }, { "epoch": 1.004358108108108, "grad_norm": 1.5889478921890259, "learning_rate": 5e-05, "loss": 0.0811, "step": 425 }, { "epoch": 1.004391891891892, "grad_norm": 2.8227972984313965, "learning_rate": 5e-05, "loss": 0.0258, "step": 426 }, { "epoch": 1.0044256756756758, "grad_norm": 19.6601619720459, "learning_rate": 5e-05, "loss": 0.1394, "step": 427 }, { "epoch": 1.0044594594594594, "grad_norm": 27.281490325927734, "learning_rate": 5e-05, "loss": 0.986, "step": 428 }, { "epoch": 1.0044932432432432, "grad_norm": 0.17607228457927704, "learning_rate": 5e-05, "loss": 0.0031, "step": 429 }, { "epoch": 1.004527027027027, "grad_norm": 0.19340692460536957, "learning_rate": 5e-05, "loss": 0.0056, "step": 430 }, { "epoch": 1.0045608108108108, "grad_norm": 0.8305065631866455, "learning_rate": 5e-05, "loss": 0.0226, "step": 431 }, { "epoch": 1.0045945945945947, "grad_norm": 1.1161766052246094, "learning_rate": 5e-05, "loss": 0.0113, "step": 432 }, { "epoch": 1.0046283783783785, "grad_norm": 13.447271347045898, "learning_rate": 5e-05, "loss": 0.6301, "step": 433 }, { "epoch": 1.004662162162162, "grad_norm": 27.042556762695312, "learning_rate": 5e-05, "loss": 0.2801, "step": 434 }, { "epoch": 1.004695945945946, "grad_norm": 0.12866641581058502, "learning_rate": 5e-05, "loss": 0.0036, "step": 435 }, { "epoch": 1.0047297297297297, "grad_norm": 20.956798553466797, "learning_rate": 5e-05, "loss": 0.3232, "step": 436 }, { "epoch": 1.0047635135135136, "grad_norm": 11.226750373840332, "learning_rate": 5e-05, "loss": 0.5686, "step": 437 }, { "epoch": 1.0047972972972974, "grad_norm": 25.272539138793945, "learning_rate": 5e-05, "loss": 0.211, "step": 438 }, { "epoch": 1.004831081081081, "grad_norm": 5.673468589782715, "learning_rate": 5e-05, "loss": 0.0393, "step": 439 }, { "epoch": 1.0048648648648648, "grad_norm": 23.110286712646484, "learning_rate": 5e-05, "loss": 0.3386, "step": 440 }, { "epoch": 1.0048986486486486, "grad_norm": 18.13290786743164, "learning_rate": 5e-05, "loss": 0.6168, "step": 441 }, { "epoch": 1.0049324324324325, "grad_norm": 0.8007836937904358, "learning_rate": 5e-05, "loss": 0.0085, "step": 442 }, { "epoch": 1.0049662162162163, "grad_norm": 6.662994384765625, "learning_rate": 5e-05, "loss": 0.4568, "step": 443 }, { "epoch": 1.005, "grad_norm": 0.4117790460586548, "learning_rate": 5e-05, "loss": 0.0064, "step": 444 }, { "epoch": 1.0050337837837837, "grad_norm": 1.2620313167572021, "learning_rate": 5e-05, "loss": 0.0264, "step": 445 }, { "epoch": 1.0050675675675675, "grad_norm": 2.3059260845184326, "learning_rate": 5e-05, "loss": 0.0251, "step": 446 }, { "epoch": 1.0051013513513514, "grad_norm": 0.8815981149673462, "learning_rate": 5e-05, "loss": 0.0134, "step": 447 }, { "epoch": 1.0051351351351352, "grad_norm": 0.6048823595046997, "learning_rate": 5e-05, "loss": 0.0084, "step": 448 }, { "epoch": 1.005168918918919, "grad_norm": 15.097657203674316, "learning_rate": 5e-05, "loss": 0.2074, "step": 449 }, { "epoch": 1.0052027027027026, "grad_norm": 19.34199333190918, "learning_rate": 5e-05, "loss": 0.5239, "step": 450 }, { "epoch": 1.0052364864864864, "grad_norm": 12.855676651000977, "learning_rate": 5e-05, "loss": 0.0951, "step": 451 }, { "epoch": 1.0052702702702703, "grad_norm": 24.72435760498047, "learning_rate": 5e-05, "loss": 0.6273, "step": 452 }, { "epoch": 1.005304054054054, "grad_norm": 6.356321811676025, "learning_rate": 5e-05, "loss": 0.0539, "step": 453 }, { "epoch": 1.005337837837838, "grad_norm": 18.436992645263672, "learning_rate": 5e-05, "loss": 0.4558, "step": 454 }, { "epoch": 1.0053716216216215, "grad_norm": 13.5146484375, "learning_rate": 5e-05, "loss": 0.5658, "step": 455 }, { "epoch": 1.0054054054054054, "grad_norm": 19.235816955566406, "learning_rate": 5e-05, "loss": 0.5879, "step": 456 }, { "epoch": 1.0054391891891892, "grad_norm": 0.8986599445343018, "learning_rate": 5e-05, "loss": 0.0163, "step": 457 }, { "epoch": 1.005472972972973, "grad_norm": 11.05648422241211, "learning_rate": 5e-05, "loss": 0.0572, "step": 458 }, { "epoch": 1.0055067567567568, "grad_norm": 11.556026458740234, "learning_rate": 5e-05, "loss": 0.7487, "step": 459 }, { "epoch": 1.0055405405405407, "grad_norm": 17.96647071838379, "learning_rate": 5e-05, "loss": 1.1097, "step": 460 }, { "epoch": 1.0055743243243243, "grad_norm": 0.06688236445188522, "learning_rate": 5e-05, "loss": 0.0019, "step": 461 }, { "epoch": 1.005608108108108, "grad_norm": 6.92596960067749, "learning_rate": 5e-05, "loss": 0.458, "step": 462 }, { "epoch": 1.005641891891892, "grad_norm": 12.730301856994629, "learning_rate": 5e-05, "loss": 0.2058, "step": 463 }, { "epoch": 1.0056756756756757, "grad_norm": 26.110960006713867, "learning_rate": 5e-05, "loss": 1.0414, "step": 464 }, { "epoch": 1.0057094594594596, "grad_norm": 12.9933443069458, "learning_rate": 5e-05, "loss": 0.079, "step": 465 }, { "epoch": 1.0057432432432432, "grad_norm": 16.851139068603516, "learning_rate": 5e-05, "loss": 1.4008, "step": 466 }, { "epoch": 1.005777027027027, "grad_norm": 22.936119079589844, "learning_rate": 5e-05, "loss": 0.8684, "step": 467 }, { "epoch": 1.0058108108108108, "grad_norm": 14.791213989257812, "learning_rate": 5e-05, "loss": 0.2997, "step": 468 }, { "epoch": 1.0058445945945946, "grad_norm": 1.9468564987182617, "learning_rate": 5e-05, "loss": 0.0181, "step": 469 }, { "epoch": 1.0058783783783785, "grad_norm": 0.8771504759788513, "learning_rate": 5e-05, "loss": 0.0136, "step": 470 }, { "epoch": 1.005912162162162, "grad_norm": 10.788262367248535, "learning_rate": 5e-05, "loss": 0.5259, "step": 471 }, { "epoch": 1.0059459459459459, "grad_norm": 8.074678421020508, "learning_rate": 5e-05, "loss": 0.0739, "step": 472 }, { "epoch": 1.0059797297297297, "grad_norm": 2.316026210784912, "learning_rate": 5e-05, "loss": 0.0344, "step": 473 }, { "epoch": 1.0060135135135135, "grad_norm": 6.714468955993652, "learning_rate": 5e-05, "loss": 0.5837, "step": 474 }, { "epoch": 1.0060472972972974, "grad_norm": 5.094871997833252, "learning_rate": 5e-05, "loss": 0.4539, "step": 475 }, { "epoch": 1.0060810810810812, "grad_norm": 0.47784921526908875, "learning_rate": 5e-05, "loss": 0.0081, "step": 476 }, { "epoch": 1.0061148648648648, "grad_norm": 0.23361125588417053, "learning_rate": 5e-05, "loss": 0.0078, "step": 477 }, { "epoch": 1.0061486486486486, "grad_norm": 0.625454843044281, "learning_rate": 5e-05, "loss": 0.0204, "step": 478 }, { "epoch": 1.0061824324324324, "grad_norm": 0.7126203179359436, "learning_rate": 5e-05, "loss": 0.0114, "step": 479 }, { "epoch": 1.0062162162162163, "grad_norm": 18.320707321166992, "learning_rate": 5e-05, "loss": 0.7681, "step": 480 }, { "epoch": 1.00625, "grad_norm": 14.842591285705566, "learning_rate": 5e-05, "loss": 0.4789, "step": 481 }, { "epoch": 1.0062837837837837, "grad_norm": 0.4803825318813324, "learning_rate": 5e-05, "loss": 0.0107, "step": 482 }, { "epoch": 1.0063175675675675, "grad_norm": 18.825176239013672, "learning_rate": 5e-05, "loss": 0.3109, "step": 483 }, { "epoch": 1.0063513513513513, "grad_norm": 0.394735723733902, "learning_rate": 5e-05, "loss": 0.0112, "step": 484 }, { "epoch": 1.0063851351351352, "grad_norm": 2.7166361808776855, "learning_rate": 5e-05, "loss": 0.0893, "step": 485 }, { "epoch": 1.006418918918919, "grad_norm": 0.15689988434314728, "learning_rate": 5e-05, "loss": 0.0045, "step": 486 }, { "epoch": 1.0064527027027026, "grad_norm": 4.807775974273682, "learning_rate": 5e-05, "loss": 0.0411, "step": 487 }, { "epoch": 1.0064864864864864, "grad_norm": 0.2250020056962967, "learning_rate": 5e-05, "loss": 0.0062, "step": 488 }, { "epoch": 1.0065202702702702, "grad_norm": 2.8341028690338135, "learning_rate": 5e-05, "loss": 0.0357, "step": 489 }, { "epoch": 1.006554054054054, "grad_norm": 0.34862834215164185, "learning_rate": 5e-05, "loss": 0.0093, "step": 490 }, { "epoch": 1.006587837837838, "grad_norm": 18.279422760009766, "learning_rate": 5e-05, "loss": 0.5004, "step": 491 }, { "epoch": 1.0066216216216217, "grad_norm": 17.59016227722168, "learning_rate": 5e-05, "loss": 0.9431, "step": 492 }, { "epoch": 1.0066554054054053, "grad_norm": 21.145376205444336, "learning_rate": 5e-05, "loss": 0.1939, "step": 493 }, { "epoch": 1.0066891891891891, "grad_norm": 49.85786437988281, "learning_rate": 5e-05, "loss": 0.373, "step": 494 }, { "epoch": 1.006722972972973, "grad_norm": 0.9427379369735718, "learning_rate": 5e-05, "loss": 0.0125, "step": 495 }, { "epoch": 1.0067567567567568, "grad_norm": 0.40142783522605896, "learning_rate": 5e-05, "loss": 0.0076, "step": 496 }, { "epoch": 1.0067905405405406, "grad_norm": 0.5435702204704285, "learning_rate": 5e-05, "loss": 0.0183, "step": 497 }, { "epoch": 1.0068243243243242, "grad_norm": 0.1016978845000267, "learning_rate": 5e-05, "loss": 0.0033, "step": 498 }, { "epoch": 1.006858108108108, "grad_norm": 2.4379613399505615, "learning_rate": 5e-05, "loss": 0.0484, "step": 499 }, { "epoch": 1.0068918918918919, "grad_norm": 14.560696601867676, "learning_rate": 5e-05, "loss": 0.5999, "step": 500 }, { "epoch": 1.0069256756756757, "grad_norm": 0.6569235920906067, "learning_rate": 5e-05, "loss": 0.0106, "step": 501 }, { "epoch": 1.0069594594594595, "grad_norm": 13.20250129699707, "learning_rate": 5e-05, "loss": 0.7724, "step": 502 }, { "epoch": 1.0069932432432434, "grad_norm": 18.74055290222168, "learning_rate": 5e-05, "loss": 0.2205, "step": 503 }, { "epoch": 1.007027027027027, "grad_norm": 14.699377059936523, "learning_rate": 5e-05, "loss": 0.0985, "step": 504 }, { "epoch": 1.0070608108108108, "grad_norm": 3.7536110877990723, "learning_rate": 5e-05, "loss": 0.0868, "step": 505 }, { "epoch": 1.0070945945945946, "grad_norm": 27.04636001586914, "learning_rate": 5e-05, "loss": 0.4275, "step": 506 }, { "epoch": 1.0071283783783784, "grad_norm": 0.026021039113402367, "learning_rate": 5e-05, "loss": 0.001, "step": 507 }, { "epoch": 1.0071621621621623, "grad_norm": 4.974664211273193, "learning_rate": 5e-05, "loss": 0.8766, "step": 508 }, { "epoch": 1.0071959459459459, "grad_norm": 22.48744010925293, "learning_rate": 5e-05, "loss": 0.5138, "step": 509 }, { "epoch": 1.0072297297297297, "grad_norm": 9.228111267089844, "learning_rate": 5e-05, "loss": 0.0779, "step": 510 }, { "epoch": 1.0072635135135135, "grad_norm": 14.408559799194336, "learning_rate": 5e-05, "loss": 0.7478, "step": 511 }, { "epoch": 1.0072972972972973, "grad_norm": 13.239943504333496, "learning_rate": 5e-05, "loss": 0.627, "step": 512 }, { "epoch": 1.0073310810810812, "grad_norm": 8.991691589355469, "learning_rate": 5e-05, "loss": 0.0806, "step": 513 }, { "epoch": 1.0073648648648648, "grad_norm": 11.21560001373291, "learning_rate": 5e-05, "loss": 0.2033, "step": 514 }, { "epoch": 1.0073986486486486, "grad_norm": 2.4655187129974365, "learning_rate": 5e-05, "loss": 0.0303, "step": 515 }, { "epoch": 1.0074324324324324, "grad_norm": 0.07624492794275284, "learning_rate": 5e-05, "loss": 0.0024, "step": 516 }, { "epoch": 1.0074662162162162, "grad_norm": 0.21909958124160767, "learning_rate": 5e-05, "loss": 0.0033, "step": 517 }, { "epoch": 1.0075, "grad_norm": 17.651935577392578, "learning_rate": 5e-05, "loss": 0.3289, "step": 518 }, { "epoch": 1.0075337837837839, "grad_norm": 12.858100891113281, "learning_rate": 5e-05, "loss": 1.1317, "step": 519 }, { "epoch": 1.0075675675675675, "grad_norm": 22.35584831237793, "learning_rate": 5e-05, "loss": 0.4195, "step": 520 }, { "epoch": 1.0076013513513513, "grad_norm": 35.08623504638672, "learning_rate": 5e-05, "loss": 0.3291, "step": 521 }, { "epoch": 1.0076351351351351, "grad_norm": 16.126707077026367, "learning_rate": 5e-05, "loss": 0.1679, "step": 522 }, { "epoch": 1.007668918918919, "grad_norm": 7.525093078613281, "learning_rate": 5e-05, "loss": 0.1054, "step": 523 }, { "epoch": 1.0077027027027028, "grad_norm": 26.457571029663086, "learning_rate": 5e-05, "loss": 1.3099, "step": 524 }, { "epoch": 1.0077364864864864, "grad_norm": 18.098081588745117, "learning_rate": 5e-05, "loss": 0.1139, "step": 525 }, { "epoch": 1.0077702702702702, "grad_norm": 6.637272834777832, "learning_rate": 5e-05, "loss": 0.0827, "step": 526 }, { "epoch": 1.007804054054054, "grad_norm": 0.5930923819541931, "learning_rate": 5e-05, "loss": 0.0064, "step": 527 }, { "epoch": 1.0078378378378379, "grad_norm": 8.328889846801758, "learning_rate": 5e-05, "loss": 0.3023, "step": 528 }, { "epoch": 1.0078716216216217, "grad_norm": 15.521971702575684, "learning_rate": 5e-05, "loss": 0.2342, "step": 529 }, { "epoch": 1.0079054054054053, "grad_norm": 0.7867887616157532, "learning_rate": 5e-05, "loss": 0.0208, "step": 530 }, { "epoch": 1.0079391891891891, "grad_norm": 2.8354814052581787, "learning_rate": 5e-05, "loss": 0.0383, "step": 531 }, { "epoch": 1.007972972972973, "grad_norm": 4.655562400817871, "learning_rate": 5e-05, "loss": 0.178, "step": 532 }, { "epoch": 1.0080067567567568, "grad_norm": 0.7221949696540833, "learning_rate": 5e-05, "loss": 0.0145, "step": 533 }, { "epoch": 1.0080405405405406, "grad_norm": 6.090303421020508, "learning_rate": 5e-05, "loss": 0.5499, "step": 534 }, { "epoch": 1.0080743243243244, "grad_norm": 6.000332355499268, "learning_rate": 5e-05, "loss": 0.1265, "step": 535 }, { "epoch": 1.008108108108108, "grad_norm": 7.573974609375, "learning_rate": 5e-05, "loss": 0.1997, "step": 536 }, { "epoch": 1.0081418918918919, "grad_norm": 13.366283416748047, "learning_rate": 5e-05, "loss": 0.4903, "step": 537 }, { "epoch": 1.0081756756756757, "grad_norm": 14.288167953491211, "learning_rate": 5e-05, "loss": 0.4547, "step": 538 }, { "epoch": 1.0082094594594595, "grad_norm": 2.8240864276885986, "learning_rate": 5e-05, "loss": 0.1961, "step": 539 }, { "epoch": 1.0082432432432433, "grad_norm": 9.565617561340332, "learning_rate": 5e-05, "loss": 0.1077, "step": 540 }, { "epoch": 1.008277027027027, "grad_norm": 0.9327738881111145, "learning_rate": 5e-05, "loss": 0.0162, "step": 541 }, { "epoch": 1.0083108108108108, "grad_norm": 10.941550254821777, "learning_rate": 5e-05, "loss": 0.4643, "step": 542 }, { "epoch": 1.0083445945945946, "grad_norm": 12.703067779541016, "learning_rate": 5e-05, "loss": 0.2872, "step": 543 }, { "epoch": 1.0083783783783784, "grad_norm": 18.975589752197266, "learning_rate": 5e-05, "loss": 0.2028, "step": 544 }, { "epoch": 1.0084121621621622, "grad_norm": 2.183638095855713, "learning_rate": 5e-05, "loss": 0.037, "step": 545 }, { "epoch": 1.008445945945946, "grad_norm": 5.640847682952881, "learning_rate": 5e-05, "loss": 0.4052, "step": 546 }, { "epoch": 1.0084797297297297, "grad_norm": 1.65294349193573, "learning_rate": 5e-05, "loss": 0.0289, "step": 547 }, { "epoch": 1.0085135135135135, "grad_norm": 0.9098564386367798, "learning_rate": 5e-05, "loss": 0.0147, "step": 548 }, { "epoch": 1.0085472972972973, "grad_norm": 20.303058624267578, "learning_rate": 5e-05, "loss": 0.7287, "step": 549 }, { "epoch": 1.0085810810810811, "grad_norm": 15.547004699707031, "learning_rate": 5e-05, "loss": 0.4203, "step": 550 }, { "epoch": 1.008614864864865, "grad_norm": 15.711519241333008, "learning_rate": 5e-05, "loss": 0.2, "step": 551 }, { "epoch": 1.0086486486486486, "grad_norm": 11.038968086242676, "learning_rate": 5e-05, "loss": 0.2, "step": 552 }, { "epoch": 1.0086824324324324, "grad_norm": 18.564149856567383, "learning_rate": 5e-05, "loss": 0.4277, "step": 553 }, { "epoch": 1.0087162162162162, "grad_norm": 0.33024367690086365, "learning_rate": 5e-05, "loss": 0.0086, "step": 554 }, { "epoch": 1.00875, "grad_norm": 5.299546718597412, "learning_rate": 5e-05, "loss": 0.1037, "step": 555 }, { "epoch": 1.0087837837837839, "grad_norm": 26.15976905822754, "learning_rate": 5e-05, "loss": 0.4568, "step": 556 }, { "epoch": 1.0088175675675675, "grad_norm": 0.24036015570163727, "learning_rate": 5e-05, "loss": 0.0057, "step": 557 }, { "epoch": 1.0088513513513513, "grad_norm": 0.4247841238975525, "learning_rate": 5e-05, "loss": 0.011, "step": 558 }, { "epoch": 1.0088851351351351, "grad_norm": 12.9185209274292, "learning_rate": 5e-05, "loss": 1.1054, "step": 559 }, { "epoch": 1.008918918918919, "grad_norm": 7.084460735321045, "learning_rate": 5e-05, "loss": 0.0931, "step": 560 }, { "epoch": 1.0089527027027028, "grad_norm": 7.820375442504883, "learning_rate": 5e-05, "loss": 0.4459, "step": 561 }, { "epoch": 1.0089864864864866, "grad_norm": 8.767210960388184, "learning_rate": 5e-05, "loss": 0.1595, "step": 562 }, { "epoch": 1.0090202702702702, "grad_norm": 2.9089388847351074, "learning_rate": 5e-05, "loss": 0.0369, "step": 563 }, { "epoch": 1.009054054054054, "grad_norm": 17.575977325439453, "learning_rate": 5e-05, "loss": 0.6459, "step": 564 }, { "epoch": 1.0090878378378378, "grad_norm": 8.06907844543457, "learning_rate": 5e-05, "loss": 0.8413, "step": 565 }, { "epoch": 1.0091216216216217, "grad_norm": 29.109691619873047, "learning_rate": 5e-05, "loss": 1.7091, "step": 566 }, { "epoch": 1.0091554054054055, "grad_norm": 18.276657104492188, "learning_rate": 5e-05, "loss": 0.3168, "step": 567 }, { "epoch": 1.009189189189189, "grad_norm": 1.4907488822937012, "learning_rate": 5e-05, "loss": 0.0987, "step": 568 }, { "epoch": 1.009222972972973, "grad_norm": 13.068373680114746, "learning_rate": 5e-05, "loss": 1.021, "step": 569 }, { "epoch": 1.0092567567567567, "grad_norm": 4.992436408996582, "learning_rate": 5e-05, "loss": 0.587, "step": 570 }, { "epoch": 1.0092905405405406, "grad_norm": 17.924610137939453, "learning_rate": 5e-05, "loss": 0.4058, "step": 571 }, { "epoch": 1.0093243243243244, "grad_norm": 16.979324340820312, "learning_rate": 5e-05, "loss": 0.3436, "step": 572 }, { "epoch": 1.009358108108108, "grad_norm": 20.045909881591797, "learning_rate": 5e-05, "loss": 0.5142, "step": 573 }, { "epoch": 1.0093918918918918, "grad_norm": 5.041548728942871, "learning_rate": 5e-05, "loss": 0.1368, "step": 574 }, { "epoch": 1.0094256756756756, "grad_norm": 2.325284481048584, "learning_rate": 5e-05, "loss": 0.0251, "step": 575 }, { "epoch": 1.0094594594594595, "grad_norm": 1.0103129148483276, "learning_rate": 5e-05, "loss": 0.0612, "step": 576 }, { "epoch": 1.0094932432432433, "grad_norm": 5.360422611236572, "learning_rate": 5e-05, "loss": 0.0685, "step": 577 }, { "epoch": 1.0095270270270271, "grad_norm": 0.6188304424285889, "learning_rate": 5e-05, "loss": 0.0243, "step": 578 }, { "epoch": 1.0095608108108107, "grad_norm": 1.3710975646972656, "learning_rate": 5e-05, "loss": 0.017, "step": 579 }, { "epoch": 1.0095945945945946, "grad_norm": 3.9109179973602295, "learning_rate": 5e-05, "loss": 0.0808, "step": 580 }, { "epoch": 1.0096283783783784, "grad_norm": 5.855372905731201, "learning_rate": 5e-05, "loss": 0.0525, "step": 581 }, { "epoch": 1.0096621621621622, "grad_norm": 4.313199043273926, "learning_rate": 5e-05, "loss": 0.0816, "step": 582 }, { "epoch": 1.009695945945946, "grad_norm": 10.352798461914062, "learning_rate": 5e-05, "loss": 0.5772, "step": 583 }, { "epoch": 1.0097297297297296, "grad_norm": 3.6601293087005615, "learning_rate": 5e-05, "loss": 0.0819, "step": 584 }, { "epoch": 1.0097635135135135, "grad_norm": 4.847390174865723, "learning_rate": 5e-05, "loss": 0.0526, "step": 585 }, { "epoch": 1.0097972972972973, "grad_norm": 0.5596976280212402, "learning_rate": 5e-05, "loss": 0.0077, "step": 586 }, { "epoch": 1.009831081081081, "grad_norm": 4.13965368270874, "learning_rate": 5e-05, "loss": 0.2875, "step": 587 }, { "epoch": 1.009864864864865, "grad_norm": 2.917419195175171, "learning_rate": 5e-05, "loss": 0.0307, "step": 588 }, { "epoch": 1.0098986486486488, "grad_norm": 11.552399635314941, "learning_rate": 5e-05, "loss": 0.609, "step": 589 }, { "epoch": 1.0099324324324324, "grad_norm": 12.326403617858887, "learning_rate": 5e-05, "loss": 0.4343, "step": 590 }, { "epoch": 1.0099662162162162, "grad_norm": 0.4884990155696869, "learning_rate": 5e-05, "loss": 0.0175, "step": 591 }, { "epoch": 1.01, "grad_norm": 14.445115089416504, "learning_rate": 5e-05, "loss": 0.1816, "step": 592 }, { "epoch": 1.01, "eval_accuracy": 0.8352180936995154, "eval_loss": 0.5763300061225891, "eval_runtime": 32.4281, "eval_samples_per_second": 19.088, "eval_steps_per_second": 2.405, "step": 592 }, { "epoch": 2.000033783783784, "grad_norm": 1.6408225297927856, "learning_rate": 5e-05, "loss": 0.0281, "step": 593 }, { "epoch": 2.0000675675675677, "grad_norm": 0.18686772882938385, "learning_rate": 5e-05, "loss": 0.0066, "step": 594 }, { "epoch": 2.0001013513513515, "grad_norm": 0.24844412505626678, "learning_rate": 5e-05, "loss": 0.007, "step": 595 }, { "epoch": 2.0001351351351353, "grad_norm": 1.734616994857788, "learning_rate": 5e-05, "loss": 0.015, "step": 596 }, { "epoch": 2.000168918918919, "grad_norm": 7.127835273742676, "learning_rate": 5e-05, "loss": 0.5083, "step": 597 }, { "epoch": 2.0002027027027025, "grad_norm": 18.441770553588867, "learning_rate": 5e-05, "loss": 0.3898, "step": 598 }, { "epoch": 2.0002364864864863, "grad_norm": 3.981456995010376, "learning_rate": 5e-05, "loss": 0.3724, "step": 599 }, { "epoch": 2.00027027027027, "grad_norm": 17.97144317626953, "learning_rate": 5e-05, "loss": 0.6476, "step": 600 }, { "epoch": 2.000304054054054, "grad_norm": 9.672410011291504, "learning_rate": 5e-05, "loss": 0.2849, "step": 601 }, { "epoch": 2.000337837837838, "grad_norm": 5.801889896392822, "learning_rate": 5e-05, "loss": 0.0461, "step": 602 }, { "epoch": 2.0003716216216216, "grad_norm": 18.799739837646484, "learning_rate": 5e-05, "loss": 0.494, "step": 603 }, { "epoch": 2.0004054054054055, "grad_norm": 11.183006286621094, "learning_rate": 5e-05, "loss": 0.2769, "step": 604 }, { "epoch": 2.0004391891891893, "grad_norm": 0.13861815631389618, "learning_rate": 5e-05, "loss": 0.004, "step": 605 }, { "epoch": 2.000472972972973, "grad_norm": 18.12513542175293, "learning_rate": 5e-05, "loss": 0.2019, "step": 606 }, { "epoch": 2.000506756756757, "grad_norm": 4.364808082580566, "learning_rate": 5e-05, "loss": 0.0602, "step": 607 }, { "epoch": 2.0005405405405408, "grad_norm": 0.024479884654283524, "learning_rate": 5e-05, "loss": 0.001, "step": 608 }, { "epoch": 2.000574324324324, "grad_norm": 8.128011703491211, "learning_rate": 5e-05, "loss": 0.1029, "step": 609 }, { "epoch": 2.000608108108108, "grad_norm": 0.3229069113731384, "learning_rate": 5e-05, "loss": 0.006, "step": 610 }, { "epoch": 2.000641891891892, "grad_norm": 10.65942668914795, "learning_rate": 5e-05, "loss": 0.1154, "step": 611 }, { "epoch": 2.0006756756756756, "grad_norm": 6.047785758972168, "learning_rate": 5e-05, "loss": 0.4788, "step": 612 }, { "epoch": 2.0007094594594594, "grad_norm": 11.937636375427246, "learning_rate": 5e-05, "loss": 0.8369, "step": 613 }, { "epoch": 2.0007432432432433, "grad_norm": 0.4906492233276367, "learning_rate": 5e-05, "loss": 0.0144, "step": 614 }, { "epoch": 2.000777027027027, "grad_norm": 8.004657745361328, "learning_rate": 5e-05, "loss": 0.7514, "step": 615 }, { "epoch": 2.000810810810811, "grad_norm": 3.1301825046539307, "learning_rate": 5e-05, "loss": 0.1865, "step": 616 }, { "epoch": 2.0008445945945947, "grad_norm": 1.0101200342178345, "learning_rate": 5e-05, "loss": 0.0257, "step": 617 }, { "epoch": 2.0008783783783786, "grad_norm": 0.22082148492336273, "learning_rate": 5e-05, "loss": 0.0073, "step": 618 }, { "epoch": 2.000912162162162, "grad_norm": 2.2533493041992188, "learning_rate": 5e-05, "loss": 0.0302, "step": 619 }, { "epoch": 2.0009459459459458, "grad_norm": 4.4628472328186035, "learning_rate": 5e-05, "loss": 0.0581, "step": 620 }, { "epoch": 2.0009797297297296, "grad_norm": 5.222517967224121, "learning_rate": 5e-05, "loss": 0.0703, "step": 621 }, { "epoch": 2.0010135135135134, "grad_norm": 0.6302618980407715, "learning_rate": 5e-05, "loss": 0.0104, "step": 622 }, { "epoch": 2.0010472972972972, "grad_norm": 1.3460947275161743, "learning_rate": 5e-05, "loss": 0.0899, "step": 623 }, { "epoch": 2.001081081081081, "grad_norm": 12.827310562133789, "learning_rate": 5e-05, "loss": 0.0663, "step": 624 }, { "epoch": 2.001114864864865, "grad_norm": 19.297225952148438, "learning_rate": 5e-05, "loss": 0.2555, "step": 625 }, { "epoch": 2.0011486486486487, "grad_norm": 6.1312174797058105, "learning_rate": 5e-05, "loss": 0.0327, "step": 626 }, { "epoch": 2.0011824324324325, "grad_norm": 18.13218879699707, "learning_rate": 5e-05, "loss": 0.4526, "step": 627 }, { "epoch": 2.0012162162162164, "grad_norm": 5.429779052734375, "learning_rate": 5e-05, "loss": 0.0649, "step": 628 }, { "epoch": 2.00125, "grad_norm": 0.6388295292854309, "learning_rate": 5e-05, "loss": 0.0089, "step": 629 }, { "epoch": 2.0012837837837836, "grad_norm": 20.040184020996094, "learning_rate": 5e-05, "loss": 0.3756, "step": 630 }, { "epoch": 2.0013175675675674, "grad_norm": 0.6235172152519226, "learning_rate": 5e-05, "loss": 0.0043, "step": 631 }, { "epoch": 2.0013513513513512, "grad_norm": 9.310227394104004, "learning_rate": 5e-05, "loss": 0.1028, "step": 632 }, { "epoch": 2.001385135135135, "grad_norm": 0.7184036374092102, "learning_rate": 5e-05, "loss": 0.0173, "step": 633 }, { "epoch": 2.001418918918919, "grad_norm": 19.00543212890625, "learning_rate": 5e-05, "loss": 0.4865, "step": 634 }, { "epoch": 2.0014527027027027, "grad_norm": 13.422286987304688, "learning_rate": 5e-05, "loss": 0.1809, "step": 635 }, { "epoch": 2.0014864864864865, "grad_norm": 3.593313694000244, "learning_rate": 5e-05, "loss": 0.0525, "step": 636 }, { "epoch": 2.0015202702702704, "grad_norm": 28.485280990600586, "learning_rate": 5e-05, "loss": 0.6191, "step": 637 }, { "epoch": 2.001554054054054, "grad_norm": 6.519321441650391, "learning_rate": 5e-05, "loss": 0.1023, "step": 638 }, { "epoch": 2.001587837837838, "grad_norm": 53.74370574951172, "learning_rate": 5e-05, "loss": 0.4157, "step": 639 }, { "epoch": 2.001621621621622, "grad_norm": 15.220372200012207, "learning_rate": 5e-05, "loss": 0.5932, "step": 640 }, { "epoch": 2.001655405405405, "grad_norm": 0.3119572103023529, "learning_rate": 5e-05, "loss": 0.0066, "step": 641 }, { "epoch": 2.001689189189189, "grad_norm": 0.5084048509597778, "learning_rate": 5e-05, "loss": 0.0036, "step": 642 }, { "epoch": 2.001722972972973, "grad_norm": 19.339595794677734, "learning_rate": 5e-05, "loss": 0.3544, "step": 643 }, { "epoch": 2.0017567567567567, "grad_norm": 5.827753067016602, "learning_rate": 5e-05, "loss": 0.5267, "step": 644 }, { "epoch": 2.0017905405405405, "grad_norm": 0.35697904229164124, "learning_rate": 5e-05, "loss": 0.0062, "step": 645 }, { "epoch": 2.0018243243243243, "grad_norm": 25.691560745239258, "learning_rate": 5e-05, "loss": 0.8311, "step": 646 }, { "epoch": 2.001858108108108, "grad_norm": 14.15230941772461, "learning_rate": 5e-05, "loss": 0.1107, "step": 647 }, { "epoch": 2.001891891891892, "grad_norm": 0.11155035346746445, "learning_rate": 5e-05, "loss": 0.0026, "step": 648 }, { "epoch": 2.001925675675676, "grad_norm": 8.575791358947754, "learning_rate": 5e-05, "loss": 0.0615, "step": 649 }, { "epoch": 2.0019594594594596, "grad_norm": 0.5462492108345032, "learning_rate": 5e-05, "loss": 0.0076, "step": 650 }, { "epoch": 2.0019932432432435, "grad_norm": 5.204411506652832, "learning_rate": 5e-05, "loss": 0.8559, "step": 651 }, { "epoch": 2.002027027027027, "grad_norm": 30.931453704833984, "learning_rate": 5e-05, "loss": 0.4807, "step": 652 }, { "epoch": 2.0020608108108107, "grad_norm": 8.126422882080078, "learning_rate": 5e-05, "loss": 0.1028, "step": 653 }, { "epoch": 2.0020945945945945, "grad_norm": 13.073334693908691, "learning_rate": 5e-05, "loss": 0.1747, "step": 654 }, { "epoch": 2.0021283783783783, "grad_norm": 10.018195152282715, "learning_rate": 5e-05, "loss": 0.1137, "step": 655 }, { "epoch": 2.002162162162162, "grad_norm": 5.345040798187256, "learning_rate": 5e-05, "loss": 0.4415, "step": 656 }, { "epoch": 2.002195945945946, "grad_norm": 15.743557929992676, "learning_rate": 5e-05, "loss": 0.3758, "step": 657 }, { "epoch": 2.00222972972973, "grad_norm": 12.861006736755371, "learning_rate": 5e-05, "loss": 0.2469, "step": 658 }, { "epoch": 2.0022635135135136, "grad_norm": 10.381454467773438, "learning_rate": 5e-05, "loss": 0.377, "step": 659 }, { "epoch": 2.0022972972972974, "grad_norm": 1.456392765045166, "learning_rate": 5e-05, "loss": 0.0409, "step": 660 }, { "epoch": 2.0023310810810813, "grad_norm": 2.475480079650879, "learning_rate": 5e-05, "loss": 0.0278, "step": 661 }, { "epoch": 2.0023648648648646, "grad_norm": 4.209902286529541, "learning_rate": 5e-05, "loss": 0.0604, "step": 662 }, { "epoch": 2.0023986486486485, "grad_norm": 8.533900260925293, "learning_rate": 5e-05, "loss": 0.0643, "step": 663 }, { "epoch": 2.0024324324324323, "grad_norm": 3.957548141479492, "learning_rate": 5e-05, "loss": 0.0501, "step": 664 }, { "epoch": 2.002466216216216, "grad_norm": 20.281906127929688, "learning_rate": 5e-05, "loss": 0.275, "step": 665 }, { "epoch": 2.0025, "grad_norm": 10.219264030456543, "learning_rate": 5e-05, "loss": 0.4375, "step": 666 }, { "epoch": 2.0025337837837838, "grad_norm": 3.2229044437408447, "learning_rate": 5e-05, "loss": 0.0378, "step": 667 }, { "epoch": 2.0025675675675676, "grad_norm": 0.12749309837818146, "learning_rate": 5e-05, "loss": 0.0048, "step": 668 }, { "epoch": 2.0026013513513514, "grad_norm": 3.574615716934204, "learning_rate": 5e-05, "loss": 0.1316, "step": 669 }, { "epoch": 2.0026351351351352, "grad_norm": 0.207112118601799, "learning_rate": 5e-05, "loss": 0.0056, "step": 670 }, { "epoch": 2.002668918918919, "grad_norm": 6.0265302658081055, "learning_rate": 5e-05, "loss": 0.3958, "step": 671 }, { "epoch": 2.002702702702703, "grad_norm": 0.6582640409469604, "learning_rate": 5e-05, "loss": 0.0147, "step": 672 }, { "epoch": 2.0027364864864863, "grad_norm": 14.565114974975586, "learning_rate": 5e-05, "loss": 0.2333, "step": 673 }, { "epoch": 2.00277027027027, "grad_norm": 12.61484432220459, "learning_rate": 5e-05, "loss": 0.3695, "step": 674 }, { "epoch": 2.002804054054054, "grad_norm": 10.567675590515137, "learning_rate": 5e-05, "loss": 0.1066, "step": 675 }, { "epoch": 2.0028378378378378, "grad_norm": 24.322612762451172, "learning_rate": 5e-05, "loss": 0.601, "step": 676 }, { "epoch": 2.0028716216216216, "grad_norm": 1.2166508436203003, "learning_rate": 5e-05, "loss": 0.0191, "step": 677 }, { "epoch": 2.0029054054054054, "grad_norm": 8.581063270568848, "learning_rate": 5e-05, "loss": 0.1589, "step": 678 }, { "epoch": 2.0029391891891892, "grad_norm": 5.035099983215332, "learning_rate": 5e-05, "loss": 0.0692, "step": 679 }, { "epoch": 2.002972972972973, "grad_norm": 0.6668307781219482, "learning_rate": 5e-05, "loss": 0.0167, "step": 680 }, { "epoch": 2.003006756756757, "grad_norm": 0.3121359646320343, "learning_rate": 5e-05, "loss": 0.0064, "step": 681 }, { "epoch": 2.0030405405405407, "grad_norm": 2.0451107025146484, "learning_rate": 5e-05, "loss": 0.0323, "step": 682 }, { "epoch": 2.0030743243243245, "grad_norm": 15.583172798156738, "learning_rate": 5e-05, "loss": 0.2834, "step": 683 }, { "epoch": 2.003108108108108, "grad_norm": 1.731439471244812, "learning_rate": 5e-05, "loss": 0.0269, "step": 684 }, { "epoch": 2.0031418918918917, "grad_norm": 0.6250378489494324, "learning_rate": 5e-05, "loss": 0.022, "step": 685 }, { "epoch": 2.0031756756756756, "grad_norm": 8.53270435333252, "learning_rate": 5e-05, "loss": 0.5233, "step": 686 }, { "epoch": 2.0032094594594594, "grad_norm": 12.740571022033691, "learning_rate": 5e-05, "loss": 1.2912, "step": 687 }, { "epoch": 2.003243243243243, "grad_norm": 8.435683250427246, "learning_rate": 5e-05, "loss": 0.4945, "step": 688 }, { "epoch": 2.003277027027027, "grad_norm": 7.359000205993652, "learning_rate": 5e-05, "loss": 0.0851, "step": 689 }, { "epoch": 2.003310810810811, "grad_norm": 9.104817390441895, "learning_rate": 5e-05, "loss": 0.0972, "step": 690 }, { "epoch": 2.0033445945945947, "grad_norm": 15.365738868713379, "learning_rate": 5e-05, "loss": 0.2199, "step": 691 }, { "epoch": 2.0033783783783785, "grad_norm": 6.1362786293029785, "learning_rate": 5e-05, "loss": 0.101, "step": 692 }, { "epoch": 2.0034121621621623, "grad_norm": 9.178117752075195, "learning_rate": 5e-05, "loss": 0.1406, "step": 693 }, { "epoch": 2.003445945945946, "grad_norm": 15.521064758300781, "learning_rate": 5e-05, "loss": 0.2738, "step": 694 }, { "epoch": 2.0034797297297295, "grad_norm": 17.017473220825195, "learning_rate": 5e-05, "loss": 0.1735, "step": 695 }, { "epoch": 2.0035135135135134, "grad_norm": 10.618877410888672, "learning_rate": 5e-05, "loss": 0.1742, "step": 696 }, { "epoch": 2.003547297297297, "grad_norm": 5.176126003265381, "learning_rate": 5e-05, "loss": 0.0644, "step": 697 }, { "epoch": 2.003581081081081, "grad_norm": 13.922181129455566, "learning_rate": 5e-05, "loss": 0.2006, "step": 698 }, { "epoch": 2.003614864864865, "grad_norm": 3.279304027557373, "learning_rate": 5e-05, "loss": 0.0368, "step": 699 }, { "epoch": 2.0036486486486487, "grad_norm": 15.44192123413086, "learning_rate": 5e-05, "loss": 0.2296, "step": 700 }, { "epoch": 2.0036824324324325, "grad_norm": 14.308425903320312, "learning_rate": 5e-05, "loss": 0.2551, "step": 701 }, { "epoch": 2.0037162162162163, "grad_norm": 0.9985201954841614, "learning_rate": 5e-05, "loss": 0.0088, "step": 702 }, { "epoch": 2.00375, "grad_norm": 0.05455027520656586, "learning_rate": 5e-05, "loss": 0.0014, "step": 703 }, { "epoch": 2.003783783783784, "grad_norm": 12.522743225097656, "learning_rate": 5e-05, "loss": 0.2811, "step": 704 }, { "epoch": 2.0038175675675673, "grad_norm": 2.946341037750244, "learning_rate": 5e-05, "loss": 0.0635, "step": 705 }, { "epoch": 2.003851351351351, "grad_norm": 17.307544708251953, "learning_rate": 5e-05, "loss": 0.1941, "step": 706 }, { "epoch": 2.003885135135135, "grad_norm": 1.7037454843521118, "learning_rate": 5e-05, "loss": 0.0231, "step": 707 }, { "epoch": 2.003918918918919, "grad_norm": 11.047527313232422, "learning_rate": 5e-05, "loss": 0.5197, "step": 708 }, { "epoch": 2.0039527027027026, "grad_norm": 3.916853427886963, "learning_rate": 5e-05, "loss": 0.0827, "step": 709 }, { "epoch": 2.0039864864864865, "grad_norm": 8.820326805114746, "learning_rate": 5e-05, "loss": 0.9141, "step": 710 }, { "epoch": 2.0040202702702703, "grad_norm": 2.509366512298584, "learning_rate": 5e-05, "loss": 0.0144, "step": 711 }, { "epoch": 2.004054054054054, "grad_norm": 10.872312545776367, "learning_rate": 5e-05, "loss": 0.1073, "step": 712 }, { "epoch": 2.004087837837838, "grad_norm": 0.29015952348709106, "learning_rate": 5e-05, "loss": 0.0058, "step": 713 }, { "epoch": 2.0041216216216218, "grad_norm": 10.971266746520996, "learning_rate": 5e-05, "loss": 0.4134, "step": 714 }, { "epoch": 2.0041554054054056, "grad_norm": 12.0504150390625, "learning_rate": 5e-05, "loss": 0.1694, "step": 715 }, { "epoch": 2.004189189189189, "grad_norm": 25.424549102783203, "learning_rate": 5e-05, "loss": 0.2162, "step": 716 }, { "epoch": 2.004222972972973, "grad_norm": 15.98388957977295, "learning_rate": 5e-05, "loss": 0.2869, "step": 717 }, { "epoch": 2.0042567567567566, "grad_norm": 4.728592872619629, "learning_rate": 5e-05, "loss": 0.1983, "step": 718 }, { "epoch": 2.0042905405405405, "grad_norm": 0.891954243183136, "learning_rate": 5e-05, "loss": 0.012, "step": 719 }, { "epoch": 2.0043243243243243, "grad_norm": 6.3707733154296875, "learning_rate": 5e-05, "loss": 0.4082, "step": 720 }, { "epoch": 2.004358108108108, "grad_norm": 10.546293258666992, "learning_rate": 5e-05, "loss": 0.5127, "step": 721 }, { "epoch": 2.004391891891892, "grad_norm": 5.142731189727783, "learning_rate": 5e-05, "loss": 0.2646, "step": 722 }, { "epoch": 2.0044256756756758, "grad_norm": 2.236341953277588, "learning_rate": 5e-05, "loss": 0.0596, "step": 723 }, { "epoch": 2.0044594594594596, "grad_norm": 4.746918678283691, "learning_rate": 5e-05, "loss": 0.0811, "step": 724 }, { "epoch": 2.0044932432432434, "grad_norm": 27.258625030517578, "learning_rate": 5e-05, "loss": 0.4365, "step": 725 }, { "epoch": 2.0045270270270272, "grad_norm": 4.549984455108643, "learning_rate": 5e-05, "loss": 0.0758, "step": 726 }, { "epoch": 2.0045608108108106, "grad_norm": 6.499630928039551, "learning_rate": 5e-05, "loss": 0.8878, "step": 727 }, { "epoch": 2.0045945945945944, "grad_norm": 0.8366812467575073, "learning_rate": 5e-05, "loss": 0.0089, "step": 728 }, { "epoch": 2.0046283783783783, "grad_norm": 11.555953979492188, "learning_rate": 5e-05, "loss": 0.2525, "step": 729 }, { "epoch": 2.004662162162162, "grad_norm": 6.321455478668213, "learning_rate": 5e-05, "loss": 0.1672, "step": 730 }, { "epoch": 2.004695945945946, "grad_norm": 3.020236015319824, "learning_rate": 5e-05, "loss": 0.0508, "step": 731 }, { "epoch": 2.0047297297297297, "grad_norm": 0.44787395000457764, "learning_rate": 5e-05, "loss": 0.0063, "step": 732 }, { "epoch": 2.0047635135135136, "grad_norm": 7.429876804351807, "learning_rate": 5e-05, "loss": 0.4222, "step": 733 }, { "epoch": 2.0047972972972974, "grad_norm": 7.993992328643799, "learning_rate": 5e-05, "loss": 0.2636, "step": 734 }, { "epoch": 2.004831081081081, "grad_norm": 4.1561598777771, "learning_rate": 5e-05, "loss": 0.0551, "step": 735 }, { "epoch": 2.004864864864865, "grad_norm": 2.4099106788635254, "learning_rate": 5e-05, "loss": 0.021, "step": 736 }, { "epoch": 2.004898648648649, "grad_norm": 5.306085109710693, "learning_rate": 5e-05, "loss": 0.0844, "step": 737 }, { "epoch": 2.0049324324324322, "grad_norm": 16.65404510498047, "learning_rate": 5e-05, "loss": 0.4828, "step": 738 }, { "epoch": 2.004966216216216, "grad_norm": 5.443899154663086, "learning_rate": 5e-05, "loss": 0.0764, "step": 739 }, { "epoch": 2.005, "grad_norm": 2.9917752742767334, "learning_rate": 5e-05, "loss": 0.0781, "step": 740 }, { "epoch": 2.0050337837837837, "grad_norm": 2.1791532039642334, "learning_rate": 5e-05, "loss": 0.0562, "step": 741 }, { "epoch": 2.0050675675675675, "grad_norm": 2.047088146209717, "learning_rate": 5e-05, "loss": 0.0839, "step": 742 }, { "epoch": 2.0051013513513514, "grad_norm": 13.465705871582031, "learning_rate": 5e-05, "loss": 0.3035, "step": 743 }, { "epoch": 2.005135135135135, "grad_norm": 8.175116539001465, "learning_rate": 5e-05, "loss": 0.4207, "step": 744 }, { "epoch": 2.005168918918919, "grad_norm": 0.9195747375488281, "learning_rate": 5e-05, "loss": 0.012, "step": 745 }, { "epoch": 2.005202702702703, "grad_norm": 0.9674017429351807, "learning_rate": 5e-05, "loss": 0.0216, "step": 746 }, { "epoch": 2.0052364864864867, "grad_norm": 1.4502861499786377, "learning_rate": 5e-05, "loss": 0.0149, "step": 747 }, { "epoch": 2.00527027027027, "grad_norm": 1.079982876777649, "learning_rate": 5e-05, "loss": 0.0246, "step": 748 }, { "epoch": 2.005304054054054, "grad_norm": 1.6559898853302002, "learning_rate": 5e-05, "loss": 0.0214, "step": 749 }, { "epoch": 2.0053378378378377, "grad_norm": 5.536227226257324, "learning_rate": 5e-05, "loss": 0.0475, "step": 750 }, { "epoch": 2.0053716216216215, "grad_norm": 21.003347396850586, "learning_rate": 5e-05, "loss": 0.5418, "step": 751 }, { "epoch": 2.0054054054054054, "grad_norm": 0.416413813829422, "learning_rate": 5e-05, "loss": 0.0066, "step": 752 }, { "epoch": 2.005439189189189, "grad_norm": 0.20377112925052643, "learning_rate": 5e-05, "loss": 0.0045, "step": 753 }, { "epoch": 2.005472972972973, "grad_norm": 0.0717206820845604, "learning_rate": 5e-05, "loss": 0.0014, "step": 754 }, { "epoch": 2.005506756756757, "grad_norm": 0.06735185533761978, "learning_rate": 5e-05, "loss": 0.002, "step": 755 }, { "epoch": 2.0055405405405407, "grad_norm": 17.50029945373535, "learning_rate": 5e-05, "loss": 0.6779, "step": 756 }, { "epoch": 2.0055743243243245, "grad_norm": 10.097397804260254, "learning_rate": 5e-05, "loss": 0.0702, "step": 757 }, { "epoch": 2.0056081081081083, "grad_norm": 0.3654506206512451, "learning_rate": 5e-05, "loss": 0.0041, "step": 758 }, { "epoch": 2.0056418918918917, "grad_norm": 0.9026389122009277, "learning_rate": 5e-05, "loss": 0.0192, "step": 759 }, { "epoch": 2.0056756756756755, "grad_norm": 0.37910380959510803, "learning_rate": 5e-05, "loss": 0.0038, "step": 760 }, { "epoch": 2.0057094594594593, "grad_norm": 15.836853981018066, "learning_rate": 5e-05, "loss": 0.1755, "step": 761 }, { "epoch": 2.005743243243243, "grad_norm": 2.302426338195801, "learning_rate": 5e-05, "loss": 0.0143, "step": 762 }, { "epoch": 2.005777027027027, "grad_norm": 0.13337555527687073, "learning_rate": 5e-05, "loss": 0.0035, "step": 763 }, { "epoch": 2.005810810810811, "grad_norm": 1.4235565662384033, "learning_rate": 5e-05, "loss": 0.0068, "step": 764 }, { "epoch": 2.0058445945945946, "grad_norm": 28.388431549072266, "learning_rate": 5e-05, "loss": 0.7267, "step": 765 }, { "epoch": 2.0058783783783785, "grad_norm": 10.539917945861816, "learning_rate": 5e-05, "loss": 0.6078, "step": 766 }, { "epoch": 2.0059121621621623, "grad_norm": 0.0583023875951767, "learning_rate": 5e-05, "loss": 0.0009, "step": 767 }, { "epoch": 2.005945945945946, "grad_norm": 16.127830505371094, "learning_rate": 5e-05, "loss": 0.6339, "step": 768 }, { "epoch": 2.00597972972973, "grad_norm": 2.352376937866211, "learning_rate": 5e-05, "loss": 0.0083, "step": 769 }, { "epoch": 2.0060135135135133, "grad_norm": 4.0362138748168945, "learning_rate": 5e-05, "loss": 0.0751, "step": 770 }, { "epoch": 2.006047297297297, "grad_norm": 28.491371154785156, "learning_rate": 5e-05, "loss": 1.2245, "step": 771 }, { "epoch": 2.006081081081081, "grad_norm": 15.208232879638672, "learning_rate": 5e-05, "loss": 0.6908, "step": 772 }, { "epoch": 2.006114864864865, "grad_norm": 30.496973037719727, "learning_rate": 5e-05, "loss": 0.392, "step": 773 }, { "epoch": 2.0061486486486486, "grad_norm": 23.55420684814453, "learning_rate": 5e-05, "loss": 1.0256, "step": 774 }, { "epoch": 2.0061824324324324, "grad_norm": 19.260324478149414, "learning_rate": 5e-05, "loss": 0.3937, "step": 775 }, { "epoch": 2.0062162162162163, "grad_norm": 13.769672393798828, "learning_rate": 5e-05, "loss": 0.0694, "step": 776 }, { "epoch": 2.00625, "grad_norm": 21.370405197143555, "learning_rate": 5e-05, "loss": 0.4566, "step": 777 }, { "epoch": 2.006283783783784, "grad_norm": 22.896018981933594, "learning_rate": 5e-05, "loss": 0.4286, "step": 778 }, { "epoch": 2.0063175675675677, "grad_norm": 0.13361196219921112, "learning_rate": 5e-05, "loss": 0.0014, "step": 779 }, { "epoch": 2.0063513513513516, "grad_norm": 20.47823715209961, "learning_rate": 5e-05, "loss": 0.1768, "step": 780 }, { "epoch": 2.006385135135135, "grad_norm": 0.18047749996185303, "learning_rate": 5e-05, "loss": 0.0037, "step": 781 }, { "epoch": 2.0064189189189188, "grad_norm": 9.70415210723877, "learning_rate": 5e-05, "loss": 0.6323, "step": 782 }, { "epoch": 2.0064527027027026, "grad_norm": 0.8581860065460205, "learning_rate": 5e-05, "loss": 0.0089, "step": 783 }, { "epoch": 2.0064864864864864, "grad_norm": 10.250097274780273, "learning_rate": 5e-05, "loss": 0.0693, "step": 784 }, { "epoch": 2.0065202702702702, "grad_norm": 15.713114738464355, "learning_rate": 5e-05, "loss": 0.2922, "step": 785 }, { "epoch": 2.006554054054054, "grad_norm": 11.190007209777832, "learning_rate": 5e-05, "loss": 0.3499, "step": 786 }, { "epoch": 2.006587837837838, "grad_norm": 8.961989402770996, "learning_rate": 5e-05, "loss": 0.0671, "step": 787 }, { "epoch": 2.0066216216216217, "grad_norm": 0.2240617722272873, "learning_rate": 5e-05, "loss": 0.0067, "step": 788 }, { "epoch": 2.0066554054054055, "grad_norm": 4.607660293579102, "learning_rate": 5e-05, "loss": 0.6, "step": 789 }, { "epoch": 2.0066891891891894, "grad_norm": 0.1919843852519989, "learning_rate": 5e-05, "loss": 0.0044, "step": 790 }, { "epoch": 2.0067229729729728, "grad_norm": 7.235976696014404, "learning_rate": 5e-05, "loss": 0.0542, "step": 791 }, { "epoch": 2.0067567567567566, "grad_norm": 11.046886444091797, "learning_rate": 5e-05, "loss": 0.1181, "step": 792 }, { "epoch": 2.0067905405405404, "grad_norm": 0.15327107906341553, "learning_rate": 5e-05, "loss": 0.0042, "step": 793 }, { "epoch": 2.0068243243243242, "grad_norm": 3.715346097946167, "learning_rate": 5e-05, "loss": 0.0613, "step": 794 }, { "epoch": 2.006858108108108, "grad_norm": 3.9025111198425293, "learning_rate": 5e-05, "loss": 0.4002, "step": 795 }, { "epoch": 2.006891891891892, "grad_norm": 7.6578369140625, "learning_rate": 5e-05, "loss": 0.1108, "step": 796 }, { "epoch": 2.0069256756756757, "grad_norm": 8.75988483428955, "learning_rate": 5e-05, "loss": 0.1161, "step": 797 }, { "epoch": 2.0069594594594595, "grad_norm": 5.367680072784424, "learning_rate": 5e-05, "loss": 0.1028, "step": 798 }, { "epoch": 2.0069932432432434, "grad_norm": 15.87852668762207, "learning_rate": 5e-05, "loss": 0.3865, "step": 799 }, { "epoch": 2.007027027027027, "grad_norm": 10.981097221374512, "learning_rate": 5e-05, "loss": 0.192, "step": 800 }, { "epoch": 2.007060810810811, "grad_norm": 10.640697479248047, "learning_rate": 5e-05, "loss": 0.1094, "step": 801 }, { "epoch": 2.0070945945945944, "grad_norm": 6.128524303436279, "learning_rate": 5e-05, "loss": 0.1122, "step": 802 }, { "epoch": 2.007128378378378, "grad_norm": 11.058035850524902, "learning_rate": 5e-05, "loss": 0.9617, "step": 803 }, { "epoch": 2.007162162162162, "grad_norm": 9.49351978302002, "learning_rate": 5e-05, "loss": 0.1994, "step": 804 }, { "epoch": 2.007195945945946, "grad_norm": 0.750651478767395, "learning_rate": 5e-05, "loss": 0.0132, "step": 805 }, { "epoch": 2.0072297297297297, "grad_norm": 20.704086303710938, "learning_rate": 5e-05, "loss": 0.2335, "step": 806 }, { "epoch": 2.0072635135135135, "grad_norm": 7.222199440002441, "learning_rate": 5e-05, "loss": 0.1297, "step": 807 }, { "epoch": 2.0072972972972973, "grad_norm": 14.77496337890625, "learning_rate": 5e-05, "loss": 0.1446, "step": 808 }, { "epoch": 2.007331081081081, "grad_norm": 13.242955207824707, "learning_rate": 5e-05, "loss": 0.5492, "step": 809 }, { "epoch": 2.007364864864865, "grad_norm": 0.17577087879180908, "learning_rate": 5e-05, "loss": 0.0031, "step": 810 }, { "epoch": 2.007398648648649, "grad_norm": 9.559738159179688, "learning_rate": 5e-05, "loss": 0.0966, "step": 811 }, { "epoch": 2.0074324324324326, "grad_norm": 4.161880016326904, "learning_rate": 5e-05, "loss": 0.0763, "step": 812 }, { "epoch": 2.007466216216216, "grad_norm": 0.848692774772644, "learning_rate": 5e-05, "loss": 0.0142, "step": 813 }, { "epoch": 2.0075, "grad_norm": 25.39118194580078, "learning_rate": 5e-05, "loss": 0.5031, "step": 814 }, { "epoch": 2.0075337837837837, "grad_norm": 16.789302825927734, "learning_rate": 5e-05, "loss": 0.2751, "step": 815 }, { "epoch": 2.0075675675675675, "grad_norm": 28.729061126708984, "learning_rate": 5e-05, "loss": 1.0549, "step": 816 }, { "epoch": 2.0076013513513513, "grad_norm": 8.848963737487793, "learning_rate": 5e-05, "loss": 0.1538, "step": 817 }, { "epoch": 2.007635135135135, "grad_norm": 4.259616374969482, "learning_rate": 5e-05, "loss": 0.1799, "step": 818 }, { "epoch": 2.007668918918919, "grad_norm": 7.40332555770874, "learning_rate": 5e-05, "loss": 0.3559, "step": 819 }, { "epoch": 2.007702702702703, "grad_norm": 14.301207542419434, "learning_rate": 5e-05, "loss": 0.0943, "step": 820 }, { "epoch": 2.0077364864864866, "grad_norm": 7.569019794464111, "learning_rate": 5e-05, "loss": 0.0807, "step": 821 }, { "epoch": 2.0077702702702704, "grad_norm": 2.997365951538086, "learning_rate": 5e-05, "loss": 0.0759, "step": 822 }, { "epoch": 2.0078040540540543, "grad_norm": 5.482513904571533, "learning_rate": 5e-05, "loss": 0.1234, "step": 823 }, { "epoch": 2.0078378378378376, "grad_norm": 12.563411712646484, "learning_rate": 5e-05, "loss": 0.2449, "step": 824 }, { "epoch": 2.0078716216216215, "grad_norm": 3.198190689086914, "learning_rate": 5e-05, "loss": 0.1269, "step": 825 }, { "epoch": 2.0079054054054053, "grad_norm": 0.03561531752347946, "learning_rate": 5e-05, "loss": 0.001, "step": 826 }, { "epoch": 2.007939189189189, "grad_norm": 15.93771743774414, "learning_rate": 5e-05, "loss": 0.383, "step": 827 }, { "epoch": 2.007972972972973, "grad_norm": 0.6273855566978455, "learning_rate": 5e-05, "loss": 0.021, "step": 828 }, { "epoch": 2.0080067567567568, "grad_norm": 3.7142348289489746, "learning_rate": 5e-05, "loss": 0.0629, "step": 829 }, { "epoch": 2.0080405405405406, "grad_norm": 0.5627942085266113, "learning_rate": 5e-05, "loss": 0.0162, "step": 830 }, { "epoch": 2.0080743243243244, "grad_norm": 17.550336837768555, "learning_rate": 5e-05, "loss": 0.2046, "step": 831 }, { "epoch": 2.0081081081081082, "grad_norm": 13.815311431884766, "learning_rate": 5e-05, "loss": 0.1273, "step": 832 }, { "epoch": 2.008141891891892, "grad_norm": 17.948362350463867, "learning_rate": 5e-05, "loss": 0.5938, "step": 833 }, { "epoch": 2.008175675675676, "grad_norm": 8.965827941894531, "learning_rate": 5e-05, "loss": 0.1384, "step": 834 }, { "epoch": 2.0082094594594593, "grad_norm": 12.837055206298828, "learning_rate": 5e-05, "loss": 0.1248, "step": 835 }, { "epoch": 2.008243243243243, "grad_norm": 2.7275054454803467, "learning_rate": 5e-05, "loss": 0.0519, "step": 836 }, { "epoch": 2.008277027027027, "grad_norm": 0.8533601760864258, "learning_rate": 5e-05, "loss": 0.022, "step": 837 }, { "epoch": 2.0083108108108108, "grad_norm": 15.898599624633789, "learning_rate": 5e-05, "loss": 0.1293, "step": 838 }, { "epoch": 2.0083445945945946, "grad_norm": 0.5491054654121399, "learning_rate": 5e-05, "loss": 0.0084, "step": 839 }, { "epoch": 2.0083783783783784, "grad_norm": 8.19331169128418, "learning_rate": 5e-05, "loss": 0.3972, "step": 840 }, { "epoch": 2.0084121621621622, "grad_norm": 11.43541431427002, "learning_rate": 5e-05, "loss": 0.1563, "step": 841 }, { "epoch": 2.008445945945946, "grad_norm": 4.702891826629639, "learning_rate": 5e-05, "loss": 0.3211, "step": 842 }, { "epoch": 2.00847972972973, "grad_norm": 11.162337303161621, "learning_rate": 5e-05, "loss": 0.249, "step": 843 }, { "epoch": 2.0085135135135137, "grad_norm": 13.529760360717773, "learning_rate": 5e-05, "loss": 0.2539, "step": 844 }, { "epoch": 2.008547297297297, "grad_norm": 0.2837526202201843, "learning_rate": 5e-05, "loss": 0.0061, "step": 845 }, { "epoch": 2.008581081081081, "grad_norm": 1.6852984428405762, "learning_rate": 5e-05, "loss": 0.0546, "step": 846 }, { "epoch": 2.0086148648648647, "grad_norm": 4.26300573348999, "learning_rate": 5e-05, "loss": 0.0352, "step": 847 }, { "epoch": 2.0086486486486486, "grad_norm": 0.27118656039237976, "learning_rate": 5e-05, "loss": 0.0064, "step": 848 }, { "epoch": 2.0086824324324324, "grad_norm": 1.256234884262085, "learning_rate": 5e-05, "loss": 0.0173, "step": 849 }, { "epoch": 2.008716216216216, "grad_norm": 1.1710515022277832, "learning_rate": 5e-05, "loss": 0.0104, "step": 850 }, { "epoch": 2.00875, "grad_norm": 13.263849258422852, "learning_rate": 5e-05, "loss": 0.2097, "step": 851 }, { "epoch": 2.008783783783784, "grad_norm": 0.2321625053882599, "learning_rate": 5e-05, "loss": 0.004, "step": 852 }, { "epoch": 2.0088175675675677, "grad_norm": 0.8488490581512451, "learning_rate": 5e-05, "loss": 0.0074, "step": 853 }, { "epoch": 2.0088513513513515, "grad_norm": 5.249314308166504, "learning_rate": 5e-05, "loss": 0.4065, "step": 854 }, { "epoch": 2.0088851351351353, "grad_norm": 16.53843116760254, "learning_rate": 5e-05, "loss": 0.2049, "step": 855 }, { "epoch": 2.0089189189189187, "grad_norm": 0.6615910530090332, "learning_rate": 5e-05, "loss": 0.0194, "step": 856 }, { "epoch": 2.0089527027027025, "grad_norm": 15.398473739624023, "learning_rate": 5e-05, "loss": 0.3903, "step": 857 }, { "epoch": 2.0089864864864864, "grad_norm": 0.060222819447517395, "learning_rate": 5e-05, "loss": 0.0016, "step": 858 }, { "epoch": 2.00902027027027, "grad_norm": 0.53265380859375, "learning_rate": 5e-05, "loss": 0.006, "step": 859 }, { "epoch": 2.009054054054054, "grad_norm": 9.331954002380371, "learning_rate": 5e-05, "loss": 0.0476, "step": 860 }, { "epoch": 2.009087837837838, "grad_norm": 3.561265230178833, "learning_rate": 5e-05, "loss": 0.0203, "step": 861 }, { "epoch": 2.0091216216216217, "grad_norm": 4.905521869659424, "learning_rate": 5e-05, "loss": 0.1426, "step": 862 }, { "epoch": 2.0091554054054055, "grad_norm": 0.28475287556648254, "learning_rate": 5e-05, "loss": 0.0055, "step": 863 }, { "epoch": 2.0091891891891893, "grad_norm": 0.14086779952049255, "learning_rate": 5e-05, "loss": 0.0035, "step": 864 }, { "epoch": 2.009222972972973, "grad_norm": 1.7512469291687012, "learning_rate": 5e-05, "loss": 0.0203, "step": 865 }, { "epoch": 2.009256756756757, "grad_norm": 3.581559419631958, "learning_rate": 5e-05, "loss": 0.0321, "step": 866 }, { "epoch": 2.0092905405405403, "grad_norm": 24.53478240966797, "learning_rate": 5e-05, "loss": 1.2121, "step": 867 }, { "epoch": 2.009324324324324, "grad_norm": 20.205154418945312, "learning_rate": 5e-05, "loss": 0.4851, "step": 868 }, { "epoch": 2.009358108108108, "grad_norm": 30.155258178710938, "learning_rate": 5e-05, "loss": 0.4363, "step": 869 }, { "epoch": 2.009391891891892, "grad_norm": 16.956764221191406, "learning_rate": 5e-05, "loss": 0.5896, "step": 870 }, { "epoch": 2.0094256756756756, "grad_norm": 12.636265754699707, "learning_rate": 5e-05, "loss": 0.1481, "step": 871 }, { "epoch": 2.0094594594594595, "grad_norm": 3.7417151927948, "learning_rate": 5e-05, "loss": 0.0369, "step": 872 }, { "epoch": 2.0094932432432433, "grad_norm": 0.24807900190353394, "learning_rate": 5e-05, "loss": 0.0036, "step": 873 }, { "epoch": 2.009527027027027, "grad_norm": 0.06580958515405655, "learning_rate": 5e-05, "loss": 0.0015, "step": 874 }, { "epoch": 2.009560810810811, "grad_norm": 2.792377471923828, "learning_rate": 5e-05, "loss": 0.0158, "step": 875 }, { "epoch": 2.0095945945945948, "grad_norm": 18.044504165649414, "learning_rate": 5e-05, "loss": 0.0521, "step": 876 }, { "epoch": 2.0096283783783786, "grad_norm": 0.05109013617038727, "learning_rate": 5e-05, "loss": 0.0009, "step": 877 }, { "epoch": 2.009662162162162, "grad_norm": 1.535478115081787, "learning_rate": 5e-05, "loss": 0.0092, "step": 878 }, { "epoch": 2.009695945945946, "grad_norm": 20.93328094482422, "learning_rate": 5e-05, "loss": 0.2536, "step": 879 }, { "epoch": 2.0097297297297296, "grad_norm": 4.30779504776001, "learning_rate": 5e-05, "loss": 0.0127, "step": 880 }, { "epoch": 2.0097635135135135, "grad_norm": 2.7973365783691406, "learning_rate": 5e-05, "loss": 0.0098, "step": 881 }, { "epoch": 2.0097972972972973, "grad_norm": 17.700448989868164, "learning_rate": 5e-05, "loss": 0.5757, "step": 882 }, { "epoch": 2.009831081081081, "grad_norm": 12.191022872924805, "learning_rate": 5e-05, "loss": 0.0878, "step": 883 }, { "epoch": 2.009864864864865, "grad_norm": 27.55369758605957, "learning_rate": 5e-05, "loss": 1.3547, "step": 884 }, { "epoch": 2.0098986486486488, "grad_norm": 17.82823371887207, "learning_rate": 5e-05, "loss": 1.2201, "step": 885 }, { "epoch": 2.0099324324324326, "grad_norm": 0.5615248084068298, "learning_rate": 5e-05, "loss": 0.0033, "step": 886 }, { "epoch": 2.0099662162162164, "grad_norm": 13.004617691040039, "learning_rate": 5e-05, "loss": 0.1897, "step": 887 }, { "epoch": 2.01, "grad_norm": 2.9903197288513184, "learning_rate": 5e-05, "loss": 0.0266, "step": 888 }, { "epoch": 2.01, "eval_accuracy": 0.8077544426494345, "eval_loss": 0.6627947092056274, "eval_runtime": 31.8946, "eval_samples_per_second": 19.408, "eval_steps_per_second": 2.446, "step": 888 }, { "epoch": 3.000033783783784, "grad_norm": 0.1234632283449173, "learning_rate": 5e-05, "loss": 0.0028, "step": 889 }, { "epoch": 3.0000675675675677, "grad_norm": 0.03161322697997093, "learning_rate": 5e-05, "loss": 0.001, "step": 890 }, { "epoch": 3.0001013513513515, "grad_norm": 0.16628232598304749, "learning_rate": 5e-05, "loss": 0.0029, "step": 891 }, { "epoch": 3.0001351351351353, "grad_norm": 10.507821083068848, "learning_rate": 5e-05, "loss": 0.0794, "step": 892 }, { "epoch": 3.000168918918919, "grad_norm": 0.4537968933582306, "learning_rate": 5e-05, "loss": 0.0079, "step": 893 }, { "epoch": 3.0002027027027025, "grad_norm": 25.010643005371094, "learning_rate": 5e-05, "loss": 0.1389, "step": 894 }, { "epoch": 3.0002364864864863, "grad_norm": 22.1783447265625, "learning_rate": 5e-05, "loss": 1.0634, "step": 895 }, { "epoch": 3.00027027027027, "grad_norm": 0.03995116055011749, "learning_rate": 5e-05, "loss": 0.0008, "step": 896 }, { "epoch": 3.000304054054054, "grad_norm": 0.317483127117157, "learning_rate": 5e-05, "loss": 0.0035, "step": 897 }, { "epoch": 3.000337837837838, "grad_norm": 14.338205337524414, "learning_rate": 5e-05, "loss": 0.4601, "step": 898 }, { "epoch": 3.0003716216216216, "grad_norm": 0.28166669607162476, "learning_rate": 5e-05, "loss": 0.0054, "step": 899 }, { "epoch": 3.0004054054054055, "grad_norm": 1.1412079334259033, "learning_rate": 5e-05, "loss": 0.0113, "step": 900 }, { "epoch": 3.0004391891891893, "grad_norm": 19.21966552734375, "learning_rate": 5e-05, "loss": 0.7354, "step": 901 }, { "epoch": 3.000472972972973, "grad_norm": 5.898601531982422, "learning_rate": 5e-05, "loss": 0.0235, "step": 902 }, { "epoch": 3.000506756756757, "grad_norm": 17.754764556884766, "learning_rate": 5e-05, "loss": 0.3656, "step": 903 }, { "epoch": 3.0005405405405408, "grad_norm": 20.077394485473633, "learning_rate": 5e-05, "loss": 0.1103, "step": 904 }, { "epoch": 3.000574324324324, "grad_norm": 0.05068984255194664, "learning_rate": 5e-05, "loss": 0.0013, "step": 905 }, { "epoch": 3.000608108108108, "grad_norm": 2.7937419414520264, "learning_rate": 5e-05, "loss": 0.0624, "step": 906 }, { "epoch": 3.000641891891892, "grad_norm": 3.3674354553222656, "learning_rate": 5e-05, "loss": 0.0249, "step": 907 }, { "epoch": 3.0006756756756756, "grad_norm": 2.988079309463501, "learning_rate": 5e-05, "loss": 0.0106, "step": 908 }, { "epoch": 3.0007094594594594, "grad_norm": 1.5372494459152222, "learning_rate": 5e-05, "loss": 0.0164, "step": 909 }, { "epoch": 3.0007432432432433, "grad_norm": 28.20690155029297, "learning_rate": 5e-05, "loss": 0.2661, "step": 910 }, { "epoch": 3.000777027027027, "grad_norm": 0.026171863079071045, "learning_rate": 5e-05, "loss": 0.0008, "step": 911 }, { "epoch": 3.000810810810811, "grad_norm": 0.35423117876052856, "learning_rate": 5e-05, "loss": 0.0037, "step": 912 }, { "epoch": 3.0008445945945947, "grad_norm": 14.664070129394531, "learning_rate": 5e-05, "loss": 0.1257, "step": 913 }, { "epoch": 3.0008783783783786, "grad_norm": 0.02055477723479271, "learning_rate": 5e-05, "loss": 0.0006, "step": 914 }, { "epoch": 3.000912162162162, "grad_norm": 0.11834486573934555, "learning_rate": 5e-05, "loss": 0.0027, "step": 915 }, { "epoch": 3.0009459459459458, "grad_norm": 5.72115421295166, "learning_rate": 5e-05, "loss": 0.0248, "step": 916 }, { "epoch": 3.0009797297297296, "grad_norm": 35.25407791137695, "learning_rate": 5e-05, "loss": 0.4712, "step": 917 }, { "epoch": 3.0010135135135134, "grad_norm": 33.28700637817383, "learning_rate": 5e-05, "loss": 1.3971, "step": 918 }, { "epoch": 3.0010472972972972, "grad_norm": 0.7601388096809387, "learning_rate": 5e-05, "loss": 0.0059, "step": 919 }, { "epoch": 3.001081081081081, "grad_norm": 2.499063014984131, "learning_rate": 5e-05, "loss": 0.0148, "step": 920 }, { "epoch": 3.001114864864865, "grad_norm": 16.483381271362305, "learning_rate": 5e-05, "loss": 0.4758, "step": 921 }, { "epoch": 3.0011486486486487, "grad_norm": 1.715020775794983, "learning_rate": 5e-05, "loss": 0.0069, "step": 922 }, { "epoch": 3.0011824324324325, "grad_norm": 0.5278586745262146, "learning_rate": 5e-05, "loss": 0.003, "step": 923 }, { "epoch": 3.0012162162162164, "grad_norm": 16.699615478515625, "learning_rate": 5e-05, "loss": 0.7401, "step": 924 }, { "epoch": 3.00125, "grad_norm": 0.3599952161312103, "learning_rate": 5e-05, "loss": 0.0051, "step": 925 }, { "epoch": 3.0012837837837836, "grad_norm": 16.810134887695312, "learning_rate": 5e-05, "loss": 0.5887, "step": 926 }, { "epoch": 3.0013175675675674, "grad_norm": 18.13884925842285, "learning_rate": 5e-05, "loss": 0.5131, "step": 927 }, { "epoch": 3.0013513513513512, "grad_norm": 1.9587231874465942, "learning_rate": 5e-05, "loss": 0.0104, "step": 928 }, { "epoch": 3.001385135135135, "grad_norm": 1.5165091753005981, "learning_rate": 5e-05, "loss": 0.0327, "step": 929 }, { "epoch": 3.001418918918919, "grad_norm": 29.306095123291016, "learning_rate": 5e-05, "loss": 0.5493, "step": 930 }, { "epoch": 3.0014527027027027, "grad_norm": 7.2319111824035645, "learning_rate": 5e-05, "loss": 0.0473, "step": 931 }, { "epoch": 3.0014864864864865, "grad_norm": 0.10572871565818787, "learning_rate": 5e-05, "loss": 0.0014, "step": 932 }, { "epoch": 3.0015202702702704, "grad_norm": 24.425268173217773, "learning_rate": 5e-05, "loss": 0.8536, "step": 933 }, { "epoch": 3.001554054054054, "grad_norm": 13.49934196472168, "learning_rate": 5e-05, "loss": 0.0615, "step": 934 }, { "epoch": 3.001587837837838, "grad_norm": 14.703864097595215, "learning_rate": 5e-05, "loss": 0.0966, "step": 935 }, { "epoch": 3.001621621621622, "grad_norm": 9.369180679321289, "learning_rate": 5e-05, "loss": 0.5221, "step": 936 }, { "epoch": 3.001655405405405, "grad_norm": 0.24460987746715546, "learning_rate": 5e-05, "loss": 0.0023, "step": 937 }, { "epoch": 3.001689189189189, "grad_norm": 0.12574730813503265, "learning_rate": 5e-05, "loss": 0.0028, "step": 938 }, { "epoch": 3.001722972972973, "grad_norm": 25.133018493652344, "learning_rate": 5e-05, "loss": 0.6879, "step": 939 }, { "epoch": 3.0017567567567567, "grad_norm": 0.00989561341702938, "learning_rate": 5e-05, "loss": 0.0003, "step": 940 }, { "epoch": 3.0017905405405405, "grad_norm": 0.058104872703552246, "learning_rate": 5e-05, "loss": 0.0013, "step": 941 }, { "epoch": 3.0018243243243243, "grad_norm": 0.1243259459733963, "learning_rate": 5e-05, "loss": 0.0018, "step": 942 }, { "epoch": 3.001858108108108, "grad_norm": 13.211277961730957, "learning_rate": 5e-05, "loss": 0.813, "step": 943 }, { "epoch": 3.001891891891892, "grad_norm": 0.24815790355205536, "learning_rate": 5e-05, "loss": 0.0054, "step": 944 }, { "epoch": 3.001925675675676, "grad_norm": 22.18710708618164, "learning_rate": 5e-05, "loss": 0.3754, "step": 945 }, { "epoch": 3.0019594594594596, "grad_norm": 9.158702850341797, "learning_rate": 5e-05, "loss": 0.1443, "step": 946 }, { "epoch": 3.0019932432432435, "grad_norm": 21.923181533813477, "learning_rate": 5e-05, "loss": 0.419, "step": 947 }, { "epoch": 3.002027027027027, "grad_norm": 22.715024948120117, "learning_rate": 5e-05, "loss": 0.5952, "step": 948 }, { "epoch": 3.0020608108108107, "grad_norm": 15.935602188110352, "learning_rate": 5e-05, "loss": 1.0102, "step": 949 }, { "epoch": 3.0020945945945945, "grad_norm": 0.05303044617176056, "learning_rate": 5e-05, "loss": 0.0013, "step": 950 }, { "epoch": 3.0021283783783783, "grad_norm": 25.37066078186035, "learning_rate": 5e-05, "loss": 0.8658, "step": 951 }, { "epoch": 3.002162162162162, "grad_norm": 1.5248066186904907, "learning_rate": 5e-05, "loss": 0.0323, "step": 952 }, { "epoch": 3.002195945945946, "grad_norm": 6.313404560089111, "learning_rate": 5e-05, "loss": 0.4316, "step": 953 }, { "epoch": 3.00222972972973, "grad_norm": 22.34250259399414, "learning_rate": 5e-05, "loss": 0.2336, "step": 954 }, { "epoch": 3.0022635135135136, "grad_norm": 4.8152384757995605, "learning_rate": 5e-05, "loss": 0.3058, "step": 955 }, { "epoch": 3.0022972972972974, "grad_norm": 3.211003065109253, "learning_rate": 5e-05, "loss": 0.0272, "step": 956 }, { "epoch": 3.0023310810810813, "grad_norm": 0.46654751896858215, "learning_rate": 5e-05, "loss": 0.0059, "step": 957 }, { "epoch": 3.0023648648648646, "grad_norm": 0.008958316408097744, "learning_rate": 5e-05, "loss": 0.0004, "step": 958 }, { "epoch": 3.0023986486486485, "grad_norm": 0.21460947394371033, "learning_rate": 5e-05, "loss": 0.0046, "step": 959 }, { "epoch": 3.0024324324324323, "grad_norm": 8.90234088897705, "learning_rate": 5e-05, "loss": 0.0867, "step": 960 }, { "epoch": 3.002466216216216, "grad_norm": 5.568995952606201, "learning_rate": 5e-05, "loss": 0.0801, "step": 961 }, { "epoch": 3.0025, "grad_norm": 0.37455514073371887, "learning_rate": 5e-05, "loss": 0.0075, "step": 962 }, { "epoch": 3.0025337837837838, "grad_norm": 0.09371397644281387, "learning_rate": 5e-05, "loss": 0.0024, "step": 963 }, { "epoch": 3.0025675675675676, "grad_norm": 5.799345970153809, "learning_rate": 5e-05, "loss": 0.0299, "step": 964 }, { "epoch": 3.0026013513513514, "grad_norm": 4.0173821449279785, "learning_rate": 5e-05, "loss": 0.0226, "step": 965 }, { "epoch": 3.0026351351351352, "grad_norm": 42.49039077758789, "learning_rate": 5e-05, "loss": 0.2262, "step": 966 }, { "epoch": 3.002668918918919, "grad_norm": 10.618077278137207, "learning_rate": 5e-05, "loss": 0.3897, "step": 967 }, { "epoch": 3.002702702702703, "grad_norm": 14.564884185791016, "learning_rate": 5e-05, "loss": 0.2312, "step": 968 }, { "epoch": 3.0027364864864863, "grad_norm": 7.3299713134765625, "learning_rate": 5e-05, "loss": 0.399, "step": 969 }, { "epoch": 3.00277027027027, "grad_norm": 0.1717269867658615, "learning_rate": 5e-05, "loss": 0.0032, "step": 970 }, { "epoch": 3.002804054054054, "grad_norm": 2.167264223098755, "learning_rate": 5e-05, "loss": 0.0231, "step": 971 }, { "epoch": 3.0028378378378378, "grad_norm": 0.8798015117645264, "learning_rate": 5e-05, "loss": 0.0159, "step": 972 }, { "epoch": 3.0028716216216216, "grad_norm": 6.4924187660217285, "learning_rate": 5e-05, "loss": 0.0746, "step": 973 }, { "epoch": 3.0029054054054054, "grad_norm": 0.056636545807123184, "learning_rate": 5e-05, "loss": 0.0013, "step": 974 }, { "epoch": 3.0029391891891892, "grad_norm": 32.594303131103516, "learning_rate": 5e-05, "loss": 1.069, "step": 975 }, { "epoch": 3.002972972972973, "grad_norm": 1.4219871759414673, "learning_rate": 5e-05, "loss": 0.0132, "step": 976 }, { "epoch": 3.003006756756757, "grad_norm": 17.899120330810547, "learning_rate": 5e-05, "loss": 0.4405, "step": 977 }, { "epoch": 3.0030405405405407, "grad_norm": 19.642301559448242, "learning_rate": 5e-05, "loss": 0.687, "step": 978 }, { "epoch": 3.0030743243243245, "grad_norm": 0.06267721205949783, "learning_rate": 5e-05, "loss": 0.0013, "step": 979 }, { "epoch": 3.003108108108108, "grad_norm": 3.313708782196045, "learning_rate": 5e-05, "loss": 0.0429, "step": 980 }, { "epoch": 3.0031418918918917, "grad_norm": 1.4256505966186523, "learning_rate": 5e-05, "loss": 0.0071, "step": 981 }, { "epoch": 3.0031756756756756, "grad_norm": 21.852420806884766, "learning_rate": 5e-05, "loss": 0.3874, "step": 982 }, { "epoch": 3.0032094594594594, "grad_norm": 15.734886169433594, "learning_rate": 5e-05, "loss": 0.182, "step": 983 }, { "epoch": 3.003243243243243, "grad_norm": 18.812702178955078, "learning_rate": 5e-05, "loss": 0.1821, "step": 984 }, { "epoch": 3.003277027027027, "grad_norm": 2.211137056350708, "learning_rate": 5e-05, "loss": 0.0536, "step": 985 }, { "epoch": 3.003310810810811, "grad_norm": 22.405399322509766, "learning_rate": 5e-05, "loss": 0.3847, "step": 986 }, { "epoch": 3.0033445945945947, "grad_norm": 6.187371730804443, "learning_rate": 5e-05, "loss": 0.3555, "step": 987 }, { "epoch": 3.0033783783783785, "grad_norm": 3.0154247283935547, "learning_rate": 5e-05, "loss": 0.0306, "step": 988 }, { "epoch": 3.0034121621621623, "grad_norm": 7.680883407592773, "learning_rate": 5e-05, "loss": 0.7567, "step": 989 }, { "epoch": 3.003445945945946, "grad_norm": 1.3917415142059326, "learning_rate": 5e-05, "loss": 0.0092, "step": 990 }, { "epoch": 3.0034797297297295, "grad_norm": 14.677221298217773, "learning_rate": 5e-05, "loss": 0.21, "step": 991 }, { "epoch": 3.0035135135135134, "grad_norm": 2.8095569610595703, "learning_rate": 5e-05, "loss": 0.0371, "step": 992 }, { "epoch": 3.003547297297297, "grad_norm": 0.167382150888443, "learning_rate": 5e-05, "loss": 0.0038, "step": 993 }, { "epoch": 3.003581081081081, "grad_norm": 20.52214813232422, "learning_rate": 5e-05, "loss": 0.4671, "step": 994 }, { "epoch": 3.003614864864865, "grad_norm": 3.7305915355682373, "learning_rate": 5e-05, "loss": 0.0329, "step": 995 }, { "epoch": 3.0036486486486487, "grad_norm": 16.056377410888672, "learning_rate": 5e-05, "loss": 0.255, "step": 996 }, { "epoch": 3.0036824324324325, "grad_norm": 13.268664360046387, "learning_rate": 5e-05, "loss": 0.5139, "step": 997 }, { "epoch": 3.0037162162162163, "grad_norm": 18.920175552368164, "learning_rate": 5e-05, "loss": 0.1529, "step": 998 }, { "epoch": 3.00375, "grad_norm": 6.868951320648193, "learning_rate": 5e-05, "loss": 0.6905, "step": 999 }, { "epoch": 3.003783783783784, "grad_norm": 11.1655912399292, "learning_rate": 5e-05, "loss": 0.4636, "step": 1000 }, { "epoch": 3.0038175675675673, "grad_norm": 0.47663432359695435, "learning_rate": 5e-05, "loss": 0.0106, "step": 1001 }, { "epoch": 3.003851351351351, "grad_norm": 0.12810879945755005, "learning_rate": 5e-05, "loss": 0.0039, "step": 1002 }, { "epoch": 3.003885135135135, "grad_norm": 4.668596267700195, "learning_rate": 5e-05, "loss": 0.2212, "step": 1003 }, { "epoch": 3.003918918918919, "grad_norm": 23.623817443847656, "learning_rate": 5e-05, "loss": 0.7092, "step": 1004 }, { "epoch": 3.0039527027027026, "grad_norm": 4.564948558807373, "learning_rate": 5e-05, "loss": 0.0493, "step": 1005 }, { "epoch": 3.0039864864864865, "grad_norm": 1.6943174600601196, "learning_rate": 5e-05, "loss": 0.0315, "step": 1006 }, { "epoch": 3.0040202702702703, "grad_norm": 4.706667900085449, "learning_rate": 5e-05, "loss": 0.1766, "step": 1007 }, { "epoch": 3.004054054054054, "grad_norm": 4.504525184631348, "learning_rate": 5e-05, "loss": 0.3093, "step": 1008 }, { "epoch": 3.004087837837838, "grad_norm": 5.865261554718018, "learning_rate": 5e-05, "loss": 0.4991, "step": 1009 }, { "epoch": 3.0041216216216218, "grad_norm": 11.185822486877441, "learning_rate": 5e-05, "loss": 0.1189, "step": 1010 }, { "epoch": 3.0041554054054056, "grad_norm": 0.3543480336666107, "learning_rate": 5e-05, "loss": 0.0098, "step": 1011 }, { "epoch": 3.004189189189189, "grad_norm": 5.0621771812438965, "learning_rate": 5e-05, "loss": 0.1943, "step": 1012 }, { "epoch": 3.004222972972973, "grad_norm": 5.3835978507995605, "learning_rate": 5e-05, "loss": 0.0977, "step": 1013 }, { "epoch": 3.0042567567567566, "grad_norm": 1.0338064432144165, "learning_rate": 5e-05, "loss": 0.0129, "step": 1014 }, { "epoch": 3.0042905405405405, "grad_norm": 17.29814338684082, "learning_rate": 5e-05, "loss": 0.7925, "step": 1015 }, { "epoch": 3.0043243243243243, "grad_norm": 0.7845548391342163, "learning_rate": 5e-05, "loss": 0.0154, "step": 1016 }, { "epoch": 3.004358108108108, "grad_norm": 1.0175706148147583, "learning_rate": 5e-05, "loss": 0.0346, "step": 1017 }, { "epoch": 3.004391891891892, "grad_norm": 2.164522409439087, "learning_rate": 5e-05, "loss": 0.0297, "step": 1018 }, { "epoch": 3.0044256756756758, "grad_norm": 1.3690813779830933, "learning_rate": 5e-05, "loss": 0.016, "step": 1019 }, { "epoch": 3.0044594594594596, "grad_norm": 1.9200350046157837, "learning_rate": 5e-05, "loss": 0.0191, "step": 1020 }, { "epoch": 3.0044932432432434, "grad_norm": 5.304471492767334, "learning_rate": 5e-05, "loss": 0.0644, "step": 1021 }, { "epoch": 3.0045270270270272, "grad_norm": 5.6564040184021, "learning_rate": 5e-05, "loss": 0.0312, "step": 1022 }, { "epoch": 3.0045608108108106, "grad_norm": 14.615310668945312, "learning_rate": 5e-05, "loss": 0.3606, "step": 1023 }, { "epoch": 3.0045945945945944, "grad_norm": 6.613729476928711, "learning_rate": 5e-05, "loss": 0.053, "step": 1024 }, { "epoch": 3.0046283783783783, "grad_norm": 0.8022513389587402, "learning_rate": 5e-05, "loss": 0.0061, "step": 1025 }, { "epoch": 3.004662162162162, "grad_norm": 5.5113325119018555, "learning_rate": 5e-05, "loss": 0.0522, "step": 1026 }, { "epoch": 3.004695945945946, "grad_norm": 0.06207512319087982, "learning_rate": 5e-05, "loss": 0.0019, "step": 1027 }, { "epoch": 3.0047297297297297, "grad_norm": 0.204232320189476, "learning_rate": 5e-05, "loss": 0.0041, "step": 1028 }, { "epoch": 3.0047635135135136, "grad_norm": 2.926769971847534, "learning_rate": 5e-05, "loss": 0.0158, "step": 1029 }, { "epoch": 3.0047972972972974, "grad_norm": 0.6995229721069336, "learning_rate": 5e-05, "loss": 0.0083, "step": 1030 }, { "epoch": 3.004831081081081, "grad_norm": 21.75241470336914, "learning_rate": 5e-05, "loss": 0.3647, "step": 1031 }, { "epoch": 3.004864864864865, "grad_norm": 5.888047695159912, "learning_rate": 5e-05, "loss": 0.0393, "step": 1032 }, { "epoch": 3.004898648648649, "grad_norm": 2.098966360092163, "learning_rate": 5e-05, "loss": 0.0157, "step": 1033 }, { "epoch": 3.0049324324324322, "grad_norm": 19.90398406982422, "learning_rate": 5e-05, "loss": 0.7381, "step": 1034 }, { "epoch": 3.004966216216216, "grad_norm": 0.27858608961105347, "learning_rate": 5e-05, "loss": 0.004, "step": 1035 }, { "epoch": 3.005, "grad_norm": 24.43282127380371, "learning_rate": 5e-05, "loss": 0.4758, "step": 1036 }, { "epoch": 3.0050337837837837, "grad_norm": 26.12330436706543, "learning_rate": 5e-05, "loss": 0.1543, "step": 1037 }, { "epoch": 3.0050675675675675, "grad_norm": 28.456382751464844, "learning_rate": 5e-05, "loss": 0.1888, "step": 1038 }, { "epoch": 3.0051013513513514, "grad_norm": 32.14822006225586, "learning_rate": 5e-05, "loss": 1.6445, "step": 1039 }, { "epoch": 3.005135135135135, "grad_norm": 14.174095153808594, "learning_rate": 5e-05, "loss": 0.5609, "step": 1040 }, { "epoch": 3.005168918918919, "grad_norm": 22.262975692749023, "learning_rate": 5e-05, "loss": 0.3395, "step": 1041 }, { "epoch": 3.005202702702703, "grad_norm": 0.7745525240898132, "learning_rate": 5e-05, "loss": 0.0159, "step": 1042 }, { "epoch": 3.0052364864864867, "grad_norm": 18.45747184753418, "learning_rate": 5e-05, "loss": 0.1278, "step": 1043 }, { "epoch": 3.00527027027027, "grad_norm": 22.828889846801758, "learning_rate": 5e-05, "loss": 0.2297, "step": 1044 }, { "epoch": 3.005304054054054, "grad_norm": 0.7299669981002808, "learning_rate": 5e-05, "loss": 0.0075, "step": 1045 }, { "epoch": 3.0053378378378377, "grad_norm": 34.990936279296875, "learning_rate": 5e-05, "loss": 1.0591, "step": 1046 }, { "epoch": 3.0053716216216215, "grad_norm": 0.3517588973045349, "learning_rate": 5e-05, "loss": 0.0044, "step": 1047 }, { "epoch": 3.0054054054054054, "grad_norm": 0.08313743770122528, "learning_rate": 5e-05, "loss": 0.0021, "step": 1048 }, { "epoch": 3.005439189189189, "grad_norm": 0.10727918148040771, "learning_rate": 5e-05, "loss": 0.0023, "step": 1049 }, { "epoch": 3.005472972972973, "grad_norm": 5.88434362411499, "learning_rate": 5e-05, "loss": 0.8521, "step": 1050 }, { "epoch": 3.005506756756757, "grad_norm": 12.346208572387695, "learning_rate": 5e-05, "loss": 0.5069, "step": 1051 }, { "epoch": 3.0055405405405407, "grad_norm": 24.085222244262695, "learning_rate": 5e-05, "loss": 0.4169, "step": 1052 }, { "epoch": 3.0055743243243245, "grad_norm": 15.278740882873535, "learning_rate": 5e-05, "loss": 0.1686, "step": 1053 }, { "epoch": 3.0056081081081083, "grad_norm": 13.585868835449219, "learning_rate": 5e-05, "loss": 0.0972, "step": 1054 }, { "epoch": 3.0056418918918917, "grad_norm": 19.706275939941406, "learning_rate": 5e-05, "loss": 0.3228, "step": 1055 }, { "epoch": 3.0056756756756755, "grad_norm": 0.6616368889808655, "learning_rate": 5e-05, "loss": 0.0078, "step": 1056 }, { "epoch": 3.0057094594594593, "grad_norm": 2.3522067070007324, "learning_rate": 5e-05, "loss": 0.0416, "step": 1057 }, { "epoch": 3.005743243243243, "grad_norm": 20.470327377319336, "learning_rate": 5e-05, "loss": 0.272, "step": 1058 }, { "epoch": 3.005777027027027, "grad_norm": 13.324955940246582, "learning_rate": 5e-05, "loss": 0.0881, "step": 1059 }, { "epoch": 3.005810810810811, "grad_norm": 20.57856559753418, "learning_rate": 5e-05, "loss": 0.2907, "step": 1060 }, { "epoch": 3.0058445945945946, "grad_norm": 10.81495475769043, "learning_rate": 5e-05, "loss": 0.8392, "step": 1061 }, { "epoch": 3.0058783783783785, "grad_norm": 18.08121681213379, "learning_rate": 5e-05, "loss": 0.3362, "step": 1062 }, { "epoch": 3.0059121621621623, "grad_norm": 0.2742729187011719, "learning_rate": 5e-05, "loss": 0.0071, "step": 1063 }, { "epoch": 3.005945945945946, "grad_norm": 0.30963799357414246, "learning_rate": 5e-05, "loss": 0.0036, "step": 1064 }, { "epoch": 3.00597972972973, "grad_norm": 1.373503565788269, "learning_rate": 5e-05, "loss": 0.0172, "step": 1065 }, { "epoch": 3.0060135135135133, "grad_norm": 2.739230155944824, "learning_rate": 5e-05, "loss": 0.0099, "step": 1066 }, { "epoch": 3.006047297297297, "grad_norm": 7.345090866088867, "learning_rate": 5e-05, "loss": 0.0654, "step": 1067 }, { "epoch": 3.006081081081081, "grad_norm": 0.2721858024597168, "learning_rate": 5e-05, "loss": 0.0054, "step": 1068 }, { "epoch": 3.006114864864865, "grad_norm": 3.6114919185638428, "learning_rate": 5e-05, "loss": 0.1865, "step": 1069 }, { "epoch": 3.0061486486486486, "grad_norm": 6.382689476013184, "learning_rate": 5e-05, "loss": 0.0508, "step": 1070 }, { "epoch": 3.0061824324324324, "grad_norm": 16.6743221282959, "learning_rate": 5e-05, "loss": 0.4574, "step": 1071 }, { "epoch": 3.0062162162162163, "grad_norm": 2.707900047302246, "learning_rate": 5e-05, "loss": 0.0133, "step": 1072 }, { "epoch": 3.00625, "grad_norm": 6.729382514953613, "learning_rate": 5e-05, "loss": 0.7332, "step": 1073 }, { "epoch": 3.006283783783784, "grad_norm": 3.6896331310272217, "learning_rate": 5e-05, "loss": 0.0282, "step": 1074 }, { "epoch": 3.0063175675675677, "grad_norm": 3.749047040939331, "learning_rate": 5e-05, "loss": 0.0581, "step": 1075 }, { "epoch": 3.0063513513513516, "grad_norm": 4.474020481109619, "learning_rate": 5e-05, "loss": 0.0361, "step": 1076 }, { "epoch": 3.006385135135135, "grad_norm": 17.462247848510742, "learning_rate": 5e-05, "loss": 0.2106, "step": 1077 }, { "epoch": 3.0064189189189188, "grad_norm": 0.023770583793520927, "learning_rate": 5e-05, "loss": 0.0009, "step": 1078 }, { "epoch": 3.0064527027027026, "grad_norm": 1.791603684425354, "learning_rate": 5e-05, "loss": 0.0191, "step": 1079 }, { "epoch": 3.0064864864864864, "grad_norm": 24.989822387695312, "learning_rate": 5e-05, "loss": 0.2288, "step": 1080 }, { "epoch": 3.0065202702702702, "grad_norm": 3.795117139816284, "learning_rate": 5e-05, "loss": 0.0242, "step": 1081 }, { "epoch": 3.006554054054054, "grad_norm": 11.183342933654785, "learning_rate": 5e-05, "loss": 0.197, "step": 1082 }, { "epoch": 3.006587837837838, "grad_norm": 0.9044035077095032, "learning_rate": 5e-05, "loss": 0.0574, "step": 1083 }, { "epoch": 3.0066216216216217, "grad_norm": 0.02325511910021305, "learning_rate": 5e-05, "loss": 0.0007, "step": 1084 }, { "epoch": 3.0066554054054055, "grad_norm": 13.648694038391113, "learning_rate": 5e-05, "loss": 0.6872, "step": 1085 }, { "epoch": 3.0066891891891894, "grad_norm": 5.211246967315674, "learning_rate": 5e-05, "loss": 0.0421, "step": 1086 }, { "epoch": 3.0067229729729728, "grad_norm": 0.87028968334198, "learning_rate": 5e-05, "loss": 0.0213, "step": 1087 }, { "epoch": 3.0067567567567566, "grad_norm": 1.892443299293518, "learning_rate": 5e-05, "loss": 0.0177, "step": 1088 }, { "epoch": 3.0067905405405404, "grad_norm": 3.5757408142089844, "learning_rate": 5e-05, "loss": 0.0321, "step": 1089 }, { "epoch": 3.0068243243243242, "grad_norm": 17.50002670288086, "learning_rate": 5e-05, "loss": 0.1997, "step": 1090 }, { "epoch": 3.006858108108108, "grad_norm": 18.04364585876465, "learning_rate": 5e-05, "loss": 0.2902, "step": 1091 }, { "epoch": 3.006891891891892, "grad_norm": 17.782840728759766, "learning_rate": 5e-05, "loss": 1.0544, "step": 1092 }, { "epoch": 3.0069256756756757, "grad_norm": 23.3546085357666, "learning_rate": 5e-05, "loss": 0.3925, "step": 1093 }, { "epoch": 3.0069594594594595, "grad_norm": 1.6948331594467163, "learning_rate": 5e-05, "loss": 0.0225, "step": 1094 }, { "epoch": 3.0069932432432434, "grad_norm": 0.09219735860824585, "learning_rate": 5e-05, "loss": 0.0025, "step": 1095 }, { "epoch": 3.007027027027027, "grad_norm": 0.34374740719795227, "learning_rate": 5e-05, "loss": 0.0065, "step": 1096 }, { "epoch": 3.007060810810811, "grad_norm": 16.414461135864258, "learning_rate": 5e-05, "loss": 0.5024, "step": 1097 }, { "epoch": 3.0070945945945944, "grad_norm": 26.44287109375, "learning_rate": 5e-05, "loss": 0.6993, "step": 1098 }, { "epoch": 3.007128378378378, "grad_norm": 0.19216401875019073, "learning_rate": 5e-05, "loss": 0.0042, "step": 1099 }, { "epoch": 3.007162162162162, "grad_norm": 22.51660919189453, "learning_rate": 5e-05, "loss": 0.7237, "step": 1100 }, { "epoch": 3.007195945945946, "grad_norm": 14.810504913330078, "learning_rate": 5e-05, "loss": 0.5467, "step": 1101 }, { "epoch": 3.0072297297297297, "grad_norm": 0.5033894181251526, "learning_rate": 5e-05, "loss": 0.0066, "step": 1102 }, { "epoch": 3.0072635135135135, "grad_norm": 0.4944498836994171, "learning_rate": 5e-05, "loss": 0.0097, "step": 1103 }, { "epoch": 3.0072972972972973, "grad_norm": 18.268848419189453, "learning_rate": 5e-05, "loss": 0.6173, "step": 1104 }, { "epoch": 3.007331081081081, "grad_norm": 2.798845052719116, "learning_rate": 5e-05, "loss": 0.0328, "step": 1105 }, { "epoch": 3.007364864864865, "grad_norm": 1.1047919988632202, "learning_rate": 5e-05, "loss": 0.0124, "step": 1106 }, { "epoch": 3.007398648648649, "grad_norm": 0.6788066625595093, "learning_rate": 5e-05, "loss": 0.0122, "step": 1107 }, { "epoch": 3.0074324324324326, "grad_norm": 0.09637828171253204, "learning_rate": 5e-05, "loss": 0.002, "step": 1108 }, { "epoch": 3.007466216216216, "grad_norm": 0.27290940284729004, "learning_rate": 5e-05, "loss": 0.0035, "step": 1109 }, { "epoch": 3.0075, "grad_norm": 24.403411865234375, "learning_rate": 5e-05, "loss": 0.3528, "step": 1110 }, { "epoch": 3.0075337837837837, "grad_norm": 16.00898551940918, "learning_rate": 5e-05, "loss": 0.1081, "step": 1111 }, { "epoch": 3.0075675675675675, "grad_norm": 24.5959415435791, "learning_rate": 5e-05, "loss": 0.4887, "step": 1112 }, { "epoch": 3.0076013513513513, "grad_norm": 35.9322395324707, "learning_rate": 5e-05, "loss": 1.053, "step": 1113 }, { "epoch": 3.007635135135135, "grad_norm": 4.310102939605713, "learning_rate": 5e-05, "loss": 0.037, "step": 1114 }, { "epoch": 3.007668918918919, "grad_norm": 16.185070037841797, "learning_rate": 5e-05, "loss": 0.4684, "step": 1115 }, { "epoch": 3.007702702702703, "grad_norm": 19.61172103881836, "learning_rate": 5e-05, "loss": 0.2237, "step": 1116 }, { "epoch": 3.0077364864864866, "grad_norm": 0.06594177335500717, "learning_rate": 5e-05, "loss": 0.0018, "step": 1117 }, { "epoch": 3.0077702702702704, "grad_norm": 14.439477920532227, "learning_rate": 5e-05, "loss": 1.3404, "step": 1118 }, { "epoch": 3.0078040540540543, "grad_norm": 10.976378440856934, "learning_rate": 5e-05, "loss": 0.1807, "step": 1119 }, { "epoch": 3.0078378378378376, "grad_norm": 7.725071907043457, "learning_rate": 5e-05, "loss": 0.3681, "step": 1120 }, { "epoch": 3.0078716216216215, "grad_norm": 14.076944351196289, "learning_rate": 5e-05, "loss": 0.2377, "step": 1121 }, { "epoch": 3.0079054054054053, "grad_norm": 7.920755863189697, "learning_rate": 5e-05, "loss": 0.6345, "step": 1122 }, { "epoch": 3.007939189189189, "grad_norm": 0.5835193991661072, "learning_rate": 5e-05, "loss": 0.0076, "step": 1123 }, { "epoch": 3.007972972972973, "grad_norm": 26.84272003173828, "learning_rate": 5e-05, "loss": 0.5509, "step": 1124 }, { "epoch": 3.0080067567567568, "grad_norm": 10.906031608581543, "learning_rate": 5e-05, "loss": 0.1916, "step": 1125 }, { "epoch": 3.0080405405405406, "grad_norm": 14.526939392089844, "learning_rate": 5e-05, "loss": 0.3762, "step": 1126 }, { "epoch": 3.0080743243243244, "grad_norm": 2.0668656826019287, "learning_rate": 5e-05, "loss": 0.0332, "step": 1127 }, { "epoch": 3.0081081081081082, "grad_norm": 15.620635032653809, "learning_rate": 5e-05, "loss": 0.3093, "step": 1128 }, { "epoch": 3.008141891891892, "grad_norm": 22.76863670349121, "learning_rate": 5e-05, "loss": 0.2589, "step": 1129 }, { "epoch": 3.008175675675676, "grad_norm": 0.6740882396697998, "learning_rate": 5e-05, "loss": 0.017, "step": 1130 }, { "epoch": 3.0082094594594593, "grad_norm": 1.038204550743103, "learning_rate": 5e-05, "loss": 0.0187, "step": 1131 }, { "epoch": 3.008243243243243, "grad_norm": 13.70112133026123, "learning_rate": 5e-05, "loss": 0.6692, "step": 1132 }, { "epoch": 3.008277027027027, "grad_norm": 20.529640197753906, "learning_rate": 5e-05, "loss": 0.7396, "step": 1133 }, { "epoch": 3.0083108108108108, "grad_norm": 2.282074451446533, "learning_rate": 5e-05, "loss": 0.0327, "step": 1134 }, { "epoch": 3.0083445945945946, "grad_norm": 6.904858589172363, "learning_rate": 5e-05, "loss": 0.2209, "step": 1135 }, { "epoch": 3.0083783783783784, "grad_norm": 1.131390929222107, "learning_rate": 5e-05, "loss": 0.015, "step": 1136 }, { "epoch": 3.0084121621621622, "grad_norm": 0.15460199117660522, "learning_rate": 5e-05, "loss": 0.0024, "step": 1137 }, { "epoch": 3.008445945945946, "grad_norm": 21.40874671936035, "learning_rate": 5e-05, "loss": 0.9292, "step": 1138 }, { "epoch": 3.00847972972973, "grad_norm": 11.970168113708496, "learning_rate": 5e-05, "loss": 0.1749, "step": 1139 }, { "epoch": 3.0085135135135137, "grad_norm": 0.23016200959682465, "learning_rate": 5e-05, "loss": 0.0062, "step": 1140 }, { "epoch": 3.008547297297297, "grad_norm": 7.7079997062683105, "learning_rate": 5e-05, "loss": 0.1674, "step": 1141 }, { "epoch": 3.008581081081081, "grad_norm": 4.883083343505859, "learning_rate": 5e-05, "loss": 0.1725, "step": 1142 }, { "epoch": 3.0086148648648647, "grad_norm": 0.7496493458747864, "learning_rate": 5e-05, "loss": 0.0123, "step": 1143 }, { "epoch": 3.0086486486486486, "grad_norm": 14.910581588745117, "learning_rate": 5e-05, "loss": 0.2034, "step": 1144 }, { "epoch": 3.0086824324324324, "grad_norm": 0.19367234408855438, "learning_rate": 5e-05, "loss": 0.0074, "step": 1145 }, { "epoch": 3.008716216216216, "grad_norm": 8.253870010375977, "learning_rate": 5e-05, "loss": 0.4727, "step": 1146 }, { "epoch": 3.00875, "grad_norm": 22.93012046813965, "learning_rate": 5e-05, "loss": 0.2465, "step": 1147 }, { "epoch": 3.008783783783784, "grad_norm": 9.781954765319824, "learning_rate": 5e-05, "loss": 0.3732, "step": 1148 }, { "epoch": 3.0088175675675677, "grad_norm": 0.77884840965271, "learning_rate": 5e-05, "loss": 0.0089, "step": 1149 }, { "epoch": 3.0088513513513515, "grad_norm": 12.51172161102295, "learning_rate": 5e-05, "loss": 0.7454, "step": 1150 }, { "epoch": 3.0088851351351353, "grad_norm": 0.05721567943692207, "learning_rate": 5e-05, "loss": 0.0018, "step": 1151 }, { "epoch": 3.0089189189189187, "grad_norm": 15.800390243530273, "learning_rate": 5e-05, "loss": 0.2947, "step": 1152 }, { "epoch": 3.0089527027027025, "grad_norm": 19.95978546142578, "learning_rate": 5e-05, "loss": 0.8125, "step": 1153 }, { "epoch": 3.0089864864864864, "grad_norm": 11.866628646850586, "learning_rate": 5e-05, "loss": 0.4375, "step": 1154 }, { "epoch": 3.00902027027027, "grad_norm": 9.12220287322998, "learning_rate": 5e-05, "loss": 0.1446, "step": 1155 }, { "epoch": 3.009054054054054, "grad_norm": 0.10365631431341171, "learning_rate": 5e-05, "loss": 0.0025, "step": 1156 }, { "epoch": 3.009087837837838, "grad_norm": 6.957522869110107, "learning_rate": 5e-05, "loss": 0.0336, "step": 1157 }, { "epoch": 3.0091216216216217, "grad_norm": 0.5888859629631042, "learning_rate": 5e-05, "loss": 0.0081, "step": 1158 }, { "epoch": 3.0091554054054055, "grad_norm": 0.07927615195512772, "learning_rate": 5e-05, "loss": 0.0017, "step": 1159 }, { "epoch": 3.0091891891891893, "grad_norm": 10.471328735351562, "learning_rate": 5e-05, "loss": 0.2014, "step": 1160 }, { "epoch": 3.009222972972973, "grad_norm": 11.569526672363281, "learning_rate": 5e-05, "loss": 0.1391, "step": 1161 }, { "epoch": 3.009256756756757, "grad_norm": 8.104941368103027, "learning_rate": 5e-05, "loss": 0.1503, "step": 1162 }, { "epoch": 3.0092905405405403, "grad_norm": 7.241353511810303, "learning_rate": 5e-05, "loss": 0.2719, "step": 1163 }, { "epoch": 3.009324324324324, "grad_norm": 0.25939464569091797, "learning_rate": 5e-05, "loss": 0.0083, "step": 1164 }, { "epoch": 3.009358108108108, "grad_norm": 15.60102367401123, "learning_rate": 5e-05, "loss": 0.1346, "step": 1165 }, { "epoch": 3.009391891891892, "grad_norm": 0.09784505516290665, "learning_rate": 5e-05, "loss": 0.0021, "step": 1166 }, { "epoch": 3.0094256756756756, "grad_norm": 10.224125862121582, "learning_rate": 5e-05, "loss": 0.1744, "step": 1167 }, { "epoch": 3.0094594594594595, "grad_norm": 12.367109298706055, "learning_rate": 5e-05, "loss": 0.3203, "step": 1168 }, { "epoch": 3.0094932432432433, "grad_norm": 24.182144165039062, "learning_rate": 5e-05, "loss": 0.3133, "step": 1169 }, { "epoch": 3.009527027027027, "grad_norm": 19.220293045043945, "learning_rate": 5e-05, "loss": 0.204, "step": 1170 }, { "epoch": 3.009560810810811, "grad_norm": 7.516851425170898, "learning_rate": 5e-05, "loss": 0.6566, "step": 1171 }, { "epoch": 3.0095945945945948, "grad_norm": 0.06045526638627052, "learning_rate": 5e-05, "loss": 0.0021, "step": 1172 }, { "epoch": 3.0096283783783786, "grad_norm": 11.399614334106445, "learning_rate": 5e-05, "loss": 0.1014, "step": 1173 }, { "epoch": 3.009662162162162, "grad_norm": 6.606675624847412, "learning_rate": 5e-05, "loss": 0.8174, "step": 1174 }, { "epoch": 3.009695945945946, "grad_norm": 18.588603973388672, "learning_rate": 5e-05, "loss": 0.5924, "step": 1175 }, { "epoch": 3.0097297297297296, "grad_norm": 6.222175598144531, "learning_rate": 5e-05, "loss": 0.3766, "step": 1176 }, { "epoch": 3.0097635135135135, "grad_norm": 16.432567596435547, "learning_rate": 5e-05, "loss": 0.2946, "step": 1177 }, { "epoch": 3.0097972972972973, "grad_norm": 12.525311470031738, "learning_rate": 5e-05, "loss": 0.2559, "step": 1178 }, { "epoch": 3.009831081081081, "grad_norm": 1.5805771350860596, "learning_rate": 5e-05, "loss": 0.0379, "step": 1179 }, { "epoch": 3.009864864864865, "grad_norm": 10.234554290771484, "learning_rate": 5e-05, "loss": 0.3575, "step": 1180 }, { "epoch": 3.0098986486486488, "grad_norm": 19.565271377563477, "learning_rate": 5e-05, "loss": 0.2428, "step": 1181 }, { "epoch": 3.0099324324324326, "grad_norm": 14.638829231262207, "learning_rate": 5e-05, "loss": 0.6235, "step": 1182 }, { "epoch": 3.0099662162162164, "grad_norm": 6.367255687713623, "learning_rate": 5e-05, "loss": 0.101, "step": 1183 }, { "epoch": 3.01, "grad_norm": 6.355800151824951, "learning_rate": 5e-05, "loss": 0.4168, "step": 1184 }, { "epoch": 3.01, "eval_accuracy": 0.8562197092084006, "eval_loss": 0.4009462893009186, "eval_runtime": 31.7502, "eval_samples_per_second": 19.496, "eval_steps_per_second": 2.457, "step": 1184 }, { "epoch": 4.000033783783784, "grad_norm": 0.5351633429527283, "learning_rate": 5e-05, "loss": 0.0173, "step": 1185 }, { "epoch": 4.000067567567568, "grad_norm": 0.6870751976966858, "learning_rate": 5e-05, "loss": 0.0076, "step": 1186 }, { "epoch": 4.0001013513513515, "grad_norm": 16.201950073242188, "learning_rate": 5e-05, "loss": 0.4162, "step": 1187 }, { "epoch": 4.000135135135135, "grad_norm": 3.056945323944092, "learning_rate": 5e-05, "loss": 0.0401, "step": 1188 }, { "epoch": 4.000168918918919, "grad_norm": 9.967169761657715, "learning_rate": 5e-05, "loss": 0.0885, "step": 1189 }, { "epoch": 4.000202702702703, "grad_norm": 10.0690336227417, "learning_rate": 5e-05, "loss": 0.3934, "step": 1190 }, { "epoch": 4.000236486486487, "grad_norm": 12.042024612426758, "learning_rate": 5e-05, "loss": 0.3127, "step": 1191 }, { "epoch": 4.000270270270271, "grad_norm": 3.0418426990509033, "learning_rate": 5e-05, "loss": 0.0513, "step": 1192 }, { "epoch": 4.000304054054054, "grad_norm": 7.742201328277588, "learning_rate": 5e-05, "loss": 0.0531, "step": 1193 }, { "epoch": 4.000337837837838, "grad_norm": 0.1972622126340866, "learning_rate": 5e-05, "loss": 0.0067, "step": 1194 }, { "epoch": 4.000371621621621, "grad_norm": 6.444258213043213, "learning_rate": 5e-05, "loss": 0.0363, "step": 1195 }, { "epoch": 4.000405405405405, "grad_norm": 2.1509571075439453, "learning_rate": 5e-05, "loss": 0.0632, "step": 1196 }, { "epoch": 4.000439189189189, "grad_norm": 4.842563629150391, "learning_rate": 5e-05, "loss": 0.0714, "step": 1197 }, { "epoch": 4.000472972972973, "grad_norm": 15.45928955078125, "learning_rate": 5e-05, "loss": 0.7591, "step": 1198 }, { "epoch": 4.0005067567567565, "grad_norm": 0.08861789852380753, "learning_rate": 5e-05, "loss": 0.0025, "step": 1199 }, { "epoch": 4.00054054054054, "grad_norm": 1.327793836593628, "learning_rate": 5e-05, "loss": 0.0337, "step": 1200 }, { "epoch": 4.000574324324324, "grad_norm": 10.13684368133545, "learning_rate": 5e-05, "loss": 0.1326, "step": 1201 }, { "epoch": 4.000608108108108, "grad_norm": 4.547879695892334, "learning_rate": 5e-05, "loss": 0.0442, "step": 1202 }, { "epoch": 4.000641891891892, "grad_norm": 0.42309311032295227, "learning_rate": 5e-05, "loss": 0.0108, "step": 1203 }, { "epoch": 4.000675675675676, "grad_norm": 2.744396686553955, "learning_rate": 5e-05, "loss": 0.0854, "step": 1204 }, { "epoch": 4.000709459459459, "grad_norm": 2.6955645084381104, "learning_rate": 5e-05, "loss": 0.0508, "step": 1205 }, { "epoch": 4.000743243243243, "grad_norm": 0.39690133929252625, "learning_rate": 5e-05, "loss": 0.0049, "step": 1206 }, { "epoch": 4.000777027027027, "grad_norm": 0.16466641426086426, "learning_rate": 5e-05, "loss": 0.0046, "step": 1207 }, { "epoch": 4.000810810810811, "grad_norm": 2.980419635772705, "learning_rate": 5e-05, "loss": 0.0214, "step": 1208 }, { "epoch": 4.000844594594595, "grad_norm": 6.6831183433532715, "learning_rate": 5e-05, "loss": 0.1847, "step": 1209 }, { "epoch": 4.000878378378379, "grad_norm": 5.273904323577881, "learning_rate": 5e-05, "loss": 0.0631, "step": 1210 }, { "epoch": 4.000912162162162, "grad_norm": 0.050481393933296204, "learning_rate": 5e-05, "loss": 0.0013, "step": 1211 }, { "epoch": 4.000945945945946, "grad_norm": 4.148603439331055, "learning_rate": 5e-05, "loss": 0.1199, "step": 1212 }, { "epoch": 4.00097972972973, "grad_norm": 5.638693809509277, "learning_rate": 5e-05, "loss": 0.0335, "step": 1213 }, { "epoch": 4.001013513513514, "grad_norm": 9.198224067687988, "learning_rate": 5e-05, "loss": 0.077, "step": 1214 }, { "epoch": 4.001047297297298, "grad_norm": 0.9798126220703125, "learning_rate": 5e-05, "loss": 0.0124, "step": 1215 }, { "epoch": 4.0010810810810815, "grad_norm": 28.557910919189453, "learning_rate": 5e-05, "loss": 0.3771, "step": 1216 }, { "epoch": 4.0011148648648645, "grad_norm": 24.406667709350586, "learning_rate": 5e-05, "loss": 0.9248, "step": 1217 }, { "epoch": 4.001148648648648, "grad_norm": 13.855785369873047, "learning_rate": 5e-05, "loss": 0.7421, "step": 1218 }, { "epoch": 4.001182432432432, "grad_norm": 14.961307525634766, "learning_rate": 5e-05, "loss": 1.0758, "step": 1219 }, { "epoch": 4.001216216216216, "grad_norm": 18.018611907958984, "learning_rate": 5e-05, "loss": 0.3223, "step": 1220 }, { "epoch": 4.00125, "grad_norm": 4.653634071350098, "learning_rate": 5e-05, "loss": 0.0293, "step": 1221 }, { "epoch": 4.001283783783784, "grad_norm": 15.581366539001465, "learning_rate": 5e-05, "loss": 0.0416, "step": 1222 }, { "epoch": 4.001317567567567, "grad_norm": 15.335142135620117, "learning_rate": 5e-05, "loss": 0.3366, "step": 1223 }, { "epoch": 4.001351351351351, "grad_norm": 9.871565818786621, "learning_rate": 5e-05, "loss": 0.8459, "step": 1224 }, { "epoch": 4.001385135135135, "grad_norm": 13.84634780883789, "learning_rate": 5e-05, "loss": 0.5835, "step": 1225 }, { "epoch": 4.001418918918919, "grad_norm": 6.246882915496826, "learning_rate": 5e-05, "loss": 0.5732, "step": 1226 }, { "epoch": 4.001452702702703, "grad_norm": 13.314071655273438, "learning_rate": 5e-05, "loss": 0.0965, "step": 1227 }, { "epoch": 4.0014864864864865, "grad_norm": 0.2969866693019867, "learning_rate": 5e-05, "loss": 0.0105, "step": 1228 }, { "epoch": 4.00152027027027, "grad_norm": 2.3769125938415527, "learning_rate": 5e-05, "loss": 0.0345, "step": 1229 }, { "epoch": 4.001554054054054, "grad_norm": 1.0960595607757568, "learning_rate": 5e-05, "loss": 0.0138, "step": 1230 }, { "epoch": 4.001587837837838, "grad_norm": 14.619505882263184, "learning_rate": 5e-05, "loss": 0.2291, "step": 1231 }, { "epoch": 4.001621621621622, "grad_norm": 2.204406261444092, "learning_rate": 5e-05, "loss": 0.0274, "step": 1232 }, { "epoch": 4.001655405405406, "grad_norm": 3.4415595531463623, "learning_rate": 5e-05, "loss": 0.025, "step": 1233 }, { "epoch": 4.0016891891891895, "grad_norm": 0.8196115493774414, "learning_rate": 5e-05, "loss": 0.0126, "step": 1234 }, { "epoch": 4.001722972972973, "grad_norm": 6.32401704788208, "learning_rate": 5e-05, "loss": 0.2147, "step": 1235 }, { "epoch": 4.001756756756757, "grad_norm": 0.514305591583252, "learning_rate": 5e-05, "loss": 0.0101, "step": 1236 }, { "epoch": 4.001790540540541, "grad_norm": 13.239570617675781, "learning_rate": 5e-05, "loss": 0.2659, "step": 1237 }, { "epoch": 4.001824324324324, "grad_norm": 5.368082046508789, "learning_rate": 5e-05, "loss": 0.0637, "step": 1238 }, { "epoch": 4.001858108108108, "grad_norm": 0.08090070635080338, "learning_rate": 5e-05, "loss": 0.0019, "step": 1239 }, { "epoch": 4.0018918918918915, "grad_norm": 5.149963855743408, "learning_rate": 5e-05, "loss": 0.1222, "step": 1240 }, { "epoch": 4.001925675675675, "grad_norm": 3.9890549182891846, "learning_rate": 5e-05, "loss": 0.031, "step": 1241 }, { "epoch": 4.001959459459459, "grad_norm": 2.2571773529052734, "learning_rate": 5e-05, "loss": 0.0229, "step": 1242 }, { "epoch": 4.001993243243243, "grad_norm": 1.0517438650131226, "learning_rate": 5e-05, "loss": 0.0138, "step": 1243 }, { "epoch": 4.002027027027027, "grad_norm": 6.752010822296143, "learning_rate": 5e-05, "loss": 0.2222, "step": 1244 }, { "epoch": 4.002060810810811, "grad_norm": 9.617339134216309, "learning_rate": 5e-05, "loss": 0.0897, "step": 1245 }, { "epoch": 4.0020945945945945, "grad_norm": 10.552943229675293, "learning_rate": 5e-05, "loss": 0.2499, "step": 1246 }, { "epoch": 4.002128378378378, "grad_norm": 9.044352531433105, "learning_rate": 5e-05, "loss": 0.7877, "step": 1247 }, { "epoch": 4.002162162162162, "grad_norm": 23.569318771362305, "learning_rate": 5e-05, "loss": 0.505, "step": 1248 }, { "epoch": 4.002195945945946, "grad_norm": 1.3923778533935547, "learning_rate": 5e-05, "loss": 0.0257, "step": 1249 }, { "epoch": 4.00222972972973, "grad_norm": 1.0024542808532715, "learning_rate": 5e-05, "loss": 0.0217, "step": 1250 }, { "epoch": 4.002263513513514, "grad_norm": 0.22522957623004913, "learning_rate": 5e-05, "loss": 0.0084, "step": 1251 }, { "epoch": 4.002297297297297, "grad_norm": 0.36244526505470276, "learning_rate": 5e-05, "loss": 0.0035, "step": 1252 }, { "epoch": 4.002331081081081, "grad_norm": 1.719434380531311, "learning_rate": 5e-05, "loss": 0.0158, "step": 1253 }, { "epoch": 4.002364864864865, "grad_norm": 11.626745223999023, "learning_rate": 5e-05, "loss": 0.1015, "step": 1254 }, { "epoch": 4.002398648648649, "grad_norm": 14.961271286010742, "learning_rate": 5e-05, "loss": 0.1312, "step": 1255 }, { "epoch": 4.002432432432433, "grad_norm": 31.001689910888672, "learning_rate": 5e-05, "loss": 0.2173, "step": 1256 }, { "epoch": 4.002466216216217, "grad_norm": 0.1725267916917801, "learning_rate": 5e-05, "loss": 0.0043, "step": 1257 }, { "epoch": 4.0025, "grad_norm": 13.750009536743164, "learning_rate": 5e-05, "loss": 0.561, "step": 1258 }, { "epoch": 4.002533783783784, "grad_norm": 2.4723706245422363, "learning_rate": 5e-05, "loss": 0.0096, "step": 1259 }, { "epoch": 4.002567567567567, "grad_norm": 0.12350428104400635, "learning_rate": 5e-05, "loss": 0.0036, "step": 1260 }, { "epoch": 4.002601351351351, "grad_norm": 2.0372908115386963, "learning_rate": 5e-05, "loss": 0.0152, "step": 1261 }, { "epoch": 4.002635135135135, "grad_norm": 2.475153923034668, "learning_rate": 5e-05, "loss": 0.0169, "step": 1262 }, { "epoch": 4.002668918918919, "grad_norm": 23.30116844177246, "learning_rate": 5e-05, "loss": 0.2219, "step": 1263 }, { "epoch": 4.0027027027027025, "grad_norm": 0.28644683957099915, "learning_rate": 5e-05, "loss": 0.006, "step": 1264 }, { "epoch": 4.002736486486486, "grad_norm": 13.647180557250977, "learning_rate": 5e-05, "loss": 1.2944, "step": 1265 }, { "epoch": 4.00277027027027, "grad_norm": 7.536153793334961, "learning_rate": 5e-05, "loss": 0.0567, "step": 1266 }, { "epoch": 4.002804054054054, "grad_norm": 6.682562351226807, "learning_rate": 5e-05, "loss": 0.6269, "step": 1267 }, { "epoch": 4.002837837837838, "grad_norm": 0.16732585430145264, "learning_rate": 5e-05, "loss": 0.0026, "step": 1268 }, { "epoch": 4.002871621621622, "grad_norm": 20.42455291748047, "learning_rate": 5e-05, "loss": 0.131, "step": 1269 }, { "epoch": 4.002905405405405, "grad_norm": 0.7551508545875549, "learning_rate": 5e-05, "loss": 0.0041, "step": 1270 }, { "epoch": 4.002939189189189, "grad_norm": 0.04466058313846588, "learning_rate": 5e-05, "loss": 0.0007, "step": 1271 }, { "epoch": 4.002972972972973, "grad_norm": 16.109630584716797, "learning_rate": 5e-05, "loss": 0.6027, "step": 1272 }, { "epoch": 4.003006756756757, "grad_norm": 8.224939346313477, "learning_rate": 5e-05, "loss": 0.0383, "step": 1273 }, { "epoch": 4.003040540540541, "grad_norm": 0.03669837862253189, "learning_rate": 5e-05, "loss": 0.0007, "step": 1274 }, { "epoch": 4.0030743243243245, "grad_norm": 7.3443522453308105, "learning_rate": 5e-05, "loss": 0.2362, "step": 1275 }, { "epoch": 4.003108108108108, "grad_norm": 0.03623752295970917, "learning_rate": 5e-05, "loss": 0.0012, "step": 1276 }, { "epoch": 4.003141891891892, "grad_norm": 0.1978302001953125, "learning_rate": 5e-05, "loss": 0.0019, "step": 1277 }, { "epoch": 4.003175675675676, "grad_norm": 18.52131462097168, "learning_rate": 5e-05, "loss": 1.136, "step": 1278 }, { "epoch": 4.00320945945946, "grad_norm": 0.5074062347412109, "learning_rate": 5e-05, "loss": 0.0082, "step": 1279 }, { "epoch": 4.003243243243244, "grad_norm": 2.9588708877563477, "learning_rate": 5e-05, "loss": 0.2993, "step": 1280 }, { "epoch": 4.003277027027027, "grad_norm": 12.185016632080078, "learning_rate": 5e-05, "loss": 0.4348, "step": 1281 }, { "epoch": 4.00331081081081, "grad_norm": 8.419539451599121, "learning_rate": 5e-05, "loss": 0.9942, "step": 1282 }, { "epoch": 4.003344594594594, "grad_norm": 0.06053459271788597, "learning_rate": 5e-05, "loss": 0.0011, "step": 1283 }, { "epoch": 4.003378378378378, "grad_norm": 28.301292419433594, "learning_rate": 5e-05, "loss": 0.4529, "step": 1284 }, { "epoch": 4.003412162162162, "grad_norm": 3.5895965099334717, "learning_rate": 5e-05, "loss": 0.0776, "step": 1285 }, { "epoch": 4.003445945945946, "grad_norm": 10.333102226257324, "learning_rate": 5e-05, "loss": 0.1333, "step": 1286 }, { "epoch": 4.0034797297297295, "grad_norm": 0.4788123071193695, "learning_rate": 5e-05, "loss": 0.0061, "step": 1287 }, { "epoch": 4.003513513513513, "grad_norm": 2.1415176391601562, "learning_rate": 5e-05, "loss": 0.0944, "step": 1288 }, { "epoch": 4.003547297297297, "grad_norm": 0.04277995601296425, "learning_rate": 5e-05, "loss": 0.0014, "step": 1289 }, { "epoch": 4.003581081081081, "grad_norm": 5.611273288726807, "learning_rate": 5e-05, "loss": 0.0557, "step": 1290 }, { "epoch": 4.003614864864865, "grad_norm": 0.5671752095222473, "learning_rate": 5e-05, "loss": 0.0072, "step": 1291 }, { "epoch": 4.003648648648649, "grad_norm": 8.020956039428711, "learning_rate": 5e-05, "loss": 0.0583, "step": 1292 }, { "epoch": 4.0036824324324325, "grad_norm": 4.840924263000488, "learning_rate": 5e-05, "loss": 0.1447, "step": 1293 }, { "epoch": 4.003716216216216, "grad_norm": 26.636924743652344, "learning_rate": 5e-05, "loss": 0.7508, "step": 1294 }, { "epoch": 4.00375, "grad_norm": 5.855466842651367, "learning_rate": 5e-05, "loss": 0.3374, "step": 1295 }, { "epoch": 4.003783783783784, "grad_norm": 0.3892776072025299, "learning_rate": 5e-05, "loss": 0.006, "step": 1296 }, { "epoch": 4.003817567567568, "grad_norm": 0.7954713106155396, "learning_rate": 5e-05, "loss": 0.0094, "step": 1297 }, { "epoch": 4.003851351351352, "grad_norm": 1.6521955728530884, "learning_rate": 5e-05, "loss": 0.0188, "step": 1298 }, { "epoch": 4.003885135135135, "grad_norm": 0.9886875152587891, "learning_rate": 5e-05, "loss": 0.0347, "step": 1299 }, { "epoch": 4.003918918918919, "grad_norm": 0.1742178499698639, "learning_rate": 5e-05, "loss": 0.0052, "step": 1300 }, { "epoch": 4.003952702702703, "grad_norm": 0.5352350473403931, "learning_rate": 5e-05, "loss": 0.0172, "step": 1301 }, { "epoch": 4.003986486486487, "grad_norm": 9.838845252990723, "learning_rate": 5e-05, "loss": 0.7073, "step": 1302 }, { "epoch": 4.00402027027027, "grad_norm": 11.352029800415039, "learning_rate": 5e-05, "loss": 0.6391, "step": 1303 }, { "epoch": 4.004054054054054, "grad_norm": 2.0211758613586426, "learning_rate": 5e-05, "loss": 0.0182, "step": 1304 }, { "epoch": 4.0040878378378375, "grad_norm": 0.22896267473697662, "learning_rate": 5e-05, "loss": 0.0038, "step": 1305 }, { "epoch": 4.004121621621621, "grad_norm": 9.150251388549805, "learning_rate": 5e-05, "loss": 0.9989, "step": 1306 }, { "epoch": 4.004155405405405, "grad_norm": 3.5962119102478027, "learning_rate": 5e-05, "loss": 0.0301, "step": 1307 }, { "epoch": 4.004189189189189, "grad_norm": 0.41086432337760925, "learning_rate": 5e-05, "loss": 0.0048, "step": 1308 }, { "epoch": 4.004222972972973, "grad_norm": 0.7335929870605469, "learning_rate": 5e-05, "loss": 0.0152, "step": 1309 }, { "epoch": 4.004256756756757, "grad_norm": 0.07011237740516663, "learning_rate": 5e-05, "loss": 0.0017, "step": 1310 }, { "epoch": 4.0042905405405405, "grad_norm": 21.260251998901367, "learning_rate": 5e-05, "loss": 0.2205, "step": 1311 }, { "epoch": 4.004324324324324, "grad_norm": 39.63029861450195, "learning_rate": 5e-05, "loss": 0.2532, "step": 1312 }, { "epoch": 4.004358108108108, "grad_norm": 8.226527214050293, "learning_rate": 5e-05, "loss": 0.0807, "step": 1313 }, { "epoch": 4.004391891891892, "grad_norm": 12.787647247314453, "learning_rate": 5e-05, "loss": 0.717, "step": 1314 }, { "epoch": 4.004425675675676, "grad_norm": 15.820696830749512, "learning_rate": 5e-05, "loss": 0.5899, "step": 1315 }, { "epoch": 4.00445945945946, "grad_norm": 1.0548150539398193, "learning_rate": 5e-05, "loss": 0.0088, "step": 1316 }, { "epoch": 4.004493243243243, "grad_norm": 0.20863951742649078, "learning_rate": 5e-05, "loss": 0.0052, "step": 1317 }, { "epoch": 4.004527027027027, "grad_norm": 0.31451061367988586, "learning_rate": 5e-05, "loss": 0.0033, "step": 1318 }, { "epoch": 4.004560810810811, "grad_norm": 0.03955497965216637, "learning_rate": 5e-05, "loss": 0.0015, "step": 1319 }, { "epoch": 4.004594594594595, "grad_norm": 19.66366958618164, "learning_rate": 5e-05, "loss": 0.7348, "step": 1320 }, { "epoch": 4.004628378378379, "grad_norm": 0.8225744962692261, "learning_rate": 5e-05, "loss": 0.0478, "step": 1321 }, { "epoch": 4.0046621621621625, "grad_norm": 2.4575514793395996, "learning_rate": 5e-05, "loss": 0.036, "step": 1322 }, { "epoch": 4.004695945945946, "grad_norm": 21.44823455810547, "learning_rate": 5e-05, "loss": 0.4341, "step": 1323 }, { "epoch": 4.004729729729729, "grad_norm": 15.119087219238281, "learning_rate": 5e-05, "loss": 0.7831, "step": 1324 }, { "epoch": 4.004763513513513, "grad_norm": 1.6910759210586548, "learning_rate": 5e-05, "loss": 0.0056, "step": 1325 }, { "epoch": 4.004797297297297, "grad_norm": 0.6117855906486511, "learning_rate": 5e-05, "loss": 0.0037, "step": 1326 }, { "epoch": 4.004831081081081, "grad_norm": 0.3528434634208679, "learning_rate": 5e-05, "loss": 0.0052, "step": 1327 }, { "epoch": 4.004864864864865, "grad_norm": 0.6182921528816223, "learning_rate": 5e-05, "loss": 0.0161, "step": 1328 }, { "epoch": 4.004898648648648, "grad_norm": 21.530879974365234, "learning_rate": 5e-05, "loss": 1.2006, "step": 1329 }, { "epoch": 4.004932432432432, "grad_norm": 30.080381393432617, "learning_rate": 5e-05, "loss": 0.5093, "step": 1330 }, { "epoch": 4.004966216216216, "grad_norm": 15.96956729888916, "learning_rate": 5e-05, "loss": 0.8482, "step": 1331 }, { "epoch": 4.005, "grad_norm": 0.3129767179489136, "learning_rate": 5e-05, "loss": 0.0086, "step": 1332 }, { "epoch": 4.005033783783784, "grad_norm": 15.469742774963379, "learning_rate": 5e-05, "loss": 0.0792, "step": 1333 }, { "epoch": 4.0050675675675675, "grad_norm": 0.15708912909030914, "learning_rate": 5e-05, "loss": 0.003, "step": 1334 }, { "epoch": 4.005101351351351, "grad_norm": 0.05987883359193802, "learning_rate": 5e-05, "loss": 0.0018, "step": 1335 }, { "epoch": 4.005135135135135, "grad_norm": 0.055743392556905746, "learning_rate": 5e-05, "loss": 0.0017, "step": 1336 }, { "epoch": 4.005168918918919, "grad_norm": 0.7784022688865662, "learning_rate": 5e-05, "loss": 0.0148, "step": 1337 }, { "epoch": 4.005202702702703, "grad_norm": 2.6809933185577393, "learning_rate": 5e-05, "loss": 0.2519, "step": 1338 }, { "epoch": 4.005236486486487, "grad_norm": 7.908838272094727, "learning_rate": 5e-05, "loss": 0.7368, "step": 1339 }, { "epoch": 4.0052702702702705, "grad_norm": 8.10580062866211, "learning_rate": 5e-05, "loss": 0.1626, "step": 1340 }, { "epoch": 4.005304054054054, "grad_norm": 5.291030406951904, "learning_rate": 5e-05, "loss": 0.0515, "step": 1341 }, { "epoch": 4.005337837837838, "grad_norm": 23.33680534362793, "learning_rate": 5e-05, "loss": 0.4494, "step": 1342 }, { "epoch": 4.005371621621622, "grad_norm": 23.490291595458984, "learning_rate": 5e-05, "loss": 0.3439, "step": 1343 }, { "epoch": 4.005405405405406, "grad_norm": 3.248295545578003, "learning_rate": 5e-05, "loss": 0.0196, "step": 1344 }, { "epoch": 4.00543918918919, "grad_norm": 0.11120904982089996, "learning_rate": 5e-05, "loss": 0.0024, "step": 1345 }, { "epoch": 4.005472972972973, "grad_norm": 0.022347426041960716, "learning_rate": 5e-05, "loss": 0.001, "step": 1346 }, { "epoch": 4.005506756756756, "grad_norm": 0.08791688829660416, "learning_rate": 5e-05, "loss": 0.0016, "step": 1347 }, { "epoch": 4.00554054054054, "grad_norm": 1.7400422096252441, "learning_rate": 5e-05, "loss": 0.0319, "step": 1348 }, { "epoch": 4.005574324324324, "grad_norm": 23.831880569458008, "learning_rate": 5e-05, "loss": 0.3963, "step": 1349 }, { "epoch": 4.005608108108108, "grad_norm": 0.054754819720983505, "learning_rate": 5e-05, "loss": 0.002, "step": 1350 }, { "epoch": 4.005641891891892, "grad_norm": 9.678888320922852, "learning_rate": 5e-05, "loss": 0.4464, "step": 1351 }, { "epoch": 4.0056756756756755, "grad_norm": 13.312692642211914, "learning_rate": 5e-05, "loss": 0.3224, "step": 1352 }, { "epoch": 4.005709459459459, "grad_norm": 11.023360252380371, "learning_rate": 5e-05, "loss": 0.7819, "step": 1353 }, { "epoch": 4.005743243243243, "grad_norm": 0.04428889974951744, "learning_rate": 5e-05, "loss": 0.0018, "step": 1354 }, { "epoch": 4.005777027027027, "grad_norm": 14.32711124420166, "learning_rate": 5e-05, "loss": 0.1064, "step": 1355 }, { "epoch": 4.005810810810811, "grad_norm": 19.643436431884766, "learning_rate": 5e-05, "loss": 0.7212, "step": 1356 }, { "epoch": 4.005844594594595, "grad_norm": 11.712814331054688, "learning_rate": 5e-05, "loss": 0.1245, "step": 1357 }, { "epoch": 4.0058783783783785, "grad_norm": 1.9492387771606445, "learning_rate": 5e-05, "loss": 0.0225, "step": 1358 }, { "epoch": 4.005912162162162, "grad_norm": 6.961433410644531, "learning_rate": 5e-05, "loss": 0.0865, "step": 1359 }, { "epoch": 4.005945945945946, "grad_norm": 0.35946744680404663, "learning_rate": 5e-05, "loss": 0.0108, "step": 1360 }, { "epoch": 4.00597972972973, "grad_norm": 1.3855509757995605, "learning_rate": 5e-05, "loss": 0.1134, "step": 1361 }, { "epoch": 4.006013513513514, "grad_norm": 2.407505750656128, "learning_rate": 5e-05, "loss": 0.0211, "step": 1362 }, { "epoch": 4.006047297297298, "grad_norm": 1.1618024110794067, "learning_rate": 5e-05, "loss": 0.009, "step": 1363 }, { "epoch": 4.006081081081081, "grad_norm": 5.925068378448486, "learning_rate": 5e-05, "loss": 0.7775, "step": 1364 }, { "epoch": 4.006114864864865, "grad_norm": 9.921957969665527, "learning_rate": 5e-05, "loss": 0.1032, "step": 1365 }, { "epoch": 4.006148648648649, "grad_norm": 11.295624732971191, "learning_rate": 5e-05, "loss": 0.6263, "step": 1366 }, { "epoch": 4.006182432432432, "grad_norm": 4.017728805541992, "learning_rate": 5e-05, "loss": 0.1076, "step": 1367 }, { "epoch": 4.006216216216216, "grad_norm": 2.5714640617370605, "learning_rate": 5e-05, "loss": 0.0188, "step": 1368 }, { "epoch": 4.00625, "grad_norm": 0.1047092005610466, "learning_rate": 5e-05, "loss": 0.0022, "step": 1369 }, { "epoch": 4.0062837837837835, "grad_norm": 34.165550231933594, "learning_rate": 5e-05, "loss": 0.4591, "step": 1370 }, { "epoch": 4.006317567567567, "grad_norm": 4.547745704650879, "learning_rate": 5e-05, "loss": 0.0236, "step": 1371 }, { "epoch": 4.006351351351351, "grad_norm": 7.9083404541015625, "learning_rate": 5e-05, "loss": 0.2802, "step": 1372 }, { "epoch": 4.006385135135135, "grad_norm": 27.55382537841797, "learning_rate": 5e-05, "loss": 0.3488, "step": 1373 }, { "epoch": 4.006418918918919, "grad_norm": 0.05263963341712952, "learning_rate": 5e-05, "loss": 0.0014, "step": 1374 }, { "epoch": 4.006452702702703, "grad_norm": 22.800678253173828, "learning_rate": 5e-05, "loss": 0.2323, "step": 1375 }, { "epoch": 4.006486486486486, "grad_norm": 4.663492202758789, "learning_rate": 5e-05, "loss": 0.8264, "step": 1376 }, { "epoch": 4.00652027027027, "grad_norm": 1.796640396118164, "learning_rate": 5e-05, "loss": 0.0202, "step": 1377 }, { "epoch": 4.006554054054054, "grad_norm": 1.0879889726638794, "learning_rate": 5e-05, "loss": 0.0388, "step": 1378 }, { "epoch": 4.006587837837838, "grad_norm": 6.773138523101807, "learning_rate": 5e-05, "loss": 0.0955, "step": 1379 }, { "epoch": 4.006621621621622, "grad_norm": 17.447729110717773, "learning_rate": 5e-05, "loss": 0.6859, "step": 1380 }, { "epoch": 4.0066554054054055, "grad_norm": 8.451388359069824, "learning_rate": 5e-05, "loss": 0.2388, "step": 1381 }, { "epoch": 4.006689189189189, "grad_norm": 20.301761627197266, "learning_rate": 5e-05, "loss": 1.0703, "step": 1382 }, { "epoch": 4.006722972972973, "grad_norm": 1.4285979270935059, "learning_rate": 5e-05, "loss": 0.0328, "step": 1383 }, { "epoch": 4.006756756756757, "grad_norm": 17.29395294189453, "learning_rate": 5e-05, "loss": 0.3246, "step": 1384 }, { "epoch": 4.006790540540541, "grad_norm": 8.261160850524902, "learning_rate": 5e-05, "loss": 0.068, "step": 1385 }, { "epoch": 4.006824324324325, "grad_norm": 20.429903030395508, "learning_rate": 5e-05, "loss": 0.3161, "step": 1386 }, { "epoch": 4.0068581081081085, "grad_norm": 9.58387279510498, "learning_rate": 5e-05, "loss": 0.1689, "step": 1387 }, { "epoch": 4.006891891891892, "grad_norm": 14.05483341217041, "learning_rate": 5e-05, "loss": 0.3668, "step": 1388 }, { "epoch": 4.006925675675675, "grad_norm": 8.132226943969727, "learning_rate": 5e-05, "loss": 0.1531, "step": 1389 }, { "epoch": 4.006959459459459, "grad_norm": 14.16901683807373, "learning_rate": 5e-05, "loss": 0.4293, "step": 1390 }, { "epoch": 4.006993243243243, "grad_norm": 10.900826454162598, "learning_rate": 5e-05, "loss": 0.9233, "step": 1391 }, { "epoch": 4.007027027027027, "grad_norm": 3.970811367034912, "learning_rate": 5e-05, "loss": 0.0748, "step": 1392 }, { "epoch": 4.007060810810811, "grad_norm": 21.19884490966797, "learning_rate": 5e-05, "loss": 0.2208, "step": 1393 }, { "epoch": 4.007094594594594, "grad_norm": 0.8351012468338013, "learning_rate": 5e-05, "loss": 0.0119, "step": 1394 }, { "epoch": 4.007128378378378, "grad_norm": 0.09022592753171921, "learning_rate": 5e-05, "loss": 0.0035, "step": 1395 }, { "epoch": 4.007162162162162, "grad_norm": 26.466691970825195, "learning_rate": 5e-05, "loss": 0.4429, "step": 1396 }, { "epoch": 4.007195945945946, "grad_norm": 9.891855239868164, "learning_rate": 5e-05, "loss": 0.5784, "step": 1397 }, { "epoch": 4.00722972972973, "grad_norm": 19.292612075805664, "learning_rate": 5e-05, "loss": 0.418, "step": 1398 }, { "epoch": 4.0072635135135135, "grad_norm": 12.55517578125, "learning_rate": 5e-05, "loss": 0.5006, "step": 1399 }, { "epoch": 4.007297297297297, "grad_norm": 0.9213710427284241, "learning_rate": 5e-05, "loss": 0.0116, "step": 1400 }, { "epoch": 4.007331081081081, "grad_norm": 0.24535100162029266, "learning_rate": 5e-05, "loss": 0.0076, "step": 1401 }, { "epoch": 4.007364864864865, "grad_norm": 0.19396227598190308, "learning_rate": 5e-05, "loss": 0.0045, "step": 1402 }, { "epoch": 4.007398648648649, "grad_norm": 2.62324857711792, "learning_rate": 5e-05, "loss": 0.0336, "step": 1403 }, { "epoch": 4.007432432432433, "grad_norm": 1.8450448513031006, "learning_rate": 5e-05, "loss": 0.0566, "step": 1404 }, { "epoch": 4.0074662162162165, "grad_norm": 0.18289968371391296, "learning_rate": 5e-05, "loss": 0.0059, "step": 1405 }, { "epoch": 4.0075, "grad_norm": 1.6636778116226196, "learning_rate": 5e-05, "loss": 0.0354, "step": 1406 }, { "epoch": 4.007533783783784, "grad_norm": 5.364724636077881, "learning_rate": 5e-05, "loss": 0.0848, "step": 1407 }, { "epoch": 4.007567567567568, "grad_norm": 5.326550006866455, "learning_rate": 5e-05, "loss": 0.052, "step": 1408 }, { "epoch": 4.007601351351352, "grad_norm": 0.9380009174346924, "learning_rate": 5e-05, "loss": 0.0162, "step": 1409 }, { "epoch": 4.007635135135135, "grad_norm": 1.546087622642517, "learning_rate": 5e-05, "loss": 0.0089, "step": 1410 }, { "epoch": 4.0076689189189185, "grad_norm": 0.07086139917373657, "learning_rate": 5e-05, "loss": 0.0021, "step": 1411 }, { "epoch": 4.007702702702702, "grad_norm": 0.10418368875980377, "learning_rate": 5e-05, "loss": 0.0035, "step": 1412 }, { "epoch": 4.007736486486486, "grad_norm": 6.037375450134277, "learning_rate": 5e-05, "loss": 0.6038, "step": 1413 }, { "epoch": 4.00777027027027, "grad_norm": 11.690402030944824, "learning_rate": 5e-05, "loss": 0.3366, "step": 1414 }, { "epoch": 4.007804054054054, "grad_norm": 15.999996185302734, "learning_rate": 5e-05, "loss": 0.2925, "step": 1415 }, { "epoch": 4.007837837837838, "grad_norm": 21.163896560668945, "learning_rate": 5e-05, "loss": 0.2847, "step": 1416 }, { "epoch": 4.0078716216216215, "grad_norm": 0.32954761385917664, "learning_rate": 5e-05, "loss": 0.0068, "step": 1417 }, { "epoch": 4.007905405405405, "grad_norm": 31.6840763092041, "learning_rate": 5e-05, "loss": 0.3193, "step": 1418 }, { "epoch": 4.007939189189189, "grad_norm": 0.3403964936733246, "learning_rate": 5e-05, "loss": 0.0076, "step": 1419 }, { "epoch": 4.007972972972973, "grad_norm": 17.08206558227539, "learning_rate": 5e-05, "loss": 0.3514, "step": 1420 }, { "epoch": 4.008006756756757, "grad_norm": 7.371789455413818, "learning_rate": 5e-05, "loss": 0.6461, "step": 1421 }, { "epoch": 4.008040540540541, "grad_norm": 8.908066749572754, "learning_rate": 5e-05, "loss": 0.1622, "step": 1422 }, { "epoch": 4.008074324324324, "grad_norm": 4.574301242828369, "learning_rate": 5e-05, "loss": 0.0255, "step": 1423 }, { "epoch": 4.008108108108108, "grad_norm": 11.779115676879883, "learning_rate": 5e-05, "loss": 0.0822, "step": 1424 }, { "epoch": 4.008141891891892, "grad_norm": 19.958080291748047, "learning_rate": 5e-05, "loss": 0.2928, "step": 1425 }, { "epoch": 4.008175675675676, "grad_norm": 9.270084381103516, "learning_rate": 5e-05, "loss": 0.1197, "step": 1426 }, { "epoch": 4.00820945945946, "grad_norm": 0.4011274576187134, "learning_rate": 5e-05, "loss": 0.0042, "step": 1427 }, { "epoch": 4.0082432432432435, "grad_norm": 8.679191589355469, "learning_rate": 5e-05, "loss": 0.4092, "step": 1428 }, { "epoch": 4.008277027027027, "grad_norm": 7.032505512237549, "learning_rate": 5e-05, "loss": 0.0638, "step": 1429 }, { "epoch": 4.008310810810811, "grad_norm": 2.438603639602661, "learning_rate": 5e-05, "loss": 0.0246, "step": 1430 }, { "epoch": 4.008344594594595, "grad_norm": 0.7059155106544495, "learning_rate": 5e-05, "loss": 0.0101, "step": 1431 }, { "epoch": 4.008378378378378, "grad_norm": 0.39255473017692566, "learning_rate": 5e-05, "loss": 0.0063, "step": 1432 }, { "epoch": 4.008412162162162, "grad_norm": 0.07073517888784409, "learning_rate": 5e-05, "loss": 0.0025, "step": 1433 }, { "epoch": 4.008445945945946, "grad_norm": 0.6806878447532654, "learning_rate": 5e-05, "loss": 0.0069, "step": 1434 }, { "epoch": 4.008479729729729, "grad_norm": 12.798789024353027, "learning_rate": 5e-05, "loss": 0.0692, "step": 1435 }, { "epoch": 4.008513513513513, "grad_norm": 21.317472457885742, "learning_rate": 5e-05, "loss": 0.3528, "step": 1436 }, { "epoch": 4.008547297297297, "grad_norm": 2.5849947929382324, "learning_rate": 5e-05, "loss": 0.0265, "step": 1437 }, { "epoch": 4.008581081081081, "grad_norm": 0.12100938707590103, "learning_rate": 5e-05, "loss": 0.002, "step": 1438 }, { "epoch": 4.008614864864865, "grad_norm": 0.037478018552064896, "learning_rate": 5e-05, "loss": 0.0014, "step": 1439 }, { "epoch": 4.008648648648649, "grad_norm": 3.445856809616089, "learning_rate": 5e-05, "loss": 0.0784, "step": 1440 }, { "epoch": 4.008682432432432, "grad_norm": 0.5356605648994446, "learning_rate": 5e-05, "loss": 0.0039, "step": 1441 }, { "epoch": 4.008716216216216, "grad_norm": 21.617460250854492, "learning_rate": 5e-05, "loss": 0.9642, "step": 1442 }, { "epoch": 4.00875, "grad_norm": 0.987088680267334, "learning_rate": 5e-05, "loss": 0.0136, "step": 1443 }, { "epoch": 4.008783783783784, "grad_norm": 0.6525653004646301, "learning_rate": 5e-05, "loss": 0.0054, "step": 1444 }, { "epoch": 4.008817567567568, "grad_norm": 3.5284361839294434, "learning_rate": 5e-05, "loss": 0.0247, "step": 1445 }, { "epoch": 4.0088513513513515, "grad_norm": 1.4834717512130737, "learning_rate": 5e-05, "loss": 0.0235, "step": 1446 }, { "epoch": 4.008885135135135, "grad_norm": 20.48880386352539, "learning_rate": 5e-05, "loss": 0.217, "step": 1447 }, { "epoch": 4.008918918918919, "grad_norm": 25.089874267578125, "learning_rate": 5e-05, "loss": 0.1354, "step": 1448 }, { "epoch": 4.008952702702703, "grad_norm": 18.980449676513672, "learning_rate": 5e-05, "loss": 1.1663, "step": 1449 }, { "epoch": 4.008986486486487, "grad_norm": 23.278099060058594, "learning_rate": 5e-05, "loss": 0.2743, "step": 1450 }, { "epoch": 4.009020270270271, "grad_norm": 0.4415324628353119, "learning_rate": 5e-05, "loss": 0.0073, "step": 1451 }, { "epoch": 4.0090540540540545, "grad_norm": 0.8443409204483032, "learning_rate": 5e-05, "loss": 0.0077, "step": 1452 }, { "epoch": 4.009087837837837, "grad_norm": 0.7258096933364868, "learning_rate": 5e-05, "loss": 0.0049, "step": 1453 }, { "epoch": 4.009121621621621, "grad_norm": 0.06747034192085266, "learning_rate": 5e-05, "loss": 0.0014, "step": 1454 }, { "epoch": 4.009155405405405, "grad_norm": 6.6766133308410645, "learning_rate": 5e-05, "loss": 0.5851, "step": 1455 }, { "epoch": 4.009189189189189, "grad_norm": 32.420963287353516, "learning_rate": 5e-05, "loss": 0.4753, "step": 1456 }, { "epoch": 4.009222972972973, "grad_norm": 0.016221173107624054, "learning_rate": 5e-05, "loss": 0.0005, "step": 1457 }, { "epoch": 4.0092567567567565, "grad_norm": 0.10565615445375443, "learning_rate": 5e-05, "loss": 0.0025, "step": 1458 }, { "epoch": 4.00929054054054, "grad_norm": 26.179458618164062, "learning_rate": 5e-05, "loss": 0.2175, "step": 1459 }, { "epoch": 4.009324324324324, "grad_norm": 4.5088605880737305, "learning_rate": 5e-05, "loss": 0.0554, "step": 1460 }, { "epoch": 4.009358108108108, "grad_norm": 12.873054504394531, "learning_rate": 5e-05, "loss": 0.0981, "step": 1461 }, { "epoch": 4.009391891891892, "grad_norm": 6.411178112030029, "learning_rate": 5e-05, "loss": 0.8896, "step": 1462 }, { "epoch": 4.009425675675676, "grad_norm": 22.388038635253906, "learning_rate": 5e-05, "loss": 0.5231, "step": 1463 }, { "epoch": 4.0094594594594595, "grad_norm": 0.16632691025733948, "learning_rate": 5e-05, "loss": 0.002, "step": 1464 }, { "epoch": 4.009493243243243, "grad_norm": 0.07684429734945297, "learning_rate": 5e-05, "loss": 0.0014, "step": 1465 }, { "epoch": 4.009527027027027, "grad_norm": 0.49472662806510925, "learning_rate": 5e-05, "loss": 0.0239, "step": 1466 }, { "epoch": 4.009560810810811, "grad_norm": 13.336262702941895, "learning_rate": 5e-05, "loss": 0.0911, "step": 1467 }, { "epoch": 4.009594594594595, "grad_norm": 28.02340316772461, "learning_rate": 5e-05, "loss": 0.4255, "step": 1468 }, { "epoch": 4.009628378378379, "grad_norm": 0.3531465530395508, "learning_rate": 5e-05, "loss": 0.0056, "step": 1469 }, { "epoch": 4.009662162162162, "grad_norm": 13.427985191345215, "learning_rate": 5e-05, "loss": 0.4899, "step": 1470 }, { "epoch": 4.009695945945946, "grad_norm": 1.8061902523040771, "learning_rate": 5e-05, "loss": 0.0074, "step": 1471 }, { "epoch": 4.00972972972973, "grad_norm": 39.848724365234375, "learning_rate": 5e-05, "loss": 0.6756, "step": 1472 }, { "epoch": 4.009763513513514, "grad_norm": 23.022462844848633, "learning_rate": 5e-05, "loss": 0.7682, "step": 1473 }, { "epoch": 4.009797297297298, "grad_norm": 16.78392791748047, "learning_rate": 5e-05, "loss": 0.1827, "step": 1474 }, { "epoch": 4.009831081081081, "grad_norm": 0.012714478187263012, "learning_rate": 5e-05, "loss": 0.0005, "step": 1475 }, { "epoch": 4.0098648648648645, "grad_norm": 1.7886379957199097, "learning_rate": 5e-05, "loss": 0.0154, "step": 1476 }, { "epoch": 4.009898648648648, "grad_norm": 11.336533546447754, "learning_rate": 5e-05, "loss": 0.335, "step": 1477 }, { "epoch": 4.009932432432432, "grad_norm": 4.3830366134643555, "learning_rate": 5e-05, "loss": 0.0351, "step": 1478 }, { "epoch": 4.009966216216216, "grad_norm": 13.653178215026855, "learning_rate": 5e-05, "loss": 0.0784, "step": 1479 }, { "epoch": 4.01, "grad_norm": 1.1875946521759033, "learning_rate": 5e-05, "loss": 0.0271, "step": 1480 }, { "epoch": 4.01, "eval_accuracy": 0.8675282714054927, "eval_loss": 0.4465777277946472, "eval_runtime": 32.0608, "eval_samples_per_second": 19.307, "eval_steps_per_second": 2.433, "step": 1480 }, { "epoch": 5.000033783783784, "grad_norm": 4.3033857345581055, "learning_rate": 5e-05, "loss": 0.045, "step": 1481 }, { "epoch": 5.000067567567568, "grad_norm": 0.2896711826324463, "learning_rate": 5e-05, "loss": 0.0044, "step": 1482 }, { "epoch": 5.0001013513513515, "grad_norm": 0.04189551621675491, "learning_rate": 5e-05, "loss": 0.0015, "step": 1483 }, { "epoch": 5.000135135135135, "grad_norm": 2.243940830230713, "learning_rate": 5e-05, "loss": 0.0128, "step": 1484 }, { "epoch": 5.000168918918919, "grad_norm": 0.36532774567604065, "learning_rate": 5e-05, "loss": 0.0049, "step": 1485 }, { "epoch": 5.000202702702703, "grad_norm": 25.575490951538086, "learning_rate": 5e-05, "loss": 0.2358, "step": 1486 }, { "epoch": 5.000236486486487, "grad_norm": 0.27128052711486816, "learning_rate": 5e-05, "loss": 0.0059, "step": 1487 }, { "epoch": 5.000270270270271, "grad_norm": 19.42753028869629, "learning_rate": 5e-05, "loss": 0.3317, "step": 1488 }, { "epoch": 5.000304054054054, "grad_norm": 9.161480903625488, "learning_rate": 5e-05, "loss": 0.8166, "step": 1489 }, { "epoch": 5.000337837837838, "grad_norm": 0.030747467651963234, "learning_rate": 5e-05, "loss": 0.0008, "step": 1490 }, { "epoch": 5.000371621621621, "grad_norm": 1.055471420288086, "learning_rate": 5e-05, "loss": 0.0085, "step": 1491 }, { "epoch": 5.000405405405405, "grad_norm": 14.444806098937988, "learning_rate": 5e-05, "loss": 0.5418, "step": 1492 }, { "epoch": 5.000439189189189, "grad_norm": 0.574225902557373, "learning_rate": 5e-05, "loss": 0.005, "step": 1493 }, { "epoch": 5.000472972972973, "grad_norm": 0.4923744201660156, "learning_rate": 5e-05, "loss": 0.0047, "step": 1494 }, { "epoch": 5.0005067567567565, "grad_norm": 0.011888268403708935, "learning_rate": 5e-05, "loss": 0.0005, "step": 1495 }, { "epoch": 5.00054054054054, "grad_norm": 0.04029150679707527, "learning_rate": 5e-05, "loss": 0.0013, "step": 1496 }, { "epoch": 5.000574324324324, "grad_norm": 0.12671352922916412, "learning_rate": 5e-05, "loss": 0.0034, "step": 1497 }, { "epoch": 5.000608108108108, "grad_norm": 0.06552094966173172, "learning_rate": 5e-05, "loss": 0.0012, "step": 1498 }, { "epoch": 5.000641891891892, "grad_norm": 20.404767990112305, "learning_rate": 5e-05, "loss": 0.224, "step": 1499 }, { "epoch": 5.000675675675676, "grad_norm": 0.9097607731819153, "learning_rate": 5e-05, "loss": 0.0287, "step": 1500 }, { "epoch": 5.000709459459459, "grad_norm": 0.025107141584157944, "learning_rate": 5e-05, "loss": 0.0009, "step": 1501 }, { "epoch": 5.000743243243243, "grad_norm": 1.3833320140838623, "learning_rate": 5e-05, "loss": 0.0098, "step": 1502 }, { "epoch": 5.000777027027027, "grad_norm": 0.03603808954358101, "learning_rate": 5e-05, "loss": 0.0012, "step": 1503 }, { "epoch": 5.000810810810811, "grad_norm": 1.6665304899215698, "learning_rate": 5e-05, "loss": 0.0106, "step": 1504 }, { "epoch": 5.000844594594595, "grad_norm": 0.06701340526342392, "learning_rate": 5e-05, "loss": 0.0012, "step": 1505 }, { "epoch": 5.000878378378379, "grad_norm": 0.08706233650445938, "learning_rate": 5e-05, "loss": 0.0014, "step": 1506 }, { "epoch": 5.000912162162162, "grad_norm": 0.22398465871810913, "learning_rate": 5e-05, "loss": 0.0081, "step": 1507 }, { "epoch": 5.000945945945946, "grad_norm": 0.4093005657196045, "learning_rate": 5e-05, "loss": 0.0069, "step": 1508 }, { "epoch": 5.00097972972973, "grad_norm": 0.4173017144203186, "learning_rate": 5e-05, "loss": 0.0026, "step": 1509 }, { "epoch": 5.001013513513514, "grad_norm": 0.027959313243627548, "learning_rate": 5e-05, "loss": 0.0008, "step": 1510 }, { "epoch": 5.001047297297298, "grad_norm": 0.027330562472343445, "learning_rate": 5e-05, "loss": 0.0009, "step": 1511 }, { "epoch": 5.0010810810810815, "grad_norm": 33.70766067504883, "learning_rate": 5e-05, "loss": 0.2359, "step": 1512 }, { "epoch": 5.0011148648648645, "grad_norm": 0.10219435393810272, "learning_rate": 5e-05, "loss": 0.0037, "step": 1513 }, { "epoch": 5.001148648648648, "grad_norm": 1.277001142501831, "learning_rate": 5e-05, "loss": 0.0066, "step": 1514 }, { "epoch": 5.001182432432432, "grad_norm": 6.92288875579834, "learning_rate": 5e-05, "loss": 0.9159, "step": 1515 }, { "epoch": 5.001216216216216, "grad_norm": 19.316341400146484, "learning_rate": 5e-05, "loss": 0.4039, "step": 1516 }, { "epoch": 5.00125, "grad_norm": 38.213951110839844, "learning_rate": 5e-05, "loss": 0.4025, "step": 1517 }, { "epoch": 5.001283783783784, "grad_norm": 22.54627227783203, "learning_rate": 5e-05, "loss": 0.3138, "step": 1518 }, { "epoch": 5.001317567567567, "grad_norm": 0.8906470537185669, "learning_rate": 5e-05, "loss": 0.007, "step": 1519 }, { "epoch": 5.001351351351351, "grad_norm": 2.421049118041992, "learning_rate": 5e-05, "loss": 0.237, "step": 1520 }, { "epoch": 5.001385135135135, "grad_norm": 10.265822410583496, "learning_rate": 5e-05, "loss": 0.0737, "step": 1521 }, { "epoch": 5.001418918918919, "grad_norm": 16.1036434173584, "learning_rate": 5e-05, "loss": 0.2144, "step": 1522 }, { "epoch": 5.001452702702703, "grad_norm": 0.009121760725975037, "learning_rate": 5e-05, "loss": 0.0004, "step": 1523 }, { "epoch": 5.0014864864864865, "grad_norm": 31.508684158325195, "learning_rate": 5e-05, "loss": 0.2377, "step": 1524 }, { "epoch": 5.00152027027027, "grad_norm": 0.12415117770433426, "learning_rate": 5e-05, "loss": 0.0019, "step": 1525 }, { "epoch": 5.001554054054054, "grad_norm": 0.16412389278411865, "learning_rate": 5e-05, "loss": 0.0038, "step": 1526 }, { "epoch": 5.001587837837838, "grad_norm": 13.173813819885254, "learning_rate": 5e-05, "loss": 0.0747, "step": 1527 }, { "epoch": 5.001621621621622, "grad_norm": 27.129486083984375, "learning_rate": 5e-05, "loss": 0.2349, "step": 1528 }, { "epoch": 5.001655405405406, "grad_norm": 31.71654510498047, "learning_rate": 5e-05, "loss": 0.5585, "step": 1529 }, { "epoch": 5.0016891891891895, "grad_norm": 6.509718894958496, "learning_rate": 5e-05, "loss": 0.1004, "step": 1530 }, { "epoch": 5.001722972972973, "grad_norm": 16.792400360107422, "learning_rate": 5e-05, "loss": 0.0783, "step": 1531 }, { "epoch": 5.001756756756757, "grad_norm": 0.10600975900888443, "learning_rate": 5e-05, "loss": 0.0028, "step": 1532 }, { "epoch": 5.001790540540541, "grad_norm": 14.282816886901855, "learning_rate": 5e-05, "loss": 0.7411, "step": 1533 }, { "epoch": 5.001824324324324, "grad_norm": 19.09868812561035, "learning_rate": 5e-05, "loss": 0.3686, "step": 1534 }, { "epoch": 5.001858108108108, "grad_norm": 2.5930051803588867, "learning_rate": 5e-05, "loss": 0.0139, "step": 1535 }, { "epoch": 5.0018918918918915, "grad_norm": 0.05432658642530441, "learning_rate": 5e-05, "loss": 0.0018, "step": 1536 }, { "epoch": 5.001925675675675, "grad_norm": 3.6443562507629395, "learning_rate": 5e-05, "loss": 0.056, "step": 1537 }, { "epoch": 5.001959459459459, "grad_norm": 0.08170074224472046, "learning_rate": 5e-05, "loss": 0.0027, "step": 1538 }, { "epoch": 5.001993243243243, "grad_norm": 0.168882817029953, "learning_rate": 5e-05, "loss": 0.0034, "step": 1539 }, { "epoch": 5.002027027027027, "grad_norm": 19.828554153442383, "learning_rate": 5e-05, "loss": 0.1051, "step": 1540 }, { "epoch": 5.002060810810811, "grad_norm": 12.70754623413086, "learning_rate": 5e-05, "loss": 0.0853, "step": 1541 }, { "epoch": 5.0020945945945945, "grad_norm": 0.05616901442408562, "learning_rate": 5e-05, "loss": 0.0015, "step": 1542 }, { "epoch": 5.002128378378378, "grad_norm": 22.12007713317871, "learning_rate": 5e-05, "loss": 0.218, "step": 1543 }, { "epoch": 5.002162162162162, "grad_norm": 1.4602683782577515, "learning_rate": 5e-05, "loss": 0.0123, "step": 1544 }, { "epoch": 5.002195945945946, "grad_norm": 13.595677375793457, "learning_rate": 5e-05, "loss": 0.7012, "step": 1545 }, { "epoch": 5.00222972972973, "grad_norm": 8.469050407409668, "learning_rate": 5e-05, "loss": 0.3879, "step": 1546 }, { "epoch": 5.002263513513514, "grad_norm": 0.31636127829551697, "learning_rate": 5e-05, "loss": 0.0041, "step": 1547 }, { "epoch": 5.002297297297297, "grad_norm": 23.598316192626953, "learning_rate": 5e-05, "loss": 0.6163, "step": 1548 }, { "epoch": 5.002331081081081, "grad_norm": 0.021951472386717796, "learning_rate": 5e-05, "loss": 0.0007, "step": 1549 }, { "epoch": 5.002364864864865, "grad_norm": 2.077895164489746, "learning_rate": 5e-05, "loss": 0.1384, "step": 1550 }, { "epoch": 5.002398648648649, "grad_norm": 12.859502792358398, "learning_rate": 5e-05, "loss": 0.3575, "step": 1551 }, { "epoch": 5.002432432432433, "grad_norm": 13.849201202392578, "learning_rate": 5e-05, "loss": 0.0957, "step": 1552 }, { "epoch": 5.002466216216217, "grad_norm": 28.478328704833984, "learning_rate": 5e-05, "loss": 0.5858, "step": 1553 }, { "epoch": 5.0025, "grad_norm": 8.657326698303223, "learning_rate": 5e-05, "loss": 0.16, "step": 1554 }, { "epoch": 5.002533783783784, "grad_norm": 8.940923690795898, "learning_rate": 5e-05, "loss": 0.0851, "step": 1555 }, { "epoch": 5.002567567567567, "grad_norm": 0.5113046169281006, "learning_rate": 5e-05, "loss": 0.0219, "step": 1556 }, { "epoch": 5.002601351351351, "grad_norm": 9.338139533996582, "learning_rate": 5e-05, "loss": 0.0871, "step": 1557 }, { "epoch": 5.002635135135135, "grad_norm": 0.25237125158309937, "learning_rate": 5e-05, "loss": 0.0056, "step": 1558 }, { "epoch": 5.002668918918919, "grad_norm": 0.1996578574180603, "learning_rate": 5e-05, "loss": 0.0036, "step": 1559 }, { "epoch": 5.0027027027027025, "grad_norm": 0.3372069001197815, "learning_rate": 5e-05, "loss": 0.0054, "step": 1560 }, { "epoch": 5.002736486486486, "grad_norm": 10.730425834655762, "learning_rate": 5e-05, "loss": 0.256, "step": 1561 }, { "epoch": 5.00277027027027, "grad_norm": 39.700294494628906, "learning_rate": 5e-05, "loss": 0.896, "step": 1562 }, { "epoch": 5.002804054054054, "grad_norm": 2.7898709774017334, "learning_rate": 5e-05, "loss": 0.041, "step": 1563 }, { "epoch": 5.002837837837838, "grad_norm": 0.5393791198730469, "learning_rate": 5e-05, "loss": 0.0133, "step": 1564 }, { "epoch": 5.002871621621622, "grad_norm": 2.9545416831970215, "learning_rate": 5e-05, "loss": 0.014, "step": 1565 }, { "epoch": 5.002905405405405, "grad_norm": 1.090669870376587, "learning_rate": 5e-05, "loss": 0.081, "step": 1566 }, { "epoch": 5.002939189189189, "grad_norm": 0.11708708852529526, "learning_rate": 5e-05, "loss": 0.0021, "step": 1567 }, { "epoch": 5.002972972972973, "grad_norm": 0.09402810782194138, "learning_rate": 5e-05, "loss": 0.0018, "step": 1568 }, { "epoch": 5.003006756756757, "grad_norm": 0.017942195758223534, "learning_rate": 5e-05, "loss": 0.0005, "step": 1569 }, { "epoch": 5.003040540540541, "grad_norm": 0.308289110660553, "learning_rate": 5e-05, "loss": 0.0043, "step": 1570 }, { "epoch": 5.0030743243243245, "grad_norm": 2.480778217315674, "learning_rate": 5e-05, "loss": 0.0172, "step": 1571 }, { "epoch": 5.003108108108108, "grad_norm": 0.058814145624637604, "learning_rate": 5e-05, "loss": 0.0021, "step": 1572 }, { "epoch": 5.003141891891892, "grad_norm": 17.178329467773438, "learning_rate": 5e-05, "loss": 0.6015, "step": 1573 }, { "epoch": 5.003175675675676, "grad_norm": 0.5755367875099182, "learning_rate": 5e-05, "loss": 0.0102, "step": 1574 }, { "epoch": 5.00320945945946, "grad_norm": 0.015401470474898815, "learning_rate": 5e-05, "loss": 0.0006, "step": 1575 }, { "epoch": 5.003243243243244, "grad_norm": 0.05809813365340233, "learning_rate": 5e-05, "loss": 0.0013, "step": 1576 }, { "epoch": 5.003277027027027, "grad_norm": 17.787809371948242, "learning_rate": 5e-05, "loss": 0.6017, "step": 1577 }, { "epoch": 5.00331081081081, "grad_norm": 0.04512163624167442, "learning_rate": 5e-05, "loss": 0.001, "step": 1578 }, { "epoch": 5.003344594594594, "grad_norm": 0.07922250777482986, "learning_rate": 5e-05, "loss": 0.002, "step": 1579 }, { "epoch": 5.003378378378378, "grad_norm": 8.96746826171875, "learning_rate": 5e-05, "loss": 0.1528, "step": 1580 }, { "epoch": 5.003412162162162, "grad_norm": 8.215766906738281, "learning_rate": 5e-05, "loss": 0.0679, "step": 1581 }, { "epoch": 5.003445945945946, "grad_norm": 8.362344741821289, "learning_rate": 5e-05, "loss": 0.0583, "step": 1582 }, { "epoch": 5.0034797297297295, "grad_norm": 0.009468805976212025, "learning_rate": 5e-05, "loss": 0.0004, "step": 1583 }, { "epoch": 5.003513513513513, "grad_norm": 0.051366083323955536, "learning_rate": 5e-05, "loss": 0.0008, "step": 1584 }, { "epoch": 5.003547297297297, "grad_norm": 0.21868552267551422, "learning_rate": 5e-05, "loss": 0.0016, "step": 1585 }, { "epoch": 5.003581081081081, "grad_norm": 0.019113317131996155, "learning_rate": 5e-05, "loss": 0.0007, "step": 1586 }, { "epoch": 5.003614864864865, "grad_norm": 0.23874948918819427, "learning_rate": 5e-05, "loss": 0.0022, "step": 1587 }, { "epoch": 5.003648648648649, "grad_norm": 0.05612446367740631, "learning_rate": 5e-05, "loss": 0.0014, "step": 1588 }, { "epoch": 5.0036824324324325, "grad_norm": 20.779600143432617, "learning_rate": 5e-05, "loss": 0.0951, "step": 1589 }, { "epoch": 5.003716216216216, "grad_norm": 23.359058380126953, "learning_rate": 5e-05, "loss": 0.7426, "step": 1590 }, { "epoch": 5.00375, "grad_norm": 23.10438346862793, "learning_rate": 5e-05, "loss": 0.1412, "step": 1591 }, { "epoch": 5.003783783783784, "grad_norm": 15.664324760437012, "learning_rate": 5e-05, "loss": 0.2241, "step": 1592 }, { "epoch": 5.003817567567568, "grad_norm": 15.128107070922852, "learning_rate": 5e-05, "loss": 0.2417, "step": 1593 }, { "epoch": 5.003851351351352, "grad_norm": 0.020984727889299393, "learning_rate": 5e-05, "loss": 0.0006, "step": 1594 }, { "epoch": 5.003885135135135, "grad_norm": 0.12120147794485092, "learning_rate": 5e-05, "loss": 0.0025, "step": 1595 }, { "epoch": 5.003918918918919, "grad_norm": 1.171081781387329, "learning_rate": 5e-05, "loss": 0.0052, "step": 1596 }, { "epoch": 5.003952702702703, "grad_norm": 20.170621871948242, "learning_rate": 5e-05, "loss": 0.3533, "step": 1597 }, { "epoch": 5.003986486486487, "grad_norm": 0.891694188117981, "learning_rate": 5e-05, "loss": 0.0033, "step": 1598 }, { "epoch": 5.00402027027027, "grad_norm": 0.024652771651744843, "learning_rate": 5e-05, "loss": 0.0007, "step": 1599 }, { "epoch": 5.004054054054054, "grad_norm": 0.21370287239551544, "learning_rate": 5e-05, "loss": 0.0023, "step": 1600 }, { "epoch": 5.0040878378378375, "grad_norm": 0.027128364890813828, "learning_rate": 5e-05, "loss": 0.0008, "step": 1601 }, { "epoch": 5.004121621621621, "grad_norm": 0.31084007024765015, "learning_rate": 5e-05, "loss": 0.0041, "step": 1602 }, { "epoch": 5.004155405405405, "grad_norm": 0.5558644533157349, "learning_rate": 5e-05, "loss": 0.0059, "step": 1603 }, { "epoch": 5.004189189189189, "grad_norm": 0.33981046080589294, "learning_rate": 5e-05, "loss": 0.0038, "step": 1604 }, { "epoch": 5.004222972972973, "grad_norm": 4.5722575187683105, "learning_rate": 5e-05, "loss": 0.0188, "step": 1605 }, { "epoch": 5.004256756756757, "grad_norm": 17.028472900390625, "learning_rate": 5e-05, "loss": 0.3266, "step": 1606 }, { "epoch": 5.0042905405405405, "grad_norm": 5.368931293487549, "learning_rate": 5e-05, "loss": 0.0199, "step": 1607 }, { "epoch": 5.004324324324324, "grad_norm": 0.11018476635217667, "learning_rate": 5e-05, "loss": 0.0014, "step": 1608 }, { "epoch": 5.004358108108108, "grad_norm": 0.08224356174468994, "learning_rate": 5e-05, "loss": 0.0011, "step": 1609 }, { "epoch": 5.004391891891892, "grad_norm": 1.6576436758041382, "learning_rate": 5e-05, "loss": 0.0092, "step": 1610 }, { "epoch": 5.004425675675676, "grad_norm": 21.08892250061035, "learning_rate": 5e-05, "loss": 0.343, "step": 1611 }, { "epoch": 5.00445945945946, "grad_norm": 3.3654563426971436, "learning_rate": 5e-05, "loss": 0.0865, "step": 1612 }, { "epoch": 5.004493243243243, "grad_norm": 6.247069358825684, "learning_rate": 5e-05, "loss": 0.0957, "step": 1613 }, { "epoch": 5.004527027027027, "grad_norm": 0.49655190110206604, "learning_rate": 5e-05, "loss": 0.0038, "step": 1614 }, { "epoch": 5.004560810810811, "grad_norm": 0.04470464959740639, "learning_rate": 5e-05, "loss": 0.0007, "step": 1615 }, { "epoch": 5.004594594594595, "grad_norm": 5.32329797744751, "learning_rate": 5e-05, "loss": 0.0432, "step": 1616 }, { "epoch": 5.004628378378379, "grad_norm": 35.22166061401367, "learning_rate": 5e-05, "loss": 0.2419, "step": 1617 }, { "epoch": 5.0046621621621625, "grad_norm": 0.0174701027572155, "learning_rate": 5e-05, "loss": 0.0006, "step": 1618 }, { "epoch": 5.004695945945946, "grad_norm": 0.0043756249360740185, "learning_rate": 5e-05, "loss": 0.0001, "step": 1619 }, { "epoch": 5.004729729729729, "grad_norm": 32.59823989868164, "learning_rate": 5e-05, "loss": 0.4937, "step": 1620 }, { "epoch": 5.004763513513513, "grad_norm": 30.44869041442871, "learning_rate": 5e-05, "loss": 0.1, "step": 1621 }, { "epoch": 5.004797297297297, "grad_norm": 11.182904243469238, "learning_rate": 5e-05, "loss": 0.0922, "step": 1622 }, { "epoch": 5.004831081081081, "grad_norm": 8.078532218933105, "learning_rate": 5e-05, "loss": 0.0375, "step": 1623 }, { "epoch": 5.004864864864865, "grad_norm": 33.38128662109375, "learning_rate": 5e-05, "loss": 0.5748, "step": 1624 }, { "epoch": 5.004898648648648, "grad_norm": 9.565945625305176, "learning_rate": 5e-05, "loss": 0.5624, "step": 1625 }, { "epoch": 5.004932432432432, "grad_norm": 0.06914130598306656, "learning_rate": 5e-05, "loss": 0.0009, "step": 1626 }, { "epoch": 5.004966216216216, "grad_norm": 23.07594871520996, "learning_rate": 5e-05, "loss": 0.0654, "step": 1627 }, { "epoch": 5.005, "grad_norm": 0.31924763321876526, "learning_rate": 5e-05, "loss": 0.0056, "step": 1628 }, { "epoch": 5.005033783783784, "grad_norm": 0.589286208152771, "learning_rate": 5e-05, "loss": 0.005, "step": 1629 }, { "epoch": 5.0050675675675675, "grad_norm": 0.1788860559463501, "learning_rate": 5e-05, "loss": 0.0024, "step": 1630 }, { "epoch": 5.005101351351351, "grad_norm": 0.016523372381925583, "learning_rate": 5e-05, "loss": 0.0005, "step": 1631 }, { "epoch": 5.005135135135135, "grad_norm": 0.10490395128726959, "learning_rate": 5e-05, "loss": 0.0008, "step": 1632 }, { "epoch": 5.005168918918919, "grad_norm": 48.686058044433594, "learning_rate": 5e-05, "loss": 0.6169, "step": 1633 }, { "epoch": 5.005202702702703, "grad_norm": 12.366721153259277, "learning_rate": 5e-05, "loss": 0.5438, "step": 1634 }, { "epoch": 5.005236486486487, "grad_norm": 8.198272705078125, "learning_rate": 5e-05, "loss": 0.2735, "step": 1635 }, { "epoch": 5.0052702702702705, "grad_norm": 6.268960952758789, "learning_rate": 5e-05, "loss": 0.0398, "step": 1636 }, { "epoch": 5.005304054054054, "grad_norm": 0.4462999105453491, "learning_rate": 5e-05, "loss": 0.003, "step": 1637 }, { "epoch": 5.005337837837838, "grad_norm": 1.7609432935714722, "learning_rate": 5e-05, "loss": 0.0125, "step": 1638 }, { "epoch": 5.005371621621622, "grad_norm": 0.10691969096660614, "learning_rate": 5e-05, "loss": 0.0012, "step": 1639 }, { "epoch": 5.005405405405406, "grad_norm": 33.29607009887695, "learning_rate": 5e-05, "loss": 0.2528, "step": 1640 }, { "epoch": 5.00543918918919, "grad_norm": 12.845821380615234, "learning_rate": 5e-05, "loss": 0.1004, "step": 1641 }, { "epoch": 5.005472972972973, "grad_norm": 0.011560282669961452, "learning_rate": 5e-05, "loss": 0.0004, "step": 1642 }, { "epoch": 5.005506756756756, "grad_norm": 0.0275545883923769, "learning_rate": 5e-05, "loss": 0.0005, "step": 1643 }, { "epoch": 5.00554054054054, "grad_norm": 6.863248348236084, "learning_rate": 5e-05, "loss": 1.0534, "step": 1644 }, { "epoch": 5.005574324324324, "grad_norm": 22.610389709472656, "learning_rate": 5e-05, "loss": 0.1207, "step": 1645 }, { "epoch": 5.005608108108108, "grad_norm": 13.101726531982422, "learning_rate": 5e-05, "loss": 0.0483, "step": 1646 }, { "epoch": 5.005641891891892, "grad_norm": 0.008396188728511333, "learning_rate": 5e-05, "loss": 0.0003, "step": 1647 }, { "epoch": 5.0056756756756755, "grad_norm": 0.017242159694433212, "learning_rate": 5e-05, "loss": 0.0004, "step": 1648 }, { "epoch": 5.005709459459459, "grad_norm": 0.06582677364349365, "learning_rate": 5e-05, "loss": 0.0011, "step": 1649 }, { "epoch": 5.005743243243243, "grad_norm": 12.209273338317871, "learning_rate": 5e-05, "loss": 0.0648, "step": 1650 }, { "epoch": 5.005777027027027, "grad_norm": 0.4799362123012543, "learning_rate": 5e-05, "loss": 0.0068, "step": 1651 }, { "epoch": 5.005810810810811, "grad_norm": 0.8809301257133484, "learning_rate": 5e-05, "loss": 0.0071, "step": 1652 }, { "epoch": 5.005844594594595, "grad_norm": 0.0984296202659607, "learning_rate": 5e-05, "loss": 0.0009, "step": 1653 }, { "epoch": 5.0058783783783785, "grad_norm": 2.4744465351104736, "learning_rate": 5e-05, "loss": 0.0124, "step": 1654 }, { "epoch": 5.005912162162162, "grad_norm": 5.357427597045898, "learning_rate": 5e-05, "loss": 0.0942, "step": 1655 }, { "epoch": 5.005945945945946, "grad_norm": 1.473984956741333, "learning_rate": 5e-05, "loss": 0.009, "step": 1656 }, { "epoch": 5.00597972972973, "grad_norm": 20.65157127380371, "learning_rate": 5e-05, "loss": 0.631, "step": 1657 }, { "epoch": 5.006013513513514, "grad_norm": 13.801727294921875, "learning_rate": 5e-05, "loss": 0.7645, "step": 1658 }, { "epoch": 5.006047297297298, "grad_norm": 17.63668441772461, "learning_rate": 5e-05, "loss": 0.1401, "step": 1659 }, { "epoch": 5.006081081081081, "grad_norm": 2.8509488105773926, "learning_rate": 5e-05, "loss": 0.0198, "step": 1660 }, { "epoch": 5.006114864864865, "grad_norm": 6.857617378234863, "learning_rate": 5e-05, "loss": 0.758, "step": 1661 }, { "epoch": 5.006148648648649, "grad_norm": 1.4854180812835693, "learning_rate": 5e-05, "loss": 0.0105, "step": 1662 }, { "epoch": 5.006182432432432, "grad_norm": 41.94755172729492, "learning_rate": 5e-05, "loss": 0.713, "step": 1663 }, { "epoch": 5.006216216216216, "grad_norm": 9.436025619506836, "learning_rate": 5e-05, "loss": 0.0444, "step": 1664 }, { "epoch": 5.00625, "grad_norm": 8.345372200012207, "learning_rate": 5e-05, "loss": 0.8686, "step": 1665 }, { "epoch": 5.0062837837837835, "grad_norm": 2.8937246799468994, "learning_rate": 5e-05, "loss": 0.0152, "step": 1666 }, { "epoch": 5.006317567567567, "grad_norm": 11.916192054748535, "learning_rate": 5e-05, "loss": 0.832, "step": 1667 }, { "epoch": 5.006351351351351, "grad_norm": 14.49351978302002, "learning_rate": 5e-05, "loss": 0.1043, "step": 1668 }, { "epoch": 5.006385135135135, "grad_norm": 12.113361358642578, "learning_rate": 5e-05, "loss": 0.1032, "step": 1669 }, { "epoch": 5.006418918918919, "grad_norm": 0.010070166550576687, "learning_rate": 5e-05, "loss": 0.0003, "step": 1670 }, { "epoch": 5.006452702702703, "grad_norm": 0.16841688752174377, "learning_rate": 5e-05, "loss": 0.0019, "step": 1671 }, { "epoch": 5.006486486486486, "grad_norm": 25.034175872802734, "learning_rate": 5e-05, "loss": 0.54, "step": 1672 }, { "epoch": 5.00652027027027, "grad_norm": 0.06816627830266953, "learning_rate": 5e-05, "loss": 0.0015, "step": 1673 }, { "epoch": 5.006554054054054, "grad_norm": 1.3310327529907227, "learning_rate": 5e-05, "loss": 0.0308, "step": 1674 }, { "epoch": 5.006587837837838, "grad_norm": 0.02583722211420536, "learning_rate": 5e-05, "loss": 0.0006, "step": 1675 }, { "epoch": 5.006621621621622, "grad_norm": 10.707114219665527, "learning_rate": 5e-05, "loss": 0.1282, "step": 1676 }, { "epoch": 5.0066554054054055, "grad_norm": 21.726415634155273, "learning_rate": 5e-05, "loss": 0.1387, "step": 1677 }, { "epoch": 5.006689189189189, "grad_norm": 0.04763065278530121, "learning_rate": 5e-05, "loss": 0.0013, "step": 1678 }, { "epoch": 5.006722972972973, "grad_norm": 24.200727462768555, "learning_rate": 5e-05, "loss": 1.0444, "step": 1679 }, { "epoch": 5.006756756756757, "grad_norm": 18.76099967956543, "learning_rate": 5e-05, "loss": 0.2635, "step": 1680 }, { "epoch": 5.006790540540541, "grad_norm": 0.017699142917990685, "learning_rate": 5e-05, "loss": 0.0005, "step": 1681 }, { "epoch": 5.006824324324325, "grad_norm": 19.276525497436523, "learning_rate": 5e-05, "loss": 0.4674, "step": 1682 }, { "epoch": 5.0068581081081085, "grad_norm": 6.693361759185791, "learning_rate": 5e-05, "loss": 0.5418, "step": 1683 }, { "epoch": 5.006891891891892, "grad_norm": 0.06999342888593674, "learning_rate": 5e-05, "loss": 0.0022, "step": 1684 }, { "epoch": 5.006925675675675, "grad_norm": 6.352852821350098, "learning_rate": 5e-05, "loss": 0.111, "step": 1685 }, { "epoch": 5.006959459459459, "grad_norm": 8.433215141296387, "learning_rate": 5e-05, "loss": 0.5816, "step": 1686 }, { "epoch": 5.006993243243243, "grad_norm": 3.865403890609741, "learning_rate": 5e-05, "loss": 0.3222, "step": 1687 }, { "epoch": 5.007027027027027, "grad_norm": 2.9133944511413574, "learning_rate": 5e-05, "loss": 0.0599, "step": 1688 }, { "epoch": 5.007060810810811, "grad_norm": 0.3893556296825409, "learning_rate": 5e-05, "loss": 0.0068, "step": 1689 }, { "epoch": 5.007094594594594, "grad_norm": 8.837952613830566, "learning_rate": 5e-05, "loss": 0.0488, "step": 1690 }, { "epoch": 5.007128378378378, "grad_norm": 0.4491424560546875, "learning_rate": 5e-05, "loss": 0.0122, "step": 1691 }, { "epoch": 5.007162162162162, "grad_norm": 23.543880462646484, "learning_rate": 5e-05, "loss": 0.1823, "step": 1692 }, { "epoch": 5.007195945945946, "grad_norm": 29.195720672607422, "learning_rate": 5e-05, "loss": 0.5283, "step": 1693 }, { "epoch": 5.00722972972973, "grad_norm": 1.4150770902633667, "learning_rate": 5e-05, "loss": 0.025, "step": 1694 }, { "epoch": 5.0072635135135135, "grad_norm": 8.243961334228516, "learning_rate": 5e-05, "loss": 0.2659, "step": 1695 }, { "epoch": 5.007297297297297, "grad_norm": 4.7711567878723145, "learning_rate": 5e-05, "loss": 0.0376, "step": 1696 }, { "epoch": 5.007331081081081, "grad_norm": 0.8521038293838501, "learning_rate": 5e-05, "loss": 0.0242, "step": 1697 }, { "epoch": 5.007364864864865, "grad_norm": 8.891558647155762, "learning_rate": 5e-05, "loss": 0.5691, "step": 1698 }, { "epoch": 5.007398648648649, "grad_norm": 0.2948383688926697, "learning_rate": 5e-05, "loss": 0.0074, "step": 1699 }, { "epoch": 5.007432432432433, "grad_norm": 5.018725395202637, "learning_rate": 5e-05, "loss": 0.1053, "step": 1700 }, { "epoch": 5.0074662162162165, "grad_norm": 5.086929798126221, "learning_rate": 5e-05, "loss": 0.4298, "step": 1701 }, { "epoch": 5.0075, "grad_norm": 0.758733332157135, "learning_rate": 5e-05, "loss": 0.0208, "step": 1702 }, { "epoch": 5.007533783783784, "grad_norm": 0.12151909619569778, "learning_rate": 5e-05, "loss": 0.0021, "step": 1703 }, { "epoch": 5.007567567567568, "grad_norm": 11.38137435913086, "learning_rate": 5e-05, "loss": 0.7847, "step": 1704 }, { "epoch": 5.007601351351352, "grad_norm": 0.13621269166469574, "learning_rate": 5e-05, "loss": 0.0038, "step": 1705 }, { "epoch": 5.007635135135135, "grad_norm": 3.4095051288604736, "learning_rate": 5e-05, "loss": 0.1671, "step": 1706 }, { "epoch": 5.0076689189189185, "grad_norm": 8.46708869934082, "learning_rate": 5e-05, "loss": 0.0797, "step": 1707 }, { "epoch": 5.007702702702702, "grad_norm": 0.05250133201479912, "learning_rate": 5e-05, "loss": 0.002, "step": 1708 }, { "epoch": 5.007736486486486, "grad_norm": 5.386491775512695, "learning_rate": 5e-05, "loss": 0.4082, "step": 1709 }, { "epoch": 5.00777027027027, "grad_norm": 16.75005531311035, "learning_rate": 5e-05, "loss": 0.2103, "step": 1710 }, { "epoch": 5.007804054054054, "grad_norm": 12.04323959350586, "learning_rate": 5e-05, "loss": 0.3923, "step": 1711 }, { "epoch": 5.007837837837838, "grad_norm": 3.2311201095581055, "learning_rate": 5e-05, "loss": 0.0351, "step": 1712 }, { "epoch": 5.0078716216216215, "grad_norm": 5.269421100616455, "learning_rate": 5e-05, "loss": 0.3006, "step": 1713 }, { "epoch": 5.007905405405405, "grad_norm": 0.33905309438705444, "learning_rate": 5e-05, "loss": 0.0078, "step": 1714 }, { "epoch": 5.007939189189189, "grad_norm": 12.117852210998535, "learning_rate": 5e-05, "loss": 0.1481, "step": 1715 }, { "epoch": 5.007972972972973, "grad_norm": 5.385222911834717, "learning_rate": 5e-05, "loss": 0.5179, "step": 1716 }, { "epoch": 5.008006756756757, "grad_norm": 2.269894599914551, "learning_rate": 5e-05, "loss": 0.0658, "step": 1717 }, { "epoch": 5.008040540540541, "grad_norm": 0.16243185102939606, "learning_rate": 5e-05, "loss": 0.0042, "step": 1718 }, { "epoch": 5.008074324324324, "grad_norm": 1.890541672706604, "learning_rate": 5e-05, "loss": 0.0523, "step": 1719 }, { "epoch": 5.008108108108108, "grad_norm": 4.5951247215271, "learning_rate": 5e-05, "loss": 0.1937, "step": 1720 }, { "epoch": 5.008141891891892, "grad_norm": 4.048150062561035, "learning_rate": 5e-05, "loss": 0.0599, "step": 1721 }, { "epoch": 5.008175675675676, "grad_norm": 31.342315673828125, "learning_rate": 5e-05, "loss": 0.4266, "step": 1722 }, { "epoch": 5.00820945945946, "grad_norm": 0.29359227418899536, "learning_rate": 5e-05, "loss": 0.0078, "step": 1723 }, { "epoch": 5.0082432432432435, "grad_norm": 3.1201043128967285, "learning_rate": 5e-05, "loss": 0.0699, "step": 1724 }, { "epoch": 5.008277027027027, "grad_norm": 26.469196319580078, "learning_rate": 5e-05, "loss": 0.5546, "step": 1725 }, { "epoch": 5.008310810810811, "grad_norm": 8.920289993286133, "learning_rate": 5e-05, "loss": 0.2542, "step": 1726 }, { "epoch": 5.008344594594595, "grad_norm": 1.980745553970337, "learning_rate": 5e-05, "loss": 0.1072, "step": 1727 }, { "epoch": 5.008378378378378, "grad_norm": 0.7181796431541443, "learning_rate": 5e-05, "loss": 0.0139, "step": 1728 }, { "epoch": 5.008412162162162, "grad_norm": 9.331476211547852, "learning_rate": 5e-05, "loss": 0.772, "step": 1729 }, { "epoch": 5.008445945945946, "grad_norm": 2.698669910430908, "learning_rate": 5e-05, "loss": 0.1215, "step": 1730 }, { "epoch": 5.008479729729729, "grad_norm": 0.07359219342470169, "learning_rate": 5e-05, "loss": 0.0024, "step": 1731 }, { "epoch": 5.008513513513513, "grad_norm": 0.193208709359169, "learning_rate": 5e-05, "loss": 0.0018, "step": 1732 }, { "epoch": 5.008547297297297, "grad_norm": 10.448134422302246, "learning_rate": 5e-05, "loss": 0.52, "step": 1733 }, { "epoch": 5.008581081081081, "grad_norm": 17.185527801513672, "learning_rate": 5e-05, "loss": 0.177, "step": 1734 }, { "epoch": 5.008614864864865, "grad_norm": 6.810908317565918, "learning_rate": 5e-05, "loss": 0.4907, "step": 1735 }, { "epoch": 5.008648648648649, "grad_norm": 15.714237213134766, "learning_rate": 5e-05, "loss": 0.2736, "step": 1736 }, { "epoch": 5.008682432432432, "grad_norm": 9.023649215698242, "learning_rate": 5e-05, "loss": 0.0622, "step": 1737 }, { "epoch": 5.008716216216216, "grad_norm": 8.088159561157227, "learning_rate": 5e-05, "loss": 0.7122, "step": 1738 }, { "epoch": 5.00875, "grad_norm": 3.3666207790374756, "learning_rate": 5e-05, "loss": 0.0623, "step": 1739 }, { "epoch": 5.008783783783784, "grad_norm": 1.4371492862701416, "learning_rate": 5e-05, "loss": 0.0162, "step": 1740 }, { "epoch": 5.008817567567568, "grad_norm": 9.957343101501465, "learning_rate": 5e-05, "loss": 0.5426, "step": 1741 }, { "epoch": 5.0088513513513515, "grad_norm": 11.938361167907715, "learning_rate": 5e-05, "loss": 0.2192, "step": 1742 }, { "epoch": 5.008885135135135, "grad_norm": 4.402652263641357, "learning_rate": 5e-05, "loss": 0.0654, "step": 1743 }, { "epoch": 5.008918918918919, "grad_norm": 4.985011577606201, "learning_rate": 5e-05, "loss": 0.1251, "step": 1744 }, { "epoch": 5.008952702702703, "grad_norm": 5.347353458404541, "learning_rate": 5e-05, "loss": 0.1783, "step": 1745 }, { "epoch": 5.008986486486487, "grad_norm": 1.3776054382324219, "learning_rate": 5e-05, "loss": 0.0199, "step": 1746 }, { "epoch": 5.009020270270271, "grad_norm": 0.5275123119354248, "learning_rate": 5e-05, "loss": 0.0119, "step": 1747 }, { "epoch": 5.0090540540540545, "grad_norm": 0.620480477809906, "learning_rate": 5e-05, "loss": 0.0105, "step": 1748 }, { "epoch": 5.009087837837837, "grad_norm": 13.620659828186035, "learning_rate": 5e-05, "loss": 0.1196, "step": 1749 }, { "epoch": 5.009121621621621, "grad_norm": 13.62907600402832, "learning_rate": 5e-05, "loss": 0.4609, "step": 1750 }, { "epoch": 5.009155405405405, "grad_norm": 8.54165267944336, "learning_rate": 5e-05, "loss": 0.1036, "step": 1751 }, { "epoch": 5.009189189189189, "grad_norm": 0.2723406255245209, "learning_rate": 5e-05, "loss": 0.0101, "step": 1752 }, { "epoch": 5.009222972972973, "grad_norm": 23.21828269958496, "learning_rate": 5e-05, "loss": 0.2464, "step": 1753 }, { "epoch": 5.0092567567567565, "grad_norm": 8.13559627532959, "learning_rate": 5e-05, "loss": 0.459, "step": 1754 }, { "epoch": 5.00929054054054, "grad_norm": 12.676884651184082, "learning_rate": 5e-05, "loss": 0.1615, "step": 1755 }, { "epoch": 5.009324324324324, "grad_norm": 23.29253387451172, "learning_rate": 5e-05, "loss": 0.3546, "step": 1756 }, { "epoch": 5.009358108108108, "grad_norm": 19.13679313659668, "learning_rate": 5e-05, "loss": 0.2756, "step": 1757 }, { "epoch": 5.009391891891892, "grad_norm": 17.375688552856445, "learning_rate": 5e-05, "loss": 0.1551, "step": 1758 }, { "epoch": 5.009425675675676, "grad_norm": 0.04018796980381012, "learning_rate": 5e-05, "loss": 0.0011, "step": 1759 }, { "epoch": 5.0094594594594595, "grad_norm": 0.37574827671051025, "learning_rate": 5e-05, "loss": 0.0031, "step": 1760 }, { "epoch": 5.009493243243243, "grad_norm": 1.3527752161026, "learning_rate": 5e-05, "loss": 0.1189, "step": 1761 }, { "epoch": 5.009527027027027, "grad_norm": 0.05006188526749611, "learning_rate": 5e-05, "loss": 0.0012, "step": 1762 }, { "epoch": 5.009560810810811, "grad_norm": 0.4743427038192749, "learning_rate": 5e-05, "loss": 0.0145, "step": 1763 }, { "epoch": 5.009594594594595, "grad_norm": 9.244105339050293, "learning_rate": 5e-05, "loss": 0.0498, "step": 1764 }, { "epoch": 5.009628378378379, "grad_norm": 17.926071166992188, "learning_rate": 5e-05, "loss": 0.405, "step": 1765 }, { "epoch": 5.009662162162162, "grad_norm": 8.351137161254883, "learning_rate": 5e-05, "loss": 0.0754, "step": 1766 }, { "epoch": 5.009695945945946, "grad_norm": 0.17853251099586487, "learning_rate": 5e-05, "loss": 0.0031, "step": 1767 }, { "epoch": 5.00972972972973, "grad_norm": 7.424759864807129, "learning_rate": 5e-05, "loss": 0.0658, "step": 1768 }, { "epoch": 5.009763513513514, "grad_norm": 0.33864593505859375, "learning_rate": 5e-05, "loss": 0.0047, "step": 1769 }, { "epoch": 5.009797297297298, "grad_norm": 4.869662761688232, "learning_rate": 5e-05, "loss": 0.4253, "step": 1770 }, { "epoch": 5.009831081081081, "grad_norm": 0.5147989988327026, "learning_rate": 5e-05, "loss": 0.0123, "step": 1771 }, { "epoch": 5.0098648648648645, "grad_norm": 9.281559944152832, "learning_rate": 5e-05, "loss": 0.548, "step": 1772 }, { "epoch": 5.009898648648648, "grad_norm": 22.42140769958496, "learning_rate": 5e-05, "loss": 0.3452, "step": 1773 }, { "epoch": 5.009932432432432, "grad_norm": 0.295469731092453, "learning_rate": 5e-05, "loss": 0.0086, "step": 1774 }, { "epoch": 5.009966216216216, "grad_norm": 14.292773246765137, "learning_rate": 5e-05, "loss": 0.0819, "step": 1775 }, { "epoch": 5.01, "grad_norm": 7.311014175415039, "learning_rate": 5e-05, "loss": 0.4371, "step": 1776 }, { "epoch": 5.01, "eval_accuracy": 0.8497576736672051, "eval_loss": 0.5004553198814392, "eval_runtime": 32.908, "eval_samples_per_second": 18.81, "eval_steps_per_second": 2.37, "step": 1776 }, { "epoch": 6.000033783783784, "grad_norm": 2.7588870525360107, "learning_rate": 5e-05, "loss": 0.0294, "step": 1777 }, { "epoch": 6.000067567567568, "grad_norm": 8.992206573486328, "learning_rate": 5e-05, "loss": 0.1125, "step": 1778 }, { "epoch": 6.0001013513513515, "grad_norm": 7.692394733428955, "learning_rate": 5e-05, "loss": 0.0696, "step": 1779 }, { "epoch": 6.000135135135135, "grad_norm": 1.2249552011489868, "learning_rate": 5e-05, "loss": 0.0119, "step": 1780 }, { "epoch": 6.000168918918919, "grad_norm": 0.19300377368927002, "learning_rate": 5e-05, "loss": 0.0048, "step": 1781 }, { "epoch": 6.000202702702703, "grad_norm": 0.09021002054214478, "learning_rate": 5e-05, "loss": 0.0018, "step": 1782 }, { "epoch": 6.000236486486487, "grad_norm": 0.0736730620265007, "learning_rate": 5e-05, "loss": 0.0023, "step": 1783 }, { "epoch": 6.000270270270271, "grad_norm": 0.09549248963594437, "learning_rate": 5e-05, "loss": 0.0019, "step": 1784 }, { "epoch": 6.000304054054054, "grad_norm": 26.657194137573242, "learning_rate": 5e-05, "loss": 0.29, "step": 1785 }, { "epoch": 6.000337837837838, "grad_norm": 7.876435279846191, "learning_rate": 5e-05, "loss": 0.3877, "step": 1786 }, { "epoch": 6.000371621621621, "grad_norm": 6.047584533691406, "learning_rate": 5e-05, "loss": 0.0368, "step": 1787 }, { "epoch": 6.000405405405405, "grad_norm": 0.7406677007675171, "learning_rate": 5e-05, "loss": 0.0204, "step": 1788 }, { "epoch": 6.000439189189189, "grad_norm": 11.000194549560547, "learning_rate": 5e-05, "loss": 0.0894, "step": 1789 }, { "epoch": 6.000472972972973, "grad_norm": 1.6982403993606567, "learning_rate": 5e-05, "loss": 0.021, "step": 1790 }, { "epoch": 6.0005067567567565, "grad_norm": 0.23139116168022156, "learning_rate": 5e-05, "loss": 0.005, "step": 1791 }, { "epoch": 6.00054054054054, "grad_norm": 14.490854263305664, "learning_rate": 5e-05, "loss": 0.0988, "step": 1792 }, { "epoch": 6.000574324324324, "grad_norm": 1.090686321258545, "learning_rate": 5e-05, "loss": 0.014, "step": 1793 }, { "epoch": 6.000608108108108, "grad_norm": 1.6922599077224731, "learning_rate": 5e-05, "loss": 0.1599, "step": 1794 }, { "epoch": 6.000641891891892, "grad_norm": 0.5269129872322083, "learning_rate": 5e-05, "loss": 0.0067, "step": 1795 }, { "epoch": 6.000675675675676, "grad_norm": 1.499758005142212, "learning_rate": 5e-05, "loss": 0.1282, "step": 1796 }, { "epoch": 6.000709459459459, "grad_norm": 20.763507843017578, "learning_rate": 5e-05, "loss": 0.4056, "step": 1797 }, { "epoch": 6.000743243243243, "grad_norm": 1.12918221950531, "learning_rate": 5e-05, "loss": 0.0865, "step": 1798 }, { "epoch": 6.000777027027027, "grad_norm": 28.55466651916504, "learning_rate": 5e-05, "loss": 0.3086, "step": 1799 }, { "epoch": 6.000810810810811, "grad_norm": 6.4117279052734375, "learning_rate": 5e-05, "loss": 0.0488, "step": 1800 }, { "epoch": 6.000844594594595, "grad_norm": 2.5925798416137695, "learning_rate": 5e-05, "loss": 0.0489, "step": 1801 }, { "epoch": 6.000878378378379, "grad_norm": 12.379590034484863, "learning_rate": 5e-05, "loss": 0.0756, "step": 1802 }, { "epoch": 6.000912162162162, "grad_norm": 0.05971335992217064, "learning_rate": 5e-05, "loss": 0.001, "step": 1803 }, { "epoch": 6.000945945945946, "grad_norm": 23.6202335357666, "learning_rate": 5e-05, "loss": 0.0862, "step": 1804 }, { "epoch": 6.00097972972973, "grad_norm": 42.50374984741211, "learning_rate": 5e-05, "loss": 1.0835, "step": 1805 }, { "epoch": 6.001013513513514, "grad_norm": 0.08465567976236343, "learning_rate": 5e-05, "loss": 0.0015, "step": 1806 }, { "epoch": 6.001047297297298, "grad_norm": 0.5832580327987671, "learning_rate": 5e-05, "loss": 0.006, "step": 1807 }, { "epoch": 6.0010810810810815, "grad_norm": 35.61567306518555, "learning_rate": 5e-05, "loss": 0.2723, "step": 1808 }, { "epoch": 6.0011148648648645, "grad_norm": 19.292516708374023, "learning_rate": 5e-05, "loss": 1.4469, "step": 1809 }, { "epoch": 6.001148648648648, "grad_norm": 0.016667069867253304, "learning_rate": 5e-05, "loss": 0.0005, "step": 1810 }, { "epoch": 6.001182432432432, "grad_norm": 4.489448070526123, "learning_rate": 5e-05, "loss": 0.0133, "step": 1811 }, { "epoch": 6.001216216216216, "grad_norm": 11.549699783325195, "learning_rate": 5e-05, "loss": 0.5011, "step": 1812 }, { "epoch": 6.00125, "grad_norm": 18.21803092956543, "learning_rate": 5e-05, "loss": 0.5711, "step": 1813 }, { "epoch": 6.001283783783784, "grad_norm": 0.02466808445751667, "learning_rate": 5e-05, "loss": 0.0008, "step": 1814 }, { "epoch": 6.001317567567567, "grad_norm": 18.87037467956543, "learning_rate": 5e-05, "loss": 0.514, "step": 1815 }, { "epoch": 6.001351351351351, "grad_norm": 23.008441925048828, "learning_rate": 5e-05, "loss": 0.4612, "step": 1816 }, { "epoch": 6.001385135135135, "grad_norm": 0.013981284573674202, "learning_rate": 5e-05, "loss": 0.0005, "step": 1817 }, { "epoch": 6.001418918918919, "grad_norm": 0.8365600109100342, "learning_rate": 5e-05, "loss": 0.0066, "step": 1818 }, { "epoch": 6.001452702702703, "grad_norm": 0.16636532545089722, "learning_rate": 5e-05, "loss": 0.0035, "step": 1819 }, { "epoch": 6.0014864864864865, "grad_norm": 12.557854652404785, "learning_rate": 5e-05, "loss": 0.083, "step": 1820 }, { "epoch": 6.00152027027027, "grad_norm": 0.6896426677703857, "learning_rate": 5e-05, "loss": 0.0144, "step": 1821 }, { "epoch": 6.001554054054054, "grad_norm": 0.20396538078784943, "learning_rate": 5e-05, "loss": 0.0045, "step": 1822 }, { "epoch": 6.001587837837838, "grad_norm": 0.3038482367992401, "learning_rate": 5e-05, "loss": 0.0037, "step": 1823 }, { "epoch": 6.001621621621622, "grad_norm": 2.7335259914398193, "learning_rate": 5e-05, "loss": 0.0189, "step": 1824 }, { "epoch": 6.001655405405406, "grad_norm": 26.4488468170166, "learning_rate": 5e-05, "loss": 0.1027, "step": 1825 }, { "epoch": 6.0016891891891895, "grad_norm": 10.016544342041016, "learning_rate": 5e-05, "loss": 0.225, "step": 1826 }, { "epoch": 6.001722972972973, "grad_norm": 15.286067008972168, "learning_rate": 5e-05, "loss": 0.5889, "step": 1827 }, { "epoch": 6.001756756756757, "grad_norm": 0.06721122562885284, "learning_rate": 5e-05, "loss": 0.0019, "step": 1828 }, { "epoch": 6.001790540540541, "grad_norm": 1.263869285583496, "learning_rate": 5e-05, "loss": 0.0162, "step": 1829 }, { "epoch": 6.001824324324324, "grad_norm": 0.03527368605136871, "learning_rate": 5e-05, "loss": 0.0008, "step": 1830 }, { "epoch": 6.001858108108108, "grad_norm": 0.08841827511787415, "learning_rate": 5e-05, "loss": 0.0019, "step": 1831 }, { "epoch": 6.0018918918918915, "grad_norm": 9.587873458862305, "learning_rate": 5e-05, "loss": 0.1384, "step": 1832 }, { "epoch": 6.001925675675675, "grad_norm": 6.13158655166626, "learning_rate": 5e-05, "loss": 0.0263, "step": 1833 }, { "epoch": 6.001959459459459, "grad_norm": 15.495856285095215, "learning_rate": 5e-05, "loss": 0.6537, "step": 1834 }, { "epoch": 6.001993243243243, "grad_norm": 1.5951896905899048, "learning_rate": 5e-05, "loss": 0.0106, "step": 1835 }, { "epoch": 6.002027027027027, "grad_norm": 0.3703780174255371, "learning_rate": 5e-05, "loss": 0.0032, "step": 1836 }, { "epoch": 6.002060810810811, "grad_norm": 20.27166175842285, "learning_rate": 5e-05, "loss": 0.408, "step": 1837 }, { "epoch": 6.0020945945945945, "grad_norm": 12.29081916809082, "learning_rate": 5e-05, "loss": 0.7249, "step": 1838 }, { "epoch": 6.002128378378378, "grad_norm": 0.017890412360429764, "learning_rate": 5e-05, "loss": 0.0006, "step": 1839 }, { "epoch": 6.002162162162162, "grad_norm": 0.5222967267036438, "learning_rate": 5e-05, "loss": 0.0039, "step": 1840 }, { "epoch": 6.002195945945946, "grad_norm": 0.09290409833192825, "learning_rate": 5e-05, "loss": 0.0013, "step": 1841 }, { "epoch": 6.00222972972973, "grad_norm": 0.059473611414432526, "learning_rate": 5e-05, "loss": 0.0014, "step": 1842 }, { "epoch": 6.002263513513514, "grad_norm": 33.043968200683594, "learning_rate": 5e-05, "loss": 0.4538, "step": 1843 }, { "epoch": 6.002297297297297, "grad_norm": 0.057186029851436615, "learning_rate": 5e-05, "loss": 0.0009, "step": 1844 }, { "epoch": 6.002331081081081, "grad_norm": 0.06545168161392212, "learning_rate": 5e-05, "loss": 0.0016, "step": 1845 }, { "epoch": 6.002364864864865, "grad_norm": 1.0227142572402954, "learning_rate": 5e-05, "loss": 0.0089, "step": 1846 }, { "epoch": 6.002398648648649, "grad_norm": 1.871518611907959, "learning_rate": 5e-05, "loss": 0.0106, "step": 1847 }, { "epoch": 6.002432432432433, "grad_norm": 6.963522911071777, "learning_rate": 5e-05, "loss": 0.0678, "step": 1848 }, { "epoch": 6.002466216216217, "grad_norm": 0.7290064692497253, "learning_rate": 5e-05, "loss": 0.0191, "step": 1849 }, { "epoch": 6.0025, "grad_norm": 0.32016053795814514, "learning_rate": 5e-05, "loss": 0.0071, "step": 1850 }, { "epoch": 6.002533783783784, "grad_norm": 0.10804538428783417, "learning_rate": 5e-05, "loss": 0.0022, "step": 1851 }, { "epoch": 6.002567567567567, "grad_norm": 3.7657341957092285, "learning_rate": 5e-05, "loss": 0.0249, "step": 1852 }, { "epoch": 6.002601351351351, "grad_norm": 1.1677212715148926, "learning_rate": 5e-05, "loss": 0.0025, "step": 1853 }, { "epoch": 6.002635135135135, "grad_norm": 0.8399659991264343, "learning_rate": 5e-05, "loss": 0.0085, "step": 1854 }, { "epoch": 6.002668918918919, "grad_norm": 1.2627474069595337, "learning_rate": 5e-05, "loss": 0.0092, "step": 1855 }, { "epoch": 6.0027027027027025, "grad_norm": 0.9960414171218872, "learning_rate": 5e-05, "loss": 0.0093, "step": 1856 }, { "epoch": 6.002736486486486, "grad_norm": 3.830951690673828, "learning_rate": 5e-05, "loss": 0.0422, "step": 1857 }, { "epoch": 6.00277027027027, "grad_norm": 0.8197696805000305, "learning_rate": 5e-05, "loss": 0.0063, "step": 1858 }, { "epoch": 6.002804054054054, "grad_norm": 0.029217146337032318, "learning_rate": 5e-05, "loss": 0.0008, "step": 1859 }, { "epoch": 6.002837837837838, "grad_norm": 17.31867027282715, "learning_rate": 5e-05, "loss": 0.4476, "step": 1860 }, { "epoch": 6.002871621621622, "grad_norm": 0.31854093074798584, "learning_rate": 5e-05, "loss": 0.0027, "step": 1861 }, { "epoch": 6.002905405405405, "grad_norm": 14.969334602355957, "learning_rate": 5e-05, "loss": 0.8899, "step": 1862 }, { "epoch": 6.002939189189189, "grad_norm": 27.432085037231445, "learning_rate": 5e-05, "loss": 0.4641, "step": 1863 }, { "epoch": 6.002972972972973, "grad_norm": 13.126320838928223, "learning_rate": 5e-05, "loss": 0.2034, "step": 1864 }, { "epoch": 6.003006756756757, "grad_norm": 0.014224598184227943, "learning_rate": 5e-05, "loss": 0.0004, "step": 1865 }, { "epoch": 6.003040540540541, "grad_norm": 1.9783481359481812, "learning_rate": 5e-05, "loss": 0.0073, "step": 1866 }, { "epoch": 6.0030743243243245, "grad_norm": 0.00820860080420971, "learning_rate": 5e-05, "loss": 0.0003, "step": 1867 }, { "epoch": 6.003108108108108, "grad_norm": 0.05753011628985405, "learning_rate": 5e-05, "loss": 0.0007, "step": 1868 }, { "epoch": 6.003141891891892, "grad_norm": 0.0859554260969162, "learning_rate": 5e-05, "loss": 0.0013, "step": 1869 }, { "epoch": 6.003175675675676, "grad_norm": 0.006816410925239325, "learning_rate": 5e-05, "loss": 0.0002, "step": 1870 }, { "epoch": 6.00320945945946, "grad_norm": 11.323225975036621, "learning_rate": 5e-05, "loss": 0.0757, "step": 1871 }, { "epoch": 6.003243243243244, "grad_norm": 0.6732067465782166, "learning_rate": 5e-05, "loss": 0.0061, "step": 1872 }, { "epoch": 6.003277027027027, "grad_norm": 19.429662704467773, "learning_rate": 5e-05, "loss": 0.5462, "step": 1873 }, { "epoch": 6.00331081081081, "grad_norm": 0.41651448607444763, "learning_rate": 5e-05, "loss": 0.0042, "step": 1874 }, { "epoch": 6.003344594594594, "grad_norm": 4.897641181945801, "learning_rate": 5e-05, "loss": 0.364, "step": 1875 }, { "epoch": 6.003378378378378, "grad_norm": 0.4117446541786194, "learning_rate": 5e-05, "loss": 0.0031, "step": 1876 }, { "epoch": 6.003412162162162, "grad_norm": 8.181010246276855, "learning_rate": 5e-05, "loss": 0.034, "step": 1877 }, { "epoch": 6.003445945945946, "grad_norm": 4.120474338531494, "learning_rate": 5e-05, "loss": 0.0143, "step": 1878 }, { "epoch": 6.0034797297297295, "grad_norm": 0.026677455753087997, "learning_rate": 5e-05, "loss": 0.0008, "step": 1879 }, { "epoch": 6.003513513513513, "grad_norm": 0.03729546070098877, "learning_rate": 5e-05, "loss": 0.0005, "step": 1880 }, { "epoch": 6.003547297297297, "grad_norm": 0.01981762796640396, "learning_rate": 5e-05, "loss": 0.0003, "step": 1881 }, { "epoch": 6.003581081081081, "grad_norm": 2.4388279914855957, "learning_rate": 5e-05, "loss": 0.3117, "step": 1882 }, { "epoch": 6.003614864864865, "grad_norm": 11.35088062286377, "learning_rate": 5e-05, "loss": 0.0139, "step": 1883 }, { "epoch": 6.003648648648649, "grad_norm": 0.2041662633419037, "learning_rate": 5e-05, "loss": 0.0027, "step": 1884 }, { "epoch": 6.0036824324324325, "grad_norm": 2.068455457687378, "learning_rate": 5e-05, "loss": 0.0179, "step": 1885 }, { "epoch": 6.003716216216216, "grad_norm": 0.014860978350043297, "learning_rate": 5e-05, "loss": 0.0004, "step": 1886 }, { "epoch": 6.00375, "grad_norm": 2.548527240753174, "learning_rate": 5e-05, "loss": 0.0497, "step": 1887 }, { "epoch": 6.003783783783784, "grad_norm": 8.986519813537598, "learning_rate": 5e-05, "loss": 0.1737, "step": 1888 }, { "epoch": 6.003817567567568, "grad_norm": 0.2100134640932083, "learning_rate": 5e-05, "loss": 0.0034, "step": 1889 }, { "epoch": 6.003851351351352, "grad_norm": 0.3344561457633972, "learning_rate": 5e-05, "loss": 0.0032, "step": 1890 }, { "epoch": 6.003885135135135, "grad_norm": 17.55861473083496, "learning_rate": 5e-05, "loss": 0.6131, "step": 1891 }, { "epoch": 6.003918918918919, "grad_norm": 1.650989055633545, "learning_rate": 5e-05, "loss": 0.0573, "step": 1892 }, { "epoch": 6.003952702702703, "grad_norm": 28.32549476623535, "learning_rate": 5e-05, "loss": 0.8174, "step": 1893 }, { "epoch": 6.003986486486487, "grad_norm": 1.3079702854156494, "learning_rate": 5e-05, "loss": 0.012, "step": 1894 }, { "epoch": 6.00402027027027, "grad_norm": 0.02394024096429348, "learning_rate": 5e-05, "loss": 0.0007, "step": 1895 }, { "epoch": 6.004054054054054, "grad_norm": 5.2281365394592285, "learning_rate": 5e-05, "loss": 0.0482, "step": 1896 }, { "epoch": 6.0040878378378375, "grad_norm": 36.055137634277344, "learning_rate": 5e-05, "loss": 0.2932, "step": 1897 }, { "epoch": 6.004121621621621, "grad_norm": 0.0918693020939827, "learning_rate": 5e-05, "loss": 0.0012, "step": 1898 }, { "epoch": 6.004155405405405, "grad_norm": 0.11932863295078278, "learning_rate": 5e-05, "loss": 0.0015, "step": 1899 }, { "epoch": 6.004189189189189, "grad_norm": 7.099946975708008, "learning_rate": 5e-05, "loss": 0.9566, "step": 1900 }, { "epoch": 6.004222972972973, "grad_norm": 5.83871603012085, "learning_rate": 5e-05, "loss": 0.0226, "step": 1901 }, { "epoch": 6.004256756756757, "grad_norm": 0.33274054527282715, "learning_rate": 5e-05, "loss": 0.0074, "step": 1902 }, { "epoch": 6.0042905405405405, "grad_norm": 0.03488533943891525, "learning_rate": 5e-05, "loss": 0.0011, "step": 1903 }, { "epoch": 6.004324324324324, "grad_norm": 20.396116256713867, "learning_rate": 5e-05, "loss": 0.2046, "step": 1904 }, { "epoch": 6.004358108108108, "grad_norm": 0.17402514815330505, "learning_rate": 5e-05, "loss": 0.0019, "step": 1905 }, { "epoch": 6.004391891891892, "grad_norm": 12.342816352844238, "learning_rate": 5e-05, "loss": 0.2223, "step": 1906 }, { "epoch": 6.004425675675676, "grad_norm": 0.016704929992556572, "learning_rate": 5e-05, "loss": 0.0005, "step": 1907 }, { "epoch": 6.00445945945946, "grad_norm": 21.687053680419922, "learning_rate": 5e-05, "loss": 0.1563, "step": 1908 }, { "epoch": 6.004493243243243, "grad_norm": 0.164342001080513, "learning_rate": 5e-05, "loss": 0.0023, "step": 1909 }, { "epoch": 6.004527027027027, "grad_norm": 11.36880874633789, "learning_rate": 5e-05, "loss": 0.9268, "step": 1910 }, { "epoch": 6.004560810810811, "grad_norm": 10.228264808654785, "learning_rate": 5e-05, "loss": 0.5295, "step": 1911 }, { "epoch": 6.004594594594595, "grad_norm": 22.558521270751953, "learning_rate": 5e-05, "loss": 0.6342, "step": 1912 }, { "epoch": 6.004628378378379, "grad_norm": 14.158387184143066, "learning_rate": 5e-05, "loss": 0.5875, "step": 1913 }, { "epoch": 6.0046621621621625, "grad_norm": 0.12436309456825256, "learning_rate": 5e-05, "loss": 0.0014, "step": 1914 }, { "epoch": 6.004695945945946, "grad_norm": 1.7407649755477905, "learning_rate": 5e-05, "loss": 0.0428, "step": 1915 }, { "epoch": 6.004729729729729, "grad_norm": 6.327157497406006, "learning_rate": 5e-05, "loss": 0.0477, "step": 1916 }, { "epoch": 6.004763513513513, "grad_norm": 6.552639484405518, "learning_rate": 5e-05, "loss": 0.2698, "step": 1917 }, { "epoch": 6.004797297297297, "grad_norm": 20.97304344177246, "learning_rate": 5e-05, "loss": 1.0349, "step": 1918 }, { "epoch": 6.004831081081081, "grad_norm": 15.474186897277832, "learning_rate": 5e-05, "loss": 0.6208, "step": 1919 }, { "epoch": 6.004864864864865, "grad_norm": 0.18650558590888977, "learning_rate": 5e-05, "loss": 0.0021, "step": 1920 }, { "epoch": 6.004898648648648, "grad_norm": 2.2763497829437256, "learning_rate": 5e-05, "loss": 0.0476, "step": 1921 }, { "epoch": 6.004932432432432, "grad_norm": 9.389820098876953, "learning_rate": 5e-05, "loss": 0.1494, "step": 1922 }, { "epoch": 6.004966216216216, "grad_norm": 4.943981170654297, "learning_rate": 5e-05, "loss": 0.0475, "step": 1923 }, { "epoch": 6.005, "grad_norm": 5.012251377105713, "learning_rate": 5e-05, "loss": 0.0288, "step": 1924 }, { "epoch": 6.005033783783784, "grad_norm": 21.32411766052246, "learning_rate": 5e-05, "loss": 0.2295, "step": 1925 }, { "epoch": 6.0050675675675675, "grad_norm": 14.928879737854004, "learning_rate": 5e-05, "loss": 0.6246, "step": 1926 }, { "epoch": 6.005101351351351, "grad_norm": 0.06770685315132141, "learning_rate": 5e-05, "loss": 0.0013, "step": 1927 }, { "epoch": 6.005135135135135, "grad_norm": 0.36054351925849915, "learning_rate": 5e-05, "loss": 0.0058, "step": 1928 }, { "epoch": 6.005168918918919, "grad_norm": 2.802499771118164, "learning_rate": 5e-05, "loss": 0.032, "step": 1929 }, { "epoch": 6.005202702702703, "grad_norm": 0.07030037045478821, "learning_rate": 5e-05, "loss": 0.0022, "step": 1930 }, { "epoch": 6.005236486486487, "grad_norm": 7.193667888641357, "learning_rate": 5e-05, "loss": 0.3395, "step": 1931 }, { "epoch": 6.0052702702702705, "grad_norm": 0.11465185880661011, "learning_rate": 5e-05, "loss": 0.0036, "step": 1932 }, { "epoch": 6.005304054054054, "grad_norm": 13.157487869262695, "learning_rate": 5e-05, "loss": 0.5034, "step": 1933 }, { "epoch": 6.005337837837838, "grad_norm": 18.85061264038086, "learning_rate": 5e-05, "loss": 0.2507, "step": 1934 }, { "epoch": 6.005371621621622, "grad_norm": 0.16563062369823456, "learning_rate": 5e-05, "loss": 0.0038, "step": 1935 }, { "epoch": 6.005405405405406, "grad_norm": 12.045029640197754, "learning_rate": 5e-05, "loss": 1.17, "step": 1936 }, { "epoch": 6.00543918918919, "grad_norm": 11.556281089782715, "learning_rate": 5e-05, "loss": 0.2055, "step": 1937 }, { "epoch": 6.005472972972973, "grad_norm": 0.6418059468269348, "learning_rate": 5e-05, "loss": 0.0147, "step": 1938 }, { "epoch": 6.005506756756756, "grad_norm": 10.67901611328125, "learning_rate": 5e-05, "loss": 0.0845, "step": 1939 }, { "epoch": 6.00554054054054, "grad_norm": 21.25682830810547, "learning_rate": 5e-05, "loss": 0.21, "step": 1940 }, { "epoch": 6.005574324324324, "grad_norm": 0.04806722328066826, "learning_rate": 5e-05, "loss": 0.0015, "step": 1941 }, { "epoch": 6.005608108108108, "grad_norm": 12.976702690124512, "learning_rate": 5e-05, "loss": 0.4921, "step": 1942 }, { "epoch": 6.005641891891892, "grad_norm": 5.452323913574219, "learning_rate": 5e-05, "loss": 0.7479, "step": 1943 }, { "epoch": 6.0056756756756755, "grad_norm": 6.149107456207275, "learning_rate": 5e-05, "loss": 0.2909, "step": 1944 }, { "epoch": 6.005709459459459, "grad_norm": 0.2162931263446808, "learning_rate": 5e-05, "loss": 0.005, "step": 1945 }, { "epoch": 6.005743243243243, "grad_norm": 2.1148011684417725, "learning_rate": 5e-05, "loss": 0.036, "step": 1946 }, { "epoch": 6.005777027027027, "grad_norm": 4.490005970001221, "learning_rate": 5e-05, "loss": 0.0625, "step": 1947 }, { "epoch": 6.005810810810811, "grad_norm": 1.496354341506958, "learning_rate": 5e-05, "loss": 0.0107, "step": 1948 }, { "epoch": 6.005844594594595, "grad_norm": 0.24352334439754486, "learning_rate": 5e-05, "loss": 0.006, "step": 1949 }, { "epoch": 6.0058783783783785, "grad_norm": 5.191158771514893, "learning_rate": 5e-05, "loss": 0.5186, "step": 1950 }, { "epoch": 6.005912162162162, "grad_norm": 0.5030950307846069, "learning_rate": 5e-05, "loss": 0.0126, "step": 1951 }, { "epoch": 6.005945945945946, "grad_norm": 0.41631606221199036, "learning_rate": 5e-05, "loss": 0.0056, "step": 1952 }, { "epoch": 6.00597972972973, "grad_norm": 1.867934226989746, "learning_rate": 5e-05, "loss": 0.0428, "step": 1953 }, { "epoch": 6.006013513513514, "grad_norm": 0.023642994463443756, "learning_rate": 5e-05, "loss": 0.0009, "step": 1954 }, { "epoch": 6.006047297297298, "grad_norm": 10.207080841064453, "learning_rate": 5e-05, "loss": 0.7076, "step": 1955 }, { "epoch": 6.006081081081081, "grad_norm": 0.09119213372468948, "learning_rate": 5e-05, "loss": 0.0033, "step": 1956 }, { "epoch": 6.006114864864865, "grad_norm": 30.130910873413086, "learning_rate": 5e-05, "loss": 0.3344, "step": 1957 }, { "epoch": 6.006148648648649, "grad_norm": 14.728409767150879, "learning_rate": 5e-05, "loss": 0.3789, "step": 1958 }, { "epoch": 6.006182432432432, "grad_norm": 0.9050992727279663, "learning_rate": 5e-05, "loss": 0.0107, "step": 1959 }, { "epoch": 6.006216216216216, "grad_norm": 0.018691979348659515, "learning_rate": 5e-05, "loss": 0.0007, "step": 1960 }, { "epoch": 6.00625, "grad_norm": 2.3380231857299805, "learning_rate": 5e-05, "loss": 0.0367, "step": 1961 }, { "epoch": 6.0062837837837835, "grad_norm": 22.767658233642578, "learning_rate": 5e-05, "loss": 0.1751, "step": 1962 }, { "epoch": 6.006317567567567, "grad_norm": 35.663612365722656, "learning_rate": 5e-05, "loss": 0.2421, "step": 1963 }, { "epoch": 6.006351351351351, "grad_norm": 0.06307879090309143, "learning_rate": 5e-05, "loss": 0.0014, "step": 1964 }, { "epoch": 6.006385135135135, "grad_norm": 36.14234924316406, "learning_rate": 5e-05, "loss": 0.7058, "step": 1965 }, { "epoch": 6.006418918918919, "grad_norm": 0.9600571393966675, "learning_rate": 5e-05, "loss": 0.0202, "step": 1966 }, { "epoch": 6.006452702702703, "grad_norm": 11.006872177124023, "learning_rate": 5e-05, "loss": 0.1169, "step": 1967 }, { "epoch": 6.006486486486486, "grad_norm": 7.817658424377441, "learning_rate": 5e-05, "loss": 0.0582, "step": 1968 }, { "epoch": 6.00652027027027, "grad_norm": 4.236488342285156, "learning_rate": 5e-05, "loss": 0.0342, "step": 1969 }, { "epoch": 6.006554054054054, "grad_norm": 4.750302314758301, "learning_rate": 5e-05, "loss": 0.0615, "step": 1970 }, { "epoch": 6.006587837837838, "grad_norm": 0.3307317793369293, "learning_rate": 5e-05, "loss": 0.0057, "step": 1971 }, { "epoch": 6.006621621621622, "grad_norm": 1.1461209058761597, "learning_rate": 5e-05, "loss": 0.0107, "step": 1972 }, { "epoch": 6.0066554054054055, "grad_norm": 11.590350151062012, "learning_rate": 5e-05, "loss": 0.1147, "step": 1973 }, { "epoch": 6.006689189189189, "grad_norm": 18.8985595703125, "learning_rate": 5e-05, "loss": 0.4032, "step": 1974 }, { "epoch": 6.006722972972973, "grad_norm": 0.4356536269187927, "learning_rate": 5e-05, "loss": 0.0034, "step": 1975 }, { "epoch": 6.006756756756757, "grad_norm": 0.0711522176861763, "learning_rate": 5e-05, "loss": 0.002, "step": 1976 }, { "epoch": 6.006790540540541, "grad_norm": 0.10092207789421082, "learning_rate": 5e-05, "loss": 0.0026, "step": 1977 }, { "epoch": 6.006824324324325, "grad_norm": 8.4664888381958, "learning_rate": 5e-05, "loss": 0.429, "step": 1978 }, { "epoch": 6.0068581081081085, "grad_norm": 0.022938579320907593, "learning_rate": 5e-05, "loss": 0.0009, "step": 1979 }, { "epoch": 6.006891891891892, "grad_norm": 0.9770704507827759, "learning_rate": 5e-05, "loss": 0.021, "step": 1980 }, { "epoch": 6.006925675675675, "grad_norm": 1.0734848976135254, "learning_rate": 5e-05, "loss": 0.0213, "step": 1981 }, { "epoch": 6.006959459459459, "grad_norm": 2.2289931774139404, "learning_rate": 5e-05, "loss": 0.0125, "step": 1982 }, { "epoch": 6.006993243243243, "grad_norm": 0.8382478356361389, "learning_rate": 5e-05, "loss": 0.0102, "step": 1983 }, { "epoch": 6.007027027027027, "grad_norm": 0.06175365671515465, "learning_rate": 5e-05, "loss": 0.0013, "step": 1984 }, { "epoch": 6.007060810810811, "grad_norm": 6.70506477355957, "learning_rate": 5e-05, "loss": 0.0648, "step": 1985 }, { "epoch": 6.007094594594594, "grad_norm": 0.011878696270287037, "learning_rate": 5e-05, "loss": 0.0004, "step": 1986 }, { "epoch": 6.007128378378378, "grad_norm": 0.1421385556459427, "learning_rate": 5e-05, "loss": 0.0011, "step": 1987 }, { "epoch": 6.007162162162162, "grad_norm": 1.636307954788208, "learning_rate": 5e-05, "loss": 0.011, "step": 1988 }, { "epoch": 6.007195945945946, "grad_norm": 11.217211723327637, "learning_rate": 5e-05, "loss": 0.8352, "step": 1989 }, { "epoch": 6.00722972972973, "grad_norm": 49.1063232421875, "learning_rate": 5e-05, "loss": 0.4188, "step": 1990 }, { "epoch": 6.0072635135135135, "grad_norm": 16.933382034301758, "learning_rate": 5e-05, "loss": 0.7072, "step": 1991 }, { "epoch": 6.007297297297297, "grad_norm": 0.005439423490315676, "learning_rate": 5e-05, "loss": 0.0002, "step": 1992 }, { "epoch": 6.007331081081081, "grad_norm": 0.341806560754776, "learning_rate": 5e-05, "loss": 0.0027, "step": 1993 }, { "epoch": 6.007364864864865, "grad_norm": 28.26555824279785, "learning_rate": 5e-05, "loss": 0.36, "step": 1994 }, { "epoch": 6.007398648648649, "grad_norm": 14.21964168548584, "learning_rate": 5e-05, "loss": 0.0928, "step": 1995 }, { "epoch": 6.007432432432433, "grad_norm": 25.595245361328125, "learning_rate": 5e-05, "loss": 0.0971, "step": 1996 }, { "epoch": 6.0074662162162165, "grad_norm": 5.5482707023620605, "learning_rate": 5e-05, "loss": 0.0213, "step": 1997 }, { "epoch": 6.0075, "grad_norm": 9.50156021118164, "learning_rate": 5e-05, "loss": 0.0864, "step": 1998 }, { "epoch": 6.007533783783784, "grad_norm": 1.226300597190857, "learning_rate": 5e-05, "loss": 0.0084, "step": 1999 }, { "epoch": 6.007567567567568, "grad_norm": 5.896008014678955, "learning_rate": 5e-05, "loss": 0.0255, "step": 2000 }, { "epoch": 6.007601351351352, "grad_norm": 0.35035738348960876, "learning_rate": 5e-05, "loss": 0.0049, "step": 2001 }, { "epoch": 6.007635135135135, "grad_norm": 0.0466594360768795, "learning_rate": 5e-05, "loss": 0.0011, "step": 2002 }, { "epoch": 6.0076689189189185, "grad_norm": 5.8562846183776855, "learning_rate": 5e-05, "loss": 0.0236, "step": 2003 }, { "epoch": 6.007702702702702, "grad_norm": 17.695945739746094, "learning_rate": 5e-05, "loss": 1.2084, "step": 2004 }, { "epoch": 6.007736486486486, "grad_norm": 2.6068525314331055, "learning_rate": 5e-05, "loss": 0.0125, "step": 2005 }, { "epoch": 6.00777027027027, "grad_norm": 21.540821075439453, "learning_rate": 5e-05, "loss": 0.3114, "step": 2006 }, { "epoch": 6.007804054054054, "grad_norm": 0.028623495250940323, "learning_rate": 5e-05, "loss": 0.0008, "step": 2007 }, { "epoch": 6.007837837837838, "grad_norm": 16.644428253173828, "learning_rate": 5e-05, "loss": 0.2666, "step": 2008 }, { "epoch": 6.0078716216216215, "grad_norm": 16.338699340820312, "learning_rate": 5e-05, "loss": 1.0391, "step": 2009 }, { "epoch": 6.007905405405405, "grad_norm": 26.81963539123535, "learning_rate": 5e-05, "loss": 0.9311, "step": 2010 }, { "epoch": 6.007939189189189, "grad_norm": 3.9777374267578125, "learning_rate": 5e-05, "loss": 0.1955, "step": 2011 }, { "epoch": 6.007972972972973, "grad_norm": 5.94364070892334, "learning_rate": 5e-05, "loss": 0.0266, "step": 2012 }, { "epoch": 6.008006756756757, "grad_norm": 0.3813297152519226, "learning_rate": 5e-05, "loss": 0.0055, "step": 2013 }, { "epoch": 6.008040540540541, "grad_norm": 57.39351272583008, "learning_rate": 5e-05, "loss": 0.2453, "step": 2014 }, { "epoch": 6.008074324324324, "grad_norm": 7.502597808837891, "learning_rate": 5e-05, "loss": 0.0305, "step": 2015 }, { "epoch": 6.008108108108108, "grad_norm": 4.1780924797058105, "learning_rate": 5e-05, "loss": 0.1312, "step": 2016 }, { "epoch": 6.008141891891892, "grad_norm": 0.6993635892868042, "learning_rate": 5e-05, "loss": 0.0139, "step": 2017 }, { "epoch": 6.008175675675676, "grad_norm": 0.8333580493927002, "learning_rate": 5e-05, "loss": 0.0124, "step": 2018 }, { "epoch": 6.00820945945946, "grad_norm": 0.229800745844841, "learning_rate": 5e-05, "loss": 0.0021, "step": 2019 }, { "epoch": 6.0082432432432435, "grad_norm": 1.4739024639129639, "learning_rate": 5e-05, "loss": 0.0066, "step": 2020 }, { "epoch": 6.008277027027027, "grad_norm": 19.178550720214844, "learning_rate": 5e-05, "loss": 0.0973, "step": 2021 }, { "epoch": 6.008310810810811, "grad_norm": 3.0265841484069824, "learning_rate": 5e-05, "loss": 0.1198, "step": 2022 }, { "epoch": 6.008344594594595, "grad_norm": 9.35561752319336, "learning_rate": 5e-05, "loss": 0.1051, "step": 2023 }, { "epoch": 6.008378378378378, "grad_norm": 6.249094009399414, "learning_rate": 5e-05, "loss": 0.188, "step": 2024 }, { "epoch": 6.008412162162162, "grad_norm": 21.458477020263672, "learning_rate": 5e-05, "loss": 0.1009, "step": 2025 }, { "epoch": 6.008445945945946, "grad_norm": 3.2394790649414062, "learning_rate": 5e-05, "loss": 0.0368, "step": 2026 }, { "epoch": 6.008479729729729, "grad_norm": 3.630964994430542, "learning_rate": 5e-05, "loss": 0.7892, "step": 2027 }, { "epoch": 6.008513513513513, "grad_norm": 5.632106781005859, "learning_rate": 5e-05, "loss": 0.0997, "step": 2028 }, { "epoch": 6.008547297297297, "grad_norm": 2.155853509902954, "learning_rate": 5e-05, "loss": 0.2551, "step": 2029 }, { "epoch": 6.008581081081081, "grad_norm": 12.267677307128906, "learning_rate": 5e-05, "loss": 0.0765, "step": 2030 }, { "epoch": 6.008614864864865, "grad_norm": 9.154311180114746, "learning_rate": 5e-05, "loss": 0.5844, "step": 2031 }, { "epoch": 6.008648648648649, "grad_norm": 1.4563469886779785, "learning_rate": 5e-05, "loss": 0.0104, "step": 2032 }, { "epoch": 6.008682432432432, "grad_norm": 2.2754809856414795, "learning_rate": 5e-05, "loss": 0.0164, "step": 2033 }, { "epoch": 6.008716216216216, "grad_norm": 3.5648536682128906, "learning_rate": 5e-05, "loss": 0.6214, "step": 2034 }, { "epoch": 6.00875, "grad_norm": 0.01424784492701292, "learning_rate": 5e-05, "loss": 0.0005, "step": 2035 }, { "epoch": 6.008783783783784, "grad_norm": 32.23719787597656, "learning_rate": 5e-05, "loss": 0.7586, "step": 2036 }, { "epoch": 6.008817567567568, "grad_norm": 7.218935012817383, "learning_rate": 5e-05, "loss": 0.1125, "step": 2037 }, { "epoch": 6.0088513513513515, "grad_norm": 13.340131759643555, "learning_rate": 5e-05, "loss": 0.3403, "step": 2038 }, { "epoch": 6.008885135135135, "grad_norm": 1.8523515462875366, "learning_rate": 5e-05, "loss": 0.0161, "step": 2039 }, { "epoch": 6.008918918918919, "grad_norm": 0.7845292687416077, "learning_rate": 5e-05, "loss": 0.0137, "step": 2040 }, { "epoch": 6.008952702702703, "grad_norm": 28.830896377563477, "learning_rate": 5e-05, "loss": 0.3129, "step": 2041 }, { "epoch": 6.008986486486487, "grad_norm": 21.340900421142578, "learning_rate": 5e-05, "loss": 0.3482, "step": 2042 }, { "epoch": 6.009020270270271, "grad_norm": 9.193009376525879, "learning_rate": 5e-05, "loss": 0.2271, "step": 2043 }, { "epoch": 6.0090540540540545, "grad_norm": 3.4592103958129883, "learning_rate": 5e-05, "loss": 0.0212, "step": 2044 }, { "epoch": 6.009087837837837, "grad_norm": 0.040853120386600494, "learning_rate": 5e-05, "loss": 0.0011, "step": 2045 }, { "epoch": 6.009121621621621, "grad_norm": 3.5744988918304443, "learning_rate": 5e-05, "loss": 0.1528, "step": 2046 }, { "epoch": 6.009155405405405, "grad_norm": 13.185524940490723, "learning_rate": 5e-05, "loss": 0.315, "step": 2047 }, { "epoch": 6.009189189189189, "grad_norm": 1.3712024688720703, "learning_rate": 5e-05, "loss": 0.0904, "step": 2048 }, { "epoch": 6.009222972972973, "grad_norm": 23.91023826599121, "learning_rate": 5e-05, "loss": 0.1833, "step": 2049 }, { "epoch": 6.0092567567567565, "grad_norm": 32.30268096923828, "learning_rate": 5e-05, "loss": 0.1761, "step": 2050 }, { "epoch": 6.00929054054054, "grad_norm": 2.666043996810913, "learning_rate": 5e-05, "loss": 0.0376, "step": 2051 }, { "epoch": 6.009324324324324, "grad_norm": 7.071811199188232, "learning_rate": 5e-05, "loss": 0.0467, "step": 2052 }, { "epoch": 6.009358108108108, "grad_norm": 27.60280418395996, "learning_rate": 5e-05, "loss": 0.7561, "step": 2053 }, { "epoch": 6.009391891891892, "grad_norm": 4.715237140655518, "learning_rate": 5e-05, "loss": 0.4068, "step": 2054 }, { "epoch": 6.009425675675676, "grad_norm": 13.577132225036621, "learning_rate": 5e-05, "loss": 0.1514, "step": 2055 }, { "epoch": 6.0094594594594595, "grad_norm": 7.102494716644287, "learning_rate": 5e-05, "loss": 0.7026, "step": 2056 }, { "epoch": 6.009493243243243, "grad_norm": 4.172069072723389, "learning_rate": 5e-05, "loss": 0.0532, "step": 2057 }, { "epoch": 6.009527027027027, "grad_norm": 8.408449172973633, "learning_rate": 5e-05, "loss": 0.278, "step": 2058 }, { "epoch": 6.009560810810811, "grad_norm": 16.022985458374023, "learning_rate": 5e-05, "loss": 0.747, "step": 2059 }, { "epoch": 6.009594594594595, "grad_norm": 4.077399730682373, "learning_rate": 5e-05, "loss": 0.3521, "step": 2060 }, { "epoch": 6.009628378378379, "grad_norm": 0.24953767657279968, "learning_rate": 5e-05, "loss": 0.0058, "step": 2061 }, { "epoch": 6.009662162162162, "grad_norm": 18.882911682128906, "learning_rate": 5e-05, "loss": 0.1693, "step": 2062 }, { "epoch": 6.009695945945946, "grad_norm": 0.16422326862812042, "learning_rate": 5e-05, "loss": 0.0028, "step": 2063 }, { "epoch": 6.00972972972973, "grad_norm": 3.9541015625, "learning_rate": 5e-05, "loss": 0.124, "step": 2064 }, { "epoch": 6.009763513513514, "grad_norm": 7.802035808563232, "learning_rate": 5e-05, "loss": 0.1312, "step": 2065 }, { "epoch": 6.009797297297298, "grad_norm": 3.731193780899048, "learning_rate": 5e-05, "loss": 0.0914, "step": 2066 }, { "epoch": 6.009831081081081, "grad_norm": 4.8191142082214355, "learning_rate": 5e-05, "loss": 0.0782, "step": 2067 }, { "epoch": 6.0098648648648645, "grad_norm": 4.0894622802734375, "learning_rate": 5e-05, "loss": 0.0381, "step": 2068 }, { "epoch": 6.009898648648648, "grad_norm": 2.507362127304077, "learning_rate": 5e-05, "loss": 0.0316, "step": 2069 }, { "epoch": 6.009932432432432, "grad_norm": 0.3631543517112732, "learning_rate": 5e-05, "loss": 0.0075, "step": 2070 }, { "epoch": 6.009966216216216, "grad_norm": 0.036821793764829636, "learning_rate": 5e-05, "loss": 0.0014, "step": 2071 }, { "epoch": 6.01, "grad_norm": 1.006964921951294, "learning_rate": 5e-05, "loss": 0.0269, "step": 2072 }, { "epoch": 6.01, "eval_accuracy": 0.8497576736672051, "eval_loss": 0.5683692097663879, "eval_runtime": 32.1253, "eval_samples_per_second": 19.268, "eval_steps_per_second": 2.428, "step": 2072 }, { "epoch": 7.000033783783784, "grad_norm": 11.838115692138672, "learning_rate": 2.5e-05, "loss": 0.0737, "step": 2073 }, { "epoch": 7.000067567567568, "grad_norm": 5.0079779624938965, "learning_rate": 2.5e-05, "loss": 0.0287, "step": 2074 }, { "epoch": 7.0001013513513515, "grad_norm": 0.08469763398170471, "learning_rate": 2.5e-05, "loss": 0.002, "step": 2075 }, { "epoch": 7.000135135135135, "grad_norm": 0.22270579636096954, "learning_rate": 2.5e-05, "loss": 0.0025, "step": 2076 }, { "epoch": 7.000168918918919, "grad_norm": 28.308013916015625, "learning_rate": 2.5e-05, "loss": 1.0314, "step": 2077 }, { "epoch": 7.000202702702703, "grad_norm": 0.34093910455703735, "learning_rate": 2.5e-05, "loss": 0.0056, "step": 2078 }, { "epoch": 7.000236486486487, "grad_norm": 9.493703842163086, "learning_rate": 2.5e-05, "loss": 0.6754, "step": 2079 }, { "epoch": 7.000270270270271, "grad_norm": 2.1635282039642334, "learning_rate": 2.5e-05, "loss": 0.1266, "step": 2080 }, { "epoch": 7.000304054054054, "grad_norm": 0.07931061089038849, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2081 }, { "epoch": 7.000337837837838, "grad_norm": 18.687686920166016, "learning_rate": 2.5e-05, "loss": 0.1274, "step": 2082 }, { "epoch": 7.000371621621621, "grad_norm": 35.720821380615234, "learning_rate": 2.5e-05, "loss": 0.4019, "step": 2083 }, { "epoch": 7.000405405405405, "grad_norm": 0.13612768054008484, "learning_rate": 2.5e-05, "loss": 0.0029, "step": 2084 }, { "epoch": 7.000439189189189, "grad_norm": 19.048128128051758, "learning_rate": 2.5e-05, "loss": 1.1594, "step": 2085 }, { "epoch": 7.000472972972973, "grad_norm": 50.99811935424805, "learning_rate": 2.5e-05, "loss": 0.2828, "step": 2086 }, { "epoch": 7.0005067567567565, "grad_norm": 0.8390342593193054, "learning_rate": 2.5e-05, "loss": 0.0083, "step": 2087 }, { "epoch": 7.00054054054054, "grad_norm": 0.5928789377212524, "learning_rate": 2.5e-05, "loss": 0.0091, "step": 2088 }, { "epoch": 7.000574324324324, "grad_norm": 0.009993667714297771, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2089 }, { "epoch": 7.000608108108108, "grad_norm": 27.698705673217773, "learning_rate": 2.5e-05, "loss": 0.1185, "step": 2090 }, { "epoch": 7.000641891891892, "grad_norm": 0.2881213426589966, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 2091 }, { "epoch": 7.000675675675676, "grad_norm": 8.287544250488281, "learning_rate": 2.5e-05, "loss": 0.3119, "step": 2092 }, { "epoch": 7.000709459459459, "grad_norm": 7.085123538970947, "learning_rate": 2.5e-05, "loss": 0.5819, "step": 2093 }, { "epoch": 7.000743243243243, "grad_norm": 0.21090731024742126, "learning_rate": 2.5e-05, "loss": 0.0028, "step": 2094 }, { "epoch": 7.000777027027027, "grad_norm": 0.8507442474365234, "learning_rate": 2.5e-05, "loss": 0.008, "step": 2095 }, { "epoch": 7.000810810810811, "grad_norm": 0.4405113756656647, "learning_rate": 2.5e-05, "loss": 0.0084, "step": 2096 }, { "epoch": 7.000844594594595, "grad_norm": 0.0416698157787323, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2097 }, { "epoch": 7.000878378378379, "grad_norm": 2.7252912521362305, "learning_rate": 2.5e-05, "loss": 0.0301, "step": 2098 }, { "epoch": 7.000912162162162, "grad_norm": 0.036296676844358444, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2099 }, { "epoch": 7.000945945945946, "grad_norm": 0.5025346279144287, "learning_rate": 2.5e-05, "loss": 0.0053, "step": 2100 }, { "epoch": 7.00097972972973, "grad_norm": 4.9710588455200195, "learning_rate": 2.5e-05, "loss": 0.2996, "step": 2101 }, { "epoch": 7.001013513513514, "grad_norm": 0.08484555035829544, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 2102 }, { "epoch": 7.001047297297298, "grad_norm": 0.014995367266237736, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2103 }, { "epoch": 7.0010810810810815, "grad_norm": 0.08140140026807785, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2104 }, { "epoch": 7.0011148648648645, "grad_norm": 13.918614387512207, "learning_rate": 2.5e-05, "loss": 0.2578, "step": 2105 }, { "epoch": 7.001148648648648, "grad_norm": 0.6123678684234619, "learning_rate": 2.5e-05, "loss": 0.0063, "step": 2106 }, { "epoch": 7.001182432432432, "grad_norm": 0.033878080546855927, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2107 }, { "epoch": 7.001216216216216, "grad_norm": 0.3061028718948364, "learning_rate": 2.5e-05, "loss": 0.0045, "step": 2108 }, { "epoch": 7.00125, "grad_norm": 21.30211067199707, "learning_rate": 2.5e-05, "loss": 0.1193, "step": 2109 }, { "epoch": 7.001283783783784, "grad_norm": 10.024972915649414, "learning_rate": 2.5e-05, "loss": 0.5, "step": 2110 }, { "epoch": 7.001317567567567, "grad_norm": 1.4779887199401855, "learning_rate": 2.5e-05, "loss": 0.0123, "step": 2111 }, { "epoch": 7.001351351351351, "grad_norm": 5.888952255249023, "learning_rate": 2.5e-05, "loss": 0.0388, "step": 2112 }, { "epoch": 7.001385135135135, "grad_norm": 32.461273193359375, "learning_rate": 2.5e-05, "loss": 0.3089, "step": 2113 }, { "epoch": 7.001418918918919, "grad_norm": 0.17502495646476746, "learning_rate": 2.5e-05, "loss": 0.0025, "step": 2114 }, { "epoch": 7.001452702702703, "grad_norm": 0.3083450198173523, "learning_rate": 2.5e-05, "loss": 0.0053, "step": 2115 }, { "epoch": 7.0014864864864865, "grad_norm": 0.010301760397851467, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2116 }, { "epoch": 7.00152027027027, "grad_norm": 0.049824267625808716, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2117 }, { "epoch": 7.001554054054054, "grad_norm": 0.049825266003608704, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2118 }, { "epoch": 7.001587837837838, "grad_norm": 2.2832846641540527, "learning_rate": 2.5e-05, "loss": 0.0095, "step": 2119 }, { "epoch": 7.001621621621622, "grad_norm": 1.0495891571044922, "learning_rate": 2.5e-05, "loss": 0.0435, "step": 2120 }, { "epoch": 7.001655405405406, "grad_norm": 0.014272164553403854, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2121 }, { "epoch": 7.0016891891891895, "grad_norm": 0.09615354239940643, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2122 }, { "epoch": 7.001722972972973, "grad_norm": 0.3429509401321411, "learning_rate": 2.5e-05, "loss": 0.0046, "step": 2123 }, { "epoch": 7.001756756756757, "grad_norm": 0.021396001800894737, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2124 }, { "epoch": 7.001790540540541, "grad_norm": 0.8327059745788574, "learning_rate": 2.5e-05, "loss": 0.0086, "step": 2125 }, { "epoch": 7.001824324324324, "grad_norm": 0.5763586163520813, "learning_rate": 2.5e-05, "loss": 0.0065, "step": 2126 }, { "epoch": 7.001858108108108, "grad_norm": 0.5204915404319763, "learning_rate": 2.5e-05, "loss": 0.0202, "step": 2127 }, { "epoch": 7.0018918918918915, "grad_norm": 0.020525744184851646, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2128 }, { "epoch": 7.001925675675675, "grad_norm": 34.46744918823242, "learning_rate": 2.5e-05, "loss": 0.2338, "step": 2129 }, { "epoch": 7.001959459459459, "grad_norm": 0.1942436844110489, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2130 }, { "epoch": 7.001993243243243, "grad_norm": 0.01660415343940258, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2131 }, { "epoch": 7.002027027027027, "grad_norm": 0.006938444450497627, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2132 }, { "epoch": 7.002060810810811, "grad_norm": 6.547630786895752, "learning_rate": 2.5e-05, "loss": 0.6853, "step": 2133 }, { "epoch": 7.0020945945945945, "grad_norm": 0.008633033372461796, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2134 }, { "epoch": 7.002128378378378, "grad_norm": 13.949463844299316, "learning_rate": 2.5e-05, "loss": 0.3604, "step": 2135 }, { "epoch": 7.002162162162162, "grad_norm": 6.120707035064697, "learning_rate": 2.5e-05, "loss": 0.4963, "step": 2136 }, { "epoch": 7.002195945945946, "grad_norm": 19.414047241210938, "learning_rate": 2.5e-05, "loss": 1.1015, "step": 2137 }, { "epoch": 7.00222972972973, "grad_norm": 0.4948062300682068, "learning_rate": 2.5e-05, "loss": 0.0068, "step": 2138 }, { "epoch": 7.002263513513514, "grad_norm": 4.7696967124938965, "learning_rate": 2.5e-05, "loss": 0.0226, "step": 2139 }, { "epoch": 7.002297297297297, "grad_norm": 0.2274077832698822, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 2140 }, { "epoch": 7.002331081081081, "grad_norm": 4.828996658325195, "learning_rate": 2.5e-05, "loss": 0.0325, "step": 2141 }, { "epoch": 7.002364864864865, "grad_norm": 0.03774681314826012, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2142 }, { "epoch": 7.002398648648649, "grad_norm": 0.02031874842941761, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2143 }, { "epoch": 7.002432432432433, "grad_norm": 6.173173904418945, "learning_rate": 2.5e-05, "loss": 0.6418, "step": 2144 }, { "epoch": 7.002466216216217, "grad_norm": 0.7711488604545593, "learning_rate": 2.5e-05, "loss": 0.0105, "step": 2145 }, { "epoch": 7.0025, "grad_norm": 0.1567314714193344, "learning_rate": 2.5e-05, "loss": 0.0031, "step": 2146 }, { "epoch": 7.002533783783784, "grad_norm": 0.20095333456993103, "learning_rate": 2.5e-05, "loss": 0.0056, "step": 2147 }, { "epoch": 7.002567567567567, "grad_norm": 2.3799726963043213, "learning_rate": 2.5e-05, "loss": 0.0085, "step": 2148 }, { "epoch": 7.002601351351351, "grad_norm": 0.020446596667170525, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2149 }, { "epoch": 7.002635135135135, "grad_norm": 0.06281173229217529, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2150 }, { "epoch": 7.002668918918919, "grad_norm": 0.16408835351467133, "learning_rate": 2.5e-05, "loss": 0.0039, "step": 2151 }, { "epoch": 7.0027027027027025, "grad_norm": 14.033869743347168, "learning_rate": 2.5e-05, "loss": 0.6982, "step": 2152 }, { "epoch": 7.002736486486486, "grad_norm": 0.12495056539773941, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2153 }, { "epoch": 7.00277027027027, "grad_norm": 0.04673611372709274, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2154 }, { "epoch": 7.002804054054054, "grad_norm": 3.5553574562072754, "learning_rate": 2.5e-05, "loss": 0.0211, "step": 2155 }, { "epoch": 7.002837837837838, "grad_norm": 0.11122360080480576, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2156 }, { "epoch": 7.002871621621622, "grad_norm": 0.03730053827166557, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2157 }, { "epoch": 7.002905405405405, "grad_norm": 7.47406005859375, "learning_rate": 2.5e-05, "loss": 0.8217, "step": 2158 }, { "epoch": 7.002939189189189, "grad_norm": 1.2732338905334473, "learning_rate": 2.5e-05, "loss": 0.0907, "step": 2159 }, { "epoch": 7.002972972972973, "grad_norm": 0.0578949972987175, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 2160 }, { "epoch": 7.003006756756757, "grad_norm": 0.3545185625553131, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2161 }, { "epoch": 7.003040540540541, "grad_norm": 0.5995778441429138, "learning_rate": 2.5e-05, "loss": 0.0088, "step": 2162 }, { "epoch": 7.0030743243243245, "grad_norm": 2.8241350650787354, "learning_rate": 2.5e-05, "loss": 0.1277, "step": 2163 }, { "epoch": 7.003108108108108, "grad_norm": 1.009212851524353, "learning_rate": 2.5e-05, "loss": 0.0083, "step": 2164 }, { "epoch": 7.003141891891892, "grad_norm": 0.020942138507962227, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2165 }, { "epoch": 7.003175675675676, "grad_norm": 0.17854514718055725, "learning_rate": 2.5e-05, "loss": 0.0031, "step": 2166 }, { "epoch": 7.00320945945946, "grad_norm": 23.175174713134766, "learning_rate": 2.5e-05, "loss": 0.0862, "step": 2167 }, { "epoch": 7.003243243243244, "grad_norm": 0.22156263887882233, "learning_rate": 2.5e-05, "loss": 0.0079, "step": 2168 }, { "epoch": 7.003277027027027, "grad_norm": 0.05275692418217659, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2169 }, { "epoch": 7.00331081081081, "grad_norm": 10.950981140136719, "learning_rate": 2.5e-05, "loss": 0.0635, "step": 2170 }, { "epoch": 7.003344594594594, "grad_norm": 4.083075046539307, "learning_rate": 2.5e-05, "loss": 0.0793, "step": 2171 }, { "epoch": 7.003378378378378, "grad_norm": 8.19741439819336, "learning_rate": 2.5e-05, "loss": 0.0478, "step": 2172 }, { "epoch": 7.003412162162162, "grad_norm": 0.4394233822822571, "learning_rate": 2.5e-05, "loss": 0.002, "step": 2173 }, { "epoch": 7.003445945945946, "grad_norm": 0.011694693937897682, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2174 }, { "epoch": 7.0034797297297295, "grad_norm": 9.91814136505127, "learning_rate": 2.5e-05, "loss": 0.4353, "step": 2175 }, { "epoch": 7.003513513513513, "grad_norm": 0.10636698454618454, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2176 }, { "epoch": 7.003547297297297, "grad_norm": 5.158403396606445, "learning_rate": 2.5e-05, "loss": 0.0263, "step": 2177 }, { "epoch": 7.003581081081081, "grad_norm": 17.810754776000977, "learning_rate": 2.5e-05, "loss": 0.0904, "step": 2178 }, { "epoch": 7.003614864864865, "grad_norm": 5.603930473327637, "learning_rate": 2.5e-05, "loss": 0.0355, "step": 2179 }, { "epoch": 7.003648648648649, "grad_norm": 0.14425817131996155, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 2180 }, { "epoch": 7.0036824324324325, "grad_norm": 0.018597137182950974, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2181 }, { "epoch": 7.003716216216216, "grad_norm": 0.04916888475418091, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2182 }, { "epoch": 7.00375, "grad_norm": 0.010699950158596039, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2183 }, { "epoch": 7.003783783783784, "grad_norm": 0.07976066321134567, "learning_rate": 2.5e-05, "loss": 0.0032, "step": 2184 }, { "epoch": 7.003817567567568, "grad_norm": 0.3677041530609131, "learning_rate": 2.5e-05, "loss": 0.0048, "step": 2185 }, { "epoch": 7.003851351351352, "grad_norm": 10.977533340454102, "learning_rate": 2.5e-05, "loss": 0.074, "step": 2186 }, { "epoch": 7.003885135135135, "grad_norm": 2.547430992126465, "learning_rate": 2.5e-05, "loss": 0.0781, "step": 2187 }, { "epoch": 7.003918918918919, "grad_norm": 0.32741713523864746, "learning_rate": 2.5e-05, "loss": 0.0051, "step": 2188 }, { "epoch": 7.003952702702703, "grad_norm": 0.03310281038284302, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2189 }, { "epoch": 7.003986486486487, "grad_norm": 23.258054733276367, "learning_rate": 2.5e-05, "loss": 0.3899, "step": 2190 }, { "epoch": 7.00402027027027, "grad_norm": 7.538224220275879, "learning_rate": 2.5e-05, "loss": 0.0863, "step": 2191 }, { "epoch": 7.004054054054054, "grad_norm": 0.09127640724182129, "learning_rate": 2.5e-05, "loss": 0.0038, "step": 2192 }, { "epoch": 7.0040878378378375, "grad_norm": 2.844147205352783, "learning_rate": 2.5e-05, "loss": 0.0287, "step": 2193 }, { "epoch": 7.004121621621621, "grad_norm": 4.461415767669678, "learning_rate": 2.5e-05, "loss": 0.0244, "step": 2194 }, { "epoch": 7.004155405405405, "grad_norm": 0.20203472673892975, "learning_rate": 2.5e-05, "loss": 0.0029, "step": 2195 }, { "epoch": 7.004189189189189, "grad_norm": 6.8406982421875, "learning_rate": 2.5e-05, "loss": 0.6035, "step": 2196 }, { "epoch": 7.004222972972973, "grad_norm": 0.24867156147956848, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2197 }, { "epoch": 7.004256756756757, "grad_norm": 1.0148838758468628, "learning_rate": 2.5e-05, "loss": 0.0204, "step": 2198 }, { "epoch": 7.0042905405405405, "grad_norm": 0.6481841802597046, "learning_rate": 2.5e-05, "loss": 0.0065, "step": 2199 }, { "epoch": 7.004324324324324, "grad_norm": 0.16803120076656342, "learning_rate": 2.5e-05, "loss": 0.0053, "step": 2200 }, { "epoch": 7.004358108108108, "grad_norm": 6.887407302856445, "learning_rate": 2.5e-05, "loss": 0.0393, "step": 2201 }, { "epoch": 7.004391891891892, "grad_norm": 1.2651556730270386, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2202 }, { "epoch": 7.004425675675676, "grad_norm": 4.883862018585205, "learning_rate": 2.5e-05, "loss": 0.0462, "step": 2203 }, { "epoch": 7.00445945945946, "grad_norm": 0.013531984761357307, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2204 }, { "epoch": 7.004493243243243, "grad_norm": 3.391636371612549, "learning_rate": 2.5e-05, "loss": 0.0151, "step": 2205 }, { "epoch": 7.004527027027027, "grad_norm": 0.04161980375647545, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2206 }, { "epoch": 7.004560810810811, "grad_norm": 7.326340198516846, "learning_rate": 2.5e-05, "loss": 0.3922, "step": 2207 }, { "epoch": 7.004594594594595, "grad_norm": 3.436185598373413, "learning_rate": 2.5e-05, "loss": 0.5023, "step": 2208 }, { "epoch": 7.004628378378379, "grad_norm": 0.8993191719055176, "learning_rate": 2.5e-05, "loss": 0.0054, "step": 2209 }, { "epoch": 7.0046621621621625, "grad_norm": 0.3986317813396454, "learning_rate": 2.5e-05, "loss": 0.0052, "step": 2210 }, { "epoch": 7.004695945945946, "grad_norm": 5.037693023681641, "learning_rate": 2.5e-05, "loss": 0.0202, "step": 2211 }, { "epoch": 7.004729729729729, "grad_norm": 3.272120237350464, "learning_rate": 2.5e-05, "loss": 0.013, "step": 2212 }, { "epoch": 7.004763513513513, "grad_norm": 0.7546171545982361, "learning_rate": 2.5e-05, "loss": 0.0191, "step": 2213 }, { "epoch": 7.004797297297297, "grad_norm": 0.0690222680568695, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2214 }, { "epoch": 7.004831081081081, "grad_norm": 0.038757383823394775, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2215 }, { "epoch": 7.004864864864865, "grad_norm": 0.19985322654247284, "learning_rate": 2.5e-05, "loss": 0.0026, "step": 2216 }, { "epoch": 7.004898648648648, "grad_norm": 0.0331125408411026, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2217 }, { "epoch": 7.004932432432432, "grad_norm": 0.012740207836031914, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2218 }, { "epoch": 7.004966216216216, "grad_norm": 1.0984736680984497, "learning_rate": 2.5e-05, "loss": 0.0104, "step": 2219 }, { "epoch": 7.005, "grad_norm": 0.02887587808072567, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2220 }, { "epoch": 7.005033783783784, "grad_norm": 0.030541572719812393, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2221 }, { "epoch": 7.0050675675675675, "grad_norm": 15.37231159210205, "learning_rate": 2.5e-05, "loss": 0.1582, "step": 2222 }, { "epoch": 7.005101351351351, "grad_norm": 38.826202392578125, "learning_rate": 2.5e-05, "loss": 0.2921, "step": 2223 }, { "epoch": 7.005135135135135, "grad_norm": 0.05044591799378395, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2224 }, { "epoch": 7.005168918918919, "grad_norm": 0.08264810591936111, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2225 }, { "epoch": 7.005202702702703, "grad_norm": 0.46002820134162903, "learning_rate": 2.5e-05, "loss": 0.0044, "step": 2226 }, { "epoch": 7.005236486486487, "grad_norm": 1.2922803163528442, "learning_rate": 2.5e-05, "loss": 0.0116, "step": 2227 }, { "epoch": 7.0052702702702705, "grad_norm": 0.008898830972611904, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2228 }, { "epoch": 7.005304054054054, "grad_norm": 26.409669876098633, "learning_rate": 2.5e-05, "loss": 0.193, "step": 2229 }, { "epoch": 7.005337837837838, "grad_norm": 0.007248961366713047, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2230 }, { "epoch": 7.005371621621622, "grad_norm": 0.13014894723892212, "learning_rate": 2.5e-05, "loss": 0.0025, "step": 2231 }, { "epoch": 7.005405405405406, "grad_norm": 14.485461235046387, "learning_rate": 2.5e-05, "loss": 0.1156, "step": 2232 }, { "epoch": 7.00543918918919, "grad_norm": 0.1346706598997116, "learning_rate": 2.5e-05, "loss": 0.0045, "step": 2233 }, { "epoch": 7.005472972972973, "grad_norm": 0.03126766160130501, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2234 }, { "epoch": 7.005506756756756, "grad_norm": 0.012242333963513374, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2235 }, { "epoch": 7.00554054054054, "grad_norm": 2.35636305809021, "learning_rate": 2.5e-05, "loss": 0.2783, "step": 2236 }, { "epoch": 7.005574324324324, "grad_norm": 0.011658442206680775, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2237 }, { "epoch": 7.005608108108108, "grad_norm": 1.4075361490249634, "learning_rate": 2.5e-05, "loss": 0.0075, "step": 2238 }, { "epoch": 7.005641891891892, "grad_norm": 0.030318958684802055, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2239 }, { "epoch": 7.0056756756756755, "grad_norm": 1.3913321495056152, "learning_rate": 2.5e-05, "loss": 0.0048, "step": 2240 }, { "epoch": 7.005709459459459, "grad_norm": 0.0597166009247303, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2241 }, { "epoch": 7.005743243243243, "grad_norm": 0.009394572116434574, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2242 }, { "epoch": 7.005777027027027, "grad_norm": 0.8993475437164307, "learning_rate": 2.5e-05, "loss": 0.0048, "step": 2243 }, { "epoch": 7.005810810810811, "grad_norm": 0.17828145623207092, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2244 }, { "epoch": 7.005844594594595, "grad_norm": 0.18449759483337402, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2245 }, { "epoch": 7.0058783783783785, "grad_norm": 0.6504025459289551, "learning_rate": 2.5e-05, "loss": 0.0048, "step": 2246 }, { "epoch": 7.005912162162162, "grad_norm": 0.024674497544765472, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2247 }, { "epoch": 7.005945945945946, "grad_norm": 0.7228708863258362, "learning_rate": 2.5e-05, "loss": 0.0341, "step": 2248 }, { "epoch": 7.00597972972973, "grad_norm": 5.032560348510742, "learning_rate": 2.5e-05, "loss": 0.0071, "step": 2249 }, { "epoch": 7.006013513513514, "grad_norm": 0.46117788553237915, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2250 }, { "epoch": 7.006047297297298, "grad_norm": 0.21207088232040405, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2251 }, { "epoch": 7.006081081081081, "grad_norm": 0.14278702437877655, "learning_rate": 2.5e-05, "loss": 0.0054, "step": 2252 }, { "epoch": 7.006114864864865, "grad_norm": 8.49287223815918, "learning_rate": 2.5e-05, "loss": 0.027, "step": 2253 }, { "epoch": 7.006148648648649, "grad_norm": 0.4635685980319977, "learning_rate": 2.5e-05, "loss": 0.004, "step": 2254 }, { "epoch": 7.006182432432432, "grad_norm": 11.194714546203613, "learning_rate": 2.5e-05, "loss": 0.0624, "step": 2255 }, { "epoch": 7.006216216216216, "grad_norm": 0.0073005096055567265, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2256 }, { "epoch": 7.00625, "grad_norm": 1.895395040512085, "learning_rate": 2.5e-05, "loss": 0.1813, "step": 2257 }, { "epoch": 7.0062837837837835, "grad_norm": 0.006971444468945265, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2258 }, { "epoch": 7.006317567567567, "grad_norm": 0.00431853486225009, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2259 }, { "epoch": 7.006351351351351, "grad_norm": 0.21049518883228302, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2260 }, { "epoch": 7.006385135135135, "grad_norm": 0.2273964285850525, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2261 }, { "epoch": 7.006418918918919, "grad_norm": 0.09836996346712112, "learning_rate": 2.5e-05, "loss": 0.0037, "step": 2262 }, { "epoch": 7.006452702702703, "grad_norm": 44.97425079345703, "learning_rate": 2.5e-05, "loss": 0.2919, "step": 2263 }, { "epoch": 7.006486486486486, "grad_norm": 0.047781288623809814, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2264 }, { "epoch": 7.00652027027027, "grad_norm": 0.180389404296875, "learning_rate": 2.5e-05, "loss": 0.004, "step": 2265 }, { "epoch": 7.006554054054054, "grad_norm": 0.775824248790741, "learning_rate": 2.5e-05, "loss": 0.003, "step": 2266 }, { "epoch": 7.006587837837838, "grad_norm": 0.06143687665462494, "learning_rate": 2.5e-05, "loss": 0.0023, "step": 2267 }, { "epoch": 7.006621621621622, "grad_norm": 4.162306785583496, "learning_rate": 2.5e-05, "loss": 0.044, "step": 2268 }, { "epoch": 7.0066554054054055, "grad_norm": 0.24718092381954193, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 2269 }, { "epoch": 7.006689189189189, "grad_norm": 3.4523677825927734, "learning_rate": 2.5e-05, "loss": 0.4899, "step": 2270 }, { "epoch": 7.006722972972973, "grad_norm": 0.007485318463295698, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2271 }, { "epoch": 7.006756756756757, "grad_norm": 27.201698303222656, "learning_rate": 2.5e-05, "loss": 0.6429, "step": 2272 }, { "epoch": 7.006790540540541, "grad_norm": 0.03135404363274574, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2273 }, { "epoch": 7.006824324324325, "grad_norm": 0.17841656506061554, "learning_rate": 2.5e-05, "loss": 0.0051, "step": 2274 }, { "epoch": 7.0068581081081085, "grad_norm": 15.524478912353516, "learning_rate": 2.5e-05, "loss": 0.2077, "step": 2275 }, { "epoch": 7.006891891891892, "grad_norm": 0.007746970746666193, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2276 }, { "epoch": 7.006925675675675, "grad_norm": 14.402918815612793, "learning_rate": 2.5e-05, "loss": 0.9192, "step": 2277 }, { "epoch": 7.006959459459459, "grad_norm": 32.36726379394531, "learning_rate": 2.5e-05, "loss": 0.1315, "step": 2278 }, { "epoch": 7.006993243243243, "grad_norm": 0.0053564622066915035, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2279 }, { "epoch": 7.007027027027027, "grad_norm": 0.00763353705406189, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2280 }, { "epoch": 7.007060810810811, "grad_norm": 0.2233002483844757, "learning_rate": 2.5e-05, "loss": 0.0087, "step": 2281 }, { "epoch": 7.007094594594594, "grad_norm": 1.8183001279830933, "learning_rate": 2.5e-05, "loss": 0.0275, "step": 2282 }, { "epoch": 7.007128378378378, "grad_norm": 0.26465991139411926, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2283 }, { "epoch": 7.007162162162162, "grad_norm": 1.7508102655410767, "learning_rate": 2.5e-05, "loss": 0.0039, "step": 2284 }, { "epoch": 7.007195945945946, "grad_norm": 60.313507080078125, "learning_rate": 2.5e-05, "loss": 0.1826, "step": 2285 }, { "epoch": 7.00722972972973, "grad_norm": 41.29826736450195, "learning_rate": 2.5e-05, "loss": 0.1918, "step": 2286 }, { "epoch": 7.0072635135135135, "grad_norm": 0.018291903659701347, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2287 }, { "epoch": 7.007297297297297, "grad_norm": 10.779451370239258, "learning_rate": 2.5e-05, "loss": 0.0517, "step": 2288 }, { "epoch": 7.007331081081081, "grad_norm": 0.003137378254905343, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2289 }, { "epoch": 7.007364864864865, "grad_norm": 0.017077911645174026, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2290 }, { "epoch": 7.007398648648649, "grad_norm": 0.05087236315011978, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2291 }, { "epoch": 7.007432432432433, "grad_norm": 0.32101041078567505, "learning_rate": 2.5e-05, "loss": 0.0056, "step": 2292 }, { "epoch": 7.0074662162162165, "grad_norm": 0.10238036513328552, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2293 }, { "epoch": 7.0075, "grad_norm": 0.04788544774055481, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2294 }, { "epoch": 7.007533783783784, "grad_norm": 0.1403277963399887, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2295 }, { "epoch": 7.007567567567568, "grad_norm": 0.5826812386512756, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2296 }, { "epoch": 7.007601351351352, "grad_norm": 0.011696889996528625, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2297 }, { "epoch": 7.007635135135135, "grad_norm": 0.060627296566963196, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2298 }, { "epoch": 7.0076689189189185, "grad_norm": 12.789608001708984, "learning_rate": 2.5e-05, "loss": 0.6056, "step": 2299 }, { "epoch": 7.007702702702702, "grad_norm": 0.022982968017458916, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2300 }, { "epoch": 7.007736486486486, "grad_norm": 26.50075912475586, "learning_rate": 2.5e-05, "loss": 0.1194, "step": 2301 }, { "epoch": 7.00777027027027, "grad_norm": 0.004511190578341484, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2302 }, { "epoch": 7.007804054054054, "grad_norm": 0.012930788099765778, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2303 }, { "epoch": 7.007837837837838, "grad_norm": 40.72500991821289, "learning_rate": 2.5e-05, "loss": 0.2315, "step": 2304 }, { "epoch": 7.0078716216216215, "grad_norm": 0.04584289342164993, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2305 }, { "epoch": 7.007905405405405, "grad_norm": 0.004822248592972755, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2306 }, { "epoch": 7.007939189189189, "grad_norm": 0.02093713916838169, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2307 }, { "epoch": 7.007972972972973, "grad_norm": 0.1031356006860733, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2308 }, { "epoch": 7.008006756756757, "grad_norm": 0.1234685629606247, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2309 }, { "epoch": 7.008040540540541, "grad_norm": 0.006548271048814058, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2310 }, { "epoch": 7.008074324324324, "grad_norm": 17.16607093811035, "learning_rate": 2.5e-05, "loss": 1.0081, "step": 2311 }, { "epoch": 7.008108108108108, "grad_norm": 0.019971853122115135, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2312 }, { "epoch": 7.008141891891892, "grad_norm": 0.0032640693243592978, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2313 }, { "epoch": 7.008175675675676, "grad_norm": 47.64530944824219, "learning_rate": 2.5e-05, "loss": 0.2076, "step": 2314 }, { "epoch": 7.00820945945946, "grad_norm": 0.026151472702622414, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2315 }, { "epoch": 7.0082432432432435, "grad_norm": 0.1470959186553955, "learning_rate": 2.5e-05, "loss": 0.0054, "step": 2316 }, { "epoch": 7.008277027027027, "grad_norm": 0.16525748372077942, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2317 }, { "epoch": 7.008310810810811, "grad_norm": 0.1605999618768692, "learning_rate": 2.5e-05, "loss": 0.0059, "step": 2318 }, { "epoch": 7.008344594594595, "grad_norm": 0.005523439031094313, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2319 }, { "epoch": 7.008378378378378, "grad_norm": 0.5105994343757629, "learning_rate": 2.5e-05, "loss": 0.0021, "step": 2320 }, { "epoch": 7.008412162162162, "grad_norm": 15.55146312713623, "learning_rate": 2.5e-05, "loss": 0.1229, "step": 2321 }, { "epoch": 7.008445945945946, "grad_norm": 0.07609590142965317, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2322 }, { "epoch": 7.008479729729729, "grad_norm": 0.009549305774271488, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2323 }, { "epoch": 7.008513513513513, "grad_norm": 84.50398254394531, "learning_rate": 2.5e-05, "loss": 0.4081, "step": 2324 }, { "epoch": 7.008547297297297, "grad_norm": 51.4271125793457, "learning_rate": 2.5e-05, "loss": 0.2214, "step": 2325 }, { "epoch": 7.008581081081081, "grad_norm": 0.016669945791363716, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2326 }, { "epoch": 7.008614864864865, "grad_norm": 0.003406940260902047, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2327 }, { "epoch": 7.008648648648649, "grad_norm": 0.3110619783401489, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2328 }, { "epoch": 7.008682432432432, "grad_norm": 0.0988016426563263, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2329 }, { "epoch": 7.008716216216216, "grad_norm": 14.828872680664062, "learning_rate": 2.5e-05, "loss": 0.7826, "step": 2330 }, { "epoch": 7.00875, "grad_norm": 16.00920867919922, "learning_rate": 2.5e-05, "loss": 0.5275, "step": 2331 }, { "epoch": 7.008783783783784, "grad_norm": 6.132001876831055, "learning_rate": 2.5e-05, "loss": 0.1333, "step": 2332 }, { "epoch": 7.008817567567568, "grad_norm": 0.021765606477856636, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2333 }, { "epoch": 7.0088513513513515, "grad_norm": 0.00891153048723936, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2334 }, { "epoch": 7.008885135135135, "grad_norm": 0.5231658816337585, "learning_rate": 2.5e-05, "loss": 0.007, "step": 2335 }, { "epoch": 7.008918918918919, "grad_norm": 0.06768934428691864, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2336 }, { "epoch": 7.008952702702703, "grad_norm": 0.00675536785274744, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2337 }, { "epoch": 7.008986486486487, "grad_norm": 0.10906250029802322, "learning_rate": 2.5e-05, "loss": 0.0041, "step": 2338 }, { "epoch": 7.009020270270271, "grad_norm": 0.08724691718816757, "learning_rate": 2.5e-05, "loss": 0.0034, "step": 2339 }, { "epoch": 7.0090540540540545, "grad_norm": 0.0919005498290062, "learning_rate": 2.5e-05, "loss": 0.0025, "step": 2340 }, { "epoch": 7.009087837837837, "grad_norm": 9.636896133422852, "learning_rate": 2.5e-05, "loss": 0.9389, "step": 2341 }, { "epoch": 7.009121621621621, "grad_norm": 0.1816040724515915, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2342 }, { "epoch": 7.009155405405405, "grad_norm": 0.052052486687898636, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2343 }, { "epoch": 7.009189189189189, "grad_norm": 23.24197006225586, "learning_rate": 2.5e-05, "loss": 1.0994, "step": 2344 }, { "epoch": 7.009222972972973, "grad_norm": 0.06614867597818375, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2345 }, { "epoch": 7.0092567567567565, "grad_norm": 0.019022740423679352, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2346 }, { "epoch": 7.00929054054054, "grad_norm": 2.7292754650115967, "learning_rate": 2.5e-05, "loss": 0.0726, "step": 2347 }, { "epoch": 7.009324324324324, "grad_norm": 12.323110580444336, "learning_rate": 2.5e-05, "loss": 0.8317, "step": 2348 }, { "epoch": 7.009358108108108, "grad_norm": 0.06250607222318649, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2349 }, { "epoch": 7.009391891891892, "grad_norm": 13.156206130981445, "learning_rate": 2.5e-05, "loss": 0.053, "step": 2350 }, { "epoch": 7.009425675675676, "grad_norm": 14.092199325561523, "learning_rate": 2.5e-05, "loss": 0.2604, "step": 2351 }, { "epoch": 7.0094594594594595, "grad_norm": 12.839349746704102, "learning_rate": 2.5e-05, "loss": 0.6504, "step": 2352 }, { "epoch": 7.009493243243243, "grad_norm": 0.010670802555978298, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2353 }, { "epoch": 7.009527027027027, "grad_norm": 0.7998191118240356, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2354 }, { "epoch": 7.009560810810811, "grad_norm": 16.50786781311035, "learning_rate": 2.5e-05, "loss": 0.1883, "step": 2355 }, { "epoch": 7.009594594594595, "grad_norm": 0.05222555994987488, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2356 }, { "epoch": 7.009628378378379, "grad_norm": 0.43836018443107605, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2357 }, { "epoch": 7.009662162162162, "grad_norm": 0.009243079461157322, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2358 }, { "epoch": 7.009695945945946, "grad_norm": 0.010578581131994724, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2359 }, { "epoch": 7.00972972972973, "grad_norm": 4.9612579345703125, "learning_rate": 2.5e-05, "loss": 0.0155, "step": 2360 }, { "epoch": 7.009763513513514, "grad_norm": 1.0338678359985352, "learning_rate": 2.5e-05, "loss": 0.0546, "step": 2361 }, { "epoch": 7.009797297297298, "grad_norm": 0.28653913736343384, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2362 }, { "epoch": 7.009831081081081, "grad_norm": 0.8407981991767883, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2363 }, { "epoch": 7.0098648648648645, "grad_norm": 0.09573577344417572, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2364 }, { "epoch": 7.009898648648648, "grad_norm": 0.048200301826000214, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2365 }, { "epoch": 7.009932432432432, "grad_norm": 0.01718473993241787, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2366 }, { "epoch": 7.009966216216216, "grad_norm": 0.5537585020065308, "learning_rate": 2.5e-05, "loss": 0.0044, "step": 2367 }, { "epoch": 7.01, "grad_norm": 0.8631910681724548, "learning_rate": 2.5e-05, "loss": 0.0061, "step": 2368 }, { "epoch": 7.01, "eval_accuracy": 0.864297253634895, "eval_loss": 0.5858303904533386, "eval_runtime": 32.0185, "eval_samples_per_second": 19.333, "eval_steps_per_second": 2.436, "step": 2368 }, { "epoch": 8.000033783783783, "grad_norm": 0.053939685225486755, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2369 }, { "epoch": 8.000067567567568, "grad_norm": 0.29599860310554504, "learning_rate": 2.5e-05, "loss": 0.0034, "step": 2370 }, { "epoch": 8.00010135135135, "grad_norm": 0.09814243763685226, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2371 }, { "epoch": 8.000135135135135, "grad_norm": 0.006469881162047386, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2372 }, { "epoch": 8.000168918918918, "grad_norm": 0.09508483856916428, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2373 }, { "epoch": 8.000202702702703, "grad_norm": 0.3306155204772949, "learning_rate": 2.5e-05, "loss": 0.0212, "step": 2374 }, { "epoch": 8.000236486486486, "grad_norm": 2.559230089187622, "learning_rate": 2.5e-05, "loss": 0.0129, "step": 2375 }, { "epoch": 8.00027027027027, "grad_norm": 0.42572343349456787, "learning_rate": 2.5e-05, "loss": 0.0266, "step": 2376 }, { "epoch": 8.000304054054054, "grad_norm": 0.009735827334225178, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2377 }, { "epoch": 8.000337837837838, "grad_norm": 0.014413788914680481, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2378 }, { "epoch": 8.000371621621621, "grad_norm": 0.016223080456256866, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2379 }, { "epoch": 8.000405405405406, "grad_norm": 0.06649023294448853, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2380 }, { "epoch": 8.000439189189189, "grad_norm": 0.10241413116455078, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2381 }, { "epoch": 8.000472972972974, "grad_norm": 0.026807524263858795, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2382 }, { "epoch": 8.000506756756756, "grad_norm": 0.07545031607151031, "learning_rate": 2.5e-05, "loss": 0.0029, "step": 2383 }, { "epoch": 8.000540540540541, "grad_norm": 0.5180581212043762, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2384 }, { "epoch": 8.000574324324324, "grad_norm": 2.41278338432312, "learning_rate": 2.5e-05, "loss": 0.0056, "step": 2385 }, { "epoch": 8.000608108108109, "grad_norm": 1.4378224611282349, "learning_rate": 2.5e-05, "loss": 0.0072, "step": 2386 }, { "epoch": 8.000641891891892, "grad_norm": 22.342281341552734, "learning_rate": 2.5e-05, "loss": 0.612, "step": 2387 }, { "epoch": 8.000675675675677, "grad_norm": 0.04324118047952652, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2388 }, { "epoch": 8.00070945945946, "grad_norm": 5.556407451629639, "learning_rate": 2.5e-05, "loss": 0.0245, "step": 2389 }, { "epoch": 8.000743243243242, "grad_norm": 2.1584835052490234, "learning_rate": 2.5e-05, "loss": 0.0448, "step": 2390 }, { "epoch": 8.000777027027027, "grad_norm": 2.266899824142456, "learning_rate": 2.5e-05, "loss": 0.1779, "step": 2391 }, { "epoch": 8.00081081081081, "grad_norm": 0.27090761065483093, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2392 }, { "epoch": 8.000844594594595, "grad_norm": 0.0035397622268646955, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2393 }, { "epoch": 8.000878378378378, "grad_norm": 0.19772523641586304, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2394 }, { "epoch": 8.000912162162162, "grad_norm": 0.01131783239543438, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2395 }, { "epoch": 8.000945945945945, "grad_norm": 26.77949333190918, "learning_rate": 2.5e-05, "loss": 0.215, "step": 2396 }, { "epoch": 8.00097972972973, "grad_norm": 0.09226993471384048, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2397 }, { "epoch": 8.001013513513513, "grad_norm": 0.10641827434301376, "learning_rate": 2.5e-05, "loss": 0.0029, "step": 2398 }, { "epoch": 8.001047297297298, "grad_norm": 0.1198970153927803, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2399 }, { "epoch": 8.00108108108108, "grad_norm": 20.081281661987305, "learning_rate": 2.5e-05, "loss": 0.5432, "step": 2400 }, { "epoch": 8.001114864864865, "grad_norm": 25.993160247802734, "learning_rate": 2.5e-05, "loss": 0.1195, "step": 2401 }, { "epoch": 8.001148648648648, "grad_norm": 0.34632608294487, "learning_rate": 2.5e-05, "loss": 0.003, "step": 2402 }, { "epoch": 8.001182432432433, "grad_norm": 0.0032412486616522074, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2403 }, { "epoch": 8.001216216216216, "grad_norm": 44.46741485595703, "learning_rate": 2.5e-05, "loss": 0.2751, "step": 2404 }, { "epoch": 8.00125, "grad_norm": 0.06547223776578903, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2405 }, { "epoch": 8.001283783783784, "grad_norm": 0.7065528035163879, "learning_rate": 2.5e-05, "loss": 0.0051, "step": 2406 }, { "epoch": 8.001317567567568, "grad_norm": 0.0046921223402023315, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2407 }, { "epoch": 8.001351351351351, "grad_norm": 0.013681279495358467, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2408 }, { "epoch": 8.001385135135136, "grad_norm": 0.08152979612350464, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2409 }, { "epoch": 8.001418918918919, "grad_norm": 13.99307632446289, "learning_rate": 2.5e-05, "loss": 0.8204, "step": 2410 }, { "epoch": 8.001452702702704, "grad_norm": 0.08110644668340683, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2411 }, { "epoch": 8.001486486486487, "grad_norm": 0.11648593842983246, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2412 }, { "epoch": 8.00152027027027, "grad_norm": 4.264178276062012, "learning_rate": 2.5e-05, "loss": 0.011, "step": 2413 }, { "epoch": 8.001554054054054, "grad_norm": 40.358829498291016, "learning_rate": 2.5e-05, "loss": 0.2673, "step": 2414 }, { "epoch": 8.001587837837837, "grad_norm": 0.0032910637091845274, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2415 }, { "epoch": 8.001621621621622, "grad_norm": 0.014740820974111557, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2416 }, { "epoch": 8.001655405405405, "grad_norm": 0.04813484102487564, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2417 }, { "epoch": 8.00168918918919, "grad_norm": 0.018056314438581467, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2418 }, { "epoch": 8.001722972972972, "grad_norm": 0.7347312569618225, "learning_rate": 2.5e-05, "loss": 0.0129, "step": 2419 }, { "epoch": 8.001756756756757, "grad_norm": 6.328540802001953, "learning_rate": 2.5e-05, "loss": 0.0202, "step": 2420 }, { "epoch": 8.00179054054054, "grad_norm": 1.8533700704574585, "learning_rate": 2.5e-05, "loss": 0.159, "step": 2421 }, { "epoch": 8.001824324324325, "grad_norm": 0.47511565685272217, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2422 }, { "epoch": 8.001858108108108, "grad_norm": 16.14890480041504, "learning_rate": 2.5e-05, "loss": 0.3879, "step": 2423 }, { "epoch": 8.001891891891892, "grad_norm": 0.2748814821243286, "learning_rate": 2.5e-05, "loss": 0.0038, "step": 2424 }, { "epoch": 8.001925675675675, "grad_norm": 16.341140747070312, "learning_rate": 2.5e-05, "loss": 0.3504, "step": 2425 }, { "epoch": 8.00195945945946, "grad_norm": 3.1435773372650146, "learning_rate": 2.5e-05, "loss": 0.0223, "step": 2426 }, { "epoch": 8.001993243243243, "grad_norm": 0.05810944736003876, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2427 }, { "epoch": 8.002027027027028, "grad_norm": 10.264327049255371, "learning_rate": 2.5e-05, "loss": 0.7979, "step": 2428 }, { "epoch": 8.00206081081081, "grad_norm": 0.11605419218540192, "learning_rate": 2.5e-05, "loss": 0.0044, "step": 2429 }, { "epoch": 8.002094594594595, "grad_norm": 0.04627155512571335, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2430 }, { "epoch": 8.002128378378378, "grad_norm": 1.9201273918151855, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2431 }, { "epoch": 8.002162162162163, "grad_norm": 4.937418460845947, "learning_rate": 2.5e-05, "loss": 0.0711, "step": 2432 }, { "epoch": 8.002195945945946, "grad_norm": 41.08331298828125, "learning_rate": 2.5e-05, "loss": 0.1292, "step": 2433 }, { "epoch": 8.002229729729729, "grad_norm": 1.797671914100647, "learning_rate": 2.5e-05, "loss": 0.0112, "step": 2434 }, { "epoch": 8.002263513513514, "grad_norm": 0.04532792046666145, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 2435 }, { "epoch": 8.002297297297297, "grad_norm": 0.006573617458343506, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2436 }, { "epoch": 8.002331081081081, "grad_norm": 0.23063543438911438, "learning_rate": 2.5e-05, "loss": 0.0021, "step": 2437 }, { "epoch": 8.002364864864864, "grad_norm": 0.6958153247833252, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 2438 }, { "epoch": 8.002398648648649, "grad_norm": 0.08456752449274063, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2439 }, { "epoch": 8.002432432432432, "grad_norm": 1.1425526142120361, "learning_rate": 2.5e-05, "loss": 0.01, "step": 2440 }, { "epoch": 8.002466216216217, "grad_norm": 0.20706550776958466, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2441 }, { "epoch": 8.0025, "grad_norm": 0.26955777406692505, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2442 }, { "epoch": 8.002533783783784, "grad_norm": 0.7376745939254761, "learning_rate": 2.5e-05, "loss": 0.0096, "step": 2443 }, { "epoch": 8.002567567567567, "grad_norm": 4.597662925720215, "learning_rate": 2.5e-05, "loss": 0.0186, "step": 2444 }, { "epoch": 8.002601351351352, "grad_norm": 0.009460926055908203, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2445 }, { "epoch": 8.002635135135135, "grad_norm": 0.13898274302482605, "learning_rate": 2.5e-05, "loss": 0.0021, "step": 2446 }, { "epoch": 8.00266891891892, "grad_norm": 2.9034464359283447, "learning_rate": 2.5e-05, "loss": 0.0121, "step": 2447 }, { "epoch": 8.002702702702702, "grad_norm": 0.032826781272888184, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2448 }, { "epoch": 8.002736486486487, "grad_norm": 4.683267116546631, "learning_rate": 2.5e-05, "loss": 0.7538, "step": 2449 }, { "epoch": 8.00277027027027, "grad_norm": 8.648504257202148, "learning_rate": 2.5e-05, "loss": 0.0785, "step": 2450 }, { "epoch": 8.002804054054055, "grad_norm": 0.0035870864521712065, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2451 }, { "epoch": 8.002837837837838, "grad_norm": 0.05739198625087738, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2452 }, { "epoch": 8.002871621621622, "grad_norm": 0.005460518412292004, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2453 }, { "epoch": 8.002905405405405, "grad_norm": 1.3619186878204346, "learning_rate": 2.5e-05, "loss": 0.0073, "step": 2454 }, { "epoch": 8.002939189189188, "grad_norm": 44.735145568847656, "learning_rate": 2.5e-05, "loss": 0.5265, "step": 2455 }, { "epoch": 8.002972972972973, "grad_norm": 9.599691390991211, "learning_rate": 2.5e-05, "loss": 0.0367, "step": 2456 }, { "epoch": 8.003006756756756, "grad_norm": 6.670742034912109, "learning_rate": 2.5e-05, "loss": 0.8603, "step": 2457 }, { "epoch": 8.00304054054054, "grad_norm": 0.010700997896492481, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2458 }, { "epoch": 8.003074324324324, "grad_norm": 2.800629138946533, "learning_rate": 2.5e-05, "loss": 0.1226, "step": 2459 }, { "epoch": 8.003108108108108, "grad_norm": 0.0044896528124809265, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2460 }, { "epoch": 8.003141891891891, "grad_norm": 0.006918872240930796, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2461 }, { "epoch": 8.003175675675676, "grad_norm": 5.74686861038208, "learning_rate": 2.5e-05, "loss": 0.1338, "step": 2462 }, { "epoch": 8.003209459459459, "grad_norm": 0.013566081412136555, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2463 }, { "epoch": 8.003243243243244, "grad_norm": 15.364291191101074, "learning_rate": 2.5e-05, "loss": 0.6721, "step": 2464 }, { "epoch": 8.003277027027027, "grad_norm": 0.10803493857383728, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2465 }, { "epoch": 8.003310810810811, "grad_norm": 0.012462585233151913, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2466 }, { "epoch": 8.003344594594594, "grad_norm": 0.38852712512016296, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2467 }, { "epoch": 8.003378378378379, "grad_norm": 0.020167995244264603, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2468 }, { "epoch": 8.003412162162162, "grad_norm": 0.00751761207357049, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2469 }, { "epoch": 8.003445945945947, "grad_norm": 0.008674140088260174, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2470 }, { "epoch": 8.00347972972973, "grad_norm": 35.380916595458984, "learning_rate": 2.5e-05, "loss": 0.2971, "step": 2471 }, { "epoch": 8.003513513513514, "grad_norm": 0.008288795128464699, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2472 }, { "epoch": 8.003547297297297, "grad_norm": 0.004230547696352005, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2473 }, { "epoch": 8.003581081081082, "grad_norm": 0.4110335409641266, "learning_rate": 2.5e-05, "loss": 0.0042, "step": 2474 }, { "epoch": 8.003614864864865, "grad_norm": 4.006132125854492, "learning_rate": 2.5e-05, "loss": 0.5046, "step": 2475 }, { "epoch": 8.003648648648648, "grad_norm": 0.07893639802932739, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2476 }, { "epoch": 8.003682432432432, "grad_norm": 0.032893721014261246, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2477 }, { "epoch": 8.003716216216215, "grad_norm": 0.39101558923721313, "learning_rate": 2.5e-05, "loss": 0.004, "step": 2478 }, { "epoch": 8.00375, "grad_norm": 0.02928292565047741, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2479 }, { "epoch": 8.003783783783783, "grad_norm": 0.006295429542660713, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2480 }, { "epoch": 8.003817567567568, "grad_norm": 4.4750471115112305, "learning_rate": 2.5e-05, "loss": 0.2016, "step": 2481 }, { "epoch": 8.00385135135135, "grad_norm": 10.749003410339355, "learning_rate": 2.5e-05, "loss": 0.6058, "step": 2482 }, { "epoch": 8.003885135135135, "grad_norm": 13.28110122680664, "learning_rate": 2.5e-05, "loss": 0.0639, "step": 2483 }, { "epoch": 8.003918918918918, "grad_norm": 25.398113250732422, "learning_rate": 2.5e-05, "loss": 0.2352, "step": 2484 }, { "epoch": 8.003952702702703, "grad_norm": 2.3143858909606934, "learning_rate": 2.5e-05, "loss": 0.0246, "step": 2485 }, { "epoch": 8.003986486486486, "grad_norm": 14.05184268951416, "learning_rate": 2.5e-05, "loss": 0.2509, "step": 2486 }, { "epoch": 8.00402027027027, "grad_norm": 0.008208248764276505, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2487 }, { "epoch": 8.004054054054054, "grad_norm": 0.01156510878354311, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2488 }, { "epoch": 8.004087837837838, "grad_norm": 0.9987685084342957, "learning_rate": 2.5e-05, "loss": 0.0053, "step": 2489 }, { "epoch": 8.004121621621621, "grad_norm": 3.3874058723449707, "learning_rate": 2.5e-05, "loss": 0.012, "step": 2490 }, { "epoch": 8.004155405405406, "grad_norm": 0.044379349797964096, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2491 }, { "epoch": 8.004189189189189, "grad_norm": 33.24074172973633, "learning_rate": 2.5e-05, "loss": 0.3438, "step": 2492 }, { "epoch": 8.004222972972974, "grad_norm": 0.005784013774245977, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2493 }, { "epoch": 8.004256756756757, "grad_norm": 1.6408271789550781, "learning_rate": 2.5e-05, "loss": 0.0151, "step": 2494 }, { "epoch": 8.004290540540541, "grad_norm": 2.4821243286132812, "learning_rate": 2.5e-05, "loss": 0.2134, "step": 2495 }, { "epoch": 8.004324324324324, "grad_norm": 16.209199905395508, "learning_rate": 2.5e-05, "loss": 0.0637, "step": 2496 }, { "epoch": 8.004358108108109, "grad_norm": 9.157625198364258, "learning_rate": 2.5e-05, "loss": 0.0986, "step": 2497 }, { "epoch": 8.004391891891892, "grad_norm": 15.041952133178711, "learning_rate": 2.5e-05, "loss": 0.8383, "step": 2498 }, { "epoch": 8.004425675675675, "grad_norm": 0.12030471116304398, "learning_rate": 2.5e-05, "loss": 0.002, "step": 2499 }, { "epoch": 8.00445945945946, "grad_norm": 0.28426533937454224, "learning_rate": 2.5e-05, "loss": 0.0041, "step": 2500 }, { "epoch": 8.004493243243243, "grad_norm": 0.01484223548322916, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2501 }, { "epoch": 8.004527027027027, "grad_norm": 0.6542966961860657, "learning_rate": 2.5e-05, "loss": 0.0412, "step": 2502 }, { "epoch": 8.00456081081081, "grad_norm": 0.01088249683380127, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2503 }, { "epoch": 8.004594594594595, "grad_norm": 15.078142166137695, "learning_rate": 2.5e-05, "loss": 0.2673, "step": 2504 }, { "epoch": 8.004628378378378, "grad_norm": 0.4121009111404419, "learning_rate": 2.5e-05, "loss": 0.004, "step": 2505 }, { "epoch": 8.004662162162163, "grad_norm": 6.321603298187256, "learning_rate": 2.5e-05, "loss": 0.4539, "step": 2506 }, { "epoch": 8.004695945945945, "grad_norm": 0.033617570996284485, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2507 }, { "epoch": 8.00472972972973, "grad_norm": 0.005927556660026312, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2508 }, { "epoch": 8.004763513513513, "grad_norm": 0.8952799439430237, "learning_rate": 2.5e-05, "loss": 0.0591, "step": 2509 }, { "epoch": 8.004797297297298, "grad_norm": 10.335201263427734, "learning_rate": 2.5e-05, "loss": 0.4859, "step": 2510 }, { "epoch": 8.00483108108108, "grad_norm": 6.65460729598999, "learning_rate": 2.5e-05, "loss": 0.0401, "step": 2511 }, { "epoch": 8.004864864864865, "grad_norm": 14.977509498596191, "learning_rate": 2.5e-05, "loss": 0.8109, "step": 2512 }, { "epoch": 8.004898648648648, "grad_norm": 0.12269959598779678, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2513 }, { "epoch": 8.004932432432433, "grad_norm": 1.0759652853012085, "learning_rate": 2.5e-05, "loss": 0.0074, "step": 2514 }, { "epoch": 8.004966216216216, "grad_norm": 64.24642181396484, "learning_rate": 2.5e-05, "loss": 0.3016, "step": 2515 }, { "epoch": 8.005, "grad_norm": 0.011967379599809647, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2516 }, { "epoch": 8.005033783783784, "grad_norm": 0.4824622869491577, "learning_rate": 2.5e-05, "loss": 0.0121, "step": 2517 }, { "epoch": 8.005067567567568, "grad_norm": 23.840946197509766, "learning_rate": 2.5e-05, "loss": 0.1878, "step": 2518 }, { "epoch": 8.005101351351351, "grad_norm": 1.933635950088501, "learning_rate": 2.5e-05, "loss": 0.0298, "step": 2519 }, { "epoch": 8.005135135135134, "grad_norm": 0.024653971195220947, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2520 }, { "epoch": 8.005168918918919, "grad_norm": 0.6032394766807556, "learning_rate": 2.5e-05, "loss": 0.0116, "step": 2521 }, { "epoch": 8.005202702702702, "grad_norm": 10.818012237548828, "learning_rate": 2.5e-05, "loss": 0.0449, "step": 2522 }, { "epoch": 8.005236486486487, "grad_norm": 5.97047758102417, "learning_rate": 2.5e-05, "loss": 0.3839, "step": 2523 }, { "epoch": 8.00527027027027, "grad_norm": 10.348895072937012, "learning_rate": 2.5e-05, "loss": 0.0775, "step": 2524 }, { "epoch": 8.005304054054054, "grad_norm": 58.036285400390625, "learning_rate": 2.5e-05, "loss": 1.008, "step": 2525 }, { "epoch": 8.005337837837837, "grad_norm": 0.794752836227417, "learning_rate": 2.5e-05, "loss": 0.008, "step": 2526 }, { "epoch": 8.005371621621622, "grad_norm": 0.13596314191818237, "learning_rate": 2.5e-05, "loss": 0.0025, "step": 2527 }, { "epoch": 8.005405405405405, "grad_norm": 0.15740127861499786, "learning_rate": 2.5e-05, "loss": 0.0026, "step": 2528 }, { "epoch": 8.00543918918919, "grad_norm": 0.5537694096565247, "learning_rate": 2.5e-05, "loss": 0.0375, "step": 2529 }, { "epoch": 8.005472972972973, "grad_norm": 5.487738132476807, "learning_rate": 2.5e-05, "loss": 0.6355, "step": 2530 }, { "epoch": 8.005506756756757, "grad_norm": 0.006011145189404488, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2531 }, { "epoch": 8.00554054054054, "grad_norm": 4.86250114440918, "learning_rate": 2.5e-05, "loss": 0.0145, "step": 2532 }, { "epoch": 8.005574324324325, "grad_norm": 0.20218153297901154, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 2533 }, { "epoch": 8.005608108108108, "grad_norm": 3.3543968200683594, "learning_rate": 2.5e-05, "loss": 0.4001, "step": 2534 }, { "epoch": 8.005641891891893, "grad_norm": 0.008614901453256607, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2535 }, { "epoch": 8.005675675675676, "grad_norm": 0.20744109153747559, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2536 }, { "epoch": 8.00570945945946, "grad_norm": 0.3596441149711609, "learning_rate": 2.5e-05, "loss": 0.0038, "step": 2537 }, { "epoch": 8.005743243243243, "grad_norm": 0.25445297360420227, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2538 }, { "epoch": 8.005777027027028, "grad_norm": 5.304027557373047, "learning_rate": 2.5e-05, "loss": 0.0086, "step": 2539 }, { "epoch": 8.00581081081081, "grad_norm": 0.2902889549732208, "learning_rate": 2.5e-05, "loss": 0.0045, "step": 2540 }, { "epoch": 8.005844594594594, "grad_norm": 24.292299270629883, "learning_rate": 2.5e-05, "loss": 0.0585, "step": 2541 }, { "epoch": 8.005878378378378, "grad_norm": 40.154808044433594, "learning_rate": 2.5e-05, "loss": 0.2325, "step": 2542 }, { "epoch": 8.005912162162161, "grad_norm": 6.271827220916748, "learning_rate": 2.5e-05, "loss": 0.0878, "step": 2543 }, { "epoch": 8.005945945945946, "grad_norm": 0.13745123147964478, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2544 }, { "epoch": 8.005979729729729, "grad_norm": 0.3306794762611389, "learning_rate": 2.5e-05, "loss": 0.0134, "step": 2545 }, { "epoch": 8.006013513513514, "grad_norm": 0.015473795123398304, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2546 }, { "epoch": 8.006047297297297, "grad_norm": 3.920947551727295, "learning_rate": 2.5e-05, "loss": 0.0128, "step": 2547 }, { "epoch": 8.006081081081081, "grad_norm": 6.1508307456970215, "learning_rate": 2.5e-05, "loss": 0.4915, "step": 2548 }, { "epoch": 8.006114864864864, "grad_norm": 0.3478429615497589, "learning_rate": 2.5e-05, "loss": 0.0105, "step": 2549 }, { "epoch": 8.006148648648649, "grad_norm": 0.010263296775519848, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2550 }, { "epoch": 8.006182432432432, "grad_norm": 5.443362712860107, "learning_rate": 2.5e-05, "loss": 0.1199, "step": 2551 }, { "epoch": 8.006216216216217, "grad_norm": 4.750640869140625, "learning_rate": 2.5e-05, "loss": 0.032, "step": 2552 }, { "epoch": 8.00625, "grad_norm": 10.285428047180176, "learning_rate": 2.5e-05, "loss": 0.0811, "step": 2553 }, { "epoch": 8.006283783783784, "grad_norm": 0.011551743373274803, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2554 }, { "epoch": 8.006317567567567, "grad_norm": 0.0303008072078228, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2555 }, { "epoch": 8.006351351351352, "grad_norm": 0.4716910719871521, "learning_rate": 2.5e-05, "loss": 0.0023, "step": 2556 }, { "epoch": 8.006385135135135, "grad_norm": 3.402721405029297, "learning_rate": 2.5e-05, "loss": 0.0087, "step": 2557 }, { "epoch": 8.00641891891892, "grad_norm": 1.8141367435455322, "learning_rate": 2.5e-05, "loss": 0.2266, "step": 2558 }, { "epoch": 8.006452702702703, "grad_norm": 4.44830846786499, "learning_rate": 2.5e-05, "loss": 0.1354, "step": 2559 }, { "epoch": 8.006486486486487, "grad_norm": 3.147564649581909, "learning_rate": 2.5e-05, "loss": 0.3398, "step": 2560 }, { "epoch": 8.00652027027027, "grad_norm": 0.060450199991464615, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2561 }, { "epoch": 8.006554054054053, "grad_norm": 0.0421120747923851, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2562 }, { "epoch": 8.006587837837838, "grad_norm": 13.043493270874023, "learning_rate": 2.5e-05, "loss": 0.0971, "step": 2563 }, { "epoch": 8.00662162162162, "grad_norm": 0.356738805770874, "learning_rate": 2.5e-05, "loss": 0.0035, "step": 2564 }, { "epoch": 8.006655405405406, "grad_norm": 1.9463775157928467, "learning_rate": 2.5e-05, "loss": 0.0197, "step": 2565 }, { "epoch": 8.006689189189188, "grad_norm": 0.07784704118967056, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2566 }, { "epoch": 8.006722972972973, "grad_norm": 0.08622562140226364, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2567 }, { "epoch": 8.006756756756756, "grad_norm": 5.527349948883057, "learning_rate": 2.5e-05, "loss": 0.146, "step": 2568 }, { "epoch": 8.00679054054054, "grad_norm": 0.12998370826244354, "learning_rate": 2.5e-05, "loss": 0.0032, "step": 2569 }, { "epoch": 8.006824324324324, "grad_norm": 2.6951162815093994, "learning_rate": 2.5e-05, "loss": 0.0627, "step": 2570 }, { "epoch": 8.006858108108108, "grad_norm": 2.531928539276123, "learning_rate": 2.5e-05, "loss": 0.049, "step": 2571 }, { "epoch": 8.006891891891891, "grad_norm": 0.12990723550319672, "learning_rate": 2.5e-05, "loss": 0.0046, "step": 2572 }, { "epoch": 8.006925675675676, "grad_norm": 1.0544984340667725, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2573 }, { "epoch": 8.006959459459459, "grad_norm": 0.18285715579986572, "learning_rate": 2.5e-05, "loss": 0.0026, "step": 2574 }, { "epoch": 8.006993243243244, "grad_norm": 9.573326110839844, "learning_rate": 2.5e-05, "loss": 0.071, "step": 2575 }, { "epoch": 8.007027027027027, "grad_norm": 0.009763594716787338, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2576 }, { "epoch": 8.007060810810811, "grad_norm": 0.12941302359104156, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2577 }, { "epoch": 8.007094594594594, "grad_norm": 18.695606231689453, "learning_rate": 2.5e-05, "loss": 1.0351, "step": 2578 }, { "epoch": 8.007128378378379, "grad_norm": 6.505658149719238, "learning_rate": 2.5e-05, "loss": 0.0206, "step": 2579 }, { "epoch": 8.007162162162162, "grad_norm": 0.08785256743431091, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2580 }, { "epoch": 8.007195945945947, "grad_norm": 0.08663026243448257, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2581 }, { "epoch": 8.00722972972973, "grad_norm": 21.101482391357422, "learning_rate": 2.5e-05, "loss": 0.7661, "step": 2582 }, { "epoch": 8.007263513513514, "grad_norm": 1.6271377801895142, "learning_rate": 2.5e-05, "loss": 0.1877, "step": 2583 }, { "epoch": 8.007297297297297, "grad_norm": 0.0031965041998773813, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2584 }, { "epoch": 8.00733108108108, "grad_norm": 0.6866165399551392, "learning_rate": 2.5e-05, "loss": 0.0263, "step": 2585 }, { "epoch": 8.007364864864865, "grad_norm": 0.0036793265026062727, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2586 }, { "epoch": 8.007398648648648, "grad_norm": 29.887649536132812, "learning_rate": 2.5e-05, "loss": 0.6496, "step": 2587 }, { "epoch": 8.007432432432433, "grad_norm": 1.9462594985961914, "learning_rate": 2.5e-05, "loss": 0.0451, "step": 2588 }, { "epoch": 8.007466216216216, "grad_norm": 0.5993732213973999, "learning_rate": 2.5e-05, "loss": 0.0208, "step": 2589 }, { "epoch": 8.0075, "grad_norm": 0.02200436219573021, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2590 }, { "epoch": 8.007533783783783, "grad_norm": 68.59977722167969, "learning_rate": 2.5e-05, "loss": 0.7358, "step": 2591 }, { "epoch": 8.007567567567568, "grad_norm": 0.020184388384222984, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2592 }, { "epoch": 8.00760135135135, "grad_norm": 0.034115422517061234, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2593 }, { "epoch": 8.007635135135136, "grad_norm": 0.03749573975801468, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2594 }, { "epoch": 8.007668918918919, "grad_norm": 1.3725178241729736, "learning_rate": 2.5e-05, "loss": 0.1233, "step": 2595 }, { "epoch": 8.007702702702703, "grad_norm": 4.584661483764648, "learning_rate": 2.5e-05, "loss": 0.0198, "step": 2596 }, { "epoch": 8.007736486486486, "grad_norm": 0.14144645631313324, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2597 }, { "epoch": 8.00777027027027, "grad_norm": 53.03546142578125, "learning_rate": 2.5e-05, "loss": 0.545, "step": 2598 }, { "epoch": 8.007804054054054, "grad_norm": 9.85061264038086, "learning_rate": 2.5e-05, "loss": 0.2157, "step": 2599 }, { "epoch": 8.007837837837839, "grad_norm": 0.18667711317539215, "learning_rate": 2.5e-05, "loss": 0.0074, "step": 2600 }, { "epoch": 8.007871621621621, "grad_norm": 0.35880953073501587, "learning_rate": 2.5e-05, "loss": 0.0051, "step": 2601 }, { "epoch": 8.007905405405406, "grad_norm": 1.0969781875610352, "learning_rate": 2.5e-05, "loss": 0.0794, "step": 2602 }, { "epoch": 8.00793918918919, "grad_norm": 40.86524200439453, "learning_rate": 2.5e-05, "loss": 0.1449, "step": 2603 }, { "epoch": 8.007972972972974, "grad_norm": 9.486213684082031, "learning_rate": 2.5e-05, "loss": 0.7786, "step": 2604 }, { "epoch": 8.008006756756757, "grad_norm": 0.3136553168296814, "learning_rate": 2.5e-05, "loss": 0.0021, "step": 2605 }, { "epoch": 8.00804054054054, "grad_norm": 0.1622118502855301, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2606 }, { "epoch": 8.008074324324324, "grad_norm": 12.91661548614502, "learning_rate": 2.5e-05, "loss": 0.294, "step": 2607 }, { "epoch": 8.008108108108107, "grad_norm": 11.396747589111328, "learning_rate": 2.5e-05, "loss": 0.3305, "step": 2608 }, { "epoch": 8.008141891891892, "grad_norm": 0.01535375602543354, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2609 }, { "epoch": 8.008175675675675, "grad_norm": 14.772420883178711, "learning_rate": 2.5e-05, "loss": 0.483, "step": 2610 }, { "epoch": 8.00820945945946, "grad_norm": 0.037745244801044464, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2611 }, { "epoch": 8.008243243243243, "grad_norm": 0.6404486894607544, "learning_rate": 2.5e-05, "loss": 0.0067, "step": 2612 }, { "epoch": 8.008277027027027, "grad_norm": 21.907493591308594, "learning_rate": 2.5e-05, "loss": 0.1526, "step": 2613 }, { "epoch": 8.00831081081081, "grad_norm": 5.732994556427002, "learning_rate": 2.5e-05, "loss": 0.0383, "step": 2614 }, { "epoch": 8.008344594594595, "grad_norm": 0.07118715345859528, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2615 }, { "epoch": 8.008378378378378, "grad_norm": 0.018513992428779602, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2616 }, { "epoch": 8.008412162162163, "grad_norm": 27.566570281982422, "learning_rate": 2.5e-05, "loss": 0.3508, "step": 2617 }, { "epoch": 8.008445945945946, "grad_norm": 0.11470885574817657, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2618 }, { "epoch": 8.00847972972973, "grad_norm": 6.510274887084961, "learning_rate": 2.5e-05, "loss": 0.0297, "step": 2619 }, { "epoch": 8.008513513513513, "grad_norm": 5.536726474761963, "learning_rate": 2.5e-05, "loss": 0.5468, "step": 2620 }, { "epoch": 8.008547297297298, "grad_norm": 9.595479011535645, "learning_rate": 2.5e-05, "loss": 0.1593, "step": 2621 }, { "epoch": 8.008581081081081, "grad_norm": 2.0620052814483643, "learning_rate": 2.5e-05, "loss": 0.0128, "step": 2622 }, { "epoch": 8.008614864864866, "grad_norm": 0.04131797328591347, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2623 }, { "epoch": 8.008648648648649, "grad_norm": 0.10364939272403717, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2624 }, { "epoch": 8.008682432432433, "grad_norm": 1.8227038383483887, "learning_rate": 2.5e-05, "loss": 0.1065, "step": 2625 }, { "epoch": 8.008716216216216, "grad_norm": 0.21330204606056213, "learning_rate": 2.5e-05, "loss": 0.0037, "step": 2626 }, { "epoch": 8.00875, "grad_norm": 18.192550659179688, "learning_rate": 2.5e-05, "loss": 0.9127, "step": 2627 }, { "epoch": 8.008783783783784, "grad_norm": 0.017680171877145767, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2628 }, { "epoch": 8.008817567567567, "grad_norm": 12.147743225097656, "learning_rate": 2.5e-05, "loss": 0.2604, "step": 2629 }, { "epoch": 8.008851351351352, "grad_norm": 8.661871910095215, "learning_rate": 2.5e-05, "loss": 0.0272, "step": 2630 }, { "epoch": 8.008885135135134, "grad_norm": 15.462297439575195, "learning_rate": 2.5e-05, "loss": 0.1568, "step": 2631 }, { "epoch": 8.00891891891892, "grad_norm": 0.12637555599212646, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2632 }, { "epoch": 8.008952702702702, "grad_norm": 5.212715148925781, "learning_rate": 2.5e-05, "loss": 0.0182, "step": 2633 }, { "epoch": 8.008986486486487, "grad_norm": 20.076152801513672, "learning_rate": 2.5e-05, "loss": 0.5454, "step": 2634 }, { "epoch": 8.00902027027027, "grad_norm": 0.09813734889030457, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2635 }, { "epoch": 8.009054054054054, "grad_norm": 0.1336337924003601, "learning_rate": 2.5e-05, "loss": 0.002, "step": 2636 }, { "epoch": 8.009087837837837, "grad_norm": 0.5401718616485596, "learning_rate": 2.5e-05, "loss": 0.0033, "step": 2637 }, { "epoch": 8.009121621621622, "grad_norm": 0.15850506722927094, "learning_rate": 2.5e-05, "loss": 0.006, "step": 2638 }, { "epoch": 8.009155405405405, "grad_norm": 0.00900952983647585, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2639 }, { "epoch": 8.00918918918919, "grad_norm": 29.743553161621094, "learning_rate": 2.5e-05, "loss": 0.5127, "step": 2640 }, { "epoch": 8.009222972972973, "grad_norm": 26.865745544433594, "learning_rate": 2.5e-05, "loss": 0.5827, "step": 2641 }, { "epoch": 8.009256756756757, "grad_norm": 0.02534393221139908, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2642 }, { "epoch": 8.00929054054054, "grad_norm": 2.3672406673431396, "learning_rate": 2.5e-05, "loss": 0.1384, "step": 2643 }, { "epoch": 8.009324324324325, "grad_norm": 17.756467819213867, "learning_rate": 2.5e-05, "loss": 0.0691, "step": 2644 }, { "epoch": 8.009358108108108, "grad_norm": 0.2570174038410187, "learning_rate": 2.5e-05, "loss": 0.0068, "step": 2645 }, { "epoch": 8.009391891891893, "grad_norm": 17.870527267456055, "learning_rate": 2.5e-05, "loss": 0.7769, "step": 2646 }, { "epoch": 8.009425675675676, "grad_norm": 0.08170164376497269, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2647 }, { "epoch": 8.009459459459459, "grad_norm": 0.13493309915065765, "learning_rate": 2.5e-05, "loss": 0.0046, "step": 2648 }, { "epoch": 8.009493243243243, "grad_norm": 0.11252619326114655, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2649 }, { "epoch": 8.009527027027026, "grad_norm": 0.30313944816589355, "learning_rate": 2.5e-05, "loss": 0.0036, "step": 2650 }, { "epoch": 8.009560810810811, "grad_norm": 0.04179529845714569, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2651 }, { "epoch": 8.009594594594594, "grad_norm": 0.09401822090148926, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2652 }, { "epoch": 8.009628378378379, "grad_norm": 0.16005022823810577, "learning_rate": 2.5e-05, "loss": 0.0053, "step": 2653 }, { "epoch": 8.009662162162162, "grad_norm": 0.018989315256476402, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2654 }, { "epoch": 8.009695945945946, "grad_norm": 0.01813557744026184, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2655 }, { "epoch": 8.00972972972973, "grad_norm": 0.03460407257080078, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2656 }, { "epoch": 8.009763513513514, "grad_norm": 0.006976905278861523, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2657 }, { "epoch": 8.009797297297297, "grad_norm": 1.4079893827438354, "learning_rate": 2.5e-05, "loss": 0.019, "step": 2658 }, { "epoch": 8.009831081081082, "grad_norm": 33.35459899902344, "learning_rate": 2.5e-05, "loss": 0.1056, "step": 2659 }, { "epoch": 8.009864864864864, "grad_norm": 11.606756210327148, "learning_rate": 2.5e-05, "loss": 0.0599, "step": 2660 }, { "epoch": 8.00989864864865, "grad_norm": 0.9798707365989685, "learning_rate": 2.5e-05, "loss": 0.0088, "step": 2661 }, { "epoch": 8.009932432432432, "grad_norm": 0.08635081350803375, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 2662 }, { "epoch": 8.009966216216217, "grad_norm": 0.004934531636536121, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2663 }, { "epoch": 8.01, "grad_norm": 9.490483283996582, "learning_rate": 2.5e-05, "loss": 0.0262, "step": 2664 }, { "epoch": 8.01, "eval_accuracy": 0.8901453957996769, "eval_loss": 0.4571356773376465, "eval_runtime": 31.5567, "eval_samples_per_second": 19.615, "eval_steps_per_second": 2.472, "step": 2664 }, { "epoch": 9.000033783783783, "grad_norm": 0.45376861095428467, "learning_rate": 2.5e-05, "loss": 0.0188, "step": 2665 }, { "epoch": 9.000067567567568, "grad_norm": 0.1025034710764885, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2666 }, { "epoch": 9.00010135135135, "grad_norm": 0.42184650897979736, "learning_rate": 2.5e-05, "loss": 0.0105, "step": 2667 }, { "epoch": 9.000135135135135, "grad_norm": 0.16587163507938385, "learning_rate": 2.5e-05, "loss": 0.0023, "step": 2668 }, { "epoch": 9.000168918918918, "grad_norm": 0.184164360165596, "learning_rate": 2.5e-05, "loss": 0.0019, "step": 2669 }, { "epoch": 9.000202702702703, "grad_norm": 0.2896256744861603, "learning_rate": 2.5e-05, "loss": 0.002, "step": 2670 }, { "epoch": 9.000236486486486, "grad_norm": 0.6031298041343689, "learning_rate": 2.5e-05, "loss": 0.0208, "step": 2671 }, { "epoch": 9.00027027027027, "grad_norm": 1.6483291387557983, "learning_rate": 2.5e-05, "loss": 0.0332, "step": 2672 }, { "epoch": 9.000304054054054, "grad_norm": 18.65342903137207, "learning_rate": 2.5e-05, "loss": 0.7247, "step": 2673 }, { "epoch": 9.000337837837838, "grad_norm": 0.039846066385507584, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2674 }, { "epoch": 9.000371621621621, "grad_norm": 0.166500523686409, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2675 }, { "epoch": 9.000405405405406, "grad_norm": 0.02278805896639824, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2676 }, { "epoch": 9.000439189189189, "grad_norm": 23.505464553833008, "learning_rate": 2.5e-05, "loss": 0.3974, "step": 2677 }, { "epoch": 9.000472972972974, "grad_norm": 0.026236718520522118, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2678 }, { "epoch": 9.000506756756756, "grad_norm": 2.9711384773254395, "learning_rate": 2.5e-05, "loss": 0.1223, "step": 2679 }, { "epoch": 9.000540540540541, "grad_norm": 0.003582526696845889, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2680 }, { "epoch": 9.000574324324324, "grad_norm": 0.43054333329200745, "learning_rate": 2.5e-05, "loss": 0.0142, "step": 2681 }, { "epoch": 9.000608108108109, "grad_norm": 1.6982872486114502, "learning_rate": 2.5e-05, "loss": 0.0046, "step": 2682 }, { "epoch": 9.000641891891892, "grad_norm": 8.177032470703125, "learning_rate": 2.5e-05, "loss": 0.0466, "step": 2683 }, { "epoch": 9.000675675675677, "grad_norm": 0.5425007343292236, "learning_rate": 2.5e-05, "loss": 0.0105, "step": 2684 }, { "epoch": 9.00070945945946, "grad_norm": 9.75906753540039, "learning_rate": 2.5e-05, "loss": 0.0565, "step": 2685 }, { "epoch": 9.000743243243242, "grad_norm": 0.04893696680665016, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2686 }, { "epoch": 9.000777027027027, "grad_norm": 0.029402755200862885, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2687 }, { "epoch": 9.00081081081081, "grad_norm": 0.5228587985038757, "learning_rate": 2.5e-05, "loss": 0.0083, "step": 2688 }, { "epoch": 9.000844594594595, "grad_norm": 6.464563369750977, "learning_rate": 2.5e-05, "loss": 0.0185, "step": 2689 }, { "epoch": 9.000878378378378, "grad_norm": 0.008846360258758068, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2690 }, { "epoch": 9.000912162162162, "grad_norm": 0.26762574911117554, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2691 }, { "epoch": 9.000945945945945, "grad_norm": 0.8537616729736328, "learning_rate": 2.5e-05, "loss": 0.0059, "step": 2692 }, { "epoch": 9.00097972972973, "grad_norm": 0.009054595604538918, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2693 }, { "epoch": 9.001013513513513, "grad_norm": 6.603945255279541, "learning_rate": 2.5e-05, "loss": 0.0194, "step": 2694 }, { "epoch": 9.001047297297298, "grad_norm": 0.039887722581624985, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2695 }, { "epoch": 9.00108108108108, "grad_norm": 0.022322893142700195, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2696 }, { "epoch": 9.001114864864865, "grad_norm": 0.01149584911763668, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2697 }, { "epoch": 9.001148648648648, "grad_norm": 0.1884499043226242, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2698 }, { "epoch": 9.001182432432433, "grad_norm": 0.3469788432121277, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2699 }, { "epoch": 9.001216216216216, "grad_norm": 13.05125904083252, "learning_rate": 2.5e-05, "loss": 0.1128, "step": 2700 }, { "epoch": 9.00125, "grad_norm": 0.5398589372634888, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 2701 }, { "epoch": 9.001283783783784, "grad_norm": 34.50339126586914, "learning_rate": 2.5e-05, "loss": 0.0534, "step": 2702 }, { "epoch": 9.001317567567568, "grad_norm": 0.052138812839984894, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2703 }, { "epoch": 9.001351351351351, "grad_norm": 0.026215052232146263, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2704 }, { "epoch": 9.001385135135136, "grad_norm": 0.15239669382572174, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2705 }, { "epoch": 9.001418918918919, "grad_norm": 0.0026563904248178005, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2706 }, { "epoch": 9.001452702702704, "grad_norm": 0.024863436818122864, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2707 }, { "epoch": 9.001486486486487, "grad_norm": 0.10669632256031036, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2708 }, { "epoch": 9.00152027027027, "grad_norm": 0.0872611477971077, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2709 }, { "epoch": 9.001554054054054, "grad_norm": 0.016561347991228104, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2710 }, { "epoch": 9.001587837837837, "grad_norm": 2.825913429260254, "learning_rate": 2.5e-05, "loss": 0.0304, "step": 2711 }, { "epoch": 9.001621621621622, "grad_norm": 0.24674829840660095, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2712 }, { "epoch": 9.001655405405405, "grad_norm": 40.499717712402344, "learning_rate": 2.5e-05, "loss": 0.8405, "step": 2713 }, { "epoch": 9.00168918918919, "grad_norm": 26.099735260009766, "learning_rate": 2.5e-05, "loss": 0.0749, "step": 2714 }, { "epoch": 9.001722972972972, "grad_norm": 0.02472006157040596, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2715 }, { "epoch": 9.001756756756757, "grad_norm": 21.497432708740234, "learning_rate": 2.5e-05, "loss": 0.0982, "step": 2716 }, { "epoch": 9.00179054054054, "grad_norm": 17.370540618896484, "learning_rate": 2.5e-05, "loss": 0.1204, "step": 2717 }, { "epoch": 9.001824324324325, "grad_norm": 0.24399715662002563, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2718 }, { "epoch": 9.001858108108108, "grad_norm": 0.007832745090126991, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2719 }, { "epoch": 9.001891891891892, "grad_norm": 0.009822309017181396, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2720 }, { "epoch": 9.001925675675675, "grad_norm": 12.0272216796875, "learning_rate": 2.5e-05, "loss": 1.0448, "step": 2721 }, { "epoch": 9.00195945945946, "grad_norm": 0.013455613516271114, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2722 }, { "epoch": 9.001993243243243, "grad_norm": 0.005380960647016764, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2723 }, { "epoch": 9.002027027027028, "grad_norm": 22.127655029296875, "learning_rate": 2.5e-05, "loss": 0.905, "step": 2724 }, { "epoch": 9.00206081081081, "grad_norm": 0.009375577792525291, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2725 }, { "epoch": 9.002094594594595, "grad_norm": 0.059141337871551514, "learning_rate": 2.5e-05, "loss": 0.002, "step": 2726 }, { "epoch": 9.002128378378378, "grad_norm": 0.002767996396869421, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2727 }, { "epoch": 9.002162162162163, "grad_norm": 0.061188556253910065, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2728 }, { "epoch": 9.002195945945946, "grad_norm": 0.011765066534280777, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2729 }, { "epoch": 9.002229729729729, "grad_norm": 0.5909886956214905, "learning_rate": 2.5e-05, "loss": 0.0026, "step": 2730 }, { "epoch": 9.002263513513514, "grad_norm": 0.9262104630470276, "learning_rate": 2.5e-05, "loss": 0.0055, "step": 2731 }, { "epoch": 9.002297297297297, "grad_norm": 0.00734118465334177, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2732 }, { "epoch": 9.002331081081081, "grad_norm": 0.007602409925311804, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2733 }, { "epoch": 9.002364864864864, "grad_norm": 4.5269575119018555, "learning_rate": 2.5e-05, "loss": 0.0172, "step": 2734 }, { "epoch": 9.002398648648649, "grad_norm": 18.411043167114258, "learning_rate": 2.5e-05, "loss": 0.7187, "step": 2735 }, { "epoch": 9.002432432432432, "grad_norm": 1.2948954105377197, "learning_rate": 2.5e-05, "loss": 0.005, "step": 2736 }, { "epoch": 9.002466216216217, "grad_norm": 2.029276132583618, "learning_rate": 2.5e-05, "loss": 0.0198, "step": 2737 }, { "epoch": 9.0025, "grad_norm": 0.004805437754839659, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2738 }, { "epoch": 9.002533783783784, "grad_norm": 8.23408031463623, "learning_rate": 2.5e-05, "loss": 0.2021, "step": 2739 }, { "epoch": 9.002567567567567, "grad_norm": 0.025187188759446144, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2740 }, { "epoch": 9.002601351351352, "grad_norm": 0.16470450162887573, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2741 }, { "epoch": 9.002635135135135, "grad_norm": 1.0224419832229614, "learning_rate": 2.5e-05, "loss": 0.0038, "step": 2742 }, { "epoch": 9.00266891891892, "grad_norm": 1.5831468105316162, "learning_rate": 2.5e-05, "loss": 0.014, "step": 2743 }, { "epoch": 9.002702702702702, "grad_norm": 0.04104980453848839, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2744 }, { "epoch": 9.002736486486487, "grad_norm": 0.04239949584007263, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2745 }, { "epoch": 9.00277027027027, "grad_norm": 1.7341241836547852, "learning_rate": 2.5e-05, "loss": 0.0064, "step": 2746 }, { "epoch": 9.002804054054055, "grad_norm": 0.006002666894346476, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2747 }, { "epoch": 9.002837837837838, "grad_norm": 8.149341583251953, "learning_rate": 2.5e-05, "loss": 0.9326, "step": 2748 }, { "epoch": 9.002871621621622, "grad_norm": 8.243292808532715, "learning_rate": 2.5e-05, "loss": 0.0184, "step": 2749 }, { "epoch": 9.002905405405405, "grad_norm": 0.016168832778930664, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2750 }, { "epoch": 9.002939189189188, "grad_norm": 0.03472185134887695, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2751 }, { "epoch": 9.002972972972973, "grad_norm": 7.086937427520752, "learning_rate": 2.5e-05, "loss": 0.4355, "step": 2752 }, { "epoch": 9.003006756756756, "grad_norm": 3.463107109069824, "learning_rate": 2.5e-05, "loss": 0.5553, "step": 2753 }, { "epoch": 9.00304054054054, "grad_norm": 0.015431771986186504, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2754 }, { "epoch": 9.003074324324324, "grad_norm": 6.527328014373779, "learning_rate": 2.5e-05, "loss": 0.1813, "step": 2755 }, { "epoch": 9.003108108108108, "grad_norm": 0.007465454749763012, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2756 }, { "epoch": 9.003141891891891, "grad_norm": 0.0033811142202466726, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2757 }, { "epoch": 9.003175675675676, "grad_norm": 2.093724012374878, "learning_rate": 2.5e-05, "loss": 0.0086, "step": 2758 }, { "epoch": 9.003209459459459, "grad_norm": 0.018633538857102394, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2759 }, { "epoch": 9.003243243243244, "grad_norm": 0.004367479123175144, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2760 }, { "epoch": 9.003277027027027, "grad_norm": 0.01206407230347395, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2761 }, { "epoch": 9.003310810810811, "grad_norm": 0.040471188724040985, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2762 }, { "epoch": 9.003344594594594, "grad_norm": 42.19588088989258, "learning_rate": 2.5e-05, "loss": 0.6169, "step": 2763 }, { "epoch": 9.003378378378379, "grad_norm": 0.011668852530419827, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2764 }, { "epoch": 9.003412162162162, "grad_norm": 0.7028252482414246, "learning_rate": 2.5e-05, "loss": 0.0042, "step": 2765 }, { "epoch": 9.003445945945947, "grad_norm": 10.113183975219727, "learning_rate": 2.5e-05, "loss": 0.8916, "step": 2766 }, { "epoch": 9.00347972972973, "grad_norm": 3.511415958404541, "learning_rate": 2.5e-05, "loss": 0.0108, "step": 2767 }, { "epoch": 9.003513513513514, "grad_norm": 5.158033847808838, "learning_rate": 2.5e-05, "loss": 0.0139, "step": 2768 }, { "epoch": 9.003547297297297, "grad_norm": 8.0009765625, "learning_rate": 2.5e-05, "loss": 0.1391, "step": 2769 }, { "epoch": 9.003581081081082, "grad_norm": 0.012987270951271057, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2770 }, { "epoch": 9.003614864864865, "grad_norm": 0.007361121475696564, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2771 }, { "epoch": 9.003648648648648, "grad_norm": 1.7296935319900513, "learning_rate": 2.5e-05, "loss": 0.0066, "step": 2772 }, { "epoch": 9.003682432432432, "grad_norm": 0.0025938572362065315, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2773 }, { "epoch": 9.003716216216215, "grad_norm": 0.009976937435567379, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2774 }, { "epoch": 9.00375, "grad_norm": 0.07847427576780319, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 2775 }, { "epoch": 9.003783783783783, "grad_norm": 2.1122822761535645, "learning_rate": 2.5e-05, "loss": 0.0084, "step": 2776 }, { "epoch": 9.003817567567568, "grad_norm": 0.0147637240588665, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2777 }, { "epoch": 9.00385135135135, "grad_norm": 0.004246963188052177, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2778 }, { "epoch": 9.003885135135135, "grad_norm": 0.03120901808142662, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2779 }, { "epoch": 9.003918918918918, "grad_norm": 0.9386571645736694, "learning_rate": 2.5e-05, "loss": 0.0068, "step": 2780 }, { "epoch": 9.003952702702703, "grad_norm": 26.56302833557129, "learning_rate": 2.5e-05, "loss": 0.2332, "step": 2781 }, { "epoch": 9.003986486486486, "grad_norm": 0.01175622083246708, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2782 }, { "epoch": 9.00402027027027, "grad_norm": 2.8773465156555176, "learning_rate": 2.5e-05, "loss": 0.0101, "step": 2783 }, { "epoch": 9.004054054054054, "grad_norm": 0.6089902520179749, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2784 }, { "epoch": 9.004087837837838, "grad_norm": 0.006929157767444849, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2785 }, { "epoch": 9.004121621621621, "grad_norm": 0.26153385639190674, "learning_rate": 2.5e-05, "loss": 0.0052, "step": 2786 }, { "epoch": 9.004155405405406, "grad_norm": 0.1258014291524887, "learning_rate": 2.5e-05, "loss": 0.0028, "step": 2787 }, { "epoch": 9.004189189189189, "grad_norm": 4.5322184562683105, "learning_rate": 2.5e-05, "loss": 0.045, "step": 2788 }, { "epoch": 9.004222972972974, "grad_norm": 21.802682876586914, "learning_rate": 2.5e-05, "loss": 0.0732, "step": 2789 }, { "epoch": 9.004256756756757, "grad_norm": 0.3476675748825073, "learning_rate": 2.5e-05, "loss": 0.014, "step": 2790 }, { "epoch": 9.004290540540541, "grad_norm": 42.57630157470703, "learning_rate": 2.5e-05, "loss": 0.216, "step": 2791 }, { "epoch": 9.004324324324324, "grad_norm": 20.646753311157227, "learning_rate": 2.5e-05, "loss": 0.0917, "step": 2792 }, { "epoch": 9.004358108108109, "grad_norm": 43.32895278930664, "learning_rate": 2.5e-05, "loss": 0.8372, "step": 2793 }, { "epoch": 9.004391891891892, "grad_norm": 14.907626152038574, "learning_rate": 2.5e-05, "loss": 0.8037, "step": 2794 }, { "epoch": 9.004425675675675, "grad_norm": 36.43993377685547, "learning_rate": 2.5e-05, "loss": 0.097, "step": 2795 }, { "epoch": 9.00445945945946, "grad_norm": 55.76485824584961, "learning_rate": 2.5e-05, "loss": 0.6541, "step": 2796 }, { "epoch": 9.004493243243243, "grad_norm": 18.688119888305664, "learning_rate": 2.5e-05, "loss": 0.4062, "step": 2797 }, { "epoch": 9.004527027027027, "grad_norm": 2.957095146179199, "learning_rate": 2.5e-05, "loss": 0.0077, "step": 2798 }, { "epoch": 9.00456081081081, "grad_norm": 42.930908203125, "learning_rate": 2.5e-05, "loss": 1.3467, "step": 2799 }, { "epoch": 9.004594594594595, "grad_norm": 0.022003471851348877, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2800 }, { "epoch": 9.004628378378378, "grad_norm": 0.00928601622581482, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2801 }, { "epoch": 9.004662162162163, "grad_norm": 0.010485406965017319, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2802 }, { "epoch": 9.004695945945945, "grad_norm": 0.132320374250412, "learning_rate": 2.5e-05, "loss": 0.0014, "step": 2803 }, { "epoch": 9.00472972972973, "grad_norm": 0.005356747657060623, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2804 }, { "epoch": 9.004763513513513, "grad_norm": 0.020373869687318802, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2805 }, { "epoch": 9.004797297297298, "grad_norm": 0.0224249679595232, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2806 }, { "epoch": 9.00483108108108, "grad_norm": 3.052778959274292, "learning_rate": 2.5e-05, "loss": 0.0318, "step": 2807 }, { "epoch": 9.004864864864865, "grad_norm": 1.9578115940093994, "learning_rate": 2.5e-05, "loss": 0.0055, "step": 2808 }, { "epoch": 9.004898648648648, "grad_norm": 0.6176158785820007, "learning_rate": 2.5e-05, "loss": 0.0068, "step": 2809 }, { "epoch": 9.004932432432433, "grad_norm": 3.1908905506134033, "learning_rate": 2.5e-05, "loss": 0.0875, "step": 2810 }, { "epoch": 9.004966216216216, "grad_norm": 0.0328814834356308, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2811 }, { "epoch": 9.005, "grad_norm": 0.0434148833155632, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2812 }, { "epoch": 9.005033783783784, "grad_norm": 1.6455779075622559, "learning_rate": 2.5e-05, "loss": 0.0098, "step": 2813 }, { "epoch": 9.005067567567568, "grad_norm": 11.378388404846191, "learning_rate": 2.5e-05, "loss": 0.0606, "step": 2814 }, { "epoch": 9.005101351351351, "grad_norm": 0.47237923741340637, "learning_rate": 2.5e-05, "loss": 0.0094, "step": 2815 }, { "epoch": 9.005135135135134, "grad_norm": 12.627161979675293, "learning_rate": 2.5e-05, "loss": 0.0469, "step": 2816 }, { "epoch": 9.005168918918919, "grad_norm": 3.3848514556884766, "learning_rate": 2.5e-05, "loss": 0.1, "step": 2817 }, { "epoch": 9.005202702702702, "grad_norm": 0.0034258838277310133, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2818 }, { "epoch": 9.005236486486487, "grad_norm": 0.034062448889017105, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2819 }, { "epoch": 9.00527027027027, "grad_norm": 59.30539321899414, "learning_rate": 2.5e-05, "loss": 0.2384, "step": 2820 }, { "epoch": 9.005304054054054, "grad_norm": 0.14526137709617615, "learning_rate": 2.5e-05, "loss": 0.0044, "step": 2821 }, { "epoch": 9.005337837837837, "grad_norm": 0.13832388818264008, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2822 }, { "epoch": 9.005371621621622, "grad_norm": 7.115419864654541, "learning_rate": 2.5e-05, "loss": 0.0166, "step": 2823 }, { "epoch": 9.005405405405405, "grad_norm": 0.005786634981632233, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2824 }, { "epoch": 9.00543918918919, "grad_norm": 0.08014335483312607, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2825 }, { "epoch": 9.005472972972973, "grad_norm": 23.105241775512695, "learning_rate": 2.5e-05, "loss": 0.0477, "step": 2826 }, { "epoch": 9.005506756756757, "grad_norm": 2.350677251815796, "learning_rate": 2.5e-05, "loss": 0.0057, "step": 2827 }, { "epoch": 9.00554054054054, "grad_norm": 0.3315448760986328, "learning_rate": 2.5e-05, "loss": 0.0041, "step": 2828 }, { "epoch": 9.005574324324325, "grad_norm": 0.06676820665597916, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2829 }, { "epoch": 9.005608108108108, "grad_norm": 0.0049192039296031, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2830 }, { "epoch": 9.005641891891893, "grad_norm": 45.894657135009766, "learning_rate": 2.5e-05, "loss": 0.4691, "step": 2831 }, { "epoch": 9.005675675675676, "grad_norm": 0.006772239692509174, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2832 }, { "epoch": 9.00570945945946, "grad_norm": 12.059556007385254, "learning_rate": 2.5e-05, "loss": 0.1772, "step": 2833 }, { "epoch": 9.005743243243243, "grad_norm": 0.042123205959796906, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2834 }, { "epoch": 9.005777027027028, "grad_norm": 5.098573207855225, "learning_rate": 2.5e-05, "loss": 0.0778, "step": 2835 }, { "epoch": 9.00581081081081, "grad_norm": 7.976466655731201, "learning_rate": 2.5e-05, "loss": 0.0147, "step": 2836 }, { "epoch": 9.005844594594594, "grad_norm": 1.8417631387710571, "learning_rate": 2.5e-05, "loss": 0.0109, "step": 2837 }, { "epoch": 9.005878378378378, "grad_norm": 1.5208925008773804, "learning_rate": 2.5e-05, "loss": 0.0057, "step": 2838 }, { "epoch": 9.005912162162161, "grad_norm": 0.017598943784832954, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2839 }, { "epoch": 9.005945945945946, "grad_norm": 0.0029582411516457796, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2840 }, { "epoch": 9.005979729729729, "grad_norm": 0.022806089371442795, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2841 }, { "epoch": 9.006013513513514, "grad_norm": 59.7873420715332, "learning_rate": 2.5e-05, "loss": 0.2102, "step": 2842 }, { "epoch": 9.006047297297297, "grad_norm": 3.3447675704956055, "learning_rate": 2.5e-05, "loss": 0.4443, "step": 2843 }, { "epoch": 9.006081081081081, "grad_norm": 0.07962895184755325, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2844 }, { "epoch": 9.006114864864864, "grad_norm": 0.03215157613158226, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2845 }, { "epoch": 9.006148648648649, "grad_norm": 0.013918336480855942, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2846 }, { "epoch": 9.006182432432432, "grad_norm": 0.006503751501441002, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2847 }, { "epoch": 9.006216216216217, "grad_norm": 9.59339714050293, "learning_rate": 2.5e-05, "loss": 0.3613, "step": 2848 }, { "epoch": 9.00625, "grad_norm": 0.003069021040573716, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2849 }, { "epoch": 9.006283783783784, "grad_norm": 1.0389481782913208, "learning_rate": 2.5e-05, "loss": 0.0124, "step": 2850 }, { "epoch": 9.006317567567567, "grad_norm": 0.6678536534309387, "learning_rate": 2.5e-05, "loss": 0.0042, "step": 2851 }, { "epoch": 9.006351351351352, "grad_norm": 0.20887398719787598, "learning_rate": 2.5e-05, "loss": 0.0079, "step": 2852 }, { "epoch": 9.006385135135135, "grad_norm": 0.14675505459308624, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2853 }, { "epoch": 9.00641891891892, "grad_norm": 0.0066605559550225735, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2854 }, { "epoch": 9.006452702702703, "grad_norm": 27.705289840698242, "learning_rate": 2.5e-05, "loss": 0.3156, "step": 2855 }, { "epoch": 9.006486486486487, "grad_norm": 0.00989788118749857, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2856 }, { "epoch": 9.00652027027027, "grad_norm": 0.1203700602054596, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2857 }, { "epoch": 9.006554054054053, "grad_norm": 9.773773193359375, "learning_rate": 2.5e-05, "loss": 0.0193, "step": 2858 }, { "epoch": 9.006587837837838, "grad_norm": 9.751936912536621, "learning_rate": 2.5e-05, "loss": 0.845, "step": 2859 }, { "epoch": 9.00662162162162, "grad_norm": 39.34892272949219, "learning_rate": 2.5e-05, "loss": 0.8172, "step": 2860 }, { "epoch": 9.006655405405406, "grad_norm": 0.006639689672738314, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2861 }, { "epoch": 9.006689189189188, "grad_norm": 12.25720500946045, "learning_rate": 2.5e-05, "loss": 0.9268, "step": 2862 }, { "epoch": 9.006722972972973, "grad_norm": 0.00730251707136631, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2863 }, { "epoch": 9.006756756756756, "grad_norm": 0.0016092498553916812, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2864 }, { "epoch": 9.00679054054054, "grad_norm": 0.151448056101799, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2865 }, { "epoch": 9.006824324324324, "grad_norm": 0.004864237736910582, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2866 }, { "epoch": 9.006858108108108, "grad_norm": 0.019398106262087822, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2867 }, { "epoch": 9.006891891891891, "grad_norm": 0.06428961455821991, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2868 }, { "epoch": 9.006925675675676, "grad_norm": 0.13641953468322754, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2869 }, { "epoch": 9.006959459459459, "grad_norm": 0.012693758122622967, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2870 }, { "epoch": 9.006993243243244, "grad_norm": 3.0799827575683594, "learning_rate": 2.5e-05, "loss": 0.01, "step": 2871 }, { "epoch": 9.007027027027027, "grad_norm": 0.024118101224303246, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2872 }, { "epoch": 9.007060810810811, "grad_norm": 0.29301920533180237, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2873 }, { "epoch": 9.007094594594594, "grad_norm": 0.18908074498176575, "learning_rate": 2.5e-05, "loss": 0.0042, "step": 2874 }, { "epoch": 9.007128378378379, "grad_norm": 5.529666423797607, "learning_rate": 2.5e-05, "loss": 0.5724, "step": 2875 }, { "epoch": 9.007162162162162, "grad_norm": 0.24500730633735657, "learning_rate": 2.5e-05, "loss": 0.0092, "step": 2876 }, { "epoch": 9.007195945945947, "grad_norm": 11.19542121887207, "learning_rate": 2.5e-05, "loss": 0.0454, "step": 2877 }, { "epoch": 9.00722972972973, "grad_norm": 0.004235525149852037, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2878 }, { "epoch": 9.007263513513514, "grad_norm": 0.0030257769394665956, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2879 }, { "epoch": 9.007297297297297, "grad_norm": 0.024415839463472366, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2880 }, { "epoch": 9.00733108108108, "grad_norm": 0.1321890950202942, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2881 }, { "epoch": 9.007364864864865, "grad_norm": 0.6166142821311951, "learning_rate": 2.5e-05, "loss": 0.0159, "step": 2882 }, { "epoch": 9.007398648648648, "grad_norm": 0.03317435458302498, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2883 }, { "epoch": 9.007432432432433, "grad_norm": 29.022266387939453, "learning_rate": 2.5e-05, "loss": 0.0944, "step": 2884 }, { "epoch": 9.007466216216216, "grad_norm": 0.012859512120485306, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2885 }, { "epoch": 9.0075, "grad_norm": 0.0027838137466460466, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2886 }, { "epoch": 9.007533783783783, "grad_norm": 0.02846529893577099, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2887 }, { "epoch": 9.007567567567568, "grad_norm": 21.999177932739258, "learning_rate": 2.5e-05, "loss": 0.1244, "step": 2888 }, { "epoch": 9.00760135135135, "grad_norm": 0.11363056302070618, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 2889 }, { "epoch": 9.007635135135136, "grad_norm": 0.16201920807361603, "learning_rate": 2.5e-05, "loss": 0.0061, "step": 2890 }, { "epoch": 9.007668918918919, "grad_norm": 0.013480272144079208, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2891 }, { "epoch": 9.007702702702703, "grad_norm": 8.507716178894043, "learning_rate": 2.5e-05, "loss": 0.0334, "step": 2892 }, { "epoch": 9.007736486486486, "grad_norm": 0.019575901329517365, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2893 }, { "epoch": 9.00777027027027, "grad_norm": 0.16166089475154877, "learning_rate": 2.5e-05, "loss": 0.0062, "step": 2894 }, { "epoch": 9.007804054054054, "grad_norm": 0.005912215448915958, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2895 }, { "epoch": 9.007837837837839, "grad_norm": 0.009091375395655632, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2896 }, { "epoch": 9.007871621621621, "grad_norm": 64.11603546142578, "learning_rate": 2.5e-05, "loss": 1.281, "step": 2897 }, { "epoch": 9.007905405405406, "grad_norm": 7.693849086761475, "learning_rate": 2.5e-05, "loss": 0.0217, "step": 2898 }, { "epoch": 9.00793918918919, "grad_norm": 0.05027497187256813, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2899 }, { "epoch": 9.007972972972974, "grad_norm": 0.019267218187451363, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2900 }, { "epoch": 9.008006756756757, "grad_norm": 0.027201389893889427, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2901 }, { "epoch": 9.00804054054054, "grad_norm": 0.5398640036582947, "learning_rate": 2.5e-05, "loss": 0.0032, "step": 2902 }, { "epoch": 9.008074324324324, "grad_norm": 0.00875889603048563, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2903 }, { "epoch": 9.008108108108107, "grad_norm": 0.0037450185045599937, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2904 }, { "epoch": 9.008141891891892, "grad_norm": 0.00955275446176529, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2905 }, { "epoch": 9.008175675675675, "grad_norm": 0.11446765065193176, "learning_rate": 2.5e-05, "loss": 0.0043, "step": 2906 }, { "epoch": 9.00820945945946, "grad_norm": 0.029119636863470078, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2907 }, { "epoch": 9.008243243243243, "grad_norm": 0.08698910474777222, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2908 }, { "epoch": 9.008277027027027, "grad_norm": 19.83609962463379, "learning_rate": 2.5e-05, "loss": 0.0915, "step": 2909 }, { "epoch": 9.00831081081081, "grad_norm": 1.0948148965835571, "learning_rate": 2.5e-05, "loss": 0.0178, "step": 2910 }, { "epoch": 9.008344594594595, "grad_norm": 0.005571799352765083, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2911 }, { "epoch": 9.008378378378378, "grad_norm": 48.46590042114258, "learning_rate": 2.5e-05, "loss": 0.3136, "step": 2912 }, { "epoch": 9.008412162162163, "grad_norm": 23.31871795654297, "learning_rate": 2.5e-05, "loss": 0.1636, "step": 2913 }, { "epoch": 9.008445945945946, "grad_norm": 0.15158823132514954, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2914 }, { "epoch": 9.00847972972973, "grad_norm": 29.748857498168945, "learning_rate": 2.5e-05, "loss": 0.159, "step": 2915 }, { "epoch": 9.008513513513513, "grad_norm": 0.012054501101374626, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2916 }, { "epoch": 9.008547297297298, "grad_norm": 0.018281126394867897, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2917 }, { "epoch": 9.008581081081081, "grad_norm": 0.005098014138638973, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2918 }, { "epoch": 9.008614864864866, "grad_norm": 25.149667739868164, "learning_rate": 2.5e-05, "loss": 0.8171, "step": 2919 }, { "epoch": 9.008648648648649, "grad_norm": 0.005802110303193331, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2920 }, { "epoch": 9.008682432432433, "grad_norm": 0.06299471110105515, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2921 }, { "epoch": 9.008716216216216, "grad_norm": 0.007943270727992058, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2922 }, { "epoch": 9.00875, "grad_norm": 38.373992919921875, "learning_rate": 2.5e-05, "loss": 0.7133, "step": 2923 }, { "epoch": 9.008783783783784, "grad_norm": 0.15981894731521606, "learning_rate": 2.5e-05, "loss": 0.0046, "step": 2924 }, { "epoch": 9.008817567567567, "grad_norm": 0.0761559009552002, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 2925 }, { "epoch": 9.008851351351352, "grad_norm": 27.164358139038086, "learning_rate": 2.5e-05, "loss": 0.1354, "step": 2926 }, { "epoch": 9.008885135135134, "grad_norm": 0.0036585626658052206, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2927 }, { "epoch": 9.00891891891892, "grad_norm": 0.005710509605705738, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2928 }, { "epoch": 9.008952702702702, "grad_norm": 41.15082931518555, "learning_rate": 2.5e-05, "loss": 0.8268, "step": 2929 }, { "epoch": 9.008986486486487, "grad_norm": 12.18553638458252, "learning_rate": 2.5e-05, "loss": 0.6325, "step": 2930 }, { "epoch": 9.00902027027027, "grad_norm": 0.004635723773390055, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2931 }, { "epoch": 9.009054054054054, "grad_norm": 0.011165867559611797, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2932 }, { "epoch": 9.009087837837837, "grad_norm": 0.03811636567115784, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2933 }, { "epoch": 9.009121621621622, "grad_norm": 0.09667473286390305, "learning_rate": 2.5e-05, "loss": 0.0037, "step": 2934 }, { "epoch": 9.009155405405405, "grad_norm": 39.55534744262695, "learning_rate": 2.5e-05, "loss": 0.2435, "step": 2935 }, { "epoch": 9.00918918918919, "grad_norm": 21.256370544433594, "learning_rate": 2.5e-05, "loss": 0.4759, "step": 2936 }, { "epoch": 9.009222972972973, "grad_norm": 0.0039219423197209835, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 2937 }, { "epoch": 9.009256756756757, "grad_norm": 0.06910573691129684, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2938 }, { "epoch": 9.00929054054054, "grad_norm": 1.1814038753509521, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 2939 }, { "epoch": 9.009324324324325, "grad_norm": 0.06903047859668732, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 2940 }, { "epoch": 9.009358108108108, "grad_norm": 0.012781685218214989, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 2941 }, { "epoch": 9.009391891891893, "grad_norm": 13.397274017333984, "learning_rate": 2.5e-05, "loss": 0.3041, "step": 2942 }, { "epoch": 9.009425675675676, "grad_norm": 0.011460286565124989, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2943 }, { "epoch": 9.009459459459459, "grad_norm": 0.08554555475711823, "learning_rate": 2.5e-05, "loss": 0.0031, "step": 2944 }, { "epoch": 9.009493243243243, "grad_norm": 0.02331492304801941, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2945 }, { "epoch": 9.009527027027026, "grad_norm": 0.0242764949798584, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2946 }, { "epoch": 9.009560810810811, "grad_norm": 18.37374496459961, "learning_rate": 2.5e-05, "loss": 0.5337, "step": 2947 }, { "epoch": 9.009594594594594, "grad_norm": 0.028460128232836723, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2948 }, { "epoch": 9.009628378378379, "grad_norm": 3.5248539447784424, "learning_rate": 2.5e-05, "loss": 0.5175, "step": 2949 }, { "epoch": 9.009662162162162, "grad_norm": 0.023296277970075607, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2950 }, { "epoch": 9.009695945945946, "grad_norm": 0.05322543904185295, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 2951 }, { "epoch": 9.00972972972973, "grad_norm": 0.019490616396069527, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2952 }, { "epoch": 9.009763513513514, "grad_norm": 0.010692148469388485, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2953 }, { "epoch": 9.009797297297297, "grad_norm": 0.03823144733905792, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2954 }, { "epoch": 9.009831081081082, "grad_norm": 3.427539825439453, "learning_rate": 2.5e-05, "loss": 0.4479, "step": 2955 }, { "epoch": 9.009864864864864, "grad_norm": 0.1359509527683258, "learning_rate": 2.5e-05, "loss": 0.0054, "step": 2956 }, { "epoch": 9.00989864864865, "grad_norm": 9.567930221557617, "learning_rate": 2.5e-05, "loss": 0.7787, "step": 2957 }, { "epoch": 9.009932432432432, "grad_norm": 0.2245561182498932, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 2958 }, { "epoch": 9.009966216216217, "grad_norm": 0.02129121869802475, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2959 }, { "epoch": 9.01, "grad_norm": 0.011642614379525185, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2960 }, { "epoch": 9.01, "eval_accuracy": 0.8626817447495961, "eval_loss": 0.6348419189453125, "eval_runtime": 31.8507, "eval_samples_per_second": 19.434, "eval_steps_per_second": 2.449, "step": 2960 }, { "epoch": 10.000033783783783, "grad_norm": 0.04416654258966446, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2961 }, { "epoch": 10.000067567567568, "grad_norm": 0.2214675396680832, "learning_rate": 2.5e-05, "loss": 0.0087, "step": 2962 }, { "epoch": 10.00010135135135, "grad_norm": 0.01765679009258747, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2963 }, { "epoch": 10.000135135135135, "grad_norm": 0.018971970304846764, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2964 }, { "epoch": 10.000168918918918, "grad_norm": 0.008303718641400337, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2965 }, { "epoch": 10.000202702702703, "grad_norm": 0.016496360301971436, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2966 }, { "epoch": 10.000236486486486, "grad_norm": 0.38423430919647217, "learning_rate": 2.5e-05, "loss": 0.0043, "step": 2967 }, { "epoch": 10.00027027027027, "grad_norm": 0.02716144546866417, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2968 }, { "epoch": 10.000304054054054, "grad_norm": 0.241395965218544, "learning_rate": 2.5e-05, "loss": 0.0093, "step": 2969 }, { "epoch": 10.000337837837838, "grad_norm": 0.13555383682250977, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 2970 }, { "epoch": 10.000371621621621, "grad_norm": 0.24367433786392212, "learning_rate": 2.5e-05, "loss": 0.0095, "step": 2971 }, { "epoch": 10.000405405405406, "grad_norm": 0.04759259894490242, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2972 }, { "epoch": 10.000439189189189, "grad_norm": 0.01103665865957737, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 2973 }, { "epoch": 10.000472972972974, "grad_norm": 8.311040878295898, "learning_rate": 2.5e-05, "loss": 0.4852, "step": 2974 }, { "epoch": 10.000506756756756, "grad_norm": 0.0198744535446167, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2975 }, { "epoch": 10.000540540540541, "grad_norm": 0.06017585098743439, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 2976 }, { "epoch": 10.000574324324324, "grad_norm": 15.587955474853516, "learning_rate": 2.5e-05, "loss": 0.7254, "step": 2977 }, { "epoch": 10.000608108108109, "grad_norm": 0.02561771497130394, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2978 }, { "epoch": 10.000641891891892, "grad_norm": 12.606559753417969, "learning_rate": 2.5e-05, "loss": 0.1395, "step": 2979 }, { "epoch": 10.000675675675677, "grad_norm": 0.10227667540311813, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 2980 }, { "epoch": 10.00070945945946, "grad_norm": 0.024741845205426216, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2981 }, { "epoch": 10.000743243243242, "grad_norm": 12.127843856811523, "learning_rate": 2.5e-05, "loss": 0.1414, "step": 2982 }, { "epoch": 10.000777027027027, "grad_norm": 0.0572376511991024, "learning_rate": 2.5e-05, "loss": 0.001, "step": 2983 }, { "epoch": 10.00081081081081, "grad_norm": 0.4008693993091583, "learning_rate": 2.5e-05, "loss": 0.0043, "step": 2984 }, { "epoch": 10.000844594594595, "grad_norm": 9.079636573791504, "learning_rate": 2.5e-05, "loss": 0.1743, "step": 2985 }, { "epoch": 10.000878378378378, "grad_norm": 0.018791573122143745, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 2986 }, { "epoch": 10.000912162162162, "grad_norm": 38.96175765991211, "learning_rate": 2.5e-05, "loss": 0.3, "step": 2987 }, { "epoch": 10.000945945945945, "grad_norm": 0.13884979486465454, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 2988 }, { "epoch": 10.00097972972973, "grad_norm": 0.03773382306098938, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2989 }, { "epoch": 10.001013513513513, "grad_norm": 7.330732822418213, "learning_rate": 2.5e-05, "loss": 0.2652, "step": 2990 }, { "epoch": 10.001047297297298, "grad_norm": 1.6181690692901611, "learning_rate": 2.5e-05, "loss": 0.0113, "step": 2991 }, { "epoch": 10.00108108108108, "grad_norm": 0.02561800926923752, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2992 }, { "epoch": 10.001114864864865, "grad_norm": 2.2635974884033203, "learning_rate": 2.5e-05, "loss": 0.0211, "step": 2993 }, { "epoch": 10.001148648648648, "grad_norm": 0.025918757542967796, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 2994 }, { "epoch": 10.001182432432433, "grad_norm": 8.995786666870117, "learning_rate": 2.5e-05, "loss": 0.0459, "step": 2995 }, { "epoch": 10.001216216216216, "grad_norm": 17.53583526611328, "learning_rate": 2.5e-05, "loss": 0.136, "step": 2996 }, { "epoch": 10.00125, "grad_norm": 0.0041655199602246284, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 2997 }, { "epoch": 10.001283783783784, "grad_norm": 12.496574401855469, "learning_rate": 2.5e-05, "loss": 0.0399, "step": 2998 }, { "epoch": 10.001317567567568, "grad_norm": 0.016652649268507957, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 2999 }, { "epoch": 10.001351351351351, "grad_norm": 0.013210450299084187, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3000 }, { "epoch": 10.001385135135136, "grad_norm": 0.04107265546917915, "learning_rate": 2.5e-05, "loss": 0.001, "step": 3001 }, { "epoch": 10.001418918918919, "grad_norm": 0.2761267423629761, "learning_rate": 2.5e-05, "loss": 0.0045, "step": 3002 }, { "epoch": 10.001452702702704, "grad_norm": 8.443647384643555, "learning_rate": 2.5e-05, "loss": 0.5185, "step": 3003 }, { "epoch": 10.001486486486487, "grad_norm": 0.3856331408023834, "learning_rate": 2.5e-05, "loss": 0.005, "step": 3004 }, { "epoch": 10.00152027027027, "grad_norm": 0.2234409749507904, "learning_rate": 2.5e-05, "loss": 0.0029, "step": 3005 }, { "epoch": 10.001554054054054, "grad_norm": 32.549049377441406, "learning_rate": 2.5e-05, "loss": 0.129, "step": 3006 }, { "epoch": 10.001587837837837, "grad_norm": 11.476303100585938, "learning_rate": 2.5e-05, "loss": 0.5945, "step": 3007 }, { "epoch": 10.001621621621622, "grad_norm": 0.15701842308044434, "learning_rate": 2.5e-05, "loss": 0.0063, "step": 3008 }, { "epoch": 10.001655405405405, "grad_norm": 10.87949275970459, "learning_rate": 2.5e-05, "loss": 0.2353, "step": 3009 }, { "epoch": 10.00168918918919, "grad_norm": 0.09735726565122604, "learning_rate": 2.5e-05, "loss": 0.0023, "step": 3010 }, { "epoch": 10.001722972972972, "grad_norm": 0.5342214107513428, "learning_rate": 2.5e-05, "loss": 0.0116, "step": 3011 }, { "epoch": 10.001756756756757, "grad_norm": 0.1827017366886139, "learning_rate": 2.5e-05, "loss": 0.0061, "step": 3012 }, { "epoch": 10.00179054054054, "grad_norm": 0.4373321533203125, "learning_rate": 2.5e-05, "loss": 0.0047, "step": 3013 }, { "epoch": 10.001824324324325, "grad_norm": 3.9883108139038086, "learning_rate": 2.5e-05, "loss": 0.0218, "step": 3014 }, { "epoch": 10.001858108108108, "grad_norm": 17.377792358398438, "learning_rate": 2.5e-05, "loss": 0.0869, "step": 3015 }, { "epoch": 10.001891891891892, "grad_norm": 2.4954638481140137, "learning_rate": 2.5e-05, "loss": 0.0116, "step": 3016 }, { "epoch": 10.001925675675675, "grad_norm": 0.40453004837036133, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 3017 }, { "epoch": 10.00195945945946, "grad_norm": 7.0007734298706055, "learning_rate": 2.5e-05, "loss": 0.2547, "step": 3018 }, { "epoch": 10.001993243243243, "grad_norm": 0.3208724558353424, "learning_rate": 2.5e-05, "loss": 0.0027, "step": 3019 }, { "epoch": 10.002027027027028, "grad_norm": 0.41649389266967773, "learning_rate": 2.5e-05, "loss": 0.0029, "step": 3020 }, { "epoch": 10.00206081081081, "grad_norm": 0.02120283804833889, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3021 }, { "epoch": 10.002094594594595, "grad_norm": 31.762983322143555, "learning_rate": 2.5e-05, "loss": 0.5771, "step": 3022 }, { "epoch": 10.002128378378378, "grad_norm": 0.7036697864532471, "learning_rate": 2.5e-05, "loss": 0.0053, "step": 3023 }, { "epoch": 10.002162162162163, "grad_norm": 4.719839572906494, "learning_rate": 2.5e-05, "loss": 0.2536, "step": 3024 }, { "epoch": 10.002195945945946, "grad_norm": 2.2540383338928223, "learning_rate": 2.5e-05, "loss": 0.0929, "step": 3025 }, { "epoch": 10.002229729729729, "grad_norm": 0.011769942939281464, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3026 }, { "epoch": 10.002263513513514, "grad_norm": 0.0922541618347168, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 3027 }, { "epoch": 10.002297297297297, "grad_norm": 57.85964584350586, "learning_rate": 2.5e-05, "loss": 0.481, "step": 3028 }, { "epoch": 10.002331081081081, "grad_norm": 11.629168510437012, "learning_rate": 2.5e-05, "loss": 0.6312, "step": 3029 }, { "epoch": 10.002364864864864, "grad_norm": 0.007025241386145353, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3030 }, { "epoch": 10.002398648648649, "grad_norm": 0.38813838362693787, "learning_rate": 2.5e-05, "loss": 0.0028, "step": 3031 }, { "epoch": 10.002432432432432, "grad_norm": 0.03143941983580589, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 3032 }, { "epoch": 10.002466216216217, "grad_norm": 0.03402760252356529, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3033 }, { "epoch": 10.0025, "grad_norm": 0.06745758652687073, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 3034 }, { "epoch": 10.002533783783784, "grad_norm": 0.003506062086671591, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3035 }, { "epoch": 10.002567567567567, "grad_norm": 0.05127835273742676, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 3036 }, { "epoch": 10.002601351351352, "grad_norm": 0.012362822890281677, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3037 }, { "epoch": 10.002635135135135, "grad_norm": 0.032919127494096756, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 3038 }, { "epoch": 10.00266891891892, "grad_norm": 3.9526655673980713, "learning_rate": 2.5e-05, "loss": 0.144, "step": 3039 }, { "epoch": 10.002702702702702, "grad_norm": 0.672848105430603, "learning_rate": 2.5e-05, "loss": 0.0063, "step": 3040 }, { "epoch": 10.002736486486487, "grad_norm": 0.0025483930949121714, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3041 }, { "epoch": 10.00277027027027, "grad_norm": 0.028933009132742882, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 3042 }, { "epoch": 10.002804054054055, "grad_norm": 0.013250785879790783, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3043 }, { "epoch": 10.002837837837838, "grad_norm": 0.005902869161218405, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3044 }, { "epoch": 10.002871621621622, "grad_norm": 0.005185825750231743, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3045 }, { "epoch": 10.002905405405405, "grad_norm": 0.07650744915008545, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 3046 }, { "epoch": 10.002939189189188, "grad_norm": 4.4589948654174805, "learning_rate": 2.5e-05, "loss": 0.2059, "step": 3047 }, { "epoch": 10.002972972972973, "grad_norm": 0.07913918048143387, "learning_rate": 2.5e-05, "loss": 0.0011, "step": 3048 }, { "epoch": 10.003006756756756, "grad_norm": 13.735847473144531, "learning_rate": 2.5e-05, "loss": 0.3585, "step": 3049 }, { "epoch": 10.00304054054054, "grad_norm": 0.008173160254955292, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3050 }, { "epoch": 10.003074324324324, "grad_norm": 0.01086642500013113, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3051 }, { "epoch": 10.003108108108108, "grad_norm": 0.013825980015099049, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3052 }, { "epoch": 10.003141891891891, "grad_norm": 2.148866891860962, "learning_rate": 2.5e-05, "loss": 0.0189, "step": 3053 }, { "epoch": 10.003175675675676, "grad_norm": 0.11372450739145279, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 3054 }, { "epoch": 10.003209459459459, "grad_norm": 0.04432906582951546, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 3055 }, { "epoch": 10.003243243243244, "grad_norm": 0.019816933199763298, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3056 }, { "epoch": 10.003277027027027, "grad_norm": 9.60677719116211, "learning_rate": 2.5e-05, "loss": 0.0289, "step": 3057 }, { "epoch": 10.003310810810811, "grad_norm": 86.67936706542969, "learning_rate": 2.5e-05, "loss": 0.1515, "step": 3058 }, { "epoch": 10.003344594594594, "grad_norm": 46.236228942871094, "learning_rate": 2.5e-05, "loss": 0.7658, "step": 3059 }, { "epoch": 10.003378378378379, "grad_norm": 41.54511642456055, "learning_rate": 2.5e-05, "loss": 0.6525, "step": 3060 }, { "epoch": 10.003412162162162, "grad_norm": 0.029310261830687523, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 3061 }, { "epoch": 10.003445945945947, "grad_norm": 1.1620014905929565, "learning_rate": 2.5e-05, "loss": 0.0055, "step": 3062 }, { "epoch": 10.00347972972973, "grad_norm": 0.0037479056045413017, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3063 }, { "epoch": 10.003513513513514, "grad_norm": 0.013206077739596367, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3064 }, { "epoch": 10.003547297297297, "grad_norm": 0.1311561018228531, "learning_rate": 2.5e-05, "loss": 0.0045, "step": 3065 }, { "epoch": 10.003581081081082, "grad_norm": 0.00867279339581728, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3066 }, { "epoch": 10.003614864864865, "grad_norm": 0.10875871032476425, "learning_rate": 2.5e-05, "loss": 0.0041, "step": 3067 }, { "epoch": 10.003648648648648, "grad_norm": 0.9563208818435669, "learning_rate": 2.5e-05, "loss": 0.0152, "step": 3068 }, { "epoch": 10.003682432432432, "grad_norm": 0.010396613739430904, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3069 }, { "epoch": 10.003716216216215, "grad_norm": 0.004124055150896311, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3070 }, { "epoch": 10.00375, "grad_norm": 0.060662828385829926, "learning_rate": 2.5e-05, "loss": 0.001, "step": 3071 }, { "epoch": 10.003783783783783, "grad_norm": 0.027153795585036278, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 3072 }, { "epoch": 10.003817567567568, "grad_norm": 2.145601272583008, "learning_rate": 2.5e-05, "loss": 0.032, "step": 3073 }, { "epoch": 10.00385135135135, "grad_norm": 0.013859688304364681, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3074 }, { "epoch": 10.003885135135135, "grad_norm": 0.012239094823598862, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3075 }, { "epoch": 10.003918918918918, "grad_norm": 3.2907938957214355, "learning_rate": 2.5e-05, "loss": 0.4309, "step": 3076 }, { "epoch": 10.003952702702703, "grad_norm": 17.54439353942871, "learning_rate": 2.5e-05, "loss": 0.9055, "step": 3077 }, { "epoch": 10.003986486486486, "grad_norm": 0.01332138292491436, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3078 }, { "epoch": 10.00402027027027, "grad_norm": 12.7789888381958, "learning_rate": 2.5e-05, "loss": 0.6252, "step": 3079 }, { "epoch": 10.004054054054054, "grad_norm": 0.015707416459918022, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3080 }, { "epoch": 10.004087837837838, "grad_norm": 31.719152450561523, "learning_rate": 2.5e-05, "loss": 0.3757, "step": 3081 }, { "epoch": 10.004121621621621, "grad_norm": 7.272458076477051, "learning_rate": 2.5e-05, "loss": 0.119, "step": 3082 }, { "epoch": 10.004155405405406, "grad_norm": 0.006012683734297752, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3083 }, { "epoch": 10.004189189189189, "grad_norm": 0.26843076944351196, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 3084 }, { "epoch": 10.004222972972974, "grad_norm": 0.03462150692939758, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3085 }, { "epoch": 10.004256756756757, "grad_norm": 0.009704779833555222, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3086 }, { "epoch": 10.004290540540541, "grad_norm": 0.02785942330956459, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3087 }, { "epoch": 10.004324324324324, "grad_norm": 0.004882494453340769, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3088 }, { "epoch": 10.004358108108109, "grad_norm": 0.00621056417003274, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3089 }, { "epoch": 10.004391891891892, "grad_norm": 5.128859519958496, "learning_rate": 2.5e-05, "loss": 0.3197, "step": 3090 }, { "epoch": 10.004425675675675, "grad_norm": 80.73975372314453, "learning_rate": 2.5e-05, "loss": 0.9721, "step": 3091 }, { "epoch": 10.00445945945946, "grad_norm": 1.2687095403671265, "learning_rate": 2.5e-05, "loss": 0.0131, "step": 3092 }, { "epoch": 10.004493243243243, "grad_norm": 40.89626693725586, "learning_rate": 2.5e-05, "loss": 0.256, "step": 3093 }, { "epoch": 10.004527027027027, "grad_norm": 0.5674986839294434, "learning_rate": 2.5e-05, "loss": 0.0026, "step": 3094 }, { "epoch": 10.00456081081081, "grad_norm": 0.257142037153244, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 3095 }, { "epoch": 10.004594594594595, "grad_norm": 0.12796524167060852, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 3096 }, { "epoch": 10.004628378378378, "grad_norm": 0.3528797924518585, "learning_rate": 2.5e-05, "loss": 0.0135, "step": 3097 }, { "epoch": 10.004662162162163, "grad_norm": 0.007260435726493597, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3098 }, { "epoch": 10.004695945945945, "grad_norm": 0.006168826017528772, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3099 }, { "epoch": 10.00472972972973, "grad_norm": 0.0029385865200310946, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3100 }, { "epoch": 10.004763513513513, "grad_norm": 4.230224609375, "learning_rate": 2.5e-05, "loss": 0.0132, "step": 3101 }, { "epoch": 10.004797297297298, "grad_norm": 7.87785005569458, "learning_rate": 2.5e-05, "loss": 0.062, "step": 3102 }, { "epoch": 10.00483108108108, "grad_norm": 25.1749267578125, "learning_rate": 2.5e-05, "loss": 0.3169, "step": 3103 }, { "epoch": 10.004864864864865, "grad_norm": 0.006060474086552858, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3104 }, { "epoch": 10.004898648648648, "grad_norm": 0.03872549906373024, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3105 }, { "epoch": 10.004932432432433, "grad_norm": 0.32494843006134033, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 3106 }, { "epoch": 10.004966216216216, "grad_norm": 26.732114791870117, "learning_rate": 2.5e-05, "loss": 0.3992, "step": 3107 }, { "epoch": 10.005, "grad_norm": 0.008858904242515564, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3108 }, { "epoch": 10.005033783783784, "grad_norm": 28.990745544433594, "learning_rate": 2.5e-05, "loss": 0.1772, "step": 3109 }, { "epoch": 10.005067567567568, "grad_norm": 16.609153747558594, "learning_rate": 2.5e-05, "loss": 0.4144, "step": 3110 }, { "epoch": 10.005101351351351, "grad_norm": 1.1152794361114502, "learning_rate": 2.5e-05, "loss": 0.0042, "step": 3111 }, { "epoch": 10.005135135135134, "grad_norm": 0.8400949239730835, "learning_rate": 2.5e-05, "loss": 0.0093, "step": 3112 }, { "epoch": 10.005168918918919, "grad_norm": 1.8166135549545288, "learning_rate": 2.5e-05, "loss": 0.0051, "step": 3113 }, { "epoch": 10.005202702702702, "grad_norm": 1.261557936668396, "learning_rate": 2.5e-05, "loss": 0.047, "step": 3114 }, { "epoch": 10.005236486486487, "grad_norm": 0.34998056292533875, "learning_rate": 2.5e-05, "loss": 0.0137, "step": 3115 }, { "epoch": 10.00527027027027, "grad_norm": 0.6783799529075623, "learning_rate": 2.5e-05, "loss": 0.0037, "step": 3116 }, { "epoch": 10.005304054054054, "grad_norm": 0.5865479111671448, "learning_rate": 2.5e-05, "loss": 0.0201, "step": 3117 }, { "epoch": 10.005337837837837, "grad_norm": 0.037711918354034424, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3118 }, { "epoch": 10.005371621621622, "grad_norm": 4.668238162994385, "learning_rate": 2.5e-05, "loss": 0.0144, "step": 3119 }, { "epoch": 10.005405405405405, "grad_norm": 0.00794792827218771, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3120 }, { "epoch": 10.00543918918919, "grad_norm": 0.0037836097180843353, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3121 }, { "epoch": 10.005472972972973, "grad_norm": 0.006718456745147705, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3122 }, { "epoch": 10.005506756756757, "grad_norm": 46.06835174560547, "learning_rate": 2.5e-05, "loss": 0.1949, "step": 3123 }, { "epoch": 10.00554054054054, "grad_norm": 39.249244689941406, "learning_rate": 2.5e-05, "loss": 0.3233, "step": 3124 }, { "epoch": 10.005574324324325, "grad_norm": 0.007692958693951368, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3125 }, { "epoch": 10.005608108108108, "grad_norm": 0.008700132369995117, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3126 }, { "epoch": 10.005641891891893, "grad_norm": 0.001975542865693569, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3127 }, { "epoch": 10.005675675675676, "grad_norm": 0.33256593346595764, "learning_rate": 2.5e-05, "loss": 0.0013, "step": 3128 }, { "epoch": 10.00570945945946, "grad_norm": 0.14552097022533417, "learning_rate": 2.5e-05, "loss": 0.0017, "step": 3129 }, { "epoch": 10.005743243243243, "grad_norm": 43.049964904785156, "learning_rate": 2.5e-05, "loss": 0.2416, "step": 3130 }, { "epoch": 10.005777027027028, "grad_norm": 0.12900488078594208, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 3131 }, { "epoch": 10.00581081081081, "grad_norm": 18.21747589111328, "learning_rate": 2.5e-05, "loss": 0.0707, "step": 3132 }, { "epoch": 10.005844594594594, "grad_norm": 19.9259090423584, "learning_rate": 2.5e-05, "loss": 0.0629, "step": 3133 }, { "epoch": 10.005878378378378, "grad_norm": 18.862123489379883, "learning_rate": 2.5e-05, "loss": 0.0365, "step": 3134 }, { "epoch": 10.005912162162161, "grad_norm": 0.017885982990264893, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3135 }, { "epoch": 10.005945945945946, "grad_norm": 0.008041263557970524, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3136 }, { "epoch": 10.005979729729729, "grad_norm": 0.021982496604323387, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 3137 }, { "epoch": 10.006013513513514, "grad_norm": 0.10479248315095901, "learning_rate": 2.5e-05, "loss": 0.0012, "step": 3138 }, { "epoch": 10.006047297297297, "grad_norm": 5.893826007843018, "learning_rate": 2.5e-05, "loss": 0.0204, "step": 3139 }, { "epoch": 10.006081081081081, "grad_norm": 1.4442253112792969, "learning_rate": 2.5e-05, "loss": 0.0183, "step": 3140 }, { "epoch": 10.006114864864864, "grad_norm": 18.88836669921875, "learning_rate": 2.5e-05, "loss": 0.164, "step": 3141 }, { "epoch": 10.006148648648649, "grad_norm": 0.5512616038322449, "learning_rate": 2.5e-05, "loss": 0.0173, "step": 3142 }, { "epoch": 10.006182432432432, "grad_norm": 0.004258190747350454, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3143 }, { "epoch": 10.006216216216217, "grad_norm": 0.0026201948057860136, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3144 }, { "epoch": 10.00625, "grad_norm": 0.3336643576622009, "learning_rate": 2.5e-05, "loss": 0.006, "step": 3145 }, { "epoch": 10.006283783783784, "grad_norm": 37.42595291137695, "learning_rate": 2.5e-05, "loss": 0.4694, "step": 3146 }, { "epoch": 10.006317567567567, "grad_norm": 13.75846004486084, "learning_rate": 2.5e-05, "loss": 0.8479, "step": 3147 }, { "epoch": 10.006351351351352, "grad_norm": 30.404220581054688, "learning_rate": 2.5e-05, "loss": 0.0776, "step": 3148 }, { "epoch": 10.006385135135135, "grad_norm": 0.023160122334957123, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3149 }, { "epoch": 10.00641891891892, "grad_norm": 0.007048849482089281, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3150 }, { "epoch": 10.006452702702703, "grad_norm": 0.017847279086709023, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3151 }, { "epoch": 10.006486486486487, "grad_norm": 0.013573499396443367, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3152 }, { "epoch": 10.00652027027027, "grad_norm": 0.2523849904537201, "learning_rate": 2.5e-05, "loss": 0.0051, "step": 3153 }, { "epoch": 10.006554054054053, "grad_norm": 32.962562561035156, "learning_rate": 2.5e-05, "loss": 0.211, "step": 3154 }, { "epoch": 10.006587837837838, "grad_norm": 3.2567074298858643, "learning_rate": 2.5e-05, "loss": 0.4054, "step": 3155 }, { "epoch": 10.00662162162162, "grad_norm": 0.15517684817314148, "learning_rate": 2.5e-05, "loss": 0.006, "step": 3156 }, { "epoch": 10.006655405405406, "grad_norm": 1.0216166973114014, "learning_rate": 2.5e-05, "loss": 0.008, "step": 3157 }, { "epoch": 10.006689189189188, "grad_norm": 0.1354767382144928, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 3158 }, { "epoch": 10.006722972972973, "grad_norm": 0.16818256676197052, "learning_rate": 2.5e-05, "loss": 0.0041, "step": 3159 }, { "epoch": 10.006756756756756, "grad_norm": 1.090006709098816, "learning_rate": 2.5e-05, "loss": 0.0404, "step": 3160 }, { "epoch": 10.00679054054054, "grad_norm": 0.15626809000968933, "learning_rate": 2.5e-05, "loss": 0.0059, "step": 3161 }, { "epoch": 10.006824324324324, "grad_norm": 2.9360175132751465, "learning_rate": 2.5e-05, "loss": 0.0073, "step": 3162 }, { "epoch": 10.006858108108108, "grad_norm": 0.06169930472970009, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 3163 }, { "epoch": 10.006891891891891, "grad_norm": 0.004209875129163265, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3164 }, { "epoch": 10.006925675675676, "grad_norm": 0.040416840463876724, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 3165 }, { "epoch": 10.006959459459459, "grad_norm": 0.040836889296770096, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3166 }, { "epoch": 10.006993243243244, "grad_norm": 0.011099823750555515, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3167 }, { "epoch": 10.007027027027027, "grad_norm": 0.04686718061566353, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3168 }, { "epoch": 10.007060810810811, "grad_norm": 0.006719560828059912, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3169 }, { "epoch": 10.007094594594594, "grad_norm": 0.45053723454475403, "learning_rate": 2.5e-05, "loss": 0.0172, "step": 3170 }, { "epoch": 10.007128378378379, "grad_norm": 25.196807861328125, "learning_rate": 2.5e-05, "loss": 0.7581, "step": 3171 }, { "epoch": 10.007162162162162, "grad_norm": 0.006554577499628067, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3172 }, { "epoch": 10.007195945945947, "grad_norm": 0.11235051602125168, "learning_rate": 2.5e-05, "loss": 0.0016, "step": 3173 }, { "epoch": 10.00722972972973, "grad_norm": 4.480849266052246, "learning_rate": 2.5e-05, "loss": 0.3964, "step": 3174 }, { "epoch": 10.007263513513514, "grad_norm": 10.41819953918457, "learning_rate": 2.5e-05, "loss": 0.0582, "step": 3175 }, { "epoch": 10.007297297297297, "grad_norm": 4.364681720733643, "learning_rate": 2.5e-05, "loss": 0.037, "step": 3176 }, { "epoch": 10.00733108108108, "grad_norm": 0.006808661390095949, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3177 }, { "epoch": 10.007364864864865, "grad_norm": 0.01774785667657852, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3178 }, { "epoch": 10.007398648648648, "grad_norm": 0.0036745192483067513, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3179 }, { "epoch": 10.007432432432433, "grad_norm": 1.563067078590393, "learning_rate": 2.5e-05, "loss": 0.0046, "step": 3180 }, { "epoch": 10.007466216216216, "grad_norm": 0.008251618593931198, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3181 }, { "epoch": 10.0075, "grad_norm": 0.0032794703729450703, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3182 }, { "epoch": 10.007533783783783, "grad_norm": 0.008111330680549145, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3183 }, { "epoch": 10.007567567567568, "grad_norm": 0.06930431723594666, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 3184 }, { "epoch": 10.00760135135135, "grad_norm": 0.8176154494285583, "learning_rate": 2.5e-05, "loss": 0.0023, "step": 3185 }, { "epoch": 10.007635135135136, "grad_norm": 20.883413314819336, "learning_rate": 2.5e-05, "loss": 0.1166, "step": 3186 }, { "epoch": 10.007668918918919, "grad_norm": 0.024337556213140488, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3187 }, { "epoch": 10.007702702702703, "grad_norm": 0.7166610956192017, "learning_rate": 2.5e-05, "loss": 0.0022, "step": 3188 }, { "epoch": 10.007736486486486, "grad_norm": 21.40473747253418, "learning_rate": 2.5e-05, "loss": 0.7128, "step": 3189 }, { "epoch": 10.00777027027027, "grad_norm": 15.48318862915039, "learning_rate": 2.5e-05, "loss": 0.9141, "step": 3190 }, { "epoch": 10.007804054054054, "grad_norm": 7.534517765045166, "learning_rate": 2.5e-05, "loss": 0.9386, "step": 3191 }, { "epoch": 10.007837837837839, "grad_norm": 36.67161560058594, "learning_rate": 2.5e-05, "loss": 0.5273, "step": 3192 }, { "epoch": 10.007871621621621, "grad_norm": 0.2052333801984787, "learning_rate": 2.5e-05, "loss": 0.0082, "step": 3193 }, { "epoch": 10.007905405405406, "grad_norm": 2.7258920669555664, "learning_rate": 2.5e-05, "loss": 0.0709, "step": 3194 }, { "epoch": 10.00793918918919, "grad_norm": 0.02387695014476776, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 3195 }, { "epoch": 10.007972972972974, "grad_norm": 0.4568084180355072, "learning_rate": 2.5e-05, "loss": 0.0059, "step": 3196 }, { "epoch": 10.008006756756757, "grad_norm": 0.014689420349895954, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3197 }, { "epoch": 10.00804054054054, "grad_norm": 7.120307922363281, "learning_rate": 2.5e-05, "loss": 0.0124, "step": 3198 }, { "epoch": 10.008074324324324, "grad_norm": 0.043648798018693924, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3199 }, { "epoch": 10.008108108108107, "grad_norm": 1.561883568763733, "learning_rate": 2.5e-05, "loss": 0.034, "step": 3200 }, { "epoch": 10.008141891891892, "grad_norm": 7.4580183029174805, "learning_rate": 2.5e-05, "loss": 0.026, "step": 3201 }, { "epoch": 10.008175675675675, "grad_norm": 0.06279745697975159, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 3202 }, { "epoch": 10.00820945945946, "grad_norm": 0.4106455445289612, "learning_rate": 2.5e-05, "loss": 0.003, "step": 3203 }, { "epoch": 10.008243243243243, "grad_norm": 31.55225944519043, "learning_rate": 2.5e-05, "loss": 0.2011, "step": 3204 }, { "epoch": 10.008277027027027, "grad_norm": 38.269287109375, "learning_rate": 2.5e-05, "loss": 0.8345, "step": 3205 }, { "epoch": 10.00831081081081, "grad_norm": 8.48319149017334, "learning_rate": 2.5e-05, "loss": 0.0436, "step": 3206 }, { "epoch": 10.008344594594595, "grad_norm": 0.043747853487730026, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 3207 }, { "epoch": 10.008378378378378, "grad_norm": 0.003212964627891779, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3208 }, { "epoch": 10.008412162162163, "grad_norm": 0.21386559307575226, "learning_rate": 2.5e-05, "loss": 0.008, "step": 3209 }, { "epoch": 10.008445945945946, "grad_norm": 0.15149220824241638, "learning_rate": 2.5e-05, "loss": 0.0059, "step": 3210 }, { "epoch": 10.00847972972973, "grad_norm": 0.011341387405991554, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3211 }, { "epoch": 10.008513513513513, "grad_norm": 0.003934393171221018, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3212 }, { "epoch": 10.008547297297298, "grad_norm": 0.31588977575302124, "learning_rate": 2.5e-05, "loss": 0.0015, "step": 3213 }, { "epoch": 10.008581081081081, "grad_norm": 0.02534402348101139, "learning_rate": 2.5e-05, "loss": 0.0005, "step": 3214 }, { "epoch": 10.008614864864866, "grad_norm": 53.679386138916016, "learning_rate": 2.5e-05, "loss": 0.1254, "step": 3215 }, { "epoch": 10.008648648648649, "grad_norm": 0.041623979806900024, "learning_rate": 2.5e-05, "loss": 0.0008, "step": 3216 }, { "epoch": 10.008682432432433, "grad_norm": 37.36592102050781, "learning_rate": 2.5e-05, "loss": 0.2503, "step": 3217 }, { "epoch": 10.008716216216216, "grad_norm": 13.803362846374512, "learning_rate": 2.5e-05, "loss": 0.1176, "step": 3218 }, { "epoch": 10.00875, "grad_norm": 37.814754486083984, "learning_rate": 2.5e-05, "loss": 0.3014, "step": 3219 }, { "epoch": 10.008783783783784, "grad_norm": 0.003956497646868229, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3220 }, { "epoch": 10.008817567567567, "grad_norm": 0.006113695912063122, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 3221 }, { "epoch": 10.008851351351352, "grad_norm": 0.05242941528558731, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3222 }, { "epoch": 10.008885135135134, "grad_norm": 0.830529510974884, "learning_rate": 2.5e-05, "loss": 0.0161, "step": 3223 }, { "epoch": 10.00891891891892, "grad_norm": 0.003948455210775137, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3224 }, { "epoch": 10.008952702702702, "grad_norm": 0.026800060644745827, "learning_rate": 2.5e-05, "loss": 0.0006, "step": 3225 }, { "epoch": 10.008986486486487, "grad_norm": 0.004031202755868435, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3226 }, { "epoch": 10.00902027027027, "grad_norm": 60.72530746459961, "learning_rate": 2.5e-05, "loss": 0.8075, "step": 3227 }, { "epoch": 10.009054054054054, "grad_norm": 0.003317770082503557, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3228 }, { "epoch": 10.009087837837837, "grad_norm": 1.4179056882858276, "learning_rate": 2.5e-05, "loss": 0.0038, "step": 3229 }, { "epoch": 10.009121621621622, "grad_norm": 0.04366854205727577, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3230 }, { "epoch": 10.009155405405405, "grad_norm": 1.1738100051879883, "learning_rate": 2.5e-05, "loss": 0.006, "step": 3231 }, { "epoch": 10.00918918918919, "grad_norm": 9.583860397338867, "learning_rate": 2.5e-05, "loss": 0.4812, "step": 3232 }, { "epoch": 10.009222972972973, "grad_norm": 79.38129425048828, "learning_rate": 2.5e-05, "loss": 0.7311, "step": 3233 }, { "epoch": 10.009256756756757, "grad_norm": 0.0025300465058535337, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3234 }, { "epoch": 10.00929054054054, "grad_norm": 44.09068298339844, "learning_rate": 2.5e-05, "loss": 0.2635, "step": 3235 }, { "epoch": 10.009324324324325, "grad_norm": 0.0025396994315087795, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3236 }, { "epoch": 10.009358108108108, "grad_norm": 40.005611419677734, "learning_rate": 2.5e-05, "loss": 0.2302, "step": 3237 }, { "epoch": 10.009391891891893, "grad_norm": 0.09863624721765518, "learning_rate": 2.5e-05, "loss": 0.0009, "step": 3238 }, { "epoch": 10.009425675675676, "grad_norm": 0.273107647895813, "learning_rate": 2.5e-05, "loss": 0.001, "step": 3239 }, { "epoch": 10.009459459459459, "grad_norm": 0.020605046302080154, "learning_rate": 2.5e-05, "loss": 0.0003, "step": 3240 }, { "epoch": 10.009493243243243, "grad_norm": 55.876136779785156, "learning_rate": 2.5e-05, "loss": 0.3473, "step": 3241 }, { "epoch": 10.009527027027026, "grad_norm": 9.56721305847168, "learning_rate": 2.5e-05, "loss": 0.503, "step": 3242 }, { "epoch": 10.009560810810811, "grad_norm": 1.6558979749679565, "learning_rate": 2.5e-05, "loss": 0.0156, "step": 3243 }, { "epoch": 10.009594594594594, "grad_norm": 25.359210968017578, "learning_rate": 2.5e-05, "loss": 1.4922, "step": 3244 }, { "epoch": 10.009628378378379, "grad_norm": 0.08090026676654816, "learning_rate": 2.5e-05, "loss": 0.0007, "step": 3245 }, { "epoch": 10.009662162162162, "grad_norm": 0.7270247936248779, "learning_rate": 2.5e-05, "loss": 0.007, "step": 3246 }, { "epoch": 10.009695945945946, "grad_norm": 0.4693688750267029, "learning_rate": 2.5e-05, "loss": 0.0032, "step": 3247 }, { "epoch": 10.00972972972973, "grad_norm": 0.6888940334320068, "learning_rate": 2.5e-05, "loss": 0.0038, "step": 3248 }, { "epoch": 10.009763513513514, "grad_norm": 0.004004486370831728, "learning_rate": 2.5e-05, "loss": 0.0001, "step": 3249 }, { "epoch": 10.009797297297297, "grad_norm": 0.6225452423095703, "learning_rate": 2.5e-05, "loss": 0.0056, "step": 3250 }, { "epoch": 10.009831081081082, "grad_norm": 57.13733673095703, "learning_rate": 2.5e-05, "loss": 0.3554, "step": 3251 }, { "epoch": 10.009864864864864, "grad_norm": 20.080852508544922, "learning_rate": 2.5e-05, "loss": 0.8655, "step": 3252 }, { "epoch": 10.00989864864865, "grad_norm": 0.37475940585136414, "learning_rate": 2.5e-05, "loss": 0.0122, "step": 3253 }, { "epoch": 10.009932432432432, "grad_norm": 34.52098846435547, "learning_rate": 2.5e-05, "loss": 0.3777, "step": 3254 }, { "epoch": 10.009966216216217, "grad_norm": 0.10360803455114365, "learning_rate": 2.5e-05, "loss": 0.004, "step": 3255 }, { "epoch": 10.01, "grad_norm": 0.017702830955386162, "learning_rate": 2.5e-05, "loss": 0.0004, "step": 3256 }, { "epoch": 10.01, "eval_accuracy": 0.8707592891760905, "eval_loss": 0.6531233787536621, "eval_runtime": 32.1103, "eval_samples_per_second": 19.277, "eval_steps_per_second": 2.429, "step": 3256 }, { "epoch": 11.000033783783783, "grad_norm": 0.009430949576199055, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3257 }, { "epoch": 11.000067567567568, "grad_norm": 0.2944971024990082, "learning_rate": 1.25e-05, "loss": 0.0037, "step": 3258 }, { "epoch": 11.00010135135135, "grad_norm": 0.6834940910339355, "learning_rate": 1.25e-05, "loss": 0.008, "step": 3259 }, { "epoch": 11.000135135135135, "grad_norm": 0.015893608331680298, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3260 }, { "epoch": 11.000168918918918, "grad_norm": 0.005178962368518114, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3261 }, { "epoch": 11.000202702702703, "grad_norm": 0.009797090664505959, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3262 }, { "epoch": 11.000236486486486, "grad_norm": 0.08540967106819153, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 3263 }, { "epoch": 11.00027027027027, "grad_norm": 0.004134880378842354, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3264 }, { "epoch": 11.000304054054054, "grad_norm": 9.559202194213867, "learning_rate": 1.25e-05, "loss": 0.1764, "step": 3265 }, { "epoch": 11.000337837837838, "grad_norm": 0.4433368444442749, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 3266 }, { "epoch": 11.000371621621621, "grad_norm": 0.11089876294136047, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3267 }, { "epoch": 11.000405405405406, "grad_norm": 30.81318473815918, "learning_rate": 1.25e-05, "loss": 0.6477, "step": 3268 }, { "epoch": 11.000439189189189, "grad_norm": 0.36276406049728394, "learning_rate": 1.25e-05, "loss": 0.003, "step": 3269 }, { "epoch": 11.000472972972974, "grad_norm": 0.8441568613052368, "learning_rate": 1.25e-05, "loss": 0.0034, "step": 3270 }, { "epoch": 11.000506756756756, "grad_norm": 1.4459517002105713, "learning_rate": 1.25e-05, "loss": 0.0321, "step": 3271 }, { "epoch": 11.000540540540541, "grad_norm": 0.1291145533323288, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 3272 }, { "epoch": 11.000574324324324, "grad_norm": 0.03440826013684273, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3273 }, { "epoch": 11.000608108108109, "grad_norm": 0.15234123170375824, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3274 }, { "epoch": 11.000641891891892, "grad_norm": 29.01829719543457, "learning_rate": 1.25e-05, "loss": 0.6325, "step": 3275 }, { "epoch": 11.000675675675677, "grad_norm": 8.144396781921387, "learning_rate": 1.25e-05, "loss": 0.8389, "step": 3276 }, { "epoch": 11.00070945945946, "grad_norm": 31.179122924804688, "learning_rate": 1.25e-05, "loss": 0.4101, "step": 3277 }, { "epoch": 11.000743243243242, "grad_norm": 6.653017997741699, "learning_rate": 1.25e-05, "loss": 0.0235, "step": 3278 }, { "epoch": 11.000777027027027, "grad_norm": 0.004561029374599457, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3279 }, { "epoch": 11.00081081081081, "grad_norm": 0.004452510736882687, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3280 }, { "epoch": 11.000844594594595, "grad_norm": 0.014041207730770111, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3281 }, { "epoch": 11.000878378378378, "grad_norm": 0.010046529583632946, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3282 }, { "epoch": 11.000912162162162, "grad_norm": 12.476547241210938, "learning_rate": 1.25e-05, "loss": 0.3384, "step": 3283 }, { "epoch": 11.000945945945945, "grad_norm": 0.006726646795868874, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3284 }, { "epoch": 11.00097972972973, "grad_norm": 0.02683371864259243, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3285 }, { "epoch": 11.001013513513513, "grad_norm": 0.7202988862991333, "learning_rate": 1.25e-05, "loss": 0.002, "step": 3286 }, { "epoch": 11.001047297297298, "grad_norm": 0.13324324786663055, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3287 }, { "epoch": 11.00108108108108, "grad_norm": 0.012332754209637642, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3288 }, { "epoch": 11.001114864864865, "grad_norm": 2.50685453414917, "learning_rate": 1.25e-05, "loss": 0.0118, "step": 3289 }, { "epoch": 11.001148648648648, "grad_norm": 0.03767123445868492, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3290 }, { "epoch": 11.001182432432433, "grad_norm": 0.005590951070189476, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3291 }, { "epoch": 11.001216216216216, "grad_norm": 0.031642574816942215, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3292 }, { "epoch": 11.00125, "grad_norm": 0.02339051477611065, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3293 }, { "epoch": 11.001283783783784, "grad_norm": 0.008459299802780151, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3294 }, { "epoch": 11.001317567567568, "grad_norm": 11.692890167236328, "learning_rate": 1.25e-05, "loss": 0.0863, "step": 3295 }, { "epoch": 11.001351351351351, "grad_norm": 0.18842214345932007, "learning_rate": 1.25e-05, "loss": 0.0038, "step": 3296 }, { "epoch": 11.001385135135136, "grad_norm": 0.08375514298677444, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3297 }, { "epoch": 11.001418918918919, "grad_norm": 0.006434972397983074, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3298 }, { "epoch": 11.001452702702704, "grad_norm": 0.32795026898384094, "learning_rate": 1.25e-05, "loss": 0.0038, "step": 3299 }, { "epoch": 11.001486486486487, "grad_norm": 0.004689591005444527, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3300 }, { "epoch": 11.00152027027027, "grad_norm": 7.912988185882568, "learning_rate": 1.25e-05, "loss": 0.6714, "step": 3301 }, { "epoch": 11.001554054054054, "grad_norm": 4.452192306518555, "learning_rate": 1.25e-05, "loss": 0.0548, "step": 3302 }, { "epoch": 11.001587837837837, "grad_norm": 0.00772069301456213, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3303 }, { "epoch": 11.001621621621622, "grad_norm": 0.006086453329771757, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3304 }, { "epoch": 11.001655405405405, "grad_norm": 0.012416571378707886, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3305 }, { "epoch": 11.00168918918919, "grad_norm": 0.09228328615427017, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3306 }, { "epoch": 11.001722972972972, "grad_norm": 0.9024906754493713, "learning_rate": 1.25e-05, "loss": 0.0074, "step": 3307 }, { "epoch": 11.001756756756757, "grad_norm": 0.12018401175737381, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3308 }, { "epoch": 11.00179054054054, "grad_norm": 0.016591820865869522, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3309 }, { "epoch": 11.001824324324325, "grad_norm": 5.672643184661865, "learning_rate": 1.25e-05, "loss": 0.035, "step": 3310 }, { "epoch": 11.001858108108108, "grad_norm": 0.06037301570177078, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3311 }, { "epoch": 11.001891891891892, "grad_norm": 9.960017204284668, "learning_rate": 1.25e-05, "loss": 0.4766, "step": 3312 }, { "epoch": 11.001925675675675, "grad_norm": 1.493920922279358, "learning_rate": 1.25e-05, "loss": 0.0163, "step": 3313 }, { "epoch": 11.00195945945946, "grad_norm": 0.030429182574152946, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3314 }, { "epoch": 11.001993243243243, "grad_norm": 2.4157614707946777, "learning_rate": 1.25e-05, "loss": 0.03, "step": 3315 }, { "epoch": 11.002027027027028, "grad_norm": 0.015126821584999561, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3316 }, { "epoch": 11.00206081081081, "grad_norm": 0.023065369576215744, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3317 }, { "epoch": 11.002094594594595, "grad_norm": 0.4823996424674988, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 3318 }, { "epoch": 11.002128378378378, "grad_norm": 0.4270573556423187, "learning_rate": 1.25e-05, "loss": 0.0135, "step": 3319 }, { "epoch": 11.002162162162163, "grad_norm": 3.936591148376465, "learning_rate": 1.25e-05, "loss": 0.0232, "step": 3320 }, { "epoch": 11.002195945945946, "grad_norm": 1.6002157926559448, "learning_rate": 1.25e-05, "loss": 0.0246, "step": 3321 }, { "epoch": 11.002229729729729, "grad_norm": 7.082553386688232, "learning_rate": 1.25e-05, "loss": 0.3531, "step": 3322 }, { "epoch": 11.002263513513514, "grad_norm": 0.7558157444000244, "learning_rate": 1.25e-05, "loss": 0.0083, "step": 3323 }, { "epoch": 11.002297297297297, "grad_norm": 0.005201042629778385, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3324 }, { "epoch": 11.002331081081081, "grad_norm": 0.04378516972064972, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3325 }, { "epoch": 11.002364864864864, "grad_norm": 19.28797149658203, "learning_rate": 1.25e-05, "loss": 0.8584, "step": 3326 }, { "epoch": 11.002398648648649, "grad_norm": 0.1025533601641655, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 3327 }, { "epoch": 11.002432432432432, "grad_norm": 0.009145105257630348, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3328 }, { "epoch": 11.002466216216217, "grad_norm": 0.01174064353108406, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3329 }, { "epoch": 11.0025, "grad_norm": 3.17364501953125, "learning_rate": 1.25e-05, "loss": 0.1534, "step": 3330 }, { "epoch": 11.002533783783784, "grad_norm": 0.007260601967573166, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3331 }, { "epoch": 11.002567567567567, "grad_norm": 0.17578886449337006, "learning_rate": 1.25e-05, "loss": 0.0074, "step": 3332 }, { "epoch": 11.002601351351352, "grad_norm": 0.14886561036109924, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3333 }, { "epoch": 11.002635135135135, "grad_norm": 0.07281675934791565, "learning_rate": 1.25e-05, "loss": 0.003, "step": 3334 }, { "epoch": 11.00266891891892, "grad_norm": 9.923303604125977, "learning_rate": 1.25e-05, "loss": 0.2167, "step": 3335 }, { "epoch": 11.002702702702702, "grad_norm": 0.05679670721292496, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3336 }, { "epoch": 11.002736486486487, "grad_norm": 0.14493271708488464, "learning_rate": 1.25e-05, "loss": 0.0036, "step": 3337 }, { "epoch": 11.00277027027027, "grad_norm": 1.8330676555633545, "learning_rate": 1.25e-05, "loss": 0.0042, "step": 3338 }, { "epoch": 11.002804054054055, "grad_norm": 0.009941864758729935, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3339 }, { "epoch": 11.002837837837838, "grad_norm": 6.8176469802856445, "learning_rate": 1.25e-05, "loss": 0.0281, "step": 3340 }, { "epoch": 11.002871621621622, "grad_norm": 4.0461320877075195, "learning_rate": 1.25e-05, "loss": 0.0194, "step": 3341 }, { "epoch": 11.002905405405405, "grad_norm": 0.12569059431552887, "learning_rate": 1.25e-05, "loss": 0.0041, "step": 3342 }, { "epoch": 11.002939189189188, "grad_norm": 2.0123226642608643, "learning_rate": 1.25e-05, "loss": 0.018, "step": 3343 }, { "epoch": 11.002972972972973, "grad_norm": 0.030536601319909096, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3344 }, { "epoch": 11.003006756756756, "grad_norm": 0.08142699301242828, "learning_rate": 1.25e-05, "loss": 0.0031, "step": 3345 }, { "epoch": 11.00304054054054, "grad_norm": 0.01510409452021122, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3346 }, { "epoch": 11.003074324324324, "grad_norm": 0.006317440886050463, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3347 }, { "epoch": 11.003108108108108, "grad_norm": 0.05309094116091728, "learning_rate": 1.25e-05, "loss": 0.0018, "step": 3348 }, { "epoch": 11.003141891891891, "grad_norm": 4.476630687713623, "learning_rate": 1.25e-05, "loss": 0.1196, "step": 3349 }, { "epoch": 11.003175675675676, "grad_norm": 8.900751113891602, "learning_rate": 1.25e-05, "loss": 0.047, "step": 3350 }, { "epoch": 11.003209459459459, "grad_norm": 0.017327595502138138, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3351 }, { "epoch": 11.003243243243244, "grad_norm": 38.45365905761719, "learning_rate": 1.25e-05, "loss": 0.1846, "step": 3352 }, { "epoch": 11.003277027027027, "grad_norm": 6.07191801071167, "learning_rate": 1.25e-05, "loss": 0.0258, "step": 3353 }, { "epoch": 11.003310810810811, "grad_norm": 1.0666805505752563, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 3354 }, { "epoch": 11.003344594594594, "grad_norm": 0.0035482938401401043, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3355 }, { "epoch": 11.003378378378379, "grad_norm": 1.36589515209198, "learning_rate": 1.25e-05, "loss": 0.0088, "step": 3356 }, { "epoch": 11.003412162162162, "grad_norm": 0.009006835520267487, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3357 }, { "epoch": 11.003445945945947, "grad_norm": 0.01193722803145647, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3358 }, { "epoch": 11.00347972972973, "grad_norm": 20.712013244628906, "learning_rate": 1.25e-05, "loss": 0.3669, "step": 3359 }, { "epoch": 11.003513513513514, "grad_norm": 0.017590954899787903, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3360 }, { "epoch": 11.003547297297297, "grad_norm": 0.0027452725917100906, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3361 }, { "epoch": 11.003581081081082, "grad_norm": 15.179483413696289, "learning_rate": 1.25e-05, "loss": 0.0675, "step": 3362 }, { "epoch": 11.003614864864865, "grad_norm": 10.818978309631348, "learning_rate": 1.25e-05, "loss": 0.1589, "step": 3363 }, { "epoch": 11.003648648648648, "grad_norm": 1.474222183227539, "learning_rate": 1.25e-05, "loss": 0.0104, "step": 3364 }, { "epoch": 11.003682432432432, "grad_norm": 0.00821912195533514, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3365 }, { "epoch": 11.003716216216215, "grad_norm": 0.008422582410275936, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3366 }, { "epoch": 11.00375, "grad_norm": 0.004350618459284306, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3367 }, { "epoch": 11.003783783783783, "grad_norm": 0.008367136120796204, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3368 }, { "epoch": 11.003817567567568, "grad_norm": 0.15326440334320068, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 3369 }, { "epoch": 11.00385135135135, "grad_norm": 0.18734139204025269, "learning_rate": 1.25e-05, "loss": 0.0022, "step": 3370 }, { "epoch": 11.003885135135135, "grad_norm": 0.006389627233147621, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3371 }, { "epoch": 11.003918918918918, "grad_norm": 0.009449111297726631, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3372 }, { "epoch": 11.003952702702703, "grad_norm": 0.03487491235136986, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3373 }, { "epoch": 11.003986486486486, "grad_norm": 0.18561793863773346, "learning_rate": 1.25e-05, "loss": 0.001, "step": 3374 }, { "epoch": 11.00402027027027, "grad_norm": 0.20547597110271454, "learning_rate": 1.25e-05, "loss": 0.0022, "step": 3375 }, { "epoch": 11.004054054054054, "grad_norm": 30.04269790649414, "learning_rate": 1.25e-05, "loss": 0.5876, "step": 3376 }, { "epoch": 11.004087837837838, "grad_norm": 0.37664318084716797, "learning_rate": 1.25e-05, "loss": 0.0041, "step": 3377 }, { "epoch": 11.004121621621621, "grad_norm": 44.660980224609375, "learning_rate": 1.25e-05, "loss": 0.4817, "step": 3378 }, { "epoch": 11.004155405405406, "grad_norm": 0.3501421809196472, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3379 }, { "epoch": 11.004189189189189, "grad_norm": 0.005756329745054245, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3380 }, { "epoch": 11.004222972972974, "grad_norm": 0.0935899168252945, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3381 }, { "epoch": 11.004256756756757, "grad_norm": 0.017829861491918564, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3382 }, { "epoch": 11.004290540540541, "grad_norm": 0.08328653872013092, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3383 }, { "epoch": 11.004324324324324, "grad_norm": 0.008535996079444885, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3384 }, { "epoch": 11.004358108108109, "grad_norm": 0.17830312252044678, "learning_rate": 1.25e-05, "loss": 0.005, "step": 3385 }, { "epoch": 11.004391891891892, "grad_norm": 0.00676576467230916, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3386 }, { "epoch": 11.004425675675675, "grad_norm": 0.044603414833545685, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3387 }, { "epoch": 11.00445945945946, "grad_norm": 0.0038234691601246595, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3388 }, { "epoch": 11.004493243243243, "grad_norm": 55.36600875854492, "learning_rate": 1.25e-05, "loss": 0.3949, "step": 3389 }, { "epoch": 11.004527027027027, "grad_norm": 0.003871874650940299, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3390 }, { "epoch": 11.00456081081081, "grad_norm": 0.3929213583469391, "learning_rate": 1.25e-05, "loss": 0.0036, "step": 3391 }, { "epoch": 11.004594594594595, "grad_norm": 17.28261947631836, "learning_rate": 1.25e-05, "loss": 0.9421, "step": 3392 }, { "epoch": 11.004628378378378, "grad_norm": 0.12926916778087616, "learning_rate": 1.25e-05, "loss": 0.0019, "step": 3393 }, { "epoch": 11.004662162162163, "grad_norm": 0.06365235149860382, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3394 }, { "epoch": 11.004695945945945, "grad_norm": 0.006749744061380625, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3395 }, { "epoch": 11.00472972972973, "grad_norm": 37.58001708984375, "learning_rate": 1.25e-05, "loss": 0.6697, "step": 3396 }, { "epoch": 11.004763513513513, "grad_norm": 0.008135396055877209, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3397 }, { "epoch": 11.004797297297298, "grad_norm": 0.003252744907513261, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3398 }, { "epoch": 11.00483108108108, "grad_norm": 0.06728116422891617, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3399 }, { "epoch": 11.004864864864865, "grad_norm": 0.1441178023815155, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3400 }, { "epoch": 11.004898648648648, "grad_norm": 5.640593528747559, "learning_rate": 1.25e-05, "loss": 0.8721, "step": 3401 }, { "epoch": 11.004932432432433, "grad_norm": 0.054601289331912994, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 3402 }, { "epoch": 11.004966216216216, "grad_norm": 0.3318118453025818, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 3403 }, { "epoch": 11.005, "grad_norm": 0.005728352349251509, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3404 }, { "epoch": 11.005033783783784, "grad_norm": 0.00705765699967742, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3405 }, { "epoch": 11.005067567567568, "grad_norm": 22.088075637817383, "learning_rate": 1.25e-05, "loss": 0.3592, "step": 3406 }, { "epoch": 11.005101351351351, "grad_norm": 5.729722023010254, "learning_rate": 1.25e-05, "loss": 0.1043, "step": 3407 }, { "epoch": 11.005135135135134, "grad_norm": 0.025527160614728928, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3408 }, { "epoch": 11.005168918918919, "grad_norm": 9.512680053710938, "learning_rate": 1.25e-05, "loss": 0.0496, "step": 3409 }, { "epoch": 11.005202702702702, "grad_norm": 1.353538990020752, "learning_rate": 1.25e-05, "loss": 0.0027, "step": 3410 }, { "epoch": 11.005236486486487, "grad_norm": 0.016296442598104477, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3411 }, { "epoch": 11.00527027027027, "grad_norm": 0.016792768612504005, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3412 }, { "epoch": 11.005304054054054, "grad_norm": 0.003676207270473242, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3413 }, { "epoch": 11.005337837837837, "grad_norm": 0.011384292505681515, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3414 }, { "epoch": 11.005371621621622, "grad_norm": 13.90300464630127, "learning_rate": 1.25e-05, "loss": 0.189, "step": 3415 }, { "epoch": 11.005405405405405, "grad_norm": 0.07619795203208923, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3416 }, { "epoch": 11.00543918918919, "grad_norm": 0.011589974164962769, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3417 }, { "epoch": 11.005472972972973, "grad_norm": 0.06701791286468506, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3418 }, { "epoch": 11.005506756756757, "grad_norm": 0.017639199271798134, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3419 }, { "epoch": 11.00554054054054, "grad_norm": 3.99460506439209, "learning_rate": 1.25e-05, "loss": 0.1033, "step": 3420 }, { "epoch": 11.005574324324325, "grad_norm": 1.4809035062789917, "learning_rate": 1.25e-05, "loss": 0.0048, "step": 3421 }, { "epoch": 11.005608108108108, "grad_norm": 0.011793847195804119, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3422 }, { "epoch": 11.005641891891893, "grad_norm": 0.006613665726035833, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3423 }, { "epoch": 11.005675675675676, "grad_norm": 0.11845588684082031, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3424 }, { "epoch": 11.00570945945946, "grad_norm": 0.23103837668895721, "learning_rate": 1.25e-05, "loss": 0.0022, "step": 3425 }, { "epoch": 11.005743243243243, "grad_norm": 0.02206563949584961, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3426 }, { "epoch": 11.005777027027028, "grad_norm": 45.17418670654297, "learning_rate": 1.25e-05, "loss": 0.4118, "step": 3427 }, { "epoch": 11.00581081081081, "grad_norm": 0.053297579288482666, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 3428 }, { "epoch": 11.005844594594594, "grad_norm": 0.004753572400659323, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3429 }, { "epoch": 11.005878378378378, "grad_norm": 5.835343837738037, "learning_rate": 1.25e-05, "loss": 0.3635, "step": 3430 }, { "epoch": 11.005912162162161, "grad_norm": 0.17676995694637299, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 3431 }, { "epoch": 11.005945945945946, "grad_norm": 0.03480096161365509, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3432 }, { "epoch": 11.005979729729729, "grad_norm": 0.007614494767040014, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3433 }, { "epoch": 11.006013513513514, "grad_norm": 0.0028616636991500854, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3434 }, { "epoch": 11.006047297297297, "grad_norm": 0.4773818254470825, "learning_rate": 1.25e-05, "loss": 0.0078, "step": 3435 }, { "epoch": 11.006081081081081, "grad_norm": 0.04343424737453461, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3436 }, { "epoch": 11.006114864864864, "grad_norm": 0.011972970329225063, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3437 }, { "epoch": 11.006148648648649, "grad_norm": 0.004940755665302277, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3438 }, { "epoch": 11.006182432432432, "grad_norm": 6.696732044219971, "learning_rate": 1.25e-05, "loss": 0.0788, "step": 3439 }, { "epoch": 11.006216216216217, "grad_norm": 0.22476962208747864, "learning_rate": 1.25e-05, "loss": 0.0057, "step": 3440 }, { "epoch": 11.00625, "grad_norm": 0.07277049124240875, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 3441 }, { "epoch": 11.006283783783784, "grad_norm": 0.03639241307973862, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3442 }, { "epoch": 11.006317567567567, "grad_norm": 7.577416896820068, "learning_rate": 1.25e-05, "loss": 0.4832, "step": 3443 }, { "epoch": 11.006351351351352, "grad_norm": 0.013330637477338314, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3444 }, { "epoch": 11.006385135135135, "grad_norm": 0.008356655016541481, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3445 }, { "epoch": 11.00641891891892, "grad_norm": 52.3818359375, "learning_rate": 1.25e-05, "loss": 0.1566, "step": 3446 }, { "epoch": 11.006452702702703, "grad_norm": 0.04897336661815643, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3447 }, { "epoch": 11.006486486486487, "grad_norm": 5.329442024230957, "learning_rate": 1.25e-05, "loss": 0.9807, "step": 3448 }, { "epoch": 11.00652027027027, "grad_norm": 0.23405759036540985, "learning_rate": 1.25e-05, "loss": 0.0016, "step": 3449 }, { "epoch": 11.006554054054053, "grad_norm": 4.728113651275635, "learning_rate": 1.25e-05, "loss": 0.5089, "step": 3450 }, { "epoch": 11.006587837837838, "grad_norm": 0.04795576632022858, "learning_rate": 1.25e-05, "loss": 0.0018, "step": 3451 }, { "epoch": 11.00662162162162, "grad_norm": 14.370755195617676, "learning_rate": 1.25e-05, "loss": 0.5039, "step": 3452 }, { "epoch": 11.006655405405406, "grad_norm": 0.022998925298452377, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3453 }, { "epoch": 11.006689189189188, "grad_norm": 0.008198082447052002, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3454 }, { "epoch": 11.006722972972973, "grad_norm": 0.06267310678958893, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3455 }, { "epoch": 11.006756756756756, "grad_norm": 17.147939682006836, "learning_rate": 1.25e-05, "loss": 0.2258, "step": 3456 }, { "epoch": 11.00679054054054, "grad_norm": 0.017305340617895126, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3457 }, { "epoch": 11.006824324324324, "grad_norm": 0.0150164058431983, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3458 }, { "epoch": 11.006858108108108, "grad_norm": 1.0181961059570312, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 3459 }, { "epoch": 11.006891891891891, "grad_norm": 29.461286544799805, "learning_rate": 1.25e-05, "loss": 0.1941, "step": 3460 }, { "epoch": 11.006925675675676, "grad_norm": 0.004012455698102713, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3461 }, { "epoch": 11.006959459459459, "grad_norm": 0.665885865688324, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 3462 }, { "epoch": 11.006993243243244, "grad_norm": 0.1280716210603714, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3463 }, { "epoch": 11.007027027027027, "grad_norm": 0.1724698543548584, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 3464 }, { "epoch": 11.007060810810811, "grad_norm": 0.5287283062934875, "learning_rate": 1.25e-05, "loss": 0.005, "step": 3465 }, { "epoch": 11.007094594594594, "grad_norm": 36.29814147949219, "learning_rate": 1.25e-05, "loss": 0.7805, "step": 3466 }, { "epoch": 11.007128378378379, "grad_norm": 0.03447186201810837, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3467 }, { "epoch": 11.007162162162162, "grad_norm": 0.9799807071685791, "learning_rate": 1.25e-05, "loss": 0.0039, "step": 3468 }, { "epoch": 11.007195945945947, "grad_norm": 33.1823844909668, "learning_rate": 1.25e-05, "loss": 0.1786, "step": 3469 }, { "epoch": 11.00722972972973, "grad_norm": 0.016335291787981987, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3470 }, { "epoch": 11.007263513513514, "grad_norm": 42.96317672729492, "learning_rate": 1.25e-05, "loss": 0.5012, "step": 3471 }, { "epoch": 11.007297297297297, "grad_norm": 0.004633774049580097, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3472 }, { "epoch": 11.00733108108108, "grad_norm": 30.321666717529297, "learning_rate": 1.25e-05, "loss": 0.1124, "step": 3473 }, { "epoch": 11.007364864864865, "grad_norm": 16.277673721313477, "learning_rate": 1.25e-05, "loss": 0.9798, "step": 3474 }, { "epoch": 11.007398648648648, "grad_norm": 0.013802184723317623, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3475 }, { "epoch": 11.007432432432433, "grad_norm": 0.032790157943964005, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3476 }, { "epoch": 11.007466216216216, "grad_norm": 58.091739654541016, "learning_rate": 1.25e-05, "loss": 0.3522, "step": 3477 }, { "epoch": 11.0075, "grad_norm": 0.6967940330505371, "learning_rate": 1.25e-05, "loss": 0.0058, "step": 3478 }, { "epoch": 11.007533783783783, "grad_norm": 1.0264331102371216, "learning_rate": 1.25e-05, "loss": 0.0075, "step": 3479 }, { "epoch": 11.007567567567568, "grad_norm": 0.07033387571573257, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3480 }, { "epoch": 11.00760135135135, "grad_norm": 0.04914206638932228, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3481 }, { "epoch": 11.007635135135136, "grad_norm": 0.55922532081604, "learning_rate": 1.25e-05, "loss": 0.0113, "step": 3482 }, { "epoch": 11.007668918918919, "grad_norm": 23.378345489501953, "learning_rate": 1.25e-05, "loss": 0.1578, "step": 3483 }, { "epoch": 11.007702702702703, "grad_norm": 0.10510695725679398, "learning_rate": 1.25e-05, "loss": 0.0037, "step": 3484 }, { "epoch": 11.007736486486486, "grad_norm": 0.04631996899843216, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3485 }, { "epoch": 11.00777027027027, "grad_norm": 0.011409216560423374, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3486 }, { "epoch": 11.007804054054054, "grad_norm": 0.005132983438670635, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3487 }, { "epoch": 11.007837837837839, "grad_norm": 0.03812238946557045, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3488 }, { "epoch": 11.007871621621621, "grad_norm": 0.1528882086277008, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3489 }, { "epoch": 11.007905405405406, "grad_norm": 0.012356969527900219, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3490 }, { "epoch": 11.00793918918919, "grad_norm": 1.4733535051345825, "learning_rate": 1.25e-05, "loss": 0.006, "step": 3491 }, { "epoch": 11.007972972972974, "grad_norm": 30.595584869384766, "learning_rate": 1.25e-05, "loss": 0.1126, "step": 3492 }, { "epoch": 11.008006756756757, "grad_norm": 22.001142501831055, "learning_rate": 1.25e-05, "loss": 0.4217, "step": 3493 }, { "epoch": 11.00804054054054, "grad_norm": 0.0031901535112410784, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3494 }, { "epoch": 11.008074324324324, "grad_norm": 3.3610305786132812, "learning_rate": 1.25e-05, "loss": 0.4943, "step": 3495 }, { "epoch": 11.008108108108107, "grad_norm": 0.010990919545292854, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3496 }, { "epoch": 11.008141891891892, "grad_norm": 3.958954334259033, "learning_rate": 1.25e-05, "loss": 0.1927, "step": 3497 }, { "epoch": 11.008175675675675, "grad_norm": 0.006239024456590414, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3498 }, { "epoch": 11.00820945945946, "grad_norm": 0.8937502503395081, "learning_rate": 1.25e-05, "loss": 0.0281, "step": 3499 }, { "epoch": 11.008243243243243, "grad_norm": 9.472599029541016, "learning_rate": 1.25e-05, "loss": 0.0439, "step": 3500 }, { "epoch": 11.008277027027027, "grad_norm": 9.526905059814453, "learning_rate": 1.25e-05, "loss": 0.5137, "step": 3501 }, { "epoch": 11.00831081081081, "grad_norm": 0.042067643254995346, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3502 }, { "epoch": 11.008344594594595, "grad_norm": 3.888303518295288, "learning_rate": 1.25e-05, "loss": 0.1815, "step": 3503 }, { "epoch": 11.008378378378378, "grad_norm": 7.1342597007751465, "learning_rate": 1.25e-05, "loss": 0.0268, "step": 3504 }, { "epoch": 11.008412162162163, "grad_norm": 0.0018993025878444314, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3505 }, { "epoch": 11.008445945945946, "grad_norm": 11.921996116638184, "learning_rate": 1.25e-05, "loss": 0.378, "step": 3506 }, { "epoch": 11.00847972972973, "grad_norm": 0.11769474297761917, "learning_rate": 1.25e-05, "loss": 0.002, "step": 3507 }, { "epoch": 11.008513513513513, "grad_norm": 0.01786581613123417, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3508 }, { "epoch": 11.008547297297298, "grad_norm": 0.027425948530435562, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3509 }, { "epoch": 11.008581081081081, "grad_norm": 4.553204536437988, "learning_rate": 1.25e-05, "loss": 0.3544, "step": 3510 }, { "epoch": 11.008614864864866, "grad_norm": 9.054766654968262, "learning_rate": 1.25e-05, "loss": 0.2015, "step": 3511 }, { "epoch": 11.008648648648649, "grad_norm": 0.007903174497187138, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3512 }, { "epoch": 11.008682432432433, "grad_norm": 27.23093605041504, "learning_rate": 1.25e-05, "loss": 0.1432, "step": 3513 }, { "epoch": 11.008716216216216, "grad_norm": 0.29224157333374023, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 3514 }, { "epoch": 11.00875, "grad_norm": 0.44808948040008545, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 3515 }, { "epoch": 11.008783783783784, "grad_norm": 0.01603774167597294, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3516 }, { "epoch": 11.008817567567567, "grad_norm": 7.867884635925293, "learning_rate": 1.25e-05, "loss": 0.199, "step": 3517 }, { "epoch": 11.008851351351352, "grad_norm": 0.002177076181396842, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3518 }, { "epoch": 11.008885135135134, "grad_norm": 0.1240440234541893, "learning_rate": 1.25e-05, "loss": 0.0046, "step": 3519 }, { "epoch": 11.00891891891892, "grad_norm": 0.010677117854356766, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3520 }, { "epoch": 11.008952702702702, "grad_norm": 0.029463138431310654, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3521 }, { "epoch": 11.008986486486487, "grad_norm": 0.0016716273967176676, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3522 }, { "epoch": 11.00902027027027, "grad_norm": 0.016748782247304916, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3523 }, { "epoch": 11.009054054054054, "grad_norm": 28.620128631591797, "learning_rate": 1.25e-05, "loss": 0.0807, "step": 3524 }, { "epoch": 11.009087837837837, "grad_norm": 8.1410493850708, "learning_rate": 1.25e-05, "loss": 0.0639, "step": 3525 }, { "epoch": 11.009121621621622, "grad_norm": 0.5088281631469727, "learning_rate": 1.25e-05, "loss": 0.0066, "step": 3526 }, { "epoch": 11.009155405405405, "grad_norm": 0.004095427691936493, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3527 }, { "epoch": 11.00918918918919, "grad_norm": 2.6097512245178223, "learning_rate": 1.25e-05, "loss": 0.092, "step": 3528 }, { "epoch": 11.009222972972973, "grad_norm": 0.10589950531721115, "learning_rate": 1.25e-05, "loss": 0.0016, "step": 3529 }, { "epoch": 11.009256756756757, "grad_norm": 0.004012349992990494, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3530 }, { "epoch": 11.00929054054054, "grad_norm": 2.25777530670166, "learning_rate": 1.25e-05, "loss": 0.032, "step": 3531 }, { "epoch": 11.009324324324325, "grad_norm": 5.900925636291504, "learning_rate": 1.25e-05, "loss": 0.1378, "step": 3532 }, { "epoch": 11.009358108108108, "grad_norm": 0.1261860430240631, "learning_rate": 1.25e-05, "loss": 0.0048, "step": 3533 }, { "epoch": 11.009391891891893, "grad_norm": 10.606229782104492, "learning_rate": 1.25e-05, "loss": 0.1807, "step": 3534 }, { "epoch": 11.009425675675676, "grad_norm": 27.88006591796875, "learning_rate": 1.25e-05, "loss": 0.2136, "step": 3535 }, { "epoch": 11.009459459459459, "grad_norm": 0.14012812077999115, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3536 }, { "epoch": 11.009493243243243, "grad_norm": 0.009808924049139023, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3537 }, { "epoch": 11.009527027027026, "grad_norm": 0.003124311799183488, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3538 }, { "epoch": 11.009560810810811, "grad_norm": 0.04070565104484558, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3539 }, { "epoch": 11.009594594594594, "grad_norm": 0.005540458485484123, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3540 }, { "epoch": 11.009628378378379, "grad_norm": 14.416398048400879, "learning_rate": 1.25e-05, "loss": 0.0802, "step": 3541 }, { "epoch": 11.009662162162162, "grad_norm": 10.480488777160645, "learning_rate": 1.25e-05, "loss": 0.9172, "step": 3542 }, { "epoch": 11.009695945945946, "grad_norm": 3.1812736988067627, "learning_rate": 1.25e-05, "loss": 0.0084, "step": 3543 }, { "epoch": 11.00972972972973, "grad_norm": 40.04827880859375, "learning_rate": 1.25e-05, "loss": 0.2307, "step": 3544 }, { "epoch": 11.009763513513514, "grad_norm": 0.3768523931503296, "learning_rate": 1.25e-05, "loss": 0.0095, "step": 3545 }, { "epoch": 11.009797297297297, "grad_norm": 0.10946261137723923, "learning_rate": 1.25e-05, "loss": 0.0018, "step": 3546 }, { "epoch": 11.009831081081082, "grad_norm": 1.6002811193466187, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 3547 }, { "epoch": 11.009864864864864, "grad_norm": 0.008697571232914925, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3548 }, { "epoch": 11.00989864864865, "grad_norm": 3.4573891162872314, "learning_rate": 1.25e-05, "loss": 0.3927, "step": 3549 }, { "epoch": 11.009932432432432, "grad_norm": 37.91227340698242, "learning_rate": 1.25e-05, "loss": 0.1577, "step": 3550 }, { "epoch": 11.009966216216217, "grad_norm": 2.2894110679626465, "learning_rate": 1.25e-05, "loss": 0.0094, "step": 3551 }, { "epoch": 11.01, "grad_norm": 0.2506448030471802, "learning_rate": 1.25e-05, "loss": 0.002, "step": 3552 }, { "epoch": 11.01, "eval_accuracy": 0.8820678513731826, "eval_loss": 0.5015593767166138, "eval_runtime": 32.2167, "eval_samples_per_second": 19.214, "eval_steps_per_second": 2.421, "step": 3552 }, { "epoch": 12.000033783783783, "grad_norm": 36.25469207763672, "learning_rate": 1.25e-05, "loss": 1.0936, "step": 3553 }, { "epoch": 12.000067567567568, "grad_norm": 0.0022030624095350504, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3554 }, { "epoch": 12.00010135135135, "grad_norm": 21.953636169433594, "learning_rate": 1.25e-05, "loss": 0.1371, "step": 3555 }, { "epoch": 12.000135135135135, "grad_norm": 11.556265830993652, "learning_rate": 1.25e-05, "loss": 0.1147, "step": 3556 }, { "epoch": 12.000168918918918, "grad_norm": 31.103544235229492, "learning_rate": 1.25e-05, "loss": 0.5126, "step": 3557 }, { "epoch": 12.000202702702703, "grad_norm": 0.018564878031611443, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3558 }, { "epoch": 12.000236486486486, "grad_norm": 0.02404961548745632, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3559 }, { "epoch": 12.00027027027027, "grad_norm": 0.2857505977153778, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 3560 }, { "epoch": 12.000304054054054, "grad_norm": 0.3915451169013977, "learning_rate": 1.25e-05, "loss": 0.003, "step": 3561 }, { "epoch": 12.000337837837838, "grad_norm": 0.03203626349568367, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3562 }, { "epoch": 12.000371621621621, "grad_norm": 13.794657707214355, "learning_rate": 1.25e-05, "loss": 0.0403, "step": 3563 }, { "epoch": 12.000405405405406, "grad_norm": 0.013354547321796417, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3564 }, { "epoch": 12.000439189189189, "grad_norm": 0.5069895386695862, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3565 }, { "epoch": 12.000472972972974, "grad_norm": 2.317856788635254, "learning_rate": 1.25e-05, "loss": 0.1426, "step": 3566 }, { "epoch": 12.000506756756756, "grad_norm": 0.0837581604719162, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3567 }, { "epoch": 12.000540540540541, "grad_norm": 0.006555377040058374, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3568 }, { "epoch": 12.000574324324324, "grad_norm": 2.040827751159668, "learning_rate": 1.25e-05, "loss": 0.056, "step": 3569 }, { "epoch": 12.000608108108109, "grad_norm": 0.01389242522418499, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3570 }, { "epoch": 12.000641891891892, "grad_norm": 0.004998476710170507, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3571 }, { "epoch": 12.000675675675677, "grad_norm": 0.045389801263809204, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3572 }, { "epoch": 12.00070945945946, "grad_norm": 0.07452188432216644, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3573 }, { "epoch": 12.000743243243242, "grad_norm": 12.635254859924316, "learning_rate": 1.25e-05, "loss": 0.3541, "step": 3574 }, { "epoch": 12.000777027027027, "grad_norm": 4.285444736480713, "learning_rate": 1.25e-05, "loss": 0.5265, "step": 3575 }, { "epoch": 12.00081081081081, "grad_norm": 0.17517438530921936, "learning_rate": 1.25e-05, "loss": 0.0068, "step": 3576 }, { "epoch": 12.000844594594595, "grad_norm": 0.003227874170988798, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3577 }, { "epoch": 12.000878378378378, "grad_norm": 7.795268535614014, "learning_rate": 1.25e-05, "loss": 0.0909, "step": 3578 }, { "epoch": 12.000912162162162, "grad_norm": 0.0021425678860396147, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3579 }, { "epoch": 12.000945945945945, "grad_norm": 0.13404403626918793, "learning_rate": 1.25e-05, "loss": 0.0022, "step": 3580 }, { "epoch": 12.00097972972973, "grad_norm": 0.013674803078174591, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3581 }, { "epoch": 12.001013513513513, "grad_norm": 0.02700851857662201, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3582 }, { "epoch": 12.001047297297298, "grad_norm": 0.038223013281822205, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3583 }, { "epoch": 12.00108108108108, "grad_norm": 3.2804958820343018, "learning_rate": 1.25e-05, "loss": 0.3842, "step": 3584 }, { "epoch": 12.001114864864865, "grad_norm": 4.118834972381592, "learning_rate": 1.25e-05, "loss": 0.3923, "step": 3585 }, { "epoch": 12.001148648648648, "grad_norm": 33.71422576904297, "learning_rate": 1.25e-05, "loss": 0.5492, "step": 3586 }, { "epoch": 12.001182432432433, "grad_norm": 0.114996537566185, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3587 }, { "epoch": 12.001216216216216, "grad_norm": 0.017146751284599304, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3588 }, { "epoch": 12.00125, "grad_norm": 0.042720939964056015, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3589 }, { "epoch": 12.001283783783784, "grad_norm": 0.03873806074261665, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3590 }, { "epoch": 12.001317567567568, "grad_norm": 0.40498682856559753, "learning_rate": 1.25e-05, "loss": 0.0019, "step": 3591 }, { "epoch": 12.001351351351351, "grad_norm": 8.52524471282959, "learning_rate": 1.25e-05, "loss": 0.4527, "step": 3592 }, { "epoch": 12.001385135135136, "grad_norm": 0.013273733668029308, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3593 }, { "epoch": 12.001418918918919, "grad_norm": 0.010875946842133999, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3594 }, { "epoch": 12.001452702702704, "grad_norm": 31.112873077392578, "learning_rate": 1.25e-05, "loss": 0.3425, "step": 3595 }, { "epoch": 12.001486486486487, "grad_norm": 32.28062057495117, "learning_rate": 1.25e-05, "loss": 0.1783, "step": 3596 }, { "epoch": 12.00152027027027, "grad_norm": 0.09438122808933258, "learning_rate": 1.25e-05, "loss": 0.0016, "step": 3597 }, { "epoch": 12.001554054054054, "grad_norm": 0.1942460536956787, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 3598 }, { "epoch": 12.001587837837837, "grad_norm": 0.05329737067222595, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3599 }, { "epoch": 12.001621621621622, "grad_norm": 0.03292876109480858, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3600 }, { "epoch": 12.001655405405405, "grad_norm": 0.5599991679191589, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 3601 }, { "epoch": 12.00168918918919, "grad_norm": 0.1315743625164032, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 3602 }, { "epoch": 12.001722972972972, "grad_norm": 1.833126425743103, "learning_rate": 1.25e-05, "loss": 0.0145, "step": 3603 }, { "epoch": 12.001756756756757, "grad_norm": 0.00424657016992569, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3604 }, { "epoch": 12.00179054054054, "grad_norm": 0.13161614537239075, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 3605 }, { "epoch": 12.001824324324325, "grad_norm": 0.029594503343105316, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3606 }, { "epoch": 12.001858108108108, "grad_norm": 0.03392188996076584, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3607 }, { "epoch": 12.001891891891892, "grad_norm": 0.1369764357805252, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 3608 }, { "epoch": 12.001925675675675, "grad_norm": 0.09594093263149261, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3609 }, { "epoch": 12.00195945945946, "grad_norm": 17.597278594970703, "learning_rate": 1.25e-05, "loss": 0.1162, "step": 3610 }, { "epoch": 12.001993243243243, "grad_norm": 0.14388509094715118, "learning_rate": 1.25e-05, "loss": 0.0028, "step": 3611 }, { "epoch": 12.002027027027028, "grad_norm": 0.005952046252787113, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3612 }, { "epoch": 12.00206081081081, "grad_norm": 1.7262202501296997, "learning_rate": 1.25e-05, "loss": 0.018, "step": 3613 }, { "epoch": 12.002094594594595, "grad_norm": 0.7082629203796387, "learning_rate": 1.25e-05, "loss": 0.0066, "step": 3614 }, { "epoch": 12.002128378378378, "grad_norm": 0.6347145438194275, "learning_rate": 1.25e-05, "loss": 0.0247, "step": 3615 }, { "epoch": 12.002162162162163, "grad_norm": 0.012011101469397545, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3616 }, { "epoch": 12.002195945945946, "grad_norm": 10.61294937133789, "learning_rate": 1.25e-05, "loss": 0.9635, "step": 3617 }, { "epoch": 12.002229729729729, "grad_norm": 0.012679050676524639, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3618 }, { "epoch": 12.002263513513514, "grad_norm": 0.0028965857345610857, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3619 }, { "epoch": 12.002297297297297, "grad_norm": 0.042095981538295746, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3620 }, { "epoch": 12.002331081081081, "grad_norm": 0.032264452427625656, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3621 }, { "epoch": 12.002364864864864, "grad_norm": 0.5812802910804749, "learning_rate": 1.25e-05, "loss": 0.0031, "step": 3622 }, { "epoch": 12.002398648648649, "grad_norm": 0.07800829410552979, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 3623 }, { "epoch": 12.002432432432432, "grad_norm": 0.009579607285559177, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3624 }, { "epoch": 12.002466216216217, "grad_norm": 0.044350411742925644, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3625 }, { "epoch": 12.0025, "grad_norm": 0.004029865842312574, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3626 }, { "epoch": 12.002533783783784, "grad_norm": 0.01190253160893917, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3627 }, { "epoch": 12.002567567567567, "grad_norm": 0.20034432411193848, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3628 }, { "epoch": 12.002601351351352, "grad_norm": 29.149391174316406, "learning_rate": 1.25e-05, "loss": 0.3415, "step": 3629 }, { "epoch": 12.002635135135135, "grad_norm": 0.007590669207274914, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3630 }, { "epoch": 12.00266891891892, "grad_norm": 26.552522659301758, "learning_rate": 1.25e-05, "loss": 0.1131, "step": 3631 }, { "epoch": 12.002702702702702, "grad_norm": 51.920310974121094, "learning_rate": 1.25e-05, "loss": 0.2267, "step": 3632 }, { "epoch": 12.002736486486487, "grad_norm": 0.5184452533721924, "learning_rate": 1.25e-05, "loss": 0.0053, "step": 3633 }, { "epoch": 12.00277027027027, "grad_norm": 4.662822723388672, "learning_rate": 1.25e-05, "loss": 0.0837, "step": 3634 }, { "epoch": 12.002804054054055, "grad_norm": 0.08590547740459442, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3635 }, { "epoch": 12.002837837837838, "grad_norm": 0.03403434529900551, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3636 }, { "epoch": 12.002871621621622, "grad_norm": 0.05241626873612404, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3637 }, { "epoch": 12.002905405405405, "grad_norm": 0.2630750834941864, "learning_rate": 1.25e-05, "loss": 0.0099, "step": 3638 }, { "epoch": 12.002939189189188, "grad_norm": 3.5877342224121094, "learning_rate": 1.25e-05, "loss": 0.3225, "step": 3639 }, { "epoch": 12.002972972972973, "grad_norm": 0.015452769584953785, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3640 }, { "epoch": 12.003006756756756, "grad_norm": 0.004425662104040384, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3641 }, { "epoch": 12.00304054054054, "grad_norm": 0.35484153032302856, "learning_rate": 1.25e-05, "loss": 0.0137, "step": 3642 }, { "epoch": 12.003074324324324, "grad_norm": 4.354015827178955, "learning_rate": 1.25e-05, "loss": 0.05, "step": 3643 }, { "epoch": 12.003108108108108, "grad_norm": 0.015842532739043236, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3644 }, { "epoch": 12.003141891891891, "grad_norm": 0.3809446394443512, "learning_rate": 1.25e-05, "loss": 0.0188, "step": 3645 }, { "epoch": 12.003175675675676, "grad_norm": 0.014462895691394806, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3646 }, { "epoch": 12.003209459459459, "grad_norm": 0.005910204257816076, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3647 }, { "epoch": 12.003243243243244, "grad_norm": 0.14750303328037262, "learning_rate": 1.25e-05, "loss": 0.0025, "step": 3648 }, { "epoch": 12.003277027027027, "grad_norm": 0.03958170861005783, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3649 }, { "epoch": 12.003310810810811, "grad_norm": 0.0030852695927023888, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3650 }, { "epoch": 12.003344594594594, "grad_norm": 4.776733875274658, "learning_rate": 1.25e-05, "loss": 0.2438, "step": 3651 }, { "epoch": 12.003378378378379, "grad_norm": 0.009334038943052292, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3652 }, { "epoch": 12.003412162162162, "grad_norm": 0.0087091363966465, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3653 }, { "epoch": 12.003445945945947, "grad_norm": 1.2503058910369873, "learning_rate": 1.25e-05, "loss": 0.016, "step": 3654 }, { "epoch": 12.00347972972973, "grad_norm": 0.09262052178382874, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3655 }, { "epoch": 12.003513513513514, "grad_norm": 18.93301773071289, "learning_rate": 1.25e-05, "loss": 0.0666, "step": 3656 }, { "epoch": 12.003547297297297, "grad_norm": 0.007853791117668152, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3657 }, { "epoch": 12.003581081081082, "grad_norm": 0.19575035572052002, "learning_rate": 1.25e-05, "loss": 0.0074, "step": 3658 }, { "epoch": 12.003614864864865, "grad_norm": 0.007743285037577152, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3659 }, { "epoch": 12.003648648648648, "grad_norm": 20.052043914794922, "learning_rate": 1.25e-05, "loss": 0.7571, "step": 3660 }, { "epoch": 12.003682432432432, "grad_norm": 34.5831298828125, "learning_rate": 1.25e-05, "loss": 0.1901, "step": 3661 }, { "epoch": 12.003716216216215, "grad_norm": 0.04248589277267456, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3662 }, { "epoch": 12.00375, "grad_norm": 0.3486841917037964, "learning_rate": 1.25e-05, "loss": 0.0133, "step": 3663 }, { "epoch": 12.003783783783783, "grad_norm": 0.07160200923681259, "learning_rate": 1.25e-05, "loss": 0.001, "step": 3664 }, { "epoch": 12.003817567567568, "grad_norm": 0.11263536661863327, "learning_rate": 1.25e-05, "loss": 0.001, "step": 3665 }, { "epoch": 12.00385135135135, "grad_norm": 16.72862434387207, "learning_rate": 1.25e-05, "loss": 0.2485, "step": 3666 }, { "epoch": 12.003885135135135, "grad_norm": 0.1541857272386551, "learning_rate": 1.25e-05, "loss": 0.0059, "step": 3667 }, { "epoch": 12.003918918918918, "grad_norm": 0.23894299566745758, "learning_rate": 1.25e-05, "loss": 0.009, "step": 3668 }, { "epoch": 12.003952702702703, "grad_norm": 0.1134490817785263, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3669 }, { "epoch": 12.003986486486486, "grad_norm": 20.230436325073242, "learning_rate": 1.25e-05, "loss": 0.5422, "step": 3670 }, { "epoch": 12.00402027027027, "grad_norm": 0.4345981180667877, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 3671 }, { "epoch": 12.004054054054054, "grad_norm": 0.005686194635927677, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3672 }, { "epoch": 12.004087837837838, "grad_norm": 5.1447319984436035, "learning_rate": 1.25e-05, "loss": 0.0623, "step": 3673 }, { "epoch": 12.004121621621621, "grad_norm": 0.006181106436997652, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3674 }, { "epoch": 12.004155405405406, "grad_norm": 0.3040359616279602, "learning_rate": 1.25e-05, "loss": 0.005, "step": 3675 }, { "epoch": 12.004189189189189, "grad_norm": 0.0033749821595847607, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3676 }, { "epoch": 12.004222972972974, "grad_norm": 0.9434853792190552, "learning_rate": 1.25e-05, "loss": 0.003, "step": 3677 }, { "epoch": 12.004256756756757, "grad_norm": 0.02554657869040966, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3678 }, { "epoch": 12.004290540540541, "grad_norm": 5.222944736480713, "learning_rate": 1.25e-05, "loss": 0.0202, "step": 3679 }, { "epoch": 12.004324324324324, "grad_norm": 0.045042313635349274, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3680 }, { "epoch": 12.004358108108109, "grad_norm": 0.008363412693142891, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3681 }, { "epoch": 12.004391891891892, "grad_norm": 0.008187299594283104, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3682 }, { "epoch": 12.004425675675675, "grad_norm": 0.016748929396271706, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3683 }, { "epoch": 12.00445945945946, "grad_norm": 0.07567288726568222, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 3684 }, { "epoch": 12.004493243243243, "grad_norm": 0.03393532335758209, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3685 }, { "epoch": 12.004527027027027, "grad_norm": 0.2215471863746643, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 3686 }, { "epoch": 12.00456081081081, "grad_norm": 39.34383010864258, "learning_rate": 1.25e-05, "loss": 0.2613, "step": 3687 }, { "epoch": 12.004594594594595, "grad_norm": 0.03420770540833473, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3688 }, { "epoch": 12.004628378378378, "grad_norm": 0.011737702414393425, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3689 }, { "epoch": 12.004662162162163, "grad_norm": 0.010968134738504887, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3690 }, { "epoch": 12.004695945945945, "grad_norm": 3.972355842590332, "learning_rate": 1.25e-05, "loss": 0.037, "step": 3691 }, { "epoch": 12.00472972972973, "grad_norm": 0.011282084509730339, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3692 }, { "epoch": 12.004763513513513, "grad_norm": 29.526229858398438, "learning_rate": 1.25e-05, "loss": 0.6922, "step": 3693 }, { "epoch": 12.004797297297298, "grad_norm": 0.08107402175664902, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3694 }, { "epoch": 12.00483108108108, "grad_norm": 9.8879976272583, "learning_rate": 1.25e-05, "loss": 0.8894, "step": 3695 }, { "epoch": 12.004864864864865, "grad_norm": 0.08573463559150696, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3696 }, { "epoch": 12.004898648648648, "grad_norm": 0.002207590965554118, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3697 }, { "epoch": 12.004932432432433, "grad_norm": 0.04957452788949013, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3698 }, { "epoch": 12.004966216216216, "grad_norm": 9.214609146118164, "learning_rate": 1.25e-05, "loss": 0.0279, "step": 3699 }, { "epoch": 12.005, "grad_norm": 0.28952255845069885, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3700 }, { "epoch": 12.005033783783784, "grad_norm": 3.337609052658081, "learning_rate": 1.25e-05, "loss": 0.434, "step": 3701 }, { "epoch": 12.005067567567568, "grad_norm": 0.08537266403436661, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3702 }, { "epoch": 12.005101351351351, "grad_norm": 0.024208616465330124, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3703 }, { "epoch": 12.005135135135134, "grad_norm": 0.006949591916054487, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3704 }, { "epoch": 12.005168918918919, "grad_norm": 28.421066284179688, "learning_rate": 1.25e-05, "loss": 0.689, "step": 3705 }, { "epoch": 12.005202702702702, "grad_norm": 0.24232590198516846, "learning_rate": 1.25e-05, "loss": 0.0027, "step": 3706 }, { "epoch": 12.005236486486487, "grad_norm": 5.616876602172852, "learning_rate": 1.25e-05, "loss": 0.0273, "step": 3707 }, { "epoch": 12.00527027027027, "grad_norm": 0.047877244651317596, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3708 }, { "epoch": 12.005304054054054, "grad_norm": 0.015789516270160675, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3709 }, { "epoch": 12.005337837837837, "grad_norm": 0.003996913321316242, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3710 }, { "epoch": 12.005371621621622, "grad_norm": 4.294330596923828, "learning_rate": 1.25e-05, "loss": 0.0205, "step": 3711 }, { "epoch": 12.005405405405405, "grad_norm": 1.4031691551208496, "learning_rate": 1.25e-05, "loss": 0.0516, "step": 3712 }, { "epoch": 12.00543918918919, "grad_norm": 0.5278617739677429, "learning_rate": 1.25e-05, "loss": 0.0132, "step": 3713 }, { "epoch": 12.005472972972973, "grad_norm": 0.01592372916638851, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3714 }, { "epoch": 12.005506756756757, "grad_norm": 0.2685205042362213, "learning_rate": 1.25e-05, "loss": 0.0097, "step": 3715 }, { "epoch": 12.00554054054054, "grad_norm": 6.117109775543213, "learning_rate": 1.25e-05, "loss": 0.1763, "step": 3716 }, { "epoch": 12.005574324324325, "grad_norm": 0.04533051699399948, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3717 }, { "epoch": 12.005608108108108, "grad_norm": 0.12906628847122192, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3718 }, { "epoch": 12.005641891891893, "grad_norm": 6.002627849578857, "learning_rate": 1.25e-05, "loss": 0.2011, "step": 3719 }, { "epoch": 12.005675675675676, "grad_norm": 0.02744182199239731, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3720 }, { "epoch": 12.00570945945946, "grad_norm": 10.232691764831543, "learning_rate": 1.25e-05, "loss": 0.6493, "step": 3721 }, { "epoch": 12.005743243243243, "grad_norm": 0.03570159897208214, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3722 }, { "epoch": 12.005777027027028, "grad_norm": 21.476139068603516, "learning_rate": 1.25e-05, "loss": 0.462, "step": 3723 }, { "epoch": 12.00581081081081, "grad_norm": 9.412187576293945, "learning_rate": 1.25e-05, "loss": 0.0305, "step": 3724 }, { "epoch": 12.005844594594594, "grad_norm": 0.005317401606589556, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3725 }, { "epoch": 12.005878378378378, "grad_norm": 0.013934164308011532, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3726 }, { "epoch": 12.005912162162161, "grad_norm": 0.020638879388570786, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3727 }, { "epoch": 12.005945945945946, "grad_norm": 0.0038459566421806812, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3728 }, { "epoch": 12.005979729729729, "grad_norm": 0.004107790067791939, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3729 }, { "epoch": 12.006013513513514, "grad_norm": 18.161415100097656, "learning_rate": 1.25e-05, "loss": 0.1858, "step": 3730 }, { "epoch": 12.006047297297297, "grad_norm": 0.07372258603572845, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3731 }, { "epoch": 12.006081081081081, "grad_norm": 0.008590003475546837, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3732 }, { "epoch": 12.006114864864864, "grad_norm": 0.0708153247833252, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 3733 }, { "epoch": 12.006148648648649, "grad_norm": 0.00497641833499074, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3734 }, { "epoch": 12.006182432432432, "grad_norm": 15.466398239135742, "learning_rate": 1.25e-05, "loss": 0.4438, "step": 3735 }, { "epoch": 12.006216216216217, "grad_norm": 0.009896554052829742, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3736 }, { "epoch": 12.00625, "grad_norm": 0.014395921491086483, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3737 }, { "epoch": 12.006283783783784, "grad_norm": 0.7729188799858093, "learning_rate": 1.25e-05, "loss": 0.0076, "step": 3738 }, { "epoch": 12.006317567567567, "grad_norm": 1.2631686925888062, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 3739 }, { "epoch": 12.006351351351352, "grad_norm": 7.060413837432861, "learning_rate": 1.25e-05, "loss": 0.4427, "step": 3740 }, { "epoch": 12.006385135135135, "grad_norm": 6.257072925567627, "learning_rate": 1.25e-05, "loss": 0.244, "step": 3741 }, { "epoch": 12.00641891891892, "grad_norm": 0.17216549813747406, "learning_rate": 1.25e-05, "loss": 0.0064, "step": 3742 }, { "epoch": 12.006452702702703, "grad_norm": 0.006521416828036308, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3743 }, { "epoch": 12.006486486486487, "grad_norm": 0.00570320151746273, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3744 }, { "epoch": 12.00652027027027, "grad_norm": 3.071434497833252, "learning_rate": 1.25e-05, "loss": 0.2298, "step": 3745 }, { "epoch": 12.006554054054053, "grad_norm": 3.6165449619293213, "learning_rate": 1.25e-05, "loss": 0.3386, "step": 3746 }, { "epoch": 12.006587837837838, "grad_norm": 0.007962055504322052, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3747 }, { "epoch": 12.00662162162162, "grad_norm": 46.04301071166992, "learning_rate": 1.25e-05, "loss": 0.3288, "step": 3748 }, { "epoch": 12.006655405405406, "grad_norm": 0.06398668885231018, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3749 }, { "epoch": 12.006689189189188, "grad_norm": 0.00471272598952055, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3750 }, { "epoch": 12.006722972972973, "grad_norm": 18.183652877807617, "learning_rate": 1.25e-05, "loss": 0.064, "step": 3751 }, { "epoch": 12.006756756756756, "grad_norm": 1.3855311870574951, "learning_rate": 1.25e-05, "loss": 0.0057, "step": 3752 }, { "epoch": 12.00679054054054, "grad_norm": 0.15191812813282013, "learning_rate": 1.25e-05, "loss": 0.0058, "step": 3753 }, { "epoch": 12.006824324324324, "grad_norm": 0.15922623872756958, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 3754 }, { "epoch": 12.006858108108108, "grad_norm": 0.45582857728004456, "learning_rate": 1.25e-05, "loss": 0.0197, "step": 3755 }, { "epoch": 12.006891891891891, "grad_norm": 1.2033007144927979, "learning_rate": 1.25e-05, "loss": 0.0308, "step": 3756 }, { "epoch": 12.006925675675676, "grad_norm": 0.06794710457324982, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 3757 }, { "epoch": 12.006959459459459, "grad_norm": 0.005288933403789997, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3758 }, { "epoch": 12.006993243243244, "grad_norm": 0.1478545069694519, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 3759 }, { "epoch": 12.007027027027027, "grad_norm": 1.90366792678833, "learning_rate": 1.25e-05, "loss": 0.1162, "step": 3760 }, { "epoch": 12.007060810810811, "grad_norm": 0.5918194055557251, "learning_rate": 1.25e-05, "loss": 0.0084, "step": 3761 }, { "epoch": 12.007094594594594, "grad_norm": 0.014318279922008514, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3762 }, { "epoch": 12.007128378378379, "grad_norm": 0.5242448449134827, "learning_rate": 1.25e-05, "loss": 0.0039, "step": 3763 }, { "epoch": 12.007162162162162, "grad_norm": 0.09515997767448425, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3764 }, { "epoch": 12.007195945945947, "grad_norm": 0.15176111459732056, "learning_rate": 1.25e-05, "loss": 0.0059, "step": 3765 }, { "epoch": 12.00722972972973, "grad_norm": 0.026396190747618675, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3766 }, { "epoch": 12.007263513513514, "grad_norm": 0.007894604466855526, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3767 }, { "epoch": 12.007297297297297, "grad_norm": 0.005474018398672342, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3768 }, { "epoch": 12.00733108108108, "grad_norm": 0.05059078708291054, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3769 }, { "epoch": 12.007364864864865, "grad_norm": 0.003469668095931411, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3770 }, { "epoch": 12.007398648648648, "grad_norm": 0.026357078924775124, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3771 }, { "epoch": 12.007432432432433, "grad_norm": 0.029837526381015778, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3772 }, { "epoch": 12.007466216216216, "grad_norm": 0.011664346791803837, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3773 }, { "epoch": 12.0075, "grad_norm": 7.3081231117248535, "learning_rate": 1.25e-05, "loss": 0.8393, "step": 3774 }, { "epoch": 12.007533783783783, "grad_norm": 0.17154403030872345, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 3775 }, { "epoch": 12.007567567567568, "grad_norm": 0.0037363816518336535, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3776 }, { "epoch": 12.00760135135135, "grad_norm": 0.056357745081186295, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3777 }, { "epoch": 12.007635135135136, "grad_norm": 0.032848384231328964, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3778 }, { "epoch": 12.007668918918919, "grad_norm": 0.009779221378266811, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3779 }, { "epoch": 12.007702702702703, "grad_norm": 0.08527000993490219, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3780 }, { "epoch": 12.007736486486486, "grad_norm": 2.5516178607940674, "learning_rate": 1.25e-05, "loss": 0.0086, "step": 3781 }, { "epoch": 12.00777027027027, "grad_norm": 26.196382522583008, "learning_rate": 1.25e-05, "loss": 0.4519, "step": 3782 }, { "epoch": 12.007804054054054, "grad_norm": 15.93067455291748, "learning_rate": 1.25e-05, "loss": 0.6743, "step": 3783 }, { "epoch": 12.007837837837839, "grad_norm": 0.06795913726091385, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 3784 }, { "epoch": 12.007871621621621, "grad_norm": 0.05933257192373276, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3785 }, { "epoch": 12.007905405405406, "grad_norm": 0.05147884413599968, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3786 }, { "epoch": 12.00793918918919, "grad_norm": 3.245422124862671, "learning_rate": 1.25e-05, "loss": 0.01, "step": 3787 }, { "epoch": 12.007972972972974, "grad_norm": 0.8875874280929565, "learning_rate": 1.25e-05, "loss": 0.0113, "step": 3788 }, { "epoch": 12.008006756756757, "grad_norm": 0.00411781994625926, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3789 }, { "epoch": 12.00804054054054, "grad_norm": 0.7004573941230774, "learning_rate": 1.25e-05, "loss": 0.0135, "step": 3790 }, { "epoch": 12.008074324324324, "grad_norm": 2.57253098487854, "learning_rate": 1.25e-05, "loss": 0.0377, "step": 3791 }, { "epoch": 12.008108108108107, "grad_norm": 0.02726970985531807, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3792 }, { "epoch": 12.008141891891892, "grad_norm": 1.7442891597747803, "learning_rate": 1.25e-05, "loss": 0.0076, "step": 3793 }, { "epoch": 12.008175675675675, "grad_norm": 0.01661328785121441, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3794 }, { "epoch": 12.00820945945946, "grad_norm": 2.5794870853424072, "learning_rate": 1.25e-05, "loss": 0.2637, "step": 3795 }, { "epoch": 12.008243243243243, "grad_norm": 0.159877210855484, "learning_rate": 1.25e-05, "loss": 0.0046, "step": 3796 }, { "epoch": 12.008277027027027, "grad_norm": 0.04372847452759743, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3797 }, { "epoch": 12.00831081081081, "grad_norm": 0.006207612343132496, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3798 }, { "epoch": 12.008344594594595, "grad_norm": 1.7069991827011108, "learning_rate": 1.25e-05, "loss": 0.0881, "step": 3799 }, { "epoch": 12.008378378378378, "grad_norm": 0.06023721396923065, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3800 }, { "epoch": 12.008412162162163, "grad_norm": 0.0500987246632576, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3801 }, { "epoch": 12.008445945945946, "grad_norm": 7.691773414611816, "learning_rate": 1.25e-05, "loss": 0.0153, "step": 3802 }, { "epoch": 12.00847972972973, "grad_norm": 1.8785475492477417, "learning_rate": 1.25e-05, "loss": 0.0895, "step": 3803 }, { "epoch": 12.008513513513513, "grad_norm": 0.15167060494422913, "learning_rate": 1.25e-05, "loss": 0.0034, "step": 3804 }, { "epoch": 12.008547297297298, "grad_norm": 0.0037691278848797083, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3805 }, { "epoch": 12.008581081081081, "grad_norm": 0.02933584898710251, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3806 }, { "epoch": 12.008614864864866, "grad_norm": 0.22956174612045288, "learning_rate": 1.25e-05, "loss": 0.0086, "step": 3807 }, { "epoch": 12.008648648648649, "grad_norm": 0.006768705788999796, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3808 }, { "epoch": 12.008682432432433, "grad_norm": 0.559873640537262, "learning_rate": 1.25e-05, "loss": 0.0068, "step": 3809 }, { "epoch": 12.008716216216216, "grad_norm": 0.0078055779449641705, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3810 }, { "epoch": 12.00875, "grad_norm": 6.188182353973389, "learning_rate": 1.25e-05, "loss": 0.0945, "step": 3811 }, { "epoch": 12.008783783783784, "grad_norm": 0.039784081280231476, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3812 }, { "epoch": 12.008817567567567, "grad_norm": 40.38760757446289, "learning_rate": 1.25e-05, "loss": 0.5587, "step": 3813 }, { "epoch": 12.008851351351352, "grad_norm": 0.008070399053394794, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3814 }, { "epoch": 12.008885135135134, "grad_norm": 2.1537857055664062, "learning_rate": 1.25e-05, "loss": 0.006, "step": 3815 }, { "epoch": 12.00891891891892, "grad_norm": 1.4832587242126465, "learning_rate": 1.25e-05, "loss": 0.1606, "step": 3816 }, { "epoch": 12.008952702702702, "grad_norm": 30.439708709716797, "learning_rate": 1.25e-05, "loss": 0.1274, "step": 3817 }, { "epoch": 12.008986486486487, "grad_norm": 1.3931385278701782, "learning_rate": 1.25e-05, "loss": 0.0675, "step": 3818 }, { "epoch": 12.00902027027027, "grad_norm": 73.66333770751953, "learning_rate": 1.25e-05, "loss": 1.059, "step": 3819 }, { "epoch": 12.009054054054054, "grad_norm": 1.7356373071670532, "learning_rate": 1.25e-05, "loss": 0.0056, "step": 3820 }, { "epoch": 12.009087837837837, "grad_norm": 0.009703286923468113, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3821 }, { "epoch": 12.009121621621622, "grad_norm": 0.02072809264063835, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3822 }, { "epoch": 12.009155405405405, "grad_norm": 0.0036665969528257847, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3823 }, { "epoch": 12.00918918918919, "grad_norm": 0.16086652874946594, "learning_rate": 1.25e-05, "loss": 0.006, "step": 3824 }, { "epoch": 12.009222972972973, "grad_norm": 0.009910124354064465, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3825 }, { "epoch": 12.009256756756757, "grad_norm": 0.0012766120489686728, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3826 }, { "epoch": 12.00929054054054, "grad_norm": 13.092280387878418, "learning_rate": 1.25e-05, "loss": 0.0569, "step": 3827 }, { "epoch": 12.009324324324325, "grad_norm": 0.011533524841070175, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3828 }, { "epoch": 12.009358108108108, "grad_norm": 0.04550934210419655, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3829 }, { "epoch": 12.009391891891893, "grad_norm": 2.4159014225006104, "learning_rate": 1.25e-05, "loss": 0.01, "step": 3830 }, { "epoch": 12.009425675675676, "grad_norm": 0.17471621930599213, "learning_rate": 1.25e-05, "loss": 0.0047, "step": 3831 }, { "epoch": 12.009459459459459, "grad_norm": 0.013968714512884617, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3832 }, { "epoch": 12.009493243243243, "grad_norm": 0.3495934009552002, "learning_rate": 1.25e-05, "loss": 0.0087, "step": 3833 }, { "epoch": 12.009527027027026, "grad_norm": 0.0027013386134058237, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3834 }, { "epoch": 12.009560810810811, "grad_norm": 0.1489304006099701, "learning_rate": 1.25e-05, "loss": 0.001, "step": 3835 }, { "epoch": 12.009594594594594, "grad_norm": 0.008701212704181671, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3836 }, { "epoch": 12.009628378378379, "grad_norm": 11.19664192199707, "learning_rate": 1.25e-05, "loss": 0.1749, "step": 3837 }, { "epoch": 12.009662162162162, "grad_norm": 0.008766219019889832, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3838 }, { "epoch": 12.009695945945946, "grad_norm": 0.0017689726082608104, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3839 }, { "epoch": 12.00972972972973, "grad_norm": 5.668590068817139, "learning_rate": 1.25e-05, "loss": 0.3097, "step": 3840 }, { "epoch": 12.009763513513514, "grad_norm": 9.309393882751465, "learning_rate": 1.25e-05, "loss": 0.0313, "step": 3841 }, { "epoch": 12.009797297297297, "grad_norm": 11.938316345214844, "learning_rate": 1.25e-05, "loss": 0.7221, "step": 3842 }, { "epoch": 12.009831081081082, "grad_norm": 0.06815911084413528, "learning_rate": 1.25e-05, "loss": 0.001, "step": 3843 }, { "epoch": 12.009864864864864, "grad_norm": 6.4697113037109375, "learning_rate": 1.25e-05, "loss": 0.0243, "step": 3844 }, { "epoch": 12.00989864864865, "grad_norm": 0.0268834438174963, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3845 }, { "epoch": 12.009932432432432, "grad_norm": 0.025531016290187836, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3846 }, { "epoch": 12.009966216216217, "grad_norm": 0.012648137286305428, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3847 }, { "epoch": 12.01, "grad_norm": 0.13262048363685608, "learning_rate": 1.25e-05, "loss": 0.0048, "step": 3848 }, { "epoch": 12.01, "eval_accuracy": 0.8578352180936996, "eval_loss": 0.6594417095184326, "eval_runtime": 31.7548, "eval_samples_per_second": 19.493, "eval_steps_per_second": 2.456, "step": 3848 }, { "epoch": 13.000033783783783, "grad_norm": 5.334597110748291, "learning_rate": 1.25e-05, "loss": 0.0159, "step": 3849 }, { "epoch": 13.000067567567568, "grad_norm": 2.3893139362335205, "learning_rate": 1.25e-05, "loss": 0.0138, "step": 3850 }, { "epoch": 13.00010135135135, "grad_norm": 1.6546517610549927, "learning_rate": 1.25e-05, "loss": 0.0049, "step": 3851 }, { "epoch": 13.000135135135135, "grad_norm": 0.12448343634605408, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3852 }, { "epoch": 13.000168918918918, "grad_norm": 6.180261135101318, "learning_rate": 1.25e-05, "loss": 0.093, "step": 3853 }, { "epoch": 13.000202702702703, "grad_norm": 0.005562610924243927, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3854 }, { "epoch": 13.000236486486486, "grad_norm": 0.47218263149261475, "learning_rate": 1.25e-05, "loss": 0.0025, "step": 3855 }, { "epoch": 13.00027027027027, "grad_norm": 0.0050223697908222675, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3856 }, { "epoch": 13.000304054054054, "grad_norm": 0.0366823710501194, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3857 }, { "epoch": 13.000337837837838, "grad_norm": 0.017113851383328438, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3858 }, { "epoch": 13.000371621621621, "grad_norm": 0.13709194958209991, "learning_rate": 1.25e-05, "loss": 0.0042, "step": 3859 }, { "epoch": 13.000405405405406, "grad_norm": 30.81071662902832, "learning_rate": 1.25e-05, "loss": 0.0631, "step": 3860 }, { "epoch": 13.000439189189189, "grad_norm": 0.09452982991933823, "learning_rate": 1.25e-05, "loss": 0.0035, "step": 3861 }, { "epoch": 13.000472972972974, "grad_norm": 0.15021389722824097, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3862 }, { "epoch": 13.000506756756756, "grad_norm": 0.019868016242980957, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3863 }, { "epoch": 13.000540540540541, "grad_norm": 9.724809646606445, "learning_rate": 1.25e-05, "loss": 0.5833, "step": 3864 }, { "epoch": 13.000574324324324, "grad_norm": 0.3104464113712311, "learning_rate": 1.25e-05, "loss": 0.0031, "step": 3865 }, { "epoch": 13.000608108108109, "grad_norm": 0.002120718127116561, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3866 }, { "epoch": 13.000641891891892, "grad_norm": 0.009513592347502708, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3867 }, { "epoch": 13.000675675675677, "grad_norm": 0.004542950075119734, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3868 }, { "epoch": 13.00070945945946, "grad_norm": 24.512622833251953, "learning_rate": 1.25e-05, "loss": 0.4002, "step": 3869 }, { "epoch": 13.000743243243242, "grad_norm": 2.734454393386841, "learning_rate": 1.25e-05, "loss": 0.0134, "step": 3870 }, { "epoch": 13.000777027027027, "grad_norm": 0.0032783737406134605, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3871 }, { "epoch": 13.00081081081081, "grad_norm": 0.9012727737426758, "learning_rate": 1.25e-05, "loss": 0.0043, "step": 3872 }, { "epoch": 13.000844594594595, "grad_norm": 0.03100351057946682, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3873 }, { "epoch": 13.000878378378378, "grad_norm": 0.09347531944513321, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3874 }, { "epoch": 13.000912162162162, "grad_norm": 6.5037150382995605, "learning_rate": 1.25e-05, "loss": 0.0533, "step": 3875 }, { "epoch": 13.000945945945945, "grad_norm": 1.0699206590652466, "learning_rate": 1.25e-05, "loss": 0.0059, "step": 3876 }, { "epoch": 13.00097972972973, "grad_norm": 72.80134582519531, "learning_rate": 1.25e-05, "loss": 0.7514, "step": 3877 }, { "epoch": 13.001013513513513, "grad_norm": 0.00333968922495842, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3878 }, { "epoch": 13.001047297297298, "grad_norm": 0.0012859927956014872, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3879 }, { "epoch": 13.00108108108108, "grad_norm": 3.2472102642059326, "learning_rate": 1.25e-05, "loss": 0.0213, "step": 3880 }, { "epoch": 13.001114864864865, "grad_norm": 0.03457550331950188, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3881 }, { "epoch": 13.001148648648648, "grad_norm": 3.2224597930908203, "learning_rate": 1.25e-05, "loss": 0.0286, "step": 3882 }, { "epoch": 13.001182432432433, "grad_norm": 0.02830749936401844, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3883 }, { "epoch": 13.001216216216216, "grad_norm": 0.48260965943336487, "learning_rate": 1.25e-05, "loss": 0.0072, "step": 3884 }, { "epoch": 13.00125, "grad_norm": 0.005875108763575554, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3885 }, { "epoch": 13.001283783783784, "grad_norm": 0.009280898608267307, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3886 }, { "epoch": 13.001317567567568, "grad_norm": 31.144437789916992, "learning_rate": 1.25e-05, "loss": 0.0958, "step": 3887 }, { "epoch": 13.001351351351351, "grad_norm": 0.005987465847283602, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3888 }, { "epoch": 13.001385135135136, "grad_norm": 0.04362411051988602, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3889 }, { "epoch": 13.001418918918919, "grad_norm": 0.007637316361069679, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3890 }, { "epoch": 13.001452702702704, "grad_norm": 46.86985397338867, "learning_rate": 1.25e-05, "loss": 0.5443, "step": 3891 }, { "epoch": 13.001486486486487, "grad_norm": 0.007314834278076887, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3892 }, { "epoch": 13.00152027027027, "grad_norm": 0.05902363732457161, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3893 }, { "epoch": 13.001554054054054, "grad_norm": 0.05298617109656334, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3894 }, { "epoch": 13.001587837837837, "grad_norm": 0.06944067031145096, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3895 }, { "epoch": 13.001621621621622, "grad_norm": 2.653614044189453, "learning_rate": 1.25e-05, "loss": 0.0127, "step": 3896 }, { "epoch": 13.001655405405405, "grad_norm": 0.18591813743114471, "learning_rate": 1.25e-05, "loss": 0.0016, "step": 3897 }, { "epoch": 13.00168918918919, "grad_norm": 46.80183029174805, "learning_rate": 1.25e-05, "loss": 0.9886, "step": 3898 }, { "epoch": 13.001722972972972, "grad_norm": 0.06822703778743744, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3899 }, { "epoch": 13.001756756756757, "grad_norm": 2.110042095184326, "learning_rate": 1.25e-05, "loss": 0.0146, "step": 3900 }, { "epoch": 13.00179054054054, "grad_norm": 0.02550845593214035, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3901 }, { "epoch": 13.001824324324325, "grad_norm": 0.15093331038951874, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 3902 }, { "epoch": 13.001858108108108, "grad_norm": 0.0018164821667596698, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3903 }, { "epoch": 13.001891891891892, "grad_norm": 8.877657890319824, "learning_rate": 1.25e-05, "loss": 0.02, "step": 3904 }, { "epoch": 13.001925675675675, "grad_norm": 0.08385961502790451, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 3905 }, { "epoch": 13.00195945945946, "grad_norm": 0.004240391310304403, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3906 }, { "epoch": 13.001993243243243, "grad_norm": 0.016973858699202538, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3907 }, { "epoch": 13.002027027027028, "grad_norm": 0.053015999495983124, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3908 }, { "epoch": 13.00206081081081, "grad_norm": 0.002822851063683629, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3909 }, { "epoch": 13.002094594594595, "grad_norm": 0.007100386545062065, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3910 }, { "epoch": 13.002128378378378, "grad_norm": 0.002072342438623309, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3911 }, { "epoch": 13.002162162162163, "grad_norm": 13.738682746887207, "learning_rate": 1.25e-05, "loss": 0.7427, "step": 3912 }, { "epoch": 13.002195945945946, "grad_norm": 18.99640655517578, "learning_rate": 1.25e-05, "loss": 0.0916, "step": 3913 }, { "epoch": 13.002229729729729, "grad_norm": 0.37478965520858765, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 3914 }, { "epoch": 13.002263513513514, "grad_norm": 1.6677846908569336, "learning_rate": 1.25e-05, "loss": 0.0472, "step": 3915 }, { "epoch": 13.002297297297297, "grad_norm": 0.5823544263839722, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3916 }, { "epoch": 13.002331081081081, "grad_norm": 0.08227043598890305, "learning_rate": 1.25e-05, "loss": 0.003, "step": 3917 }, { "epoch": 13.002364864864864, "grad_norm": 0.7701126337051392, "learning_rate": 1.25e-05, "loss": 0.0346, "step": 3918 }, { "epoch": 13.002398648648649, "grad_norm": 0.009197819977998734, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3919 }, { "epoch": 13.002432432432432, "grad_norm": 0.04378323256969452, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 3920 }, { "epoch": 13.002466216216217, "grad_norm": 0.13840559124946594, "learning_rate": 1.25e-05, "loss": 0.0044, "step": 3921 }, { "epoch": 13.0025, "grad_norm": 0.005315011367201805, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3922 }, { "epoch": 13.002533783783784, "grad_norm": 0.07497528940439224, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 3923 }, { "epoch": 13.002567567567567, "grad_norm": 0.003289205254986882, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3924 }, { "epoch": 13.002601351351352, "grad_norm": 0.005833034869283438, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3925 }, { "epoch": 13.002635135135135, "grad_norm": 0.3963639736175537, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3926 }, { "epoch": 13.00266891891892, "grad_norm": 19.704307556152344, "learning_rate": 1.25e-05, "loss": 0.0501, "step": 3927 }, { "epoch": 13.002702702702702, "grad_norm": 3.2733676433563232, "learning_rate": 1.25e-05, "loss": 0.0384, "step": 3928 }, { "epoch": 13.002736486486487, "grad_norm": 0.02148432657122612, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3929 }, { "epoch": 13.00277027027027, "grad_norm": 0.021044211462140083, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3930 }, { "epoch": 13.002804054054055, "grad_norm": 0.002527401316910982, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3931 }, { "epoch": 13.002837837837838, "grad_norm": 0.08713509142398834, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3932 }, { "epoch": 13.002871621621622, "grad_norm": 1.1520805358886719, "learning_rate": 1.25e-05, "loss": 0.0221, "step": 3933 }, { "epoch": 13.002905405405405, "grad_norm": 44.36878967285156, "learning_rate": 1.25e-05, "loss": 0.5336, "step": 3934 }, { "epoch": 13.002939189189188, "grad_norm": 0.012891916558146477, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3935 }, { "epoch": 13.002972972972973, "grad_norm": 3.096315860748291, "learning_rate": 1.25e-05, "loss": 0.3039, "step": 3936 }, { "epoch": 13.003006756756756, "grad_norm": 0.008740746416151524, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3937 }, { "epoch": 13.00304054054054, "grad_norm": 0.38214048743247986, "learning_rate": 1.25e-05, "loss": 0.0027, "step": 3938 }, { "epoch": 13.003074324324324, "grad_norm": 37.382015228271484, "learning_rate": 1.25e-05, "loss": 0.3457, "step": 3939 }, { "epoch": 13.003108108108108, "grad_norm": 39.19694137573242, "learning_rate": 1.25e-05, "loss": 0.7369, "step": 3940 }, { "epoch": 13.003141891891891, "grad_norm": 0.004374317359179258, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3941 }, { "epoch": 13.003175675675676, "grad_norm": 0.021274743601679802, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3942 }, { "epoch": 13.003209459459459, "grad_norm": 0.11707719415426254, "learning_rate": 1.25e-05, "loss": 0.0022, "step": 3943 }, { "epoch": 13.003243243243244, "grad_norm": 0.19429640471935272, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3944 }, { "epoch": 13.003277027027027, "grad_norm": 0.012851842679083347, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3945 }, { "epoch": 13.003310810810811, "grad_norm": 0.005470848176628351, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3946 }, { "epoch": 13.003344594594594, "grad_norm": 0.4651013910770416, "learning_rate": 1.25e-05, "loss": 0.0031, "step": 3947 }, { "epoch": 13.003378378378379, "grad_norm": 0.00785871222615242, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3948 }, { "epoch": 13.003412162162162, "grad_norm": 0.00299294525757432, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3949 }, { "epoch": 13.003445945945947, "grad_norm": 0.6097816824913025, "learning_rate": 1.25e-05, "loss": 0.0369, "step": 3950 }, { "epoch": 13.00347972972973, "grad_norm": 0.09220351278781891, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3951 }, { "epoch": 13.003513513513514, "grad_norm": 0.004080015234649181, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3952 }, { "epoch": 13.003547297297297, "grad_norm": 0.6349999308586121, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 3953 }, { "epoch": 13.003581081081082, "grad_norm": 0.005694235675036907, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3954 }, { "epoch": 13.003614864864865, "grad_norm": 49.318214416503906, "learning_rate": 1.25e-05, "loss": 0.6102, "step": 3955 }, { "epoch": 13.003648648648648, "grad_norm": 0.1089542806148529, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 3956 }, { "epoch": 13.003682432432432, "grad_norm": 0.07277271896600723, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3957 }, { "epoch": 13.003716216216215, "grad_norm": 0.07495678961277008, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 3958 }, { "epoch": 13.00375, "grad_norm": 0.008793887682259083, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3959 }, { "epoch": 13.003783783783783, "grad_norm": 0.004166723694652319, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3960 }, { "epoch": 13.003817567567568, "grad_norm": 0.002815030049532652, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3961 }, { "epoch": 13.00385135135135, "grad_norm": 0.004072244744747877, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3962 }, { "epoch": 13.003885135135135, "grad_norm": 0.07351744174957275, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 3963 }, { "epoch": 13.003918918918918, "grad_norm": 0.032098740339279175, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3964 }, { "epoch": 13.003952702702703, "grad_norm": 0.007889848202466965, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3965 }, { "epoch": 13.003986486486486, "grad_norm": 0.00590870575979352, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3966 }, { "epoch": 13.00402027027027, "grad_norm": 0.21143919229507446, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 3967 }, { "epoch": 13.004054054054054, "grad_norm": 0.3522869050502777, "learning_rate": 1.25e-05, "loss": 0.0184, "step": 3968 }, { "epoch": 13.004087837837838, "grad_norm": 0.2747116982936859, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 3969 }, { "epoch": 13.004121621621621, "grad_norm": 0.004110407549887896, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3970 }, { "epoch": 13.004155405405406, "grad_norm": 0.09653715044260025, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3971 }, { "epoch": 13.004189189189189, "grad_norm": 0.04933764785528183, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 3972 }, { "epoch": 13.004222972972974, "grad_norm": 0.08300919830799103, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 3973 }, { "epoch": 13.004256756756757, "grad_norm": 0.01185106672346592, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3974 }, { "epoch": 13.004290540540541, "grad_norm": 0.004433755297213793, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3975 }, { "epoch": 13.004324324324324, "grad_norm": 0.0016796804266050458, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3976 }, { "epoch": 13.004358108108109, "grad_norm": 0.05250857025384903, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 3977 }, { "epoch": 13.004391891891892, "grad_norm": 0.014311753213405609, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3978 }, { "epoch": 13.004425675675675, "grad_norm": 33.70887756347656, "learning_rate": 1.25e-05, "loss": 0.5342, "step": 3979 }, { "epoch": 13.00445945945946, "grad_norm": 0.06593851745128632, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 3980 }, { "epoch": 13.004493243243243, "grad_norm": 0.009825780987739563, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3981 }, { "epoch": 13.004527027027027, "grad_norm": 0.009851811453700066, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3982 }, { "epoch": 13.00456081081081, "grad_norm": 8.518553733825684, "learning_rate": 1.25e-05, "loss": 0.6612, "step": 3983 }, { "epoch": 13.004594594594595, "grad_norm": 0.006155433598905802, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3984 }, { "epoch": 13.004628378378378, "grad_norm": 18.87858009338379, "learning_rate": 1.25e-05, "loss": 0.078, "step": 3985 }, { "epoch": 13.004662162162163, "grad_norm": 0.005277449265122414, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3986 }, { "epoch": 13.004695945945945, "grad_norm": 0.06563296914100647, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 3987 }, { "epoch": 13.00472972972973, "grad_norm": 2.615453004837036, "learning_rate": 1.25e-05, "loss": 0.0402, "step": 3988 }, { "epoch": 13.004763513513513, "grad_norm": 10.921317100524902, "learning_rate": 1.25e-05, "loss": 0.0649, "step": 3989 }, { "epoch": 13.004797297297298, "grad_norm": 0.0025536140892654657, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3990 }, { "epoch": 13.00483108108108, "grad_norm": 0.005797432269901037, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 3991 }, { "epoch": 13.004864864864865, "grad_norm": 0.003832913702353835, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3992 }, { "epoch": 13.004898648648648, "grad_norm": 2.450737714767456, "learning_rate": 1.25e-05, "loss": 0.0102, "step": 3993 }, { "epoch": 13.004932432432433, "grad_norm": 2.029658555984497, "learning_rate": 1.25e-05, "loss": 0.0072, "step": 3994 }, { "epoch": 13.004966216216216, "grad_norm": 0.014606166630983353, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 3995 }, { "epoch": 13.005, "grad_norm": 0.0034942678175866604, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 3996 }, { "epoch": 13.005033783783784, "grad_norm": 1.0492421388626099, "learning_rate": 1.25e-05, "loss": 0.0041, "step": 3997 }, { "epoch": 13.005067567567568, "grad_norm": 0.14130565524101257, "learning_rate": 1.25e-05, "loss": 0.0041, "step": 3998 }, { "epoch": 13.005101351351351, "grad_norm": 0.30474814772605896, "learning_rate": 1.25e-05, "loss": 0.0038, "step": 3999 }, { "epoch": 13.005135135135134, "grad_norm": 0.0029986125882714987, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4000 }, { "epoch": 13.005168918918919, "grad_norm": 0.0062734028324484825, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4001 }, { "epoch": 13.005202702702702, "grad_norm": 0.05900018289685249, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 4002 }, { "epoch": 13.005236486486487, "grad_norm": 0.4380873143672943, "learning_rate": 1.25e-05, "loss": 0.0025, "step": 4003 }, { "epoch": 13.00527027027027, "grad_norm": 0.006821878254413605, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4004 }, { "epoch": 13.005304054054054, "grad_norm": 31.549301147460938, "learning_rate": 1.25e-05, "loss": 0.0705, "step": 4005 }, { "epoch": 13.005337837837837, "grad_norm": 0.006599032785743475, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4006 }, { "epoch": 13.005371621621622, "grad_norm": 0.006522634066641331, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4007 }, { "epoch": 13.005405405405405, "grad_norm": 0.007257436867803335, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4008 }, { "epoch": 13.00543918918919, "grad_norm": 0.006553471554070711, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4009 }, { "epoch": 13.005472972972973, "grad_norm": 0.009953792206943035, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4010 }, { "epoch": 13.005506756756757, "grad_norm": 0.10097185522317886, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 4011 }, { "epoch": 13.00554054054054, "grad_norm": 0.00566902058199048, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4012 }, { "epoch": 13.005574324324325, "grad_norm": 0.05262843146920204, "learning_rate": 1.25e-05, "loss": 0.002, "step": 4013 }, { "epoch": 13.005608108108108, "grad_norm": 5.509920120239258, "learning_rate": 1.25e-05, "loss": 0.0368, "step": 4014 }, { "epoch": 13.005641891891893, "grad_norm": 34.818450927734375, "learning_rate": 1.25e-05, "loss": 0.1836, "step": 4015 }, { "epoch": 13.005675675675676, "grad_norm": 3.753561496734619, "learning_rate": 1.25e-05, "loss": 0.017, "step": 4016 }, { "epoch": 13.00570945945946, "grad_norm": 0.004278865642845631, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4017 }, { "epoch": 13.005743243243243, "grad_norm": 0.0185401514172554, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4018 }, { "epoch": 13.005777027027028, "grad_norm": 0.4150669276714325, "learning_rate": 1.25e-05, "loss": 0.002, "step": 4019 }, { "epoch": 13.00581081081081, "grad_norm": 9.895881652832031, "learning_rate": 1.25e-05, "loss": 0.2183, "step": 4020 }, { "epoch": 13.005844594594594, "grad_norm": 2.6431353092193604, "learning_rate": 1.25e-05, "loss": 0.1901, "step": 4021 }, { "epoch": 13.005878378378378, "grad_norm": 0.08855798840522766, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4022 }, { "epoch": 13.005912162162161, "grad_norm": 38.80382537841797, "learning_rate": 1.25e-05, "loss": 0.8521, "step": 4023 }, { "epoch": 13.005945945945946, "grad_norm": 0.017610935494303703, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4024 }, { "epoch": 13.005979729729729, "grad_norm": 0.014696339145302773, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4025 }, { "epoch": 13.006013513513514, "grad_norm": 0.0042550633661448956, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4026 }, { "epoch": 13.006047297297297, "grad_norm": 0.006652018520981073, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4027 }, { "epoch": 13.006081081081081, "grad_norm": 12.678533554077148, "learning_rate": 1.25e-05, "loss": 0.2228, "step": 4028 }, { "epoch": 13.006114864864864, "grad_norm": 33.65654373168945, "learning_rate": 1.25e-05, "loss": 0.0725, "step": 4029 }, { "epoch": 13.006148648648649, "grad_norm": 0.05195866525173187, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4030 }, { "epoch": 13.006182432432432, "grad_norm": 34.062862396240234, "learning_rate": 1.25e-05, "loss": 0.302, "step": 4031 }, { "epoch": 13.006216216216217, "grad_norm": 2.2020351886749268, "learning_rate": 1.25e-05, "loss": 0.0266, "step": 4032 }, { "epoch": 13.00625, "grad_norm": 0.08354686945676804, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4033 }, { "epoch": 13.006283783783784, "grad_norm": 26.075122833251953, "learning_rate": 1.25e-05, "loss": 0.8815, "step": 4034 }, { "epoch": 13.006317567567567, "grad_norm": 0.007189242169260979, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4035 }, { "epoch": 13.006351351351352, "grad_norm": 0.006580820307135582, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4036 }, { "epoch": 13.006385135135135, "grad_norm": 0.006945538800209761, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4037 }, { "epoch": 13.00641891891892, "grad_norm": 0.005158253014087677, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4038 }, { "epoch": 13.006452702702703, "grad_norm": 0.10829120129346848, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 4039 }, { "epoch": 13.006486486486487, "grad_norm": 0.007716709282249212, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4040 }, { "epoch": 13.00652027027027, "grad_norm": 0.008981484919786453, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4041 }, { "epoch": 13.006554054054053, "grad_norm": 1.4642391204833984, "learning_rate": 1.25e-05, "loss": 0.0762, "step": 4042 }, { "epoch": 13.006587837837838, "grad_norm": 0.02672124095261097, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4043 }, { "epoch": 13.00662162162162, "grad_norm": 0.026949433609843254, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4044 }, { "epoch": 13.006655405405406, "grad_norm": 0.0019967318512499332, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4045 }, { "epoch": 13.006689189189188, "grad_norm": 0.01143887359648943, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4046 }, { "epoch": 13.006722972972973, "grad_norm": 0.015101706609129906, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4047 }, { "epoch": 13.006756756756756, "grad_norm": 0.08127736300230026, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4048 }, { "epoch": 13.00679054054054, "grad_norm": 0.9486112594604492, "learning_rate": 1.25e-05, "loss": 0.0282, "step": 4049 }, { "epoch": 13.006824324324324, "grad_norm": 0.0077039459720253944, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4050 }, { "epoch": 13.006858108108108, "grad_norm": 0.002312984550371766, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4051 }, { "epoch": 13.006891891891891, "grad_norm": 25.035442352294922, "learning_rate": 1.25e-05, "loss": 0.6378, "step": 4052 }, { "epoch": 13.006925675675676, "grad_norm": 2.2943050861358643, "learning_rate": 1.25e-05, "loss": 0.0069, "step": 4053 }, { "epoch": 13.006959459459459, "grad_norm": 18.070627212524414, "learning_rate": 1.25e-05, "loss": 0.6952, "step": 4054 }, { "epoch": 13.006993243243244, "grad_norm": 2.4533016681671143, "learning_rate": 1.25e-05, "loss": 0.1891, "step": 4055 }, { "epoch": 13.007027027027027, "grad_norm": 0.021236076951026917, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4056 }, { "epoch": 13.007060810810811, "grad_norm": 3.371532678604126, "learning_rate": 1.25e-05, "loss": 0.5328, "step": 4057 }, { "epoch": 13.007094594594594, "grad_norm": 0.001503115170635283, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4058 }, { "epoch": 13.007128378378379, "grad_norm": 0.0038843085058033466, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4059 }, { "epoch": 13.007162162162162, "grad_norm": 0.557022213935852, "learning_rate": 1.25e-05, "loss": 0.0019, "step": 4060 }, { "epoch": 13.007195945945947, "grad_norm": 0.09700576961040497, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 4061 }, { "epoch": 13.00722972972973, "grad_norm": 0.004551882389932871, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4062 }, { "epoch": 13.007263513513514, "grad_norm": 0.836716890335083, "learning_rate": 1.25e-05, "loss": 0.0037, "step": 4063 }, { "epoch": 13.007297297297297, "grad_norm": 0.07972994446754456, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 4064 }, { "epoch": 13.00733108108108, "grad_norm": 0.019841257482767105, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4065 }, { "epoch": 13.007364864864865, "grad_norm": 0.004995239432901144, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4066 }, { "epoch": 13.007398648648648, "grad_norm": 0.08165556192398071, "learning_rate": 1.25e-05, "loss": 0.003, "step": 4067 }, { "epoch": 13.007432432432433, "grad_norm": 0.650606095790863, "learning_rate": 1.25e-05, "loss": 0.0027, "step": 4068 }, { "epoch": 13.007466216216216, "grad_norm": 0.002336975419893861, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4069 }, { "epoch": 13.0075, "grad_norm": 0.006837652064859867, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4070 }, { "epoch": 13.007533783783783, "grad_norm": 0.8479230403900146, "learning_rate": 1.25e-05, "loss": 0.0025, "step": 4071 }, { "epoch": 13.007567567567568, "grad_norm": 0.005514864344149828, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4072 }, { "epoch": 13.00760135135135, "grad_norm": 0.009714514017105103, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4073 }, { "epoch": 13.007635135135136, "grad_norm": 0.15231487154960632, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 4074 }, { "epoch": 13.007668918918919, "grad_norm": 0.048048924654722214, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4075 }, { "epoch": 13.007702702702703, "grad_norm": 0.09789964556694031, "learning_rate": 1.25e-05, "loss": 0.0036, "step": 4076 }, { "epoch": 13.007736486486486, "grad_norm": 0.09284733235836029, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 4077 }, { "epoch": 13.00777027027027, "grad_norm": 9.313577651977539, "learning_rate": 1.25e-05, "loss": 0.0687, "step": 4078 }, { "epoch": 13.007804054054054, "grad_norm": 0.08233147114515305, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 4079 }, { "epoch": 13.007837837837839, "grad_norm": 0.05145179107785225, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4080 }, { "epoch": 13.007871621621621, "grad_norm": 0.009225673042237759, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4081 }, { "epoch": 13.007905405405406, "grad_norm": 0.011064469814300537, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4082 }, { "epoch": 13.00793918918919, "grad_norm": 0.010993634350597858, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4083 }, { "epoch": 13.007972972972974, "grad_norm": 0.030317043885588646, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4084 }, { "epoch": 13.008006756756757, "grad_norm": 0.10926235467195511, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4085 }, { "epoch": 13.00804054054054, "grad_norm": 0.00597047246992588, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4086 }, { "epoch": 13.008074324324324, "grad_norm": 0.7079448103904724, "learning_rate": 1.25e-05, "loss": 0.0353, "step": 4087 }, { "epoch": 13.008108108108107, "grad_norm": 0.003360844450071454, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4088 }, { "epoch": 13.008141891891892, "grad_norm": 0.01882902905344963, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4089 }, { "epoch": 13.008175675675675, "grad_norm": 0.0010691424831748009, "learning_rate": 1.25e-05, "loss": 0.0, "step": 4090 }, { "epoch": 13.00820945945946, "grad_norm": 3.9888923168182373, "learning_rate": 1.25e-05, "loss": 0.0125, "step": 4091 }, { "epoch": 13.008243243243243, "grad_norm": 0.07085465639829636, "learning_rate": 1.25e-05, "loss": 0.001, "step": 4092 }, { "epoch": 13.008277027027027, "grad_norm": 0.0046052830293774605, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4093 }, { "epoch": 13.00831081081081, "grad_norm": 26.318870544433594, "learning_rate": 1.25e-05, "loss": 1.3312, "step": 4094 }, { "epoch": 13.008344594594595, "grad_norm": 5.183482646942139, "learning_rate": 1.25e-05, "loss": 0.0879, "step": 4095 }, { "epoch": 13.008378378378378, "grad_norm": 1.7190910577774048, "learning_rate": 1.25e-05, "loss": 0.0953, "step": 4096 }, { "epoch": 13.008412162162163, "grad_norm": 0.0015338604571297765, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4097 }, { "epoch": 13.008445945945946, "grad_norm": 0.05354468524456024, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4098 }, { "epoch": 13.00847972972973, "grad_norm": 0.021965695545077324, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4099 }, { "epoch": 13.008513513513513, "grad_norm": 2.117377996444702, "learning_rate": 1.25e-05, "loss": 0.0374, "step": 4100 }, { "epoch": 13.008547297297298, "grad_norm": 0.014250751584768295, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4101 }, { "epoch": 13.008581081081081, "grad_norm": 0.0013452222337946296, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4102 }, { "epoch": 13.008614864864866, "grad_norm": 4.676178932189941, "learning_rate": 1.25e-05, "loss": 0.0843, "step": 4103 }, { "epoch": 13.008648648648649, "grad_norm": 3.744504690170288, "learning_rate": 1.25e-05, "loss": 0.0161, "step": 4104 }, { "epoch": 13.008682432432433, "grad_norm": 11.43697738647461, "learning_rate": 1.25e-05, "loss": 0.1876, "step": 4105 }, { "epoch": 13.008716216216216, "grad_norm": 0.007344373967498541, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4106 }, { "epoch": 13.00875, "grad_norm": 0.005773496814072132, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4107 }, { "epoch": 13.008783783783784, "grad_norm": 0.0016996528720483184, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4108 }, { "epoch": 13.008817567567567, "grad_norm": 0.004390635993331671, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4109 }, { "epoch": 13.008851351351352, "grad_norm": 4.7858405113220215, "learning_rate": 1.25e-05, "loss": 0.0206, "step": 4110 }, { "epoch": 13.008885135135134, "grad_norm": 2.090092420578003, "learning_rate": 1.25e-05, "loss": 0.1253, "step": 4111 }, { "epoch": 13.00891891891892, "grad_norm": 0.01637669838964939, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4112 }, { "epoch": 13.008952702702702, "grad_norm": 30.487323760986328, "learning_rate": 1.25e-05, "loss": 0.0864, "step": 4113 }, { "epoch": 13.008986486486487, "grad_norm": 11.135077476501465, "learning_rate": 1.25e-05, "loss": 0.0274, "step": 4114 }, { "epoch": 13.00902027027027, "grad_norm": 0.0018432444194331765, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4115 }, { "epoch": 13.009054054054054, "grad_norm": 0.0123338857665658, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4116 }, { "epoch": 13.009087837837837, "grad_norm": 0.017297273501753807, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4117 }, { "epoch": 13.009121621621622, "grad_norm": 0.013802499510347843, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4118 }, { "epoch": 13.009155405405405, "grad_norm": 0.08627722412347794, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4119 }, { "epoch": 13.00918918918919, "grad_norm": 18.42703628540039, "learning_rate": 1.25e-05, "loss": 0.0487, "step": 4120 }, { "epoch": 13.009222972972973, "grad_norm": 3.3647854328155518, "learning_rate": 1.25e-05, "loss": 0.0118, "step": 4121 }, { "epoch": 13.009256756756757, "grad_norm": 0.07027000188827515, "learning_rate": 1.25e-05, "loss": 0.0026, "step": 4122 }, { "epoch": 13.00929054054054, "grad_norm": 0.026196785271167755, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4123 }, { "epoch": 13.009324324324325, "grad_norm": 0.008880098350346088, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4124 }, { "epoch": 13.009358108108108, "grad_norm": 88.16362762451172, "learning_rate": 1.25e-05, "loss": 0.1869, "step": 4125 }, { "epoch": 13.009391891891893, "grad_norm": 0.16218458116054535, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4126 }, { "epoch": 13.009425675675676, "grad_norm": 0.5182160139083862, "learning_rate": 1.25e-05, "loss": 0.021, "step": 4127 }, { "epoch": 13.009459459459459, "grad_norm": 6.305715560913086, "learning_rate": 1.25e-05, "loss": 0.0297, "step": 4128 }, { "epoch": 13.009493243243243, "grad_norm": 0.004094934556633234, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4129 }, { "epoch": 13.009527027027026, "grad_norm": 0.054356981068849564, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4130 }, { "epoch": 13.009560810810811, "grad_norm": 0.0947059914469719, "learning_rate": 1.25e-05, "loss": 0.0036, "step": 4131 }, { "epoch": 13.009594594594594, "grad_norm": 0.23791883885860443, "learning_rate": 1.25e-05, "loss": 0.001, "step": 4132 }, { "epoch": 13.009628378378379, "grad_norm": 0.005045165307819843, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4133 }, { "epoch": 13.009662162162162, "grad_norm": 0.009837902151048183, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4134 }, { "epoch": 13.009695945945946, "grad_norm": 0.004049657378345728, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4135 }, { "epoch": 13.00972972972973, "grad_norm": 0.23719997704029083, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 4136 }, { "epoch": 13.009763513513514, "grad_norm": 0.08024848252534866, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4137 }, { "epoch": 13.009797297297297, "grad_norm": 0.0018694208702072501, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4138 }, { "epoch": 13.009831081081082, "grad_norm": 0.0037137398030608892, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4139 }, { "epoch": 13.009864864864864, "grad_norm": 0.0030785216949880123, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4140 }, { "epoch": 13.00989864864865, "grad_norm": 54.32167053222656, "learning_rate": 1.25e-05, "loss": 0.1965, "step": 4141 }, { "epoch": 13.009932432432432, "grad_norm": 0.25343501567840576, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 4142 }, { "epoch": 13.009966216216217, "grad_norm": 1.523236870765686, "learning_rate": 1.25e-05, "loss": 0.0028, "step": 4143 }, { "epoch": 13.01, "grad_norm": 48.838130950927734, "learning_rate": 1.25e-05, "loss": 0.5788, "step": 4144 }, { "epoch": 13.01, "eval_accuracy": 0.8933764135702746, "eval_loss": 0.5557999014854431, "eval_runtime": 32.1727, "eval_samples_per_second": 19.24, "eval_steps_per_second": 2.424, "step": 4144 }, { "epoch": 14.000033783783783, "grad_norm": 0.09469763189554214, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4145 }, { "epoch": 14.000067567567568, "grad_norm": 6.110011577606201, "learning_rate": 1.25e-05, "loss": 0.13, "step": 4146 }, { "epoch": 14.00010135135135, "grad_norm": 1.7765531539916992, "learning_rate": 1.25e-05, "loss": 0.015, "step": 4147 }, { "epoch": 14.000135135135135, "grad_norm": 0.0023794618900865316, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4148 }, { "epoch": 14.000168918918918, "grad_norm": 0.014819732867181301, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4149 }, { "epoch": 14.000202702702703, "grad_norm": 0.003916163928806782, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4150 }, { "epoch": 14.000236486486486, "grad_norm": 8.919940948486328, "learning_rate": 1.25e-05, "loss": 0.0705, "step": 4151 }, { "epoch": 14.00027027027027, "grad_norm": 0.07512069493532181, "learning_rate": 1.25e-05, "loss": 0.0028, "step": 4152 }, { "epoch": 14.000304054054054, "grad_norm": 0.01579599268734455, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4153 }, { "epoch": 14.000337837837838, "grad_norm": 0.004153583198785782, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4154 }, { "epoch": 14.000371621621621, "grad_norm": 0.004889020696282387, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4155 }, { "epoch": 14.000405405405406, "grad_norm": 0.004290449898689985, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4156 }, { "epoch": 14.000439189189189, "grad_norm": 0.2327135056257248, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 4157 }, { "epoch": 14.000472972972974, "grad_norm": 15.624961853027344, "learning_rate": 1.25e-05, "loss": 0.1154, "step": 4158 }, { "epoch": 14.000506756756756, "grad_norm": 0.003693380393087864, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4159 }, { "epoch": 14.000540540540541, "grad_norm": 1.2132419347763062, "learning_rate": 1.25e-05, "loss": 0.0064, "step": 4160 }, { "epoch": 14.000574324324324, "grad_norm": 4.276748180389404, "learning_rate": 1.25e-05, "loss": 0.0911, "step": 4161 }, { "epoch": 14.000608108108109, "grad_norm": 0.0030923488084226847, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4162 }, { "epoch": 14.000641891891892, "grad_norm": 2.9785077571868896, "learning_rate": 1.25e-05, "loss": 0.0182, "step": 4163 }, { "epoch": 14.000675675675677, "grad_norm": 0.00530389416962862, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4164 }, { "epoch": 14.00070945945946, "grad_norm": 0.0015888542402535677, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4165 }, { "epoch": 14.000743243243242, "grad_norm": 0.003399505279958248, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4166 }, { "epoch": 14.000777027027027, "grad_norm": 8.791924476623535, "learning_rate": 1.25e-05, "loss": 0.8718, "step": 4167 }, { "epoch": 14.00081081081081, "grad_norm": 24.419469833374023, "learning_rate": 1.25e-05, "loss": 0.8097, "step": 4168 }, { "epoch": 14.000844594594595, "grad_norm": 0.007544368971139193, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4169 }, { "epoch": 14.000878378378378, "grad_norm": 0.0025444431230425835, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4170 }, { "epoch": 14.000912162162162, "grad_norm": 0.06487514823675156, "learning_rate": 1.25e-05, "loss": 0.0023, "step": 4171 }, { "epoch": 14.000945945945945, "grad_norm": 0.0024969815276563168, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4172 }, { "epoch": 14.00097972972973, "grad_norm": 26.030704498291016, "learning_rate": 1.25e-05, "loss": 0.1644, "step": 4173 }, { "epoch": 14.001013513513513, "grad_norm": 0.0030880288686603308, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4174 }, { "epoch": 14.001047297297298, "grad_norm": 0.007189328316599131, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4175 }, { "epoch": 14.00108108108108, "grad_norm": 0.7516968846321106, "learning_rate": 1.25e-05, "loss": 0.0042, "step": 4176 }, { "epoch": 14.001114864864865, "grad_norm": 0.34887373447418213, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 4177 }, { "epoch": 14.001148648648648, "grad_norm": 0.0032325880602002144, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4178 }, { "epoch": 14.001182432432433, "grad_norm": 0.05565343052148819, "learning_rate": 1.25e-05, "loss": 0.002, "step": 4179 }, { "epoch": 14.001216216216216, "grad_norm": 0.0067650750279426575, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4180 }, { "epoch": 14.00125, "grad_norm": 0.009524720720946789, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4181 }, { "epoch": 14.001283783783784, "grad_norm": 0.06681918352842331, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 4182 }, { "epoch": 14.001317567567568, "grad_norm": 0.0059977504424750805, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4183 }, { "epoch": 14.001351351351351, "grad_norm": 0.0013947660336270928, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4184 }, { "epoch": 14.001385135135136, "grad_norm": 0.006091209594160318, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4185 }, { "epoch": 14.001418918918919, "grad_norm": 0.20520740747451782, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4186 }, { "epoch": 14.001452702702704, "grad_norm": 6.734187126159668, "learning_rate": 1.25e-05, "loss": 0.2506, "step": 4187 }, { "epoch": 14.001486486486487, "grad_norm": 0.005147546995431185, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4188 }, { "epoch": 14.00152027027027, "grad_norm": 16.31324577331543, "learning_rate": 1.25e-05, "loss": 0.974, "step": 4189 }, { "epoch": 14.001554054054054, "grad_norm": 0.0031605453696101904, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4190 }, { "epoch": 14.001587837837837, "grad_norm": 0.001940907328389585, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4191 }, { "epoch": 14.001621621621622, "grad_norm": 0.36680108308792114, "learning_rate": 1.25e-05, "loss": 0.0027, "step": 4192 }, { "epoch": 14.001655405405405, "grad_norm": 0.0029171158093959093, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4193 }, { "epoch": 14.00168918918919, "grad_norm": 0.03328565135598183, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4194 }, { "epoch": 14.001722972972972, "grad_norm": 0.012887048535048962, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4195 }, { "epoch": 14.001756756756757, "grad_norm": 1.9694229364395142, "learning_rate": 1.25e-05, "loss": 0.11, "step": 4196 }, { "epoch": 14.00179054054054, "grad_norm": 0.9755662083625793, "learning_rate": 1.25e-05, "loss": 0.0422, "step": 4197 }, { "epoch": 14.001824324324325, "grad_norm": 0.008140681311488152, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4198 }, { "epoch": 14.001858108108108, "grad_norm": 0.0023759184405207634, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4199 }, { "epoch": 14.001891891891892, "grad_norm": 49.29301834106445, "learning_rate": 1.25e-05, "loss": 0.7233, "step": 4200 }, { "epoch": 14.001925675675675, "grad_norm": 0.03486233949661255, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4201 }, { "epoch": 14.00195945945946, "grad_norm": 0.010161357931792736, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4202 }, { "epoch": 14.001993243243243, "grad_norm": 0.0038502963725477457, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4203 }, { "epoch": 14.002027027027028, "grad_norm": 14.190051078796387, "learning_rate": 1.25e-05, "loss": 0.0654, "step": 4204 }, { "epoch": 14.00206081081081, "grad_norm": 0.009668910875916481, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4205 }, { "epoch": 14.002094594594595, "grad_norm": 1.3876501321792603, "learning_rate": 1.25e-05, "loss": 0.0046, "step": 4206 }, { "epoch": 14.002128378378378, "grad_norm": 30.690208435058594, "learning_rate": 1.25e-05, "loss": 0.6426, "step": 4207 }, { "epoch": 14.002162162162163, "grad_norm": 0.001965416595339775, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4208 }, { "epoch": 14.002195945945946, "grad_norm": 0.0023282889742404222, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4209 }, { "epoch": 14.002229729729729, "grad_norm": 0.01614748314023018, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4210 }, { "epoch": 14.002263513513514, "grad_norm": 39.17356872558594, "learning_rate": 1.25e-05, "loss": 0.2077, "step": 4211 }, { "epoch": 14.002297297297297, "grad_norm": 0.00724751316010952, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4212 }, { "epoch": 14.002331081081081, "grad_norm": 0.0036771653685718775, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4213 }, { "epoch": 14.002364864864864, "grad_norm": 0.0657791942358017, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 4214 }, { "epoch": 14.002398648648649, "grad_norm": 0.0037715991493314505, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4215 }, { "epoch": 14.002432432432432, "grad_norm": 0.33077099919319153, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 4216 }, { "epoch": 14.002466216216217, "grad_norm": 0.001295712310820818, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4217 }, { "epoch": 14.0025, "grad_norm": 0.005296532064676285, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4218 }, { "epoch": 14.002533783783784, "grad_norm": 17.2724552154541, "learning_rate": 1.25e-05, "loss": 0.0707, "step": 4219 }, { "epoch": 14.002567567567567, "grad_norm": 0.005679700989276171, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4220 }, { "epoch": 14.002601351351352, "grad_norm": 0.010529797524213791, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4221 }, { "epoch": 14.002635135135135, "grad_norm": 14.159724235534668, "learning_rate": 1.25e-05, "loss": 0.0302, "step": 4222 }, { "epoch": 14.00266891891892, "grad_norm": 3.8497836589813232, "learning_rate": 1.25e-05, "loss": 0.1829, "step": 4223 }, { "epoch": 14.002702702702702, "grad_norm": 0.0990537703037262, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 4224 }, { "epoch": 14.002736486486487, "grad_norm": 0.0018416775856167078, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4225 }, { "epoch": 14.00277027027027, "grad_norm": 0.004619493614882231, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4226 }, { "epoch": 14.002804054054055, "grad_norm": 9.979084014892578, "learning_rate": 1.25e-05, "loss": 0.0875, "step": 4227 }, { "epoch": 14.002837837837838, "grad_norm": 0.006051922682672739, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4228 }, { "epoch": 14.002871621621622, "grad_norm": 0.0013397816801443696, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4229 }, { "epoch": 14.002905405405405, "grad_norm": 0.002263233531266451, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4230 }, { "epoch": 14.002939189189188, "grad_norm": 13.142802238464355, "learning_rate": 1.25e-05, "loss": 0.6343, "step": 4231 }, { "epoch": 14.002972972972973, "grad_norm": 0.00303326896391809, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4232 }, { "epoch": 14.003006756756756, "grad_norm": 52.23968505859375, "learning_rate": 1.25e-05, "loss": 0.3019, "step": 4233 }, { "epoch": 14.00304054054054, "grad_norm": 0.05142129585146904, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4234 }, { "epoch": 14.003074324324324, "grad_norm": 10.162611961364746, "learning_rate": 1.25e-05, "loss": 0.8034, "step": 4235 }, { "epoch": 14.003108108108108, "grad_norm": 0.0771118625998497, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 4236 }, { "epoch": 14.003141891891891, "grad_norm": 43.64570236206055, "learning_rate": 1.25e-05, "loss": 1.6489, "step": 4237 }, { "epoch": 14.003175675675676, "grad_norm": 71.11985778808594, "learning_rate": 1.25e-05, "loss": 1.0941, "step": 4238 }, { "epoch": 14.003209459459459, "grad_norm": 0.0019541860092431307, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4239 }, { "epoch": 14.003243243243244, "grad_norm": 0.15627716481685638, "learning_rate": 1.25e-05, "loss": 0.0031, "step": 4240 }, { "epoch": 14.003277027027027, "grad_norm": 0.023107144981622696, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4241 }, { "epoch": 14.003310810810811, "grad_norm": 27.685558319091797, "learning_rate": 1.25e-05, "loss": 0.652, "step": 4242 }, { "epoch": 14.003344594594594, "grad_norm": 0.0029067285358905792, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4243 }, { "epoch": 14.003378378378379, "grad_norm": 8.576481819152832, "learning_rate": 1.25e-05, "loss": 0.188, "step": 4244 }, { "epoch": 14.003412162162162, "grad_norm": 0.003987030126154423, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4245 }, { "epoch": 14.003445945945947, "grad_norm": 0.005880508106201887, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4246 }, { "epoch": 14.00347972972973, "grad_norm": 1.0296062231063843, "learning_rate": 1.25e-05, "loss": 0.0033, "step": 4247 }, { "epoch": 14.003513513513514, "grad_norm": 46.08771514892578, "learning_rate": 1.25e-05, "loss": 0.5073, "step": 4248 }, { "epoch": 14.003547297297297, "grad_norm": 0.10267263650894165, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4249 }, { "epoch": 14.003581081081082, "grad_norm": 23.606950759887695, "learning_rate": 1.25e-05, "loss": 0.6967, "step": 4250 }, { "epoch": 14.003614864864865, "grad_norm": 0.002028651535511017, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4251 }, { "epoch": 14.003648648648648, "grad_norm": 10.100591659545898, "learning_rate": 1.25e-05, "loss": 0.6632, "step": 4252 }, { "epoch": 14.003682432432432, "grad_norm": 0.03080536611378193, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4253 }, { "epoch": 14.003716216216215, "grad_norm": 2.592979669570923, "learning_rate": 1.25e-05, "loss": 0.1075, "step": 4254 }, { "epoch": 14.00375, "grad_norm": 0.04830728843808174, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4255 }, { "epoch": 14.003783783783783, "grad_norm": 0.009845731779932976, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4256 }, { "epoch": 14.003817567567568, "grad_norm": 0.004756302107125521, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4257 }, { "epoch": 14.00385135135135, "grad_norm": 23.040124893188477, "learning_rate": 1.25e-05, "loss": 0.0591, "step": 4258 }, { "epoch": 14.003885135135135, "grad_norm": 18.631620407104492, "learning_rate": 1.25e-05, "loss": 0.5983, "step": 4259 }, { "epoch": 14.003918918918918, "grad_norm": 0.02051267772912979, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4260 }, { "epoch": 14.003952702702703, "grad_norm": 0.012121199630200863, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4261 }, { "epoch": 14.003986486486486, "grad_norm": 2.321789503097534, "learning_rate": 1.25e-05, "loss": 0.0368, "step": 4262 }, { "epoch": 14.00402027027027, "grad_norm": 0.11318904906511307, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 4263 }, { "epoch": 14.004054054054054, "grad_norm": 0.06410487741231918, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 4264 }, { "epoch": 14.004087837837838, "grad_norm": 13.75223445892334, "learning_rate": 1.25e-05, "loss": 0.4393, "step": 4265 }, { "epoch": 14.004121621621621, "grad_norm": 45.98180389404297, "learning_rate": 1.25e-05, "loss": 0.3737, "step": 4266 }, { "epoch": 14.004155405405406, "grad_norm": 5.756099224090576, "learning_rate": 1.25e-05, "loss": 0.6299, "step": 4267 }, { "epoch": 14.004189189189189, "grad_norm": 0.28337278962135315, "learning_rate": 1.25e-05, "loss": 0.0016, "step": 4268 }, { "epoch": 14.004222972972974, "grad_norm": 0.03271842747926712, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4269 }, { "epoch": 14.004256756756757, "grad_norm": 0.054981544613838196, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 4270 }, { "epoch": 14.004290540540541, "grad_norm": 11.636261940002441, "learning_rate": 1.25e-05, "loss": 0.0278, "step": 4271 }, { "epoch": 14.004324324324324, "grad_norm": 0.010581625625491142, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4272 }, { "epoch": 14.004358108108109, "grad_norm": 0.04512528330087662, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4273 }, { "epoch": 14.004391891891892, "grad_norm": 0.0014873344916850328, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4274 }, { "epoch": 14.004425675675675, "grad_norm": 1.626573085784912, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 4275 }, { "epoch": 14.00445945945946, "grad_norm": 0.0058968267403542995, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4276 }, { "epoch": 14.004493243243243, "grad_norm": 0.8032275438308716, "learning_rate": 1.25e-05, "loss": 0.0138, "step": 4277 }, { "epoch": 14.004527027027027, "grad_norm": 0.0028783634770661592, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4278 }, { "epoch": 14.00456081081081, "grad_norm": 0.060589998960494995, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4279 }, { "epoch": 14.004594594594595, "grad_norm": 0.0340123288333416, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4280 }, { "epoch": 14.004628378378378, "grad_norm": 0.07200464606285095, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4281 }, { "epoch": 14.004662162162163, "grad_norm": 0.266553670167923, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 4282 }, { "epoch": 14.004695945945945, "grad_norm": 0.04194331169128418, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4283 }, { "epoch": 14.00472972972973, "grad_norm": 6.652055740356445, "learning_rate": 1.25e-05, "loss": 0.0358, "step": 4284 }, { "epoch": 14.004763513513513, "grad_norm": 0.7364556193351746, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 4285 }, { "epoch": 14.004797297297298, "grad_norm": 0.31963202357292175, "learning_rate": 1.25e-05, "loss": 0.006, "step": 4286 }, { "epoch": 14.00483108108108, "grad_norm": 8.028768539428711, "learning_rate": 1.25e-05, "loss": 0.096, "step": 4287 }, { "epoch": 14.004864864864865, "grad_norm": 0.13748669624328613, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 4288 }, { "epoch": 14.004898648648648, "grad_norm": 1.1870801448822021, "learning_rate": 1.25e-05, "loss": 0.0038, "step": 4289 }, { "epoch": 14.004932432432433, "grad_norm": 0.03499755635857582, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4290 }, { "epoch": 14.004966216216216, "grad_norm": 0.11888068914413452, "learning_rate": 1.25e-05, "loss": 0.0013, "step": 4291 }, { "epoch": 14.005, "grad_norm": 3.4457216262817383, "learning_rate": 1.25e-05, "loss": 0.5285, "step": 4292 }, { "epoch": 14.005033783783784, "grad_norm": 0.0042961835861206055, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4293 }, { "epoch": 14.005067567567568, "grad_norm": 0.04090556129813194, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4294 }, { "epoch": 14.005101351351351, "grad_norm": 2.618295907974243, "learning_rate": 1.25e-05, "loss": 0.01, "step": 4295 }, { "epoch": 14.005135135135134, "grad_norm": 0.02642042748630047, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4296 }, { "epoch": 14.005168918918919, "grad_norm": 0.05453287437558174, "learning_rate": 1.25e-05, "loss": 0.0021, "step": 4297 }, { "epoch": 14.005202702702702, "grad_norm": 2.876755475997925, "learning_rate": 1.25e-05, "loss": 0.0088, "step": 4298 }, { "epoch": 14.005236486486487, "grad_norm": 0.012772957794368267, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4299 }, { "epoch": 14.00527027027027, "grad_norm": 2.018925189971924, "learning_rate": 1.25e-05, "loss": 0.0872, "step": 4300 }, { "epoch": 14.005304054054054, "grad_norm": 0.0033318030182272196, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4301 }, { "epoch": 14.005337837837837, "grad_norm": 0.05459606647491455, "learning_rate": 1.25e-05, "loss": 0.002, "step": 4302 }, { "epoch": 14.005371621621622, "grad_norm": 8.890654563903809, "learning_rate": 1.25e-05, "loss": 0.3446, "step": 4303 }, { "epoch": 14.005405405405405, "grad_norm": 14.443790435791016, "learning_rate": 1.25e-05, "loss": 0.062, "step": 4304 }, { "epoch": 14.00543918918919, "grad_norm": 0.04779624193906784, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4305 }, { "epoch": 14.005472972972973, "grad_norm": 0.014932523481547832, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4306 }, { "epoch": 14.005506756756757, "grad_norm": 0.0075370236299932, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4307 }, { "epoch": 14.00554054054054, "grad_norm": 0.40289026498794556, "learning_rate": 1.25e-05, "loss": 0.0029, "step": 4308 }, { "epoch": 14.005574324324325, "grad_norm": 2.090219020843506, "learning_rate": 1.25e-05, "loss": 0.0296, "step": 4309 }, { "epoch": 14.005608108108108, "grad_norm": 0.09318365901708603, "learning_rate": 1.25e-05, "loss": 0.0028, "step": 4310 }, { "epoch": 14.005641891891893, "grad_norm": 0.005596649833023548, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4311 }, { "epoch": 14.005675675675676, "grad_norm": 0.01145176962018013, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4312 }, { "epoch": 14.00570945945946, "grad_norm": 2.4190802574157715, "learning_rate": 1.25e-05, "loss": 0.0444, "step": 4313 }, { "epoch": 14.005743243243243, "grad_norm": 3.0166964530944824, "learning_rate": 1.25e-05, "loss": 0.0089, "step": 4314 }, { "epoch": 14.005777027027028, "grad_norm": 1.2443737983703613, "learning_rate": 1.25e-05, "loss": 0.0038, "step": 4315 }, { "epoch": 14.00581081081081, "grad_norm": 44.161842346191406, "learning_rate": 1.25e-05, "loss": 0.1227, "step": 4316 }, { "epoch": 14.005844594594594, "grad_norm": 0.11245845258235931, "learning_rate": 1.25e-05, "loss": 0.0017, "step": 4317 }, { "epoch": 14.005878378378378, "grad_norm": 0.14585138857364655, "learning_rate": 1.25e-05, "loss": 0.0011, "step": 4318 }, { "epoch": 14.005912162162161, "grad_norm": 0.014478451572358608, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4319 }, { "epoch": 14.005945945945946, "grad_norm": 0.010605006478726864, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4320 }, { "epoch": 14.005979729729729, "grad_norm": 0.10094825178384781, "learning_rate": 1.25e-05, "loss": 0.0018, "step": 4321 }, { "epoch": 14.006013513513514, "grad_norm": 3.380312442779541, "learning_rate": 1.25e-05, "loss": 0.4922, "step": 4322 }, { "epoch": 14.006047297297297, "grad_norm": 0.7937666773796082, "learning_rate": 1.25e-05, "loss": 0.0025, "step": 4323 }, { "epoch": 14.006081081081081, "grad_norm": 0.002802689792588353, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4324 }, { "epoch": 14.006114864864864, "grad_norm": 0.11173062771558762, "learning_rate": 1.25e-05, "loss": 0.001, "step": 4325 }, { "epoch": 14.006148648648649, "grad_norm": 0.686829686164856, "learning_rate": 1.25e-05, "loss": 0.0243, "step": 4326 }, { "epoch": 14.006182432432432, "grad_norm": 2.0901405811309814, "learning_rate": 1.25e-05, "loss": 0.0124, "step": 4327 }, { "epoch": 14.006216216216217, "grad_norm": 0.041395075619220734, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4328 }, { "epoch": 14.00625, "grad_norm": 0.01107778400182724, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4329 }, { "epoch": 14.006283783783784, "grad_norm": 0.1406898945569992, "learning_rate": 1.25e-05, "loss": 0.0046, "step": 4330 }, { "epoch": 14.006317567567567, "grad_norm": 0.00596869271248579, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4331 }, { "epoch": 14.006351351351352, "grad_norm": 0.023963909596204758, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4332 }, { "epoch": 14.006385135135135, "grad_norm": 0.7649410963058472, "learning_rate": 1.25e-05, "loss": 0.0032, "step": 4333 }, { "epoch": 14.00641891891892, "grad_norm": 0.0024789960589259863, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4334 }, { "epoch": 14.006452702702703, "grad_norm": 0.056158147752285004, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4335 }, { "epoch": 14.006486486486487, "grad_norm": 0.02248755656182766, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4336 }, { "epoch": 14.00652027027027, "grad_norm": 0.003690298181027174, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4337 }, { "epoch": 14.006554054054053, "grad_norm": 6.705150604248047, "learning_rate": 1.25e-05, "loss": 0.014, "step": 4338 }, { "epoch": 14.006587837837838, "grad_norm": 0.007186985574662685, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4339 }, { "epoch": 14.00662162162162, "grad_norm": 6.047940254211426, "learning_rate": 1.25e-05, "loss": 0.0431, "step": 4340 }, { "epoch": 14.006655405405406, "grad_norm": 0.07466065138578415, "learning_rate": 1.25e-05, "loss": 0.0014, "step": 4341 }, { "epoch": 14.006689189189188, "grad_norm": 0.0076417564414441586, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4342 }, { "epoch": 14.006722972972973, "grad_norm": 0.013064046390354633, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4343 }, { "epoch": 14.006756756756756, "grad_norm": 0.0012704714899882674, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4344 }, { "epoch": 14.00679054054054, "grad_norm": 9.594260215759277, "learning_rate": 1.25e-05, "loss": 0.2905, "step": 4345 }, { "epoch": 14.006824324324324, "grad_norm": 12.528426170349121, "learning_rate": 1.25e-05, "loss": 0.1829, "step": 4346 }, { "epoch": 14.006858108108108, "grad_norm": 0.041991960257291794, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4347 }, { "epoch": 14.006891891891891, "grad_norm": 12.666553497314453, "learning_rate": 1.25e-05, "loss": 0.038, "step": 4348 }, { "epoch": 14.006925675675676, "grad_norm": 0.015907080844044685, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4349 }, { "epoch": 14.006959459459459, "grad_norm": 0.012025240808725357, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4350 }, { "epoch": 14.006993243243244, "grad_norm": 0.016295747831463814, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4351 }, { "epoch": 14.007027027027027, "grad_norm": 36.20330047607422, "learning_rate": 1.25e-05, "loss": 0.119, "step": 4352 }, { "epoch": 14.007060810810811, "grad_norm": 17.06148910522461, "learning_rate": 1.25e-05, "loss": 0.6632, "step": 4353 }, { "epoch": 14.007094594594594, "grad_norm": 0.3383813798427582, "learning_rate": 1.25e-05, "loss": 0.0022, "step": 4354 }, { "epoch": 14.007128378378379, "grad_norm": 36.36656951904297, "learning_rate": 1.25e-05, "loss": 0.1663, "step": 4355 }, { "epoch": 14.007162162162162, "grad_norm": 0.1388799399137497, "learning_rate": 1.25e-05, "loss": 0.0031, "step": 4356 }, { "epoch": 14.007195945945947, "grad_norm": 0.05245058238506317, "learning_rate": 1.25e-05, "loss": 0.0007, "step": 4357 }, { "epoch": 14.00722972972973, "grad_norm": 0.11452817171812057, "learning_rate": 1.25e-05, "loss": 0.0042, "step": 4358 }, { "epoch": 14.007263513513514, "grad_norm": 0.7778671383857727, "learning_rate": 1.25e-05, "loss": 0.0354, "step": 4359 }, { "epoch": 14.007297297297297, "grad_norm": 5.1064558029174805, "learning_rate": 1.25e-05, "loss": 0.0213, "step": 4360 }, { "epoch": 14.00733108108108, "grad_norm": 0.020374931395053864, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4361 }, { "epoch": 14.007364864864865, "grad_norm": 0.08075866848230362, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 4362 }, { "epoch": 14.007398648648648, "grad_norm": 84.91732025146484, "learning_rate": 1.25e-05, "loss": 0.4631, "step": 4363 }, { "epoch": 14.007432432432433, "grad_norm": 4.367103576660156, "learning_rate": 1.25e-05, "loss": 0.3196, "step": 4364 }, { "epoch": 14.007466216216216, "grad_norm": 0.011053791269659996, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4365 }, { "epoch": 14.0075, "grad_norm": 0.28903305530548096, "learning_rate": 1.25e-05, "loss": 0.0089, "step": 4366 }, { "epoch": 14.007533783783783, "grad_norm": 0.13009412586688995, "learning_rate": 1.25e-05, "loss": 0.0038, "step": 4367 }, { "epoch": 14.007567567567568, "grad_norm": 0.08834472298622131, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4368 }, { "epoch": 14.00760135135135, "grad_norm": 0.0020520444959402084, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4369 }, { "epoch": 14.007635135135136, "grad_norm": 0.6430972814559937, "learning_rate": 1.25e-05, "loss": 0.0081, "step": 4370 }, { "epoch": 14.007668918918919, "grad_norm": 0.005065205506980419, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4371 }, { "epoch": 14.007702702702703, "grad_norm": 0.07257259637117386, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4372 }, { "epoch": 14.007736486486486, "grad_norm": 0.005036336835473776, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4373 }, { "epoch": 14.00777027027027, "grad_norm": 0.10227019339799881, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 4374 }, { "epoch": 14.007804054054054, "grad_norm": 0.010913798585534096, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4375 }, { "epoch": 14.007837837837839, "grad_norm": 0.0018628257093951106, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4376 }, { "epoch": 14.007871621621621, "grad_norm": 0.0017613449599593878, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4377 }, { "epoch": 14.007905405405406, "grad_norm": 0.0024865821469575167, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4378 }, { "epoch": 14.00793918918919, "grad_norm": 0.06672155112028122, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4379 }, { "epoch": 14.007972972972974, "grad_norm": 0.0030474390368908644, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4380 }, { "epoch": 14.008006756756757, "grad_norm": 3.620401620864868, "learning_rate": 1.25e-05, "loss": 0.005, "step": 4381 }, { "epoch": 14.00804054054054, "grad_norm": 0.09631366282701492, "learning_rate": 1.25e-05, "loss": 0.0035, "step": 4382 }, { "epoch": 14.008074324324324, "grad_norm": 0.002820112043991685, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4383 }, { "epoch": 14.008108108108107, "grad_norm": 0.20633813738822937, "learning_rate": 1.25e-05, "loss": 0.0046, "step": 4384 }, { "epoch": 14.008141891891892, "grad_norm": 0.010348821990191936, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4385 }, { "epoch": 14.008175675675675, "grad_norm": 16.817821502685547, "learning_rate": 1.25e-05, "loss": 0.8682, "step": 4386 }, { "epoch": 14.00820945945946, "grad_norm": 0.013836857862770557, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4387 }, { "epoch": 14.008243243243243, "grad_norm": 0.11491268128156662, "learning_rate": 1.25e-05, "loss": 0.0012, "step": 4388 }, { "epoch": 14.008277027027027, "grad_norm": 3.3170180320739746, "learning_rate": 1.25e-05, "loss": 0.4573, "step": 4389 }, { "epoch": 14.00831081081081, "grad_norm": 0.018474938347935677, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4390 }, { "epoch": 14.008344594594595, "grad_norm": 0.014081406407058239, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4391 }, { "epoch": 14.008378378378378, "grad_norm": 39.6552848815918, "learning_rate": 1.25e-05, "loss": 0.6813, "step": 4392 }, { "epoch": 14.008412162162163, "grad_norm": 0.0714251920580864, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4393 }, { "epoch": 14.008445945945946, "grad_norm": 0.00907511729747057, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4394 }, { "epoch": 14.00847972972973, "grad_norm": 1.9791536331176758, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 4395 }, { "epoch": 14.008513513513513, "grad_norm": 0.01631373167037964, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4396 }, { "epoch": 14.008547297297298, "grad_norm": 0.0024555902928113937, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4397 }, { "epoch": 14.008581081081081, "grad_norm": 0.1191953495144844, "learning_rate": 1.25e-05, "loss": 0.0045, "step": 4398 }, { "epoch": 14.008614864864866, "grad_norm": 0.008021276444196701, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4399 }, { "epoch": 14.008648648648649, "grad_norm": 0.014949419535696507, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4400 }, { "epoch": 14.008682432432433, "grad_norm": 1.7105621099472046, "learning_rate": 1.25e-05, "loss": 0.086, "step": 4401 }, { "epoch": 14.008716216216216, "grad_norm": 13.558572769165039, "learning_rate": 1.25e-05, "loss": 0.2736, "step": 4402 }, { "epoch": 14.00875, "grad_norm": 18.600631713867188, "learning_rate": 1.25e-05, "loss": 0.6745, "step": 4403 }, { "epoch": 14.008783783783784, "grad_norm": 0.0051818653009831905, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4404 }, { "epoch": 14.008817567567567, "grad_norm": 0.059754155576229095, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4405 }, { "epoch": 14.008851351351352, "grad_norm": 0.007637556176632643, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4406 }, { "epoch": 14.008885135135134, "grad_norm": 0.012442833743989468, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4407 }, { "epoch": 14.00891891891892, "grad_norm": 0.0011488713789731264, "learning_rate": 1.25e-05, "loss": 0.0, "step": 4408 }, { "epoch": 14.008952702702702, "grad_norm": 0.0063976082019507885, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4409 }, { "epoch": 14.008986486486487, "grad_norm": 0.009642771445214748, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4410 }, { "epoch": 14.00902027027027, "grad_norm": 0.042446792125701904, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4411 }, { "epoch": 14.009054054054054, "grad_norm": 0.0064311763271689415, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4412 }, { "epoch": 14.009087837837837, "grad_norm": 0.012291627936065197, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4413 }, { "epoch": 14.009121621621622, "grad_norm": 2.548488140106201, "learning_rate": 1.25e-05, "loss": 0.0078, "step": 4414 }, { "epoch": 14.009155405405405, "grad_norm": 0.00933004543185234, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4415 }, { "epoch": 14.00918918918919, "grad_norm": 0.04659407585859299, "learning_rate": 1.25e-05, "loss": 0.0009, "step": 4416 }, { "epoch": 14.009222972972973, "grad_norm": 42.142608642578125, "learning_rate": 1.25e-05, "loss": 0.356, "step": 4417 }, { "epoch": 14.009256756756757, "grad_norm": 0.011745333671569824, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4418 }, { "epoch": 14.00929054054054, "grad_norm": 0.3008943200111389, "learning_rate": 1.25e-05, "loss": 0.0015, "step": 4419 }, { "epoch": 14.009324324324325, "grad_norm": 1.9819871187210083, "learning_rate": 1.25e-05, "loss": 0.017, "step": 4420 }, { "epoch": 14.009358108108108, "grad_norm": 0.031506653875112534, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 4421 }, { "epoch": 14.009391891891893, "grad_norm": 0.0016214563511312008, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4422 }, { "epoch": 14.009425675675676, "grad_norm": 0.13821037113666534, "learning_rate": 1.25e-05, "loss": 0.0055, "step": 4423 }, { "epoch": 14.009459459459459, "grad_norm": 0.009226161986589432, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4424 }, { "epoch": 14.009493243243243, "grad_norm": 8.817242622375488, "learning_rate": 1.25e-05, "loss": 0.0186, "step": 4425 }, { "epoch": 14.009527027027026, "grad_norm": 0.13883069157600403, "learning_rate": 1.25e-05, "loss": 0.0053, "step": 4426 }, { "epoch": 14.009560810810811, "grad_norm": 0.08393929898738861, "learning_rate": 1.25e-05, "loss": 0.001, "step": 4427 }, { "epoch": 14.009594594594594, "grad_norm": 0.3462557792663574, "learning_rate": 1.25e-05, "loss": 0.0149, "step": 4428 }, { "epoch": 14.009628378378379, "grad_norm": 0.004049266688525677, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4429 }, { "epoch": 14.009662162162162, "grad_norm": 0.019350890070199966, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4430 }, { "epoch": 14.009695945945946, "grad_norm": 0.14012469351291656, "learning_rate": 1.25e-05, "loss": 0.0053, "step": 4431 }, { "epoch": 14.00972972972973, "grad_norm": 0.008965477347373962, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4432 }, { "epoch": 14.009763513513514, "grad_norm": 0.055915120989084244, "learning_rate": 1.25e-05, "loss": 0.0019, "step": 4433 }, { "epoch": 14.009797297297297, "grad_norm": 0.0022254332434386015, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 4434 }, { "epoch": 14.009831081081082, "grad_norm": 0.04356268420815468, "learning_rate": 1.25e-05, "loss": 0.0004, "step": 4435 }, { "epoch": 14.009864864864864, "grad_norm": 0.13559630513191223, "learning_rate": 1.25e-05, "loss": 0.0008, "step": 4436 }, { "epoch": 14.00989864864865, "grad_norm": 0.014856355264782906, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4437 }, { "epoch": 14.009932432432432, "grad_norm": 0.08372464030981064, "learning_rate": 1.25e-05, "loss": 0.0025, "step": 4438 }, { "epoch": 14.009966216216217, "grad_norm": 0.022200606763362885, "learning_rate": 1.25e-05, "loss": 0.0003, "step": 4439 }, { "epoch": 14.01, "grad_norm": 0.013826174661517143, "learning_rate": 1.25e-05, "loss": 0.0002, "step": 4440 }, { "epoch": 14.01, "eval_accuracy": 0.8610662358642972, "eval_loss": 0.6739630699157715, "eval_runtime": 32.143, "eval_samples_per_second": 19.258, "eval_steps_per_second": 2.427, "step": 4440 }, { "epoch": 15.000033783783783, "grad_norm": 15.861814498901367, "learning_rate": 6.25e-06, "loss": 0.0406, "step": 4441 }, { "epoch": 15.000067567567568, "grad_norm": 0.002904534572735429, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4442 }, { "epoch": 15.00010135135135, "grad_norm": 0.8067782521247864, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 4443 }, { "epoch": 15.000135135135135, "grad_norm": 0.13110116124153137, "learning_rate": 6.25e-06, "loss": 0.0047, "step": 4444 }, { "epoch": 15.000168918918918, "grad_norm": 0.054247722029685974, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 4445 }, { "epoch": 15.000202702702703, "grad_norm": 0.004159526899456978, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4446 }, { "epoch": 15.000236486486486, "grad_norm": 0.005816435907036066, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4447 }, { "epoch": 15.00027027027027, "grad_norm": 0.030507072806358337, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4448 }, { "epoch": 15.000304054054054, "grad_norm": 1.0976324081420898, "learning_rate": 6.25e-06, "loss": 0.0055, "step": 4449 }, { "epoch": 15.000337837837838, "grad_norm": 0.011352880857884884, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4450 }, { "epoch": 15.000371621621621, "grad_norm": 0.013483465649187565, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4451 }, { "epoch": 15.000405405405406, "grad_norm": 0.02865443006157875, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4452 }, { "epoch": 15.000439189189189, "grad_norm": 0.10672620683908463, "learning_rate": 6.25e-06, "loss": 0.001, "step": 4453 }, { "epoch": 15.000472972972974, "grad_norm": 0.05317491292953491, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4454 }, { "epoch": 15.000506756756756, "grad_norm": 0.3632298409938812, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4455 }, { "epoch": 15.000540540540541, "grad_norm": 0.012311971746385098, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4456 }, { "epoch": 15.000574324324324, "grad_norm": 0.013703395612537861, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4457 }, { "epoch": 15.000608108108109, "grad_norm": 0.0039027289021760225, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4458 }, { "epoch": 15.000641891891892, "grad_norm": 0.09647919237613678, "learning_rate": 6.25e-06, "loss": 0.0036, "step": 4459 }, { "epoch": 15.000675675675677, "grad_norm": 0.004981071688234806, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4460 }, { "epoch": 15.00070945945946, "grad_norm": 0.002613175893202424, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4461 }, { "epoch": 15.000743243243242, "grad_norm": 22.085147857666016, "learning_rate": 6.25e-06, "loss": 0.3446, "step": 4462 }, { "epoch": 15.000777027027027, "grad_norm": 0.0027047828771173954, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4463 }, { "epoch": 15.00081081081081, "grad_norm": 0.56822669506073, "learning_rate": 6.25e-06, "loss": 0.0026, "step": 4464 }, { "epoch": 15.000844594594595, "grad_norm": 0.0021148843225091696, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4465 }, { "epoch": 15.000878378378378, "grad_norm": 0.10548834502696991, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4466 }, { "epoch": 15.000912162162162, "grad_norm": 0.04595983028411865, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4467 }, { "epoch": 15.000945945945945, "grad_norm": 0.01818728819489479, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4468 }, { "epoch": 15.00097972972973, "grad_norm": 79.85193634033203, "learning_rate": 6.25e-06, "loss": 0.6101, "step": 4469 }, { "epoch": 15.001013513513513, "grad_norm": 0.1028623953461647, "learning_rate": 6.25e-06, "loss": 0.0039, "step": 4470 }, { "epoch": 15.001047297297298, "grad_norm": 0.6973778009414673, "learning_rate": 6.25e-06, "loss": 0.0032, "step": 4471 }, { "epoch": 15.00108108108108, "grad_norm": 0.0020326124504208565, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4472 }, { "epoch": 15.001114864864865, "grad_norm": 9.396073341369629, "learning_rate": 6.25e-06, "loss": 0.9889, "step": 4473 }, { "epoch": 15.001148648648648, "grad_norm": 1.7465826272964478, "learning_rate": 6.25e-06, "loss": 0.011, "step": 4474 }, { "epoch": 15.001182432432433, "grad_norm": 0.030607959255576134, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4475 }, { "epoch": 15.001216216216216, "grad_norm": 0.4426361918449402, "learning_rate": 6.25e-06, "loss": 0.0138, "step": 4476 }, { "epoch": 15.00125, "grad_norm": 0.0025384279433637857, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4477 }, { "epoch": 15.001283783783784, "grad_norm": 0.004818986169993877, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4478 }, { "epoch": 15.001317567567568, "grad_norm": 0.007178005762398243, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4479 }, { "epoch": 15.001351351351351, "grad_norm": 1.9952337741851807, "learning_rate": 6.25e-06, "loss": 0.1013, "step": 4480 }, { "epoch": 15.001385135135136, "grad_norm": 0.01846974715590477, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4481 }, { "epoch": 15.001418918918919, "grad_norm": 0.11749624460935593, "learning_rate": 6.25e-06, "loss": 0.004, "step": 4482 }, { "epoch": 15.001452702702704, "grad_norm": 0.951183021068573, "learning_rate": 6.25e-06, "loss": 0.0043, "step": 4483 }, { "epoch": 15.001486486486487, "grad_norm": 0.1175130158662796, "learning_rate": 6.25e-06, "loss": 0.0014, "step": 4484 }, { "epoch": 15.00152027027027, "grad_norm": 1.6471517086029053, "learning_rate": 6.25e-06, "loss": 0.0216, "step": 4485 }, { "epoch": 15.001554054054054, "grad_norm": 0.000991010689176619, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4486 }, { "epoch": 15.001587837837837, "grad_norm": 34.52223587036133, "learning_rate": 6.25e-06, "loss": 0.1125, "step": 4487 }, { "epoch": 15.001621621621622, "grad_norm": 5.01108980178833, "learning_rate": 6.25e-06, "loss": 0.472, "step": 4488 }, { "epoch": 15.001655405405405, "grad_norm": 0.1488996297121048, "learning_rate": 6.25e-06, "loss": 0.0014, "step": 4489 }, { "epoch": 15.00168918918919, "grad_norm": 3.4433887004852295, "learning_rate": 6.25e-06, "loss": 0.4559, "step": 4490 }, { "epoch": 15.001722972972972, "grad_norm": 1.6887387037277222, "learning_rate": 6.25e-06, "loss": 0.0086, "step": 4491 }, { "epoch": 15.001756756756757, "grad_norm": 1.869570016860962, "learning_rate": 6.25e-06, "loss": 0.0882, "step": 4492 }, { "epoch": 15.00179054054054, "grad_norm": 0.010359824635088444, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4493 }, { "epoch": 15.001824324324325, "grad_norm": 5.026346206665039, "learning_rate": 6.25e-06, "loss": 0.0205, "step": 4494 }, { "epoch": 15.001858108108108, "grad_norm": 0.11633080989122391, "learning_rate": 6.25e-06, "loss": 0.0016, "step": 4495 }, { "epoch": 15.001891891891892, "grad_norm": 10.440007209777832, "learning_rate": 6.25e-06, "loss": 0.1207, "step": 4496 }, { "epoch": 15.001925675675675, "grad_norm": 0.11297687143087387, "learning_rate": 6.25e-06, "loss": 0.0042, "step": 4497 }, { "epoch": 15.00195945945946, "grad_norm": 2.9499094486236572, "learning_rate": 6.25e-06, "loss": 0.0065, "step": 4498 }, { "epoch": 15.001993243243243, "grad_norm": 0.018681736662983894, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4499 }, { "epoch": 15.002027027027028, "grad_norm": 43.32278060913086, "learning_rate": 6.25e-06, "loss": 0.7897, "step": 4500 }, { "epoch": 15.00206081081081, "grad_norm": 0.0029982763808220625, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4501 }, { "epoch": 15.002094594594595, "grad_norm": 0.0014884448610246181, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4502 }, { "epoch": 15.002128378378378, "grad_norm": 0.015476072207093239, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4503 }, { "epoch": 15.002162162162163, "grad_norm": 0.09461264312267303, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4504 }, { "epoch": 15.002195945945946, "grad_norm": 0.16967275738716125, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4505 }, { "epoch": 15.002229729729729, "grad_norm": 0.006835583597421646, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4506 }, { "epoch": 15.002263513513514, "grad_norm": 0.5217730402946472, "learning_rate": 6.25e-06, "loss": 0.0016, "step": 4507 }, { "epoch": 15.002297297297297, "grad_norm": 1.96640944480896, "learning_rate": 6.25e-06, "loss": 0.0183, "step": 4508 }, { "epoch": 15.002331081081081, "grad_norm": 0.0035436959005892277, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4509 }, { "epoch": 15.002364864864864, "grad_norm": 0.5622414946556091, "learning_rate": 6.25e-06, "loss": 0.024, "step": 4510 }, { "epoch": 15.002398648648649, "grad_norm": 0.2026381492614746, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4511 }, { "epoch": 15.002432432432432, "grad_norm": 7.7115092277526855, "learning_rate": 6.25e-06, "loss": 0.0482, "step": 4512 }, { "epoch": 15.002466216216217, "grad_norm": 0.05082505941390991, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 4513 }, { "epoch": 15.0025, "grad_norm": 0.006989434361457825, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4514 }, { "epoch": 15.002533783783784, "grad_norm": 0.0323263555765152, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4515 }, { "epoch": 15.002567567567567, "grad_norm": 3.070457696914673, "learning_rate": 6.25e-06, "loss": 0.0499, "step": 4516 }, { "epoch": 15.002601351351352, "grad_norm": 3.928833246231079, "learning_rate": 6.25e-06, "loss": 0.0235, "step": 4517 }, { "epoch": 15.002635135135135, "grad_norm": 0.003947071265429258, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4518 }, { "epoch": 15.00266891891892, "grad_norm": 0.5847400426864624, "learning_rate": 6.25e-06, "loss": 0.0035, "step": 4519 }, { "epoch": 15.002702702702702, "grad_norm": 2.269798755645752, "learning_rate": 6.25e-06, "loss": 0.1101, "step": 4520 }, { "epoch": 15.002736486486487, "grad_norm": 0.0014590010978281498, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4521 }, { "epoch": 15.00277027027027, "grad_norm": 3.8748788833618164, "learning_rate": 6.25e-06, "loss": 0.0159, "step": 4522 }, { "epoch": 15.002804054054055, "grad_norm": 0.10669246315956116, "learning_rate": 6.25e-06, "loss": 0.0038, "step": 4523 }, { "epoch": 15.002837837837838, "grad_norm": 7.058441162109375, "learning_rate": 6.25e-06, "loss": 0.1281, "step": 4524 }, { "epoch": 15.002871621621622, "grad_norm": 0.003566714469343424, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4525 }, { "epoch": 15.002905405405405, "grad_norm": 12.184050559997559, "learning_rate": 6.25e-06, "loss": 0.0224, "step": 4526 }, { "epoch": 15.002939189189188, "grad_norm": 0.0047128889709711075, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4527 }, { "epoch": 15.002972972972973, "grad_norm": 0.16854515671730042, "learning_rate": 6.25e-06, "loss": 0.001, "step": 4528 }, { "epoch": 15.003006756756756, "grad_norm": 0.02744927816092968, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4529 }, { "epoch": 15.00304054054054, "grad_norm": 0.007192050572484732, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4530 }, { "epoch": 15.003074324324324, "grad_norm": 0.008346465416252613, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4531 }, { "epoch": 15.003108108108108, "grad_norm": 0.004154311493039131, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4532 }, { "epoch": 15.003141891891891, "grad_norm": 25.331817626953125, "learning_rate": 6.25e-06, "loss": 0.1931, "step": 4533 }, { "epoch": 15.003175675675676, "grad_norm": 0.00766139617189765, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4534 }, { "epoch": 15.003209459459459, "grad_norm": 0.15481224656105042, "learning_rate": 6.25e-06, "loss": 0.0044, "step": 4535 }, { "epoch": 15.003243243243244, "grad_norm": 0.0018949246732518077, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4536 }, { "epoch": 15.003277027027027, "grad_norm": 0.12688297033309937, "learning_rate": 6.25e-06, "loss": 0.0048, "step": 4537 }, { "epoch": 15.003310810810811, "grad_norm": 0.005355563946068287, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4538 }, { "epoch": 15.003344594594594, "grad_norm": 0.0016260474221780896, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4539 }, { "epoch": 15.003378378378379, "grad_norm": 0.28351718187332153, "learning_rate": 6.25e-06, "loss": 0.0045, "step": 4540 }, { "epoch": 15.003412162162162, "grad_norm": 0.1044502779841423, "learning_rate": 6.25e-06, "loss": 0.0039, "step": 4541 }, { "epoch": 15.003445945945947, "grad_norm": 3.041670083999634, "learning_rate": 6.25e-06, "loss": 0.1814, "step": 4542 }, { "epoch": 15.00347972972973, "grad_norm": 0.1706051081418991, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 4543 }, { "epoch": 15.003513513513514, "grad_norm": 0.05163000896573067, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4544 }, { "epoch": 15.003547297297297, "grad_norm": 18.233657836914062, "learning_rate": 6.25e-06, "loss": 0.1375, "step": 4545 }, { "epoch": 15.003581081081082, "grad_norm": 0.007398364134132862, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4546 }, { "epoch": 15.003614864864865, "grad_norm": 0.02239750325679779, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4547 }, { "epoch": 15.003648648648648, "grad_norm": 0.05176186189055443, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4548 }, { "epoch": 15.003682432432432, "grad_norm": 16.200307846069336, "learning_rate": 6.25e-06, "loss": 0.0451, "step": 4549 }, { "epoch": 15.003716216216215, "grad_norm": 0.002289488213136792, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4550 }, { "epoch": 15.00375, "grad_norm": 30.185392379760742, "learning_rate": 6.25e-06, "loss": 0.6095, "step": 4551 }, { "epoch": 15.003783783783783, "grad_norm": 0.023310231044888496, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4552 }, { "epoch": 15.003817567567568, "grad_norm": 0.004903547931462526, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4553 }, { "epoch": 15.00385135135135, "grad_norm": 0.0021245970856398344, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4554 }, { "epoch": 15.003885135135135, "grad_norm": 0.0032627691980451345, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4555 }, { "epoch": 15.003918918918918, "grad_norm": 0.010323860682547092, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4556 }, { "epoch": 15.003952702702703, "grad_norm": 1.9923129081726074, "learning_rate": 6.25e-06, "loss": 0.061, "step": 4557 }, { "epoch": 15.003986486486486, "grad_norm": 0.00469401478767395, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4558 }, { "epoch": 15.00402027027027, "grad_norm": 0.002683703089132905, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4559 }, { "epoch": 15.004054054054054, "grad_norm": 0.04068021476268768, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4560 }, { "epoch": 15.004087837837838, "grad_norm": 0.008295721374452114, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4561 }, { "epoch": 15.004121621621621, "grad_norm": 0.008020860143005848, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4562 }, { "epoch": 15.004155405405406, "grad_norm": 44.3681755065918, "learning_rate": 6.25e-06, "loss": 0.6197, "step": 4563 }, { "epoch": 15.004189189189189, "grad_norm": 0.003072366351261735, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4564 }, { "epoch": 15.004222972972974, "grad_norm": 0.0525803379714489, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4565 }, { "epoch": 15.004256756756757, "grad_norm": 9.634355545043945, "learning_rate": 6.25e-06, "loss": 0.707, "step": 4566 }, { "epoch": 15.004290540540541, "grad_norm": 0.006735958158969879, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4567 }, { "epoch": 15.004324324324324, "grad_norm": 0.0014660885790362954, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4568 }, { "epoch": 15.004358108108109, "grad_norm": 0.005903958808630705, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4569 }, { "epoch": 15.004391891891892, "grad_norm": 0.01358797401189804, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4570 }, { "epoch": 15.004425675675675, "grad_norm": 12.883622169494629, "learning_rate": 6.25e-06, "loss": 0.4656, "step": 4571 }, { "epoch": 15.00445945945946, "grad_norm": 0.03696417808532715, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4572 }, { "epoch": 15.004493243243243, "grad_norm": 0.006009475328028202, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4573 }, { "epoch": 15.004527027027027, "grad_norm": 0.09615746885538101, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4574 }, { "epoch": 15.00456081081081, "grad_norm": 2.361473560333252, "learning_rate": 6.25e-06, "loss": 0.0168, "step": 4575 }, { "epoch": 15.004594594594595, "grad_norm": 0.005814452655613422, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4576 }, { "epoch": 15.004628378378378, "grad_norm": 53.38449478149414, "learning_rate": 6.25e-06, "loss": 0.2289, "step": 4577 }, { "epoch": 15.004662162162163, "grad_norm": 0.008738000877201557, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4578 }, { "epoch": 15.004695945945945, "grad_norm": 0.10419132560491562, "learning_rate": 6.25e-06, "loss": 0.001, "step": 4579 }, { "epoch": 15.00472972972973, "grad_norm": 0.21943189203739166, "learning_rate": 6.25e-06, "loss": 0.004, "step": 4580 }, { "epoch": 15.004763513513513, "grad_norm": 45.51744079589844, "learning_rate": 6.25e-06, "loss": 0.3751, "step": 4581 }, { "epoch": 15.004797297297298, "grad_norm": 1.313662052154541, "learning_rate": 6.25e-06, "loss": 0.0199, "step": 4582 }, { "epoch": 15.00483108108108, "grad_norm": 0.030525904148817062, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4583 }, { "epoch": 15.004864864864865, "grad_norm": 0.0013775663683190942, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4584 }, { "epoch": 15.004898648648648, "grad_norm": 0.002994287759065628, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4585 }, { "epoch": 15.004932432432433, "grad_norm": 47.042442321777344, "learning_rate": 6.25e-06, "loss": 0.087, "step": 4586 }, { "epoch": 15.004966216216216, "grad_norm": 0.004058232065290213, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4587 }, { "epoch": 15.005, "grad_norm": 8.740854263305664, "learning_rate": 6.25e-06, "loss": 0.7976, "step": 4588 }, { "epoch": 15.005033783783784, "grad_norm": 0.006811310537159443, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4589 }, { "epoch": 15.005067567567568, "grad_norm": 3.4101755619049072, "learning_rate": 6.25e-06, "loss": 0.4784, "step": 4590 }, { "epoch": 15.005101351351351, "grad_norm": 0.06651902198791504, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4591 }, { "epoch": 15.005135135135134, "grad_norm": 0.005318783689290285, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4592 }, { "epoch": 15.005168918918919, "grad_norm": 0.8892159461975098, "learning_rate": 6.25e-06, "loss": 0.0286, "step": 4593 }, { "epoch": 15.005202702702702, "grad_norm": 8.867003440856934, "learning_rate": 6.25e-06, "loss": 0.3816, "step": 4594 }, { "epoch": 15.005236486486487, "grad_norm": 0.0022752671502530575, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4595 }, { "epoch": 15.00527027027027, "grad_norm": 0.009292064234614372, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4596 }, { "epoch": 15.005304054054054, "grad_norm": 0.005000557284802198, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4597 }, { "epoch": 15.005337837837837, "grad_norm": 0.07431486994028091, "learning_rate": 6.25e-06, "loss": 0.0017, "step": 4598 }, { "epoch": 15.005371621621622, "grad_norm": 0.0033333459869027138, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4599 }, { "epoch": 15.005405405405405, "grad_norm": 0.45255014300346375, "learning_rate": 6.25e-06, "loss": 0.0071, "step": 4600 }, { "epoch": 15.00543918918919, "grad_norm": 0.020977802574634552, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4601 }, { "epoch": 15.005472972972973, "grad_norm": 0.1296669840812683, "learning_rate": 6.25e-06, "loss": 0.0049, "step": 4602 }, { "epoch": 15.005506756756757, "grad_norm": 0.053411275148391724, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 4603 }, { "epoch": 15.00554054054054, "grad_norm": 0.0012309413868933916, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4604 }, { "epoch": 15.005574324324325, "grad_norm": 0.0019311928190290928, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4605 }, { "epoch": 15.005608108108108, "grad_norm": 0.7785077691078186, "learning_rate": 6.25e-06, "loss": 0.0273, "step": 4606 }, { "epoch": 15.005641891891893, "grad_norm": 0.021822722628712654, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4607 }, { "epoch": 15.005675675675676, "grad_norm": 0.5561726689338684, "learning_rate": 6.25e-06, "loss": 0.0018, "step": 4608 }, { "epoch": 15.00570945945946, "grad_norm": 7.977023124694824, "learning_rate": 6.25e-06, "loss": 0.1347, "step": 4609 }, { "epoch": 15.005743243243243, "grad_norm": 1.6002050638198853, "learning_rate": 6.25e-06, "loss": 0.0131, "step": 4610 }, { "epoch": 15.005777027027028, "grad_norm": 0.3319033682346344, "learning_rate": 6.25e-06, "loss": 0.0129, "step": 4611 }, { "epoch": 15.00581081081081, "grad_norm": 0.01663653925061226, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4612 }, { "epoch": 15.005844594594594, "grad_norm": 0.1266011893749237, "learning_rate": 6.25e-06, "loss": 0.0048, "step": 4613 }, { "epoch": 15.005878378378378, "grad_norm": 1.4177260398864746, "learning_rate": 6.25e-06, "loss": 0.0192, "step": 4614 }, { "epoch": 15.005912162162161, "grad_norm": 0.0032893058378249407, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4615 }, { "epoch": 15.005945945945946, "grad_norm": 0.04875658452510834, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4616 }, { "epoch": 15.005979729729729, "grad_norm": 0.03248412534594536, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4617 }, { "epoch": 15.006013513513514, "grad_norm": 0.0017231458332389593, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4618 }, { "epoch": 15.006047297297297, "grad_norm": 0.0020604445599019527, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4619 }, { "epoch": 15.006081081081081, "grad_norm": 0.2118757963180542, "learning_rate": 6.25e-06, "loss": 0.0034, "step": 4620 }, { "epoch": 15.006114864864864, "grad_norm": 0.0038696948904544115, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4621 }, { "epoch": 15.006148648648649, "grad_norm": 0.00480728130787611, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4622 }, { "epoch": 15.006182432432432, "grad_norm": 0.002723169280216098, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4623 }, { "epoch": 15.006216216216217, "grad_norm": 0.0624435618519783, "learning_rate": 6.25e-06, "loss": 0.0014, "step": 4624 }, { "epoch": 15.00625, "grad_norm": 0.016325298696756363, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4625 }, { "epoch": 15.006283783783784, "grad_norm": 0.00450735492631793, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4626 }, { "epoch": 15.006317567567567, "grad_norm": 0.011854210868477821, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4627 }, { "epoch": 15.006351351351352, "grad_norm": 4.363121509552002, "learning_rate": 6.25e-06, "loss": 0.0368, "step": 4628 }, { "epoch": 15.006385135135135, "grad_norm": 0.04008744657039642, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4629 }, { "epoch": 15.00641891891892, "grad_norm": 0.0014623526949435472, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4630 }, { "epoch": 15.006452702702703, "grad_norm": 19.56882667541504, "learning_rate": 6.25e-06, "loss": 0.794, "step": 4631 }, { "epoch": 15.006486486486487, "grad_norm": 7.025686740875244, "learning_rate": 6.25e-06, "loss": 0.8052, "step": 4632 }, { "epoch": 15.00652027027027, "grad_norm": 10.280952453613281, "learning_rate": 6.25e-06, "loss": 0.8835, "step": 4633 }, { "epoch": 15.006554054054053, "grad_norm": 0.004971915390342474, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4634 }, { "epoch": 15.006587837837838, "grad_norm": 0.10587028414011002, "learning_rate": 6.25e-06, "loss": 0.004, "step": 4635 }, { "epoch": 15.00662162162162, "grad_norm": 0.13319407403469086, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 4636 }, { "epoch": 15.006655405405406, "grad_norm": 36.11038589477539, "learning_rate": 6.25e-06, "loss": 0.3259, "step": 4637 }, { "epoch": 15.006689189189188, "grad_norm": 0.006008595693856478, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4638 }, { "epoch": 15.006722972972973, "grad_norm": 10.429726600646973, "learning_rate": 6.25e-06, "loss": 0.6599, "step": 4639 }, { "epoch": 15.006756756756756, "grad_norm": 1.4640934467315674, "learning_rate": 6.25e-06, "loss": 0.0023, "step": 4640 }, { "epoch": 15.00679054054054, "grad_norm": 0.0024422022979706526, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4641 }, { "epoch": 15.006824324324324, "grad_norm": 0.006120575126260519, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4642 }, { "epoch": 15.006858108108108, "grad_norm": 0.06787228584289551, "learning_rate": 6.25e-06, "loss": 0.002, "step": 4643 }, { "epoch": 15.006891891891891, "grad_norm": 0.0013140452792868018, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4644 }, { "epoch": 15.006925675675676, "grad_norm": 0.0028057836461812258, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4645 }, { "epoch": 15.006959459459459, "grad_norm": 0.01683139242231846, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4646 }, { "epoch": 15.006993243243244, "grad_norm": 0.11909568309783936, "learning_rate": 6.25e-06, "loss": 0.0045, "step": 4647 }, { "epoch": 15.007027027027027, "grad_norm": 0.041994526982307434, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4648 }, { "epoch": 15.007060810810811, "grad_norm": 0.02645263448357582, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4649 }, { "epoch": 15.007094594594594, "grad_norm": 0.02273860201239586, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4650 }, { "epoch": 15.007128378378379, "grad_norm": 0.0024514845572412014, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4651 }, { "epoch": 15.007162162162162, "grad_norm": 0.2311098426580429, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4652 }, { "epoch": 15.007195945945947, "grad_norm": 0.3682534694671631, "learning_rate": 6.25e-06, "loss": 0.0112, "step": 4653 }, { "epoch": 15.00722972972973, "grad_norm": 11.091631889343262, "learning_rate": 6.25e-06, "loss": 1.1178, "step": 4654 }, { "epoch": 15.007263513513514, "grad_norm": 0.019316567108035088, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4655 }, { "epoch": 15.007297297297297, "grad_norm": 0.00736482348293066, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4656 }, { "epoch": 15.00733108108108, "grad_norm": 0.010966218076646328, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4657 }, { "epoch": 15.007364864864865, "grad_norm": 36.382328033447266, "learning_rate": 6.25e-06, "loss": 0.1265, "step": 4658 }, { "epoch": 15.007398648648648, "grad_norm": 0.1517287790775299, "learning_rate": 6.25e-06, "loss": 0.003, "step": 4659 }, { "epoch": 15.007432432432433, "grad_norm": 0.005232144147157669, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4660 }, { "epoch": 15.007466216216216, "grad_norm": 0.13664788007736206, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 4661 }, { "epoch": 15.0075, "grad_norm": 0.11069425940513611, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4662 }, { "epoch": 15.007533783783783, "grad_norm": 0.35364532470703125, "learning_rate": 6.25e-06, "loss": 0.0075, "step": 4663 }, { "epoch": 15.007567567567568, "grad_norm": 0.002760282251983881, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4664 }, { "epoch": 15.00760135135135, "grad_norm": 0.45119091868400574, "learning_rate": 6.25e-06, "loss": 0.0125, "step": 4665 }, { "epoch": 15.007635135135136, "grad_norm": 0.006380004342645407, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4666 }, { "epoch": 15.007668918918919, "grad_norm": 5.180939197540283, "learning_rate": 6.25e-06, "loss": 0.466, "step": 4667 }, { "epoch": 15.007702702702703, "grad_norm": 0.0019108002306893468, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4668 }, { "epoch": 15.007736486486486, "grad_norm": 0.0024586382787674665, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4669 }, { "epoch": 15.00777027027027, "grad_norm": 1.0922636985778809, "learning_rate": 6.25e-06, "loss": 0.0061, "step": 4670 }, { "epoch": 15.007804054054054, "grad_norm": 0.004679888021200895, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4671 }, { "epoch": 15.007837837837839, "grad_norm": 0.013879546895623207, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4672 }, { "epoch": 15.007871621621621, "grad_norm": 2.972341537475586, "learning_rate": 6.25e-06, "loss": 0.1135, "step": 4673 }, { "epoch": 15.007905405405406, "grad_norm": 0.0058349668979644775, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4674 }, { "epoch": 15.00793918918919, "grad_norm": 0.5334742069244385, "learning_rate": 6.25e-06, "loss": 0.0047, "step": 4675 }, { "epoch": 15.007972972972974, "grad_norm": 27.167953491210938, "learning_rate": 6.25e-06, "loss": 0.106, "step": 4676 }, { "epoch": 15.008006756756757, "grad_norm": 0.08461785316467285, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4677 }, { "epoch": 15.00804054054054, "grad_norm": 0.04979405179619789, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4678 }, { "epoch": 15.008074324324324, "grad_norm": 0.004135767929255962, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4679 }, { "epoch": 15.008108108108107, "grad_norm": 0.010168968699872494, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4680 }, { "epoch": 15.008141891891892, "grad_norm": 0.006945531349629164, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4681 }, { "epoch": 15.008175675675675, "grad_norm": 0.4607633352279663, "learning_rate": 6.25e-06, "loss": 0.0025, "step": 4682 }, { "epoch": 15.00820945945946, "grad_norm": 0.04114541783928871, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4683 }, { "epoch": 15.008243243243243, "grad_norm": 0.058433666825294495, "learning_rate": 6.25e-06, "loss": 0.001, "step": 4684 }, { "epoch": 15.008277027027027, "grad_norm": 0.014658059924840927, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4685 }, { "epoch": 15.00831081081081, "grad_norm": 0.02120078355073929, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4686 }, { "epoch": 15.008344594594595, "grad_norm": 0.6607486009597778, "learning_rate": 6.25e-06, "loss": 0.0025, "step": 4687 }, { "epoch": 15.008378378378378, "grad_norm": 0.06909993290901184, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 4688 }, { "epoch": 15.008412162162163, "grad_norm": 12.12982177734375, "learning_rate": 6.25e-06, "loss": 0.0218, "step": 4689 }, { "epoch": 15.008445945945946, "grad_norm": 0.01856200024485588, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4690 }, { "epoch": 15.00847972972973, "grad_norm": 5.9079084396362305, "learning_rate": 6.25e-06, "loss": 0.2845, "step": 4691 }, { "epoch": 15.008513513513513, "grad_norm": 0.041660238057374954, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4692 }, { "epoch": 15.008547297297298, "grad_norm": 7.791632652282715, "learning_rate": 6.25e-06, "loss": 0.0285, "step": 4693 }, { "epoch": 15.008581081081081, "grad_norm": 37.970176696777344, "learning_rate": 6.25e-06, "loss": 0.1534, "step": 4694 }, { "epoch": 15.008614864864866, "grad_norm": 0.037319913506507874, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4695 }, { "epoch": 15.008648648648649, "grad_norm": 0.01148934569209814, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4696 }, { "epoch": 15.008682432432433, "grad_norm": 0.05439343303442001, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4697 }, { "epoch": 15.008716216216216, "grad_norm": 0.02538127638399601, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4698 }, { "epoch": 15.00875, "grad_norm": 0.08539656549692154, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4699 }, { "epoch": 15.008783783783784, "grad_norm": 0.11781188100576401, "learning_rate": 6.25e-06, "loss": 0.0045, "step": 4700 }, { "epoch": 15.008817567567567, "grad_norm": 0.0060653951950371265, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4701 }, { "epoch": 15.008851351351352, "grad_norm": 0.11968618631362915, "learning_rate": 6.25e-06, "loss": 0.0044, "step": 4702 }, { "epoch": 15.008885135135134, "grad_norm": 4.718908309936523, "learning_rate": 6.25e-06, "loss": 0.0325, "step": 4703 }, { "epoch": 15.00891891891892, "grad_norm": 0.0034244798589497805, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4704 }, { "epoch": 15.008952702702702, "grad_norm": 0.0023041609674692154, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4705 }, { "epoch": 15.008986486486487, "grad_norm": 0.0076097180135548115, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4706 }, { "epoch": 15.00902027027027, "grad_norm": 0.004351182375103235, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4707 }, { "epoch": 15.009054054054054, "grad_norm": 0.0035164374858140945, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4708 }, { "epoch": 15.009087837837837, "grad_norm": 0.015522463247179985, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4709 }, { "epoch": 15.009121621621622, "grad_norm": 0.0357326865196228, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4710 }, { "epoch": 15.009155405405405, "grad_norm": 10.661150932312012, "learning_rate": 6.25e-06, "loss": 0.9519, "step": 4711 }, { "epoch": 15.00918918918919, "grad_norm": 3.6753811836242676, "learning_rate": 6.25e-06, "loss": 0.1021, "step": 4712 }, { "epoch": 15.009222972972973, "grad_norm": 9.049445152282715, "learning_rate": 6.25e-06, "loss": 0.0305, "step": 4713 }, { "epoch": 15.009256756756757, "grad_norm": 0.008127203211188316, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4714 }, { "epoch": 15.00929054054054, "grad_norm": 0.10382288694381714, "learning_rate": 6.25e-06, "loss": 0.0038, "step": 4715 }, { "epoch": 15.009324324324325, "grad_norm": 0.00117278634570539, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4716 }, { "epoch": 15.009358108108108, "grad_norm": 0.00887207966297865, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4717 }, { "epoch": 15.009391891891893, "grad_norm": 0.004123630002140999, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4718 }, { "epoch": 15.009425675675676, "grad_norm": 0.032937563955783844, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 4719 }, { "epoch": 15.009459459459459, "grad_norm": 0.004775387700647116, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4720 }, { "epoch": 15.009493243243243, "grad_norm": 0.003491084324195981, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4721 }, { "epoch": 15.009527027027026, "grad_norm": 2.8644473552703857, "learning_rate": 6.25e-06, "loss": 0.0157, "step": 4722 }, { "epoch": 15.009560810810811, "grad_norm": 32.349056243896484, "learning_rate": 6.25e-06, "loss": 0.0737, "step": 4723 }, { "epoch": 15.009594594594594, "grad_norm": 0.032140735536813736, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 4724 }, { "epoch": 15.009628378378379, "grad_norm": 0.0033825954888015985, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4725 }, { "epoch": 15.009662162162162, "grad_norm": 23.15279197692871, "learning_rate": 6.25e-06, "loss": 0.488, "step": 4726 }, { "epoch": 15.009695945945946, "grad_norm": 0.014191142283380032, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4727 }, { "epoch": 15.00972972972973, "grad_norm": 0.8716919422149658, "learning_rate": 6.25e-06, "loss": 0.0107, "step": 4728 }, { "epoch": 15.009763513513514, "grad_norm": 0.04595033824443817, "learning_rate": 6.25e-06, "loss": 0.001, "step": 4729 }, { "epoch": 15.009797297297297, "grad_norm": 0.0012495709815993905, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4730 }, { "epoch": 15.009831081081082, "grad_norm": 0.07143665105104446, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4731 }, { "epoch": 15.009864864864864, "grad_norm": 19.533954620361328, "learning_rate": 6.25e-06, "loss": 0.9613, "step": 4732 }, { "epoch": 15.00989864864865, "grad_norm": 0.36829742789268494, "learning_rate": 6.25e-06, "loss": 0.0039, "step": 4733 }, { "epoch": 15.009932432432432, "grad_norm": 0.1933998465538025, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4734 }, { "epoch": 15.009966216216217, "grad_norm": 0.0031932636629790068, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4735 }, { "epoch": 15.01, "grad_norm": 0.0019139543874189258, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4736 }, { "epoch": 15.01, "eval_accuracy": 0.8949919224555735, "eval_loss": 0.534601628780365, "eval_runtime": 32.2026, "eval_samples_per_second": 19.222, "eval_steps_per_second": 2.422, "step": 4736 }, { "epoch": 16.000033783783785, "grad_norm": 0.003607441671192646, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4737 }, { "epoch": 16.000067567567566, "grad_norm": 0.01054768543690443, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4738 }, { "epoch": 16.00010135135135, "grad_norm": 0.0022271948400884867, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4739 }, { "epoch": 16.000135135135135, "grad_norm": 0.006086671259254217, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4740 }, { "epoch": 16.00016891891892, "grad_norm": 4.5677361488342285, "learning_rate": 6.25e-06, "loss": 0.0158, "step": 4741 }, { "epoch": 16.0002027027027, "grad_norm": 0.11750128865242004, "learning_rate": 6.25e-06, "loss": 0.0048, "step": 4742 }, { "epoch": 16.000236486486486, "grad_norm": 0.003960256930440664, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4743 }, { "epoch": 16.00027027027027, "grad_norm": 0.021768268197774887, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4744 }, { "epoch": 16.000304054054055, "grad_norm": 0.004918972961604595, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4745 }, { "epoch": 16.000337837837836, "grad_norm": 0.03256123512983322, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4746 }, { "epoch": 16.00037162162162, "grad_norm": 0.08106236904859543, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 4747 }, { "epoch": 16.000405405405406, "grad_norm": 0.030695972964167595, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4748 }, { "epoch": 16.00043918918919, "grad_norm": 0.14733654260635376, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 4749 }, { "epoch": 16.00047297297297, "grad_norm": 46.91869354248047, "learning_rate": 6.25e-06, "loss": 0.3025, "step": 4750 }, { "epoch": 16.000506756756756, "grad_norm": 0.0012546455254778266, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4751 }, { "epoch": 16.00054054054054, "grad_norm": 0.001810239627957344, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4752 }, { "epoch": 16.000574324324326, "grad_norm": 0.001530874869786203, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4753 }, { "epoch": 16.000608108108107, "grad_norm": 0.06499601155519485, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4754 }, { "epoch": 16.000641891891892, "grad_norm": 8.329206466674805, "learning_rate": 6.25e-06, "loss": 0.1154, "step": 4755 }, { "epoch": 16.000675675675677, "grad_norm": 0.004113026428967714, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4756 }, { "epoch": 16.00070945945946, "grad_norm": 0.06913558393716812, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4757 }, { "epoch": 16.000743243243242, "grad_norm": 0.13601937890052795, "learning_rate": 6.25e-06, "loss": 0.0024, "step": 4758 }, { "epoch": 16.000777027027027, "grad_norm": 0.1040147989988327, "learning_rate": 6.25e-06, "loss": 0.0016, "step": 4759 }, { "epoch": 16.000810810810812, "grad_norm": 0.8813984990119934, "learning_rate": 6.25e-06, "loss": 0.0123, "step": 4760 }, { "epoch": 16.000844594594593, "grad_norm": 0.6144726872444153, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 4761 }, { "epoch": 16.000878378378378, "grad_norm": 0.004736985545605421, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4762 }, { "epoch": 16.000912162162162, "grad_norm": 0.008880642242729664, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4763 }, { "epoch": 16.000945945945947, "grad_norm": 0.006937429774552584, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4764 }, { "epoch": 16.00097972972973, "grad_norm": 0.1638706624507904, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4765 }, { "epoch": 16.001013513513513, "grad_norm": 0.611445426940918, "learning_rate": 6.25e-06, "loss": 0.0018, "step": 4766 }, { "epoch": 16.001047297297298, "grad_norm": 0.0026785833761096, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4767 }, { "epoch": 16.001081081081082, "grad_norm": 0.05181396007537842, "learning_rate": 6.25e-06, "loss": 0.0017, "step": 4768 }, { "epoch": 16.001114864864864, "grad_norm": 0.003600158030167222, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4769 }, { "epoch": 16.00114864864865, "grad_norm": 0.03879549726843834, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4770 }, { "epoch": 16.001182432432433, "grad_norm": 28.154409408569336, "learning_rate": 6.25e-06, "loss": 0.0891, "step": 4771 }, { "epoch": 16.001216216216218, "grad_norm": 0.011504840105772018, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4772 }, { "epoch": 16.00125, "grad_norm": 0.001523679937236011, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4773 }, { "epoch": 16.001283783783784, "grad_norm": 4.2115397453308105, "learning_rate": 6.25e-06, "loss": 0.0637, "step": 4774 }, { "epoch": 16.00131756756757, "grad_norm": 0.11935101449489594, "learning_rate": 6.25e-06, "loss": 0.0046, "step": 4775 }, { "epoch": 16.001351351351353, "grad_norm": 17.343931198120117, "learning_rate": 6.25e-06, "loss": 0.1291, "step": 4776 }, { "epoch": 16.001385135135134, "grad_norm": 0.005520268343389034, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4777 }, { "epoch": 16.00141891891892, "grad_norm": 0.004906759597361088, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4778 }, { "epoch": 16.001452702702704, "grad_norm": 0.0067098443396389484, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4779 }, { "epoch": 16.001486486486485, "grad_norm": 0.2074376344680786, "learning_rate": 6.25e-06, "loss": 0.0015, "step": 4780 }, { "epoch": 16.00152027027027, "grad_norm": 0.009150751866400242, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4781 }, { "epoch": 16.001554054054054, "grad_norm": 0.17550630867481232, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4782 }, { "epoch": 16.00158783783784, "grad_norm": 0.005203919019550085, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4783 }, { "epoch": 16.00162162162162, "grad_norm": 0.006934330798685551, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4784 }, { "epoch": 16.001655405405405, "grad_norm": 0.7627490162849426, "learning_rate": 6.25e-06, "loss": 0.0103, "step": 4785 }, { "epoch": 16.00168918918919, "grad_norm": 0.12190832197666168, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 4786 }, { "epoch": 16.001722972972974, "grad_norm": 0.0024744346737861633, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4787 }, { "epoch": 16.001756756756755, "grad_norm": 0.0042849378660321236, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4788 }, { "epoch": 16.00179054054054, "grad_norm": 0.010181673802435398, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4789 }, { "epoch": 16.001824324324325, "grad_norm": 0.013069129548966885, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4790 }, { "epoch": 16.00185810810811, "grad_norm": 0.0951438918709755, "learning_rate": 6.25e-06, "loss": 0.0036, "step": 4791 }, { "epoch": 16.00189189189189, "grad_norm": 0.019681783393025398, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4792 }, { "epoch": 16.001925675675675, "grad_norm": 0.001885498408228159, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4793 }, { "epoch": 16.00195945945946, "grad_norm": 0.046800266951322556, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4794 }, { "epoch": 16.001993243243245, "grad_norm": 0.33351248502731323, "learning_rate": 6.25e-06, "loss": 0.0089, "step": 4795 }, { "epoch": 16.002027027027026, "grad_norm": 0.39327943325042725, "learning_rate": 6.25e-06, "loss": 0.0032, "step": 4796 }, { "epoch": 16.00206081081081, "grad_norm": 0.05085602030158043, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4797 }, { "epoch": 16.002094594594595, "grad_norm": 0.745928943157196, "learning_rate": 6.25e-06, "loss": 0.0187, "step": 4798 }, { "epoch": 16.00212837837838, "grad_norm": 0.28835949301719666, "learning_rate": 6.25e-06, "loss": 0.0068, "step": 4799 }, { "epoch": 16.00216216216216, "grad_norm": 0.0056665451265871525, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4800 }, { "epoch": 16.002195945945946, "grad_norm": 0.013359354808926582, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4801 }, { "epoch": 16.00222972972973, "grad_norm": 36.42983627319336, "learning_rate": 6.25e-06, "loss": 0.7756, "step": 4802 }, { "epoch": 16.002263513513512, "grad_norm": 0.003991033881902695, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4803 }, { "epoch": 16.002297297297297, "grad_norm": 0.002787065925076604, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4804 }, { "epoch": 16.00233108108108, "grad_norm": 0.0060640973970294, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4805 }, { "epoch": 16.002364864864866, "grad_norm": 0.001732114003971219, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4806 }, { "epoch": 16.002398648648647, "grad_norm": 0.030669130384922028, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4807 }, { "epoch": 16.002432432432432, "grad_norm": 5.212095260620117, "learning_rate": 6.25e-06, "loss": 0.008, "step": 4808 }, { "epoch": 16.002466216216217, "grad_norm": 24.889198303222656, "learning_rate": 6.25e-06, "loss": 0.2462, "step": 4809 }, { "epoch": 16.0025, "grad_norm": 0.004667895846068859, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4810 }, { "epoch": 16.002533783783782, "grad_norm": 0.013397675938904285, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4811 }, { "epoch": 16.002567567567567, "grad_norm": 0.41899389028549194, "learning_rate": 6.25e-06, "loss": 0.0062, "step": 4812 }, { "epoch": 16.002601351351352, "grad_norm": 0.0039589726366102695, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4813 }, { "epoch": 16.002635135135137, "grad_norm": 0.6602737903594971, "learning_rate": 6.25e-06, "loss": 0.0114, "step": 4814 }, { "epoch": 16.002668918918918, "grad_norm": 0.1328544169664383, "learning_rate": 6.25e-06, "loss": 0.0017, "step": 4815 }, { "epoch": 16.002702702702702, "grad_norm": 0.005783020053058863, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4816 }, { "epoch": 16.002736486486487, "grad_norm": 0.008518512360751629, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4817 }, { "epoch": 16.002770270270272, "grad_norm": 0.9480181932449341, "learning_rate": 6.25e-06, "loss": 0.0032, "step": 4818 }, { "epoch": 16.002804054054053, "grad_norm": 0.2147621363401413, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 4819 }, { "epoch": 16.002837837837838, "grad_norm": 0.03712862730026245, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4820 }, { "epoch": 16.002871621621622, "grad_norm": 0.002262680558487773, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4821 }, { "epoch": 16.002905405405407, "grad_norm": 32.414215087890625, "learning_rate": 6.25e-06, "loss": 0.5474, "step": 4822 }, { "epoch": 16.00293918918919, "grad_norm": 0.10650510340929031, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 4823 }, { "epoch": 16.002972972972973, "grad_norm": 0.002878943458199501, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4824 }, { "epoch": 16.003006756756758, "grad_norm": 0.009672933258116245, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4825 }, { "epoch": 16.00304054054054, "grad_norm": 1.4707744121551514, "learning_rate": 6.25e-06, "loss": 0.006, "step": 4826 }, { "epoch": 16.003074324324324, "grad_norm": 0.0059664491564035416, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4827 }, { "epoch": 16.00310810810811, "grad_norm": 6.603366851806641, "learning_rate": 6.25e-06, "loss": 0.581, "step": 4828 }, { "epoch": 16.003141891891893, "grad_norm": 0.19398146867752075, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4829 }, { "epoch": 16.003175675675674, "grad_norm": 0.001884488738141954, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4830 }, { "epoch": 16.00320945945946, "grad_norm": 0.08472905308008194, "learning_rate": 6.25e-06, "loss": 0.0032, "step": 4831 }, { "epoch": 16.003243243243244, "grad_norm": 0.005220440216362476, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4832 }, { "epoch": 16.00327702702703, "grad_norm": 5.220009803771973, "learning_rate": 6.25e-06, "loss": 0.5471, "step": 4833 }, { "epoch": 16.00331081081081, "grad_norm": 3.5764620304107666, "learning_rate": 6.25e-06, "loss": 0.0161, "step": 4834 }, { "epoch": 16.003344594594594, "grad_norm": 0.0021023168228566647, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4835 }, { "epoch": 16.00337837837838, "grad_norm": 0.010062986984848976, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4836 }, { "epoch": 16.003412162162164, "grad_norm": 0.09952521324157715, "learning_rate": 6.25e-06, "loss": 0.0037, "step": 4837 }, { "epoch": 16.003445945945945, "grad_norm": 0.12663207948207855, "learning_rate": 6.25e-06, "loss": 0.0033, "step": 4838 }, { "epoch": 16.00347972972973, "grad_norm": 0.0029086016584187746, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4839 }, { "epoch": 16.003513513513514, "grad_norm": 0.008593481034040451, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4840 }, { "epoch": 16.0035472972973, "grad_norm": 11.560088157653809, "learning_rate": 6.25e-06, "loss": 0.081, "step": 4841 }, { "epoch": 16.00358108108108, "grad_norm": 0.1130993664264679, "learning_rate": 6.25e-06, "loss": 0.0042, "step": 4842 }, { "epoch": 16.003614864864865, "grad_norm": 0.013944506645202637, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4843 }, { "epoch": 16.00364864864865, "grad_norm": 0.0020039756782352924, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4844 }, { "epoch": 16.00368243243243, "grad_norm": 0.027687152847647667, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4845 }, { "epoch": 16.003716216216215, "grad_norm": 0.0024019319098442793, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4846 }, { "epoch": 16.00375, "grad_norm": 0.005939173512160778, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4847 }, { "epoch": 16.003783783783785, "grad_norm": 32.68171310424805, "learning_rate": 6.25e-06, "loss": 0.0708, "step": 4848 }, { "epoch": 16.003817567567566, "grad_norm": 3.326810598373413, "learning_rate": 6.25e-06, "loss": 0.0083, "step": 4849 }, { "epoch": 16.00385135135135, "grad_norm": 0.009277992881834507, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4850 }, { "epoch": 16.003885135135135, "grad_norm": 4.027780055999756, "learning_rate": 6.25e-06, "loss": 0.4798, "step": 4851 }, { "epoch": 16.00391891891892, "grad_norm": 0.006395410746335983, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4852 }, { "epoch": 16.0039527027027, "grad_norm": 0.6238611340522766, "learning_rate": 6.25e-06, "loss": 0.0022, "step": 4853 }, { "epoch": 16.003986486486486, "grad_norm": 25.408681869506836, "learning_rate": 6.25e-06, "loss": 0.0913, "step": 4854 }, { "epoch": 16.00402027027027, "grad_norm": 0.019738510251045227, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4855 }, { "epoch": 16.004054054054055, "grad_norm": 0.01485142856836319, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4856 }, { "epoch": 16.004087837837837, "grad_norm": 0.10546667128801346, "learning_rate": 6.25e-06, "loss": 0.004, "step": 4857 }, { "epoch": 16.00412162162162, "grad_norm": 64.85414123535156, "learning_rate": 6.25e-06, "loss": 0.1493, "step": 4858 }, { "epoch": 16.004155405405406, "grad_norm": 0.040331337600946426, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4859 }, { "epoch": 16.00418918918919, "grad_norm": 0.04870777204632759, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4860 }, { "epoch": 16.004222972972972, "grad_norm": 0.013620156794786453, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4861 }, { "epoch": 16.004256756756757, "grad_norm": 0.0046367221511900425, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4862 }, { "epoch": 16.00429054054054, "grad_norm": 0.03249865397810936, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4863 }, { "epoch": 16.004324324324326, "grad_norm": 3.652353525161743, "learning_rate": 6.25e-06, "loss": 0.0089, "step": 4864 }, { "epoch": 16.004358108108107, "grad_norm": 0.0015918437857180834, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4865 }, { "epoch": 16.004391891891892, "grad_norm": 0.06365970522165298, "learning_rate": 6.25e-06, "loss": 0.002, "step": 4866 }, { "epoch": 16.004425675675677, "grad_norm": 0.0034230982419103384, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4867 }, { "epoch": 16.004459459459458, "grad_norm": 1.2369979619979858, "learning_rate": 6.25e-06, "loss": 0.0034, "step": 4868 }, { "epoch": 16.004493243243243, "grad_norm": 0.005615451373159885, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4869 }, { "epoch": 16.004527027027027, "grad_norm": 0.006123765371739864, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4870 }, { "epoch": 16.004560810810812, "grad_norm": 0.0017031722236424685, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4871 }, { "epoch": 16.004594594594593, "grad_norm": 9.894142150878906, "learning_rate": 6.25e-06, "loss": 0.116, "step": 4872 }, { "epoch": 16.004628378378378, "grad_norm": 0.2373734563589096, "learning_rate": 6.25e-06, "loss": 0.0051, "step": 4873 }, { "epoch": 16.004662162162163, "grad_norm": 0.008108142763376236, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4874 }, { "epoch": 16.004695945945947, "grad_norm": 0.13117899000644684, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4875 }, { "epoch": 16.00472972972973, "grad_norm": 0.015460312366485596, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4876 }, { "epoch": 16.004763513513513, "grad_norm": 0.03752795606851578, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4877 }, { "epoch": 16.004797297297298, "grad_norm": 0.0019474922446534038, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4878 }, { "epoch": 16.004831081081083, "grad_norm": 0.005944485310465097, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4879 }, { "epoch": 16.004864864864864, "grad_norm": 10.093334197998047, "learning_rate": 6.25e-06, "loss": 0.2507, "step": 4880 }, { "epoch": 16.00489864864865, "grad_norm": 0.0047017596662044525, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4881 }, { "epoch": 16.004932432432433, "grad_norm": 0.057827986776828766, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4882 }, { "epoch": 16.004966216216218, "grad_norm": 0.00940078217536211, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4883 }, { "epoch": 16.005, "grad_norm": 0.006624091416597366, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4884 }, { "epoch": 16.005033783783784, "grad_norm": 21.071338653564453, "learning_rate": 6.25e-06, "loss": 0.6019, "step": 4885 }, { "epoch": 16.00506756756757, "grad_norm": 5.632086277008057, "learning_rate": 6.25e-06, "loss": 0.5947, "step": 4886 }, { "epoch": 16.00510135135135, "grad_norm": 0.15184588730335236, "learning_rate": 6.25e-06, "loss": 0.0049, "step": 4887 }, { "epoch": 16.005135135135134, "grad_norm": 0.0019011953845620155, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4888 }, { "epoch": 16.00516891891892, "grad_norm": 0.011028450913727283, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4889 }, { "epoch": 16.005202702702704, "grad_norm": 0.06685318797826767, "learning_rate": 6.25e-06, "loss": 0.002, "step": 4890 }, { "epoch": 16.005236486486485, "grad_norm": 0.0033676812890917063, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4891 }, { "epoch": 16.00527027027027, "grad_norm": 0.0018904926255345345, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4892 }, { "epoch": 16.005304054054054, "grad_norm": 0.007314800284802914, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4893 }, { "epoch": 16.00533783783784, "grad_norm": 0.0009931056993082166, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4894 }, { "epoch": 16.00537162162162, "grad_norm": 0.012698953039944172, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4895 }, { "epoch": 16.005405405405405, "grad_norm": 0.7572906613349915, "learning_rate": 6.25e-06, "loss": 0.0021, "step": 4896 }, { "epoch": 16.00543918918919, "grad_norm": 0.03853137046098709, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4897 }, { "epoch": 16.005472972972974, "grad_norm": 0.45003020763397217, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 4898 }, { "epoch": 16.005506756756755, "grad_norm": 0.41512051224708557, "learning_rate": 6.25e-06, "loss": 0.0018, "step": 4899 }, { "epoch": 16.00554054054054, "grad_norm": 0.00302715296857059, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4900 }, { "epoch": 16.005574324324325, "grad_norm": 0.8991332054138184, "learning_rate": 6.25e-06, "loss": 0.0364, "step": 4901 }, { "epoch": 16.00560810810811, "grad_norm": 0.06830957531929016, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4902 }, { "epoch": 16.00564189189189, "grad_norm": 0.0035673673264682293, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4903 }, { "epoch": 16.005675675675676, "grad_norm": 40.918060302734375, "learning_rate": 6.25e-06, "loss": 0.1504, "step": 4904 }, { "epoch": 16.00570945945946, "grad_norm": 0.0009307179134339094, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4905 }, { "epoch": 16.005743243243245, "grad_norm": 2.6380436420440674, "learning_rate": 6.25e-06, "loss": 0.011, "step": 4906 }, { "epoch": 16.005777027027026, "grad_norm": 0.004043609369546175, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4907 }, { "epoch": 16.00581081081081, "grad_norm": 0.0219830684363842, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4908 }, { "epoch": 16.005844594594596, "grad_norm": 0.004069931339472532, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4909 }, { "epoch": 16.005878378378377, "grad_norm": 0.010682696476578712, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4910 }, { "epoch": 16.00591216216216, "grad_norm": 2.5758934020996094, "learning_rate": 6.25e-06, "loss": 0.0711, "step": 4911 }, { "epoch": 16.005945945945946, "grad_norm": 0.008613569661974907, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4912 }, { "epoch": 16.00597972972973, "grad_norm": 0.001671627745963633, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4913 }, { "epoch": 16.006013513513512, "grad_norm": 0.010624810121953487, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4914 }, { "epoch": 16.006047297297297, "grad_norm": 0.021039661020040512, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4915 }, { "epoch": 16.00608108108108, "grad_norm": 0.03107704035937786, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4916 }, { "epoch": 16.006114864864866, "grad_norm": 0.002869896125048399, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4917 }, { "epoch": 16.006148648648647, "grad_norm": 0.09906455874443054, "learning_rate": 6.25e-06, "loss": 0.0038, "step": 4918 }, { "epoch": 16.006182432432432, "grad_norm": 0.2560790181159973, "learning_rate": 6.25e-06, "loss": 0.003, "step": 4919 }, { "epoch": 16.006216216216217, "grad_norm": 0.006288763135671616, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4920 }, { "epoch": 16.00625, "grad_norm": 0.005456862505525351, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4921 }, { "epoch": 16.006283783783783, "grad_norm": 0.08560631424188614, "learning_rate": 6.25e-06, "loss": 0.002, "step": 4922 }, { "epoch": 16.006317567567567, "grad_norm": 0.0037256821524351835, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4923 }, { "epoch": 16.006351351351352, "grad_norm": 0.05405222252011299, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4924 }, { "epoch": 16.006385135135137, "grad_norm": 0.2713659703731537, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 4925 }, { "epoch": 16.006418918918918, "grad_norm": 17.743772506713867, "learning_rate": 6.25e-06, "loss": 0.0875, "step": 4926 }, { "epoch": 16.006452702702703, "grad_norm": 0.020456694066524506, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4927 }, { "epoch": 16.006486486486487, "grad_norm": 9.205964088439941, "learning_rate": 6.25e-06, "loss": 0.0206, "step": 4928 }, { "epoch": 16.006520270270272, "grad_norm": 0.43595314025878906, "learning_rate": 6.25e-06, "loss": 0.0017, "step": 4929 }, { "epoch": 16.006554054054053, "grad_norm": 0.003789852373301983, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4930 }, { "epoch": 16.006587837837838, "grad_norm": 5.142070770263672, "learning_rate": 6.25e-06, "loss": 0.2952, "step": 4931 }, { "epoch": 16.006621621621623, "grad_norm": 0.06703338027000427, "learning_rate": 6.25e-06, "loss": 0.0014, "step": 4932 }, { "epoch": 16.006655405405404, "grad_norm": 4.196672439575195, "learning_rate": 6.25e-06, "loss": 0.0097, "step": 4933 }, { "epoch": 16.00668918918919, "grad_norm": 0.07700634002685547, "learning_rate": 6.25e-06, "loss": 0.0026, "step": 4934 }, { "epoch": 16.006722972972973, "grad_norm": 0.007709402125328779, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4935 }, { "epoch": 16.006756756756758, "grad_norm": 0.4058472216129303, "learning_rate": 6.25e-06, "loss": 0.0058, "step": 4936 }, { "epoch": 16.00679054054054, "grad_norm": 0.0038024233654141426, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4937 }, { "epoch": 16.006824324324324, "grad_norm": 0.7368614077568054, "learning_rate": 6.25e-06, "loss": 0.0023, "step": 4938 }, { "epoch": 16.00685810810811, "grad_norm": 0.05642781779170036, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4939 }, { "epoch": 16.006891891891893, "grad_norm": 6.142965793609619, "learning_rate": 6.25e-06, "loss": 0.1507, "step": 4940 }, { "epoch": 16.006925675675674, "grad_norm": 4.831450939178467, "learning_rate": 6.25e-06, "loss": 0.2134, "step": 4941 }, { "epoch": 16.00695945945946, "grad_norm": 0.32756343483924866, "learning_rate": 6.25e-06, "loss": 0.0025, "step": 4942 }, { "epoch": 16.006993243243244, "grad_norm": 0.17633694410324097, "learning_rate": 6.25e-06, "loss": 0.001, "step": 4943 }, { "epoch": 16.00702702702703, "grad_norm": 0.11957142502069473, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 4944 }, { "epoch": 16.00706081081081, "grad_norm": 0.03100096806883812, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4945 }, { "epoch": 16.007094594594594, "grad_norm": 0.037014443427324295, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4946 }, { "epoch": 16.00712837837838, "grad_norm": 0.006669635884463787, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4947 }, { "epoch": 16.007162162162164, "grad_norm": 2.519646406173706, "learning_rate": 6.25e-06, "loss": 0.0223, "step": 4948 }, { "epoch": 16.007195945945945, "grad_norm": 0.02396101877093315, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4949 }, { "epoch": 16.00722972972973, "grad_norm": 0.0785449743270874, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 4950 }, { "epoch": 16.007263513513514, "grad_norm": 0.002826197072863579, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4951 }, { "epoch": 16.007297297297296, "grad_norm": 3.0433743000030518, "learning_rate": 6.25e-06, "loss": 0.0913, "step": 4952 }, { "epoch": 16.00733108108108, "grad_norm": 0.018737636506557465, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4953 }, { "epoch": 16.007364864864865, "grad_norm": 11.860064506530762, "learning_rate": 6.25e-06, "loss": 0.2544, "step": 4954 }, { "epoch": 16.00739864864865, "grad_norm": 0.03141603618860245, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 4955 }, { "epoch": 16.00743243243243, "grad_norm": 3.3903026580810547, "learning_rate": 6.25e-06, "loss": 0.0283, "step": 4956 }, { "epoch": 16.007466216216216, "grad_norm": 0.585131824016571, "learning_rate": 6.25e-06, "loss": 0.0166, "step": 4957 }, { "epoch": 16.0075, "grad_norm": 0.009065368212759495, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4958 }, { "epoch": 16.007533783783785, "grad_norm": 0.22583328187465668, "learning_rate": 6.25e-06, "loss": 0.0015, "step": 4959 }, { "epoch": 16.007567567567566, "grad_norm": 0.002930581569671631, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4960 }, { "epoch": 16.00760135135135, "grad_norm": 0.203788623213768, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4961 }, { "epoch": 16.007635135135136, "grad_norm": 0.11909978836774826, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 4962 }, { "epoch": 16.00766891891892, "grad_norm": 0.09344633668661118, "learning_rate": 6.25e-06, "loss": 0.0031, "step": 4963 }, { "epoch": 16.0077027027027, "grad_norm": 9.88237476348877, "learning_rate": 6.25e-06, "loss": 0.5157, "step": 4964 }, { "epoch": 16.007736486486486, "grad_norm": 0.7511581778526306, "learning_rate": 6.25e-06, "loss": 0.0069, "step": 4965 }, { "epoch": 16.00777027027027, "grad_norm": 0.09338562190532684, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 4966 }, { "epoch": 16.007804054054056, "grad_norm": 67.02738189697266, "learning_rate": 6.25e-06, "loss": 0.4542, "step": 4967 }, { "epoch": 16.007837837837837, "grad_norm": 0.08185237646102905, "learning_rate": 6.25e-06, "loss": 0.0031, "step": 4968 }, { "epoch": 16.00787162162162, "grad_norm": 0.002147642197087407, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4969 }, { "epoch": 16.007905405405406, "grad_norm": 0.005321374628692865, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4970 }, { "epoch": 16.00793918918919, "grad_norm": 0.0036878662649542093, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4971 }, { "epoch": 16.007972972972972, "grad_norm": 5.092704772949219, "learning_rate": 6.25e-06, "loss": 0.3208, "step": 4972 }, { "epoch": 16.008006756756757, "grad_norm": 17.372831344604492, "learning_rate": 6.25e-06, "loss": 0.5557, "step": 4973 }, { "epoch": 16.00804054054054, "grad_norm": 0.0863913893699646, "learning_rate": 6.25e-06, "loss": 0.0029, "step": 4974 }, { "epoch": 16.008074324324323, "grad_norm": 0.0035033337771892548, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4975 }, { "epoch": 16.008108108108107, "grad_norm": 0.009308168664574623, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4976 }, { "epoch": 16.008141891891892, "grad_norm": 0.006389939226210117, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4977 }, { "epoch": 16.008175675675677, "grad_norm": 0.0026513966731727123, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4978 }, { "epoch": 16.008209459459458, "grad_norm": 0.0038528740406036377, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4979 }, { "epoch": 16.008243243243243, "grad_norm": 0.0017359599005430937, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4980 }, { "epoch": 16.008277027027027, "grad_norm": 0.040334150195121765, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 4981 }, { "epoch": 16.008310810810812, "grad_norm": 0.381300687789917, "learning_rate": 6.25e-06, "loss": 0.0019, "step": 4982 }, { "epoch": 16.008344594594593, "grad_norm": 5.043879508972168, "learning_rate": 6.25e-06, "loss": 0.4417, "step": 4983 }, { "epoch": 16.008378378378378, "grad_norm": 0.07773904502391815, "learning_rate": 6.25e-06, "loss": 0.0025, "step": 4984 }, { "epoch": 16.008412162162163, "grad_norm": 0.10122072696685791, "learning_rate": 6.25e-06, "loss": 0.0015, "step": 4985 }, { "epoch": 16.008445945945947, "grad_norm": 0.0338716059923172, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 4986 }, { "epoch": 16.00847972972973, "grad_norm": 0.008835558779537678, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 4987 }, { "epoch": 16.008513513513513, "grad_norm": 0.026527725160121918, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4988 }, { "epoch": 16.008547297297298, "grad_norm": 1.6527888774871826, "learning_rate": 6.25e-06, "loss": 0.0071, "step": 4989 }, { "epoch": 16.008581081081083, "grad_norm": 0.046940870583057404, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 4990 }, { "epoch": 16.008614864864864, "grad_norm": 0.0010377736762166023, "learning_rate": 6.25e-06, "loss": 0.0, "step": 4991 }, { "epoch": 16.00864864864865, "grad_norm": 0.0689343512058258, "learning_rate": 6.25e-06, "loss": 0.0025, "step": 4992 }, { "epoch": 16.008682432432433, "grad_norm": 1.0038541555404663, "learning_rate": 6.25e-06, "loss": 0.0457, "step": 4993 }, { "epoch": 16.008716216216218, "grad_norm": 0.01756802760064602, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 4994 }, { "epoch": 16.00875, "grad_norm": 0.09359335899353027, "learning_rate": 6.25e-06, "loss": 0.0034, "step": 4995 }, { "epoch": 16.008783783783784, "grad_norm": 5.270802974700928, "learning_rate": 6.25e-06, "loss": 0.0597, "step": 4996 }, { "epoch": 16.00881756756757, "grad_norm": 0.0023725591599941254, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4997 }, { "epoch": 16.00885135135135, "grad_norm": 3.72705078125, "learning_rate": 6.25e-06, "loss": 0.1266, "step": 4998 }, { "epoch": 16.008885135135134, "grad_norm": 0.00559825636446476, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 4999 }, { "epoch": 16.00891891891892, "grad_norm": 0.0016985677648335695, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5000 }, { "epoch": 16.008952702702704, "grad_norm": 0.0024636881425976753, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5001 }, { "epoch": 16.008986486486485, "grad_norm": 4.558000087738037, "learning_rate": 6.25e-06, "loss": 0.0223, "step": 5002 }, { "epoch": 16.00902027027027, "grad_norm": 0.002152644330635667, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5003 }, { "epoch": 16.009054054054054, "grad_norm": 6.858633995056152, "learning_rate": 6.25e-06, "loss": 0.1512, "step": 5004 }, { "epoch": 16.00908783783784, "grad_norm": 1.9638839960098267, "learning_rate": 6.25e-06, "loss": 0.0225, "step": 5005 }, { "epoch": 16.00912162162162, "grad_norm": 55.496376037597656, "learning_rate": 6.25e-06, "loss": 0.2664, "step": 5006 }, { "epoch": 16.009155405405405, "grad_norm": 0.01442588958889246, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5007 }, { "epoch": 16.00918918918919, "grad_norm": 0.15289491415023804, "learning_rate": 6.25e-06, "loss": 0.0052, "step": 5008 }, { "epoch": 16.009222972972974, "grad_norm": 0.8573735356330872, "learning_rate": 6.25e-06, "loss": 0.0119, "step": 5009 }, { "epoch": 16.009256756756756, "grad_norm": 0.00989562924951315, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5010 }, { "epoch": 16.00929054054054, "grad_norm": 0.1908716857433319, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 5011 }, { "epoch": 16.009324324324325, "grad_norm": 0.10699378699064255, "learning_rate": 6.25e-06, "loss": 0.0033, "step": 5012 }, { "epoch": 16.00935810810811, "grad_norm": 5.554930686950684, "learning_rate": 6.25e-06, "loss": 0.1042, "step": 5013 }, { "epoch": 16.00939189189189, "grad_norm": 0.0062888492830097675, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5014 }, { "epoch": 16.009425675675676, "grad_norm": 0.0025779148563742638, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5015 }, { "epoch": 16.00945945945946, "grad_norm": 0.8465389609336853, "learning_rate": 6.25e-06, "loss": 0.0253, "step": 5016 }, { "epoch": 16.00949324324324, "grad_norm": 0.004908351227641106, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5017 }, { "epoch": 16.009527027027026, "grad_norm": 4.318572044372559, "learning_rate": 6.25e-06, "loss": 0.1957, "step": 5018 }, { "epoch": 16.00956081081081, "grad_norm": 8.121269226074219, "learning_rate": 6.25e-06, "loss": 0.1388, "step": 5019 }, { "epoch": 16.009594594594596, "grad_norm": 0.0024568396620452404, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5020 }, { "epoch": 16.009628378378377, "grad_norm": 0.02413526363670826, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5021 }, { "epoch": 16.00966216216216, "grad_norm": 14.378829002380371, "learning_rate": 6.25e-06, "loss": 0.7373, "step": 5022 }, { "epoch": 16.009695945945946, "grad_norm": 0.0010915056336671114, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5023 }, { "epoch": 16.00972972972973, "grad_norm": 0.007833318784832954, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5024 }, { "epoch": 16.009763513513512, "grad_norm": 4.427395343780518, "learning_rate": 6.25e-06, "loss": 0.4625, "step": 5025 }, { "epoch": 16.009797297297297, "grad_norm": 0.05383140593767166, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 5026 }, { "epoch": 16.00983108108108, "grad_norm": 0.09640787541866302, "learning_rate": 6.25e-06, "loss": 0.0037, "step": 5027 }, { "epoch": 16.009864864864866, "grad_norm": 0.008078102953732014, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5028 }, { "epoch": 16.009898648648647, "grad_norm": 62.29704666137695, "learning_rate": 6.25e-06, "loss": 0.6942, "step": 5029 }, { "epoch": 16.009932432432432, "grad_norm": 3.366725444793701, "learning_rate": 6.25e-06, "loss": 0.408, "step": 5030 }, { "epoch": 16.009966216216217, "grad_norm": 0.08091257512569427, "learning_rate": 6.25e-06, "loss": 0.003, "step": 5031 }, { "epoch": 16.01, "grad_norm": 2.3660879135131836, "learning_rate": 6.25e-06, "loss": 0.007, "step": 5032 }, { "epoch": 16.01, "eval_accuracy": 0.8885298869143781, "eval_loss": 0.6109504699707031, "eval_runtime": 32.0147, "eval_samples_per_second": 19.335, "eval_steps_per_second": 2.436, "step": 5032 }, { "epoch": 17.000033783783785, "grad_norm": 0.8485234379768372, "learning_rate": 6.25e-06, "loss": 0.0038, "step": 5033 }, { "epoch": 17.000067567567566, "grad_norm": 0.0031939023174345493, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5034 }, { "epoch": 17.00010135135135, "grad_norm": 0.2827969789505005, "learning_rate": 6.25e-06, "loss": 0.003, "step": 5035 }, { "epoch": 17.000135135135135, "grad_norm": 0.38807326555252075, "learning_rate": 6.25e-06, "loss": 0.0067, "step": 5036 }, { "epoch": 17.00016891891892, "grad_norm": 0.002608648734167218, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5037 }, { "epoch": 17.0002027027027, "grad_norm": 0.003734213300049305, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5038 }, { "epoch": 17.000236486486486, "grad_norm": 2.782963991165161, "learning_rate": 6.25e-06, "loss": 0.02, "step": 5039 }, { "epoch": 17.00027027027027, "grad_norm": 0.0030107435304671526, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5040 }, { "epoch": 17.000304054054055, "grad_norm": 0.005020773038268089, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5041 }, { "epoch": 17.000337837837836, "grad_norm": 0.0033776285126805305, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5042 }, { "epoch": 17.00037162162162, "grad_norm": 0.0020367042161524296, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5043 }, { "epoch": 17.000405405405406, "grad_norm": 3.3418126106262207, "learning_rate": 6.25e-06, "loss": 0.4187, "step": 5044 }, { "epoch": 17.00043918918919, "grad_norm": 0.0019134533358737826, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5045 }, { "epoch": 17.00047297297297, "grad_norm": 5.395105838775635, "learning_rate": 6.25e-06, "loss": 0.1772, "step": 5046 }, { "epoch": 17.000506756756756, "grad_norm": 0.009575705043971539, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5047 }, { "epoch": 17.00054054054054, "grad_norm": 15.981163024902344, "learning_rate": 6.25e-06, "loss": 0.0488, "step": 5048 }, { "epoch": 17.000574324324326, "grad_norm": 0.010225341655313969, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5049 }, { "epoch": 17.000608108108107, "grad_norm": 0.3172800838947296, "learning_rate": 6.25e-06, "loss": 0.0018, "step": 5050 }, { "epoch": 17.000641891891892, "grad_norm": 0.005447928793728352, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5051 }, { "epoch": 17.000675675675677, "grad_norm": 0.1548343449831009, "learning_rate": 6.25e-06, "loss": 0.006, "step": 5052 }, { "epoch": 17.00070945945946, "grad_norm": 0.005313146393746138, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5053 }, { "epoch": 17.000743243243242, "grad_norm": 0.0038250659126788378, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5054 }, { "epoch": 17.000777027027027, "grad_norm": 0.004778042435646057, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5055 }, { "epoch": 17.000810810810812, "grad_norm": 0.007545114494860172, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5056 }, { "epoch": 17.000844594594593, "grad_norm": 0.0020391447469592094, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5057 }, { "epoch": 17.000878378378378, "grad_norm": 1.519680142402649, "learning_rate": 6.25e-06, "loss": 0.0307, "step": 5058 }, { "epoch": 17.000912162162162, "grad_norm": 4.112792015075684, "learning_rate": 6.25e-06, "loss": 0.2979, "step": 5059 }, { "epoch": 17.000945945945947, "grad_norm": 0.006209234707057476, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5060 }, { "epoch": 17.00097972972973, "grad_norm": 0.004734438378363848, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5061 }, { "epoch": 17.001013513513513, "grad_norm": 0.017707759514451027, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5062 }, { "epoch": 17.001047297297298, "grad_norm": 0.0018080599838867784, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5063 }, { "epoch": 17.001081081081082, "grad_norm": 8.359736442565918, "learning_rate": 6.25e-06, "loss": 0.5526, "step": 5064 }, { "epoch": 17.001114864864864, "grad_norm": 0.30198103189468384, "learning_rate": 6.25e-06, "loss": 0.0029, "step": 5065 }, { "epoch": 17.00114864864865, "grad_norm": 0.11384972929954529, "learning_rate": 6.25e-06, "loss": 0.003, "step": 5066 }, { "epoch": 17.001182432432433, "grad_norm": 32.76839828491211, "learning_rate": 6.25e-06, "loss": 0.1271, "step": 5067 }, { "epoch": 17.001216216216218, "grad_norm": 0.002780074253678322, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5068 }, { "epoch": 17.00125, "grad_norm": 0.0029853705782443285, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5069 }, { "epoch": 17.001283783783784, "grad_norm": 0.6235251426696777, "learning_rate": 6.25e-06, "loss": 0.0151, "step": 5070 }, { "epoch": 17.00131756756757, "grad_norm": 0.7114983797073364, "learning_rate": 6.25e-06, "loss": 0.0055, "step": 5071 }, { "epoch": 17.001351351351353, "grad_norm": 0.1506739854812622, "learning_rate": 6.25e-06, "loss": 0.001, "step": 5072 }, { "epoch": 17.001385135135134, "grad_norm": 0.5229927897453308, "learning_rate": 6.25e-06, "loss": 0.0119, "step": 5073 }, { "epoch": 17.00141891891892, "grad_norm": 0.003648686222732067, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5074 }, { "epoch": 17.001452702702704, "grad_norm": 0.004331725183874369, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5075 }, { "epoch": 17.001486486486485, "grad_norm": 0.33078059554100037, "learning_rate": 6.25e-06, "loss": 0.0064, "step": 5076 }, { "epoch": 17.00152027027027, "grad_norm": 0.0020089419558644295, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5077 }, { "epoch": 17.001554054054054, "grad_norm": 0.01834711804986, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5078 }, { "epoch": 17.00158783783784, "grad_norm": 2.5908641815185547, "learning_rate": 6.25e-06, "loss": 0.0775, "step": 5079 }, { "epoch": 17.00162162162162, "grad_norm": 0.011708158068358898, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5080 }, { "epoch": 17.001655405405405, "grad_norm": 0.033016387373209, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5081 }, { "epoch": 17.00168918918919, "grad_norm": 10.348281860351562, "learning_rate": 6.25e-06, "loss": 0.1341, "step": 5082 }, { "epoch": 17.001722972972974, "grad_norm": 0.11679630726575851, "learning_rate": 6.25e-06, "loss": 0.0043, "step": 5083 }, { "epoch": 17.001756756756755, "grad_norm": 11.176311492919922, "learning_rate": 6.25e-06, "loss": 0.0395, "step": 5084 }, { "epoch": 17.00179054054054, "grad_norm": 0.002197161316871643, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5085 }, { "epoch": 17.001824324324325, "grad_norm": 1.1713311672210693, "learning_rate": 6.25e-06, "loss": 0.004, "step": 5086 }, { "epoch": 17.00185810810811, "grad_norm": 0.10797170549631119, "learning_rate": 6.25e-06, "loss": 0.0034, "step": 5087 }, { "epoch": 17.00189189189189, "grad_norm": 0.004489785060286522, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5088 }, { "epoch": 17.001925675675675, "grad_norm": 0.02343318611383438, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5089 }, { "epoch": 17.00195945945946, "grad_norm": 0.003238437697291374, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5090 }, { "epoch": 17.001993243243245, "grad_norm": 0.44363877177238464, "learning_rate": 6.25e-06, "loss": 0.0174, "step": 5091 }, { "epoch": 17.002027027027026, "grad_norm": 0.03987884894013405, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 5092 }, { "epoch": 17.00206081081081, "grad_norm": 0.0011463805567473173, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5093 }, { "epoch": 17.002094594594595, "grad_norm": 20.325849533081055, "learning_rate": 6.25e-06, "loss": 0.088, "step": 5094 }, { "epoch": 17.00212837837838, "grad_norm": 0.11326967179775238, "learning_rate": 6.25e-06, "loss": 0.0042, "step": 5095 }, { "epoch": 17.00216216216216, "grad_norm": 0.002683290047571063, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5096 }, { "epoch": 17.002195945945946, "grad_norm": 0.0853893831372261, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5097 }, { "epoch": 17.00222972972973, "grad_norm": 0.0015953510301187634, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5098 }, { "epoch": 17.002263513513512, "grad_norm": 0.0025018516462296247, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5099 }, { "epoch": 17.002297297297297, "grad_norm": 0.0029412955045700073, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5100 }, { "epoch": 17.00233108108108, "grad_norm": 0.030173271894454956, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5101 }, { "epoch": 17.002364864864866, "grad_norm": 0.010736552067101002, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5102 }, { "epoch": 17.002398648648647, "grad_norm": 0.003694235347211361, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5103 }, { "epoch": 17.002432432432432, "grad_norm": 0.15602289140224457, "learning_rate": 6.25e-06, "loss": 0.0044, "step": 5104 }, { "epoch": 17.002466216216217, "grad_norm": 2.654120683670044, "learning_rate": 6.25e-06, "loss": 0.0072, "step": 5105 }, { "epoch": 17.0025, "grad_norm": 0.012299601919949055, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5106 }, { "epoch": 17.002533783783782, "grad_norm": 0.0017824898241087794, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5107 }, { "epoch": 17.002567567567567, "grad_norm": 0.015635957941412926, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5108 }, { "epoch": 17.002601351351352, "grad_norm": 0.6286369562149048, "learning_rate": 6.25e-06, "loss": 0.0023, "step": 5109 }, { "epoch": 17.002635135135137, "grad_norm": 64.57353210449219, "learning_rate": 6.25e-06, "loss": 0.4415, "step": 5110 }, { "epoch": 17.002668918918918, "grad_norm": 0.11592863500118256, "learning_rate": 6.25e-06, "loss": 0.0044, "step": 5111 }, { "epoch": 17.002702702702702, "grad_norm": 0.13696329295635223, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 5112 }, { "epoch": 17.002736486486487, "grad_norm": 0.003248445922508836, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5113 }, { "epoch": 17.002770270270272, "grad_norm": 10.398886680603027, "learning_rate": 6.25e-06, "loss": 0.6276, "step": 5114 }, { "epoch": 17.002804054054053, "grad_norm": 0.010163244791328907, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5115 }, { "epoch": 17.002837837837838, "grad_norm": 7.671548366546631, "learning_rate": 6.25e-06, "loss": 0.6009, "step": 5116 }, { "epoch": 17.002871621621622, "grad_norm": 0.9236235618591309, "learning_rate": 6.25e-06, "loss": 0.0184, "step": 5117 }, { "epoch": 17.002905405405407, "grad_norm": 0.014575563371181488, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5118 }, { "epoch": 17.00293918918919, "grad_norm": 0.003977195359766483, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5119 }, { "epoch": 17.002972972972973, "grad_norm": 0.017466096207499504, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5120 }, { "epoch": 17.003006756756758, "grad_norm": 7.177344799041748, "learning_rate": 6.25e-06, "loss": 0.3706, "step": 5121 }, { "epoch": 17.00304054054054, "grad_norm": 0.003034824738278985, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5122 }, { "epoch": 17.003074324324324, "grad_norm": 0.27629485726356506, "learning_rate": 6.25e-06, "loss": 0.0046, "step": 5123 }, { "epoch": 17.00310810810811, "grad_norm": 0.0067587802186608315, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5124 }, { "epoch": 17.003141891891893, "grad_norm": 0.002183014526963234, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5125 }, { "epoch": 17.003175675675674, "grad_norm": 22.696674346923828, "learning_rate": 6.25e-06, "loss": 0.8364, "step": 5126 }, { "epoch": 17.00320945945946, "grad_norm": 0.011092104949057102, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5127 }, { "epoch": 17.003243243243244, "grad_norm": 4.763914108276367, "learning_rate": 6.25e-06, "loss": 0.278, "step": 5128 }, { "epoch": 17.00327702702703, "grad_norm": 0.004992254544049501, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5129 }, { "epoch": 17.00331081081081, "grad_norm": 1.812005639076233, "learning_rate": 6.25e-06, "loss": 0.0646, "step": 5130 }, { "epoch": 17.003344594594594, "grad_norm": 0.03240469470620155, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5131 }, { "epoch": 17.00337837837838, "grad_norm": 0.0051338328048586845, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5132 }, { "epoch": 17.003412162162164, "grad_norm": 0.0019261745037510991, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5133 }, { "epoch": 17.003445945945945, "grad_norm": 0.009237338788807392, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5134 }, { "epoch": 17.00347972972973, "grad_norm": 0.008381817489862442, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5135 }, { "epoch": 17.003513513513514, "grad_norm": 10.775594711303711, "learning_rate": 6.25e-06, "loss": 0.5011, "step": 5136 }, { "epoch": 17.0035472972973, "grad_norm": 0.07048161327838898, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5137 }, { "epoch": 17.00358108108108, "grad_norm": 0.054454583674669266, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5138 }, { "epoch": 17.003614864864865, "grad_norm": 0.0019263906870037317, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5139 }, { "epoch": 17.00364864864865, "grad_norm": 0.001243917504325509, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5140 }, { "epoch": 17.00368243243243, "grad_norm": 0.017543066293001175, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5141 }, { "epoch": 17.003716216216215, "grad_norm": 0.0017004212131723762, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5142 }, { "epoch": 17.00375, "grad_norm": 37.17109298706055, "learning_rate": 6.25e-06, "loss": 0.7232, "step": 5143 }, { "epoch": 17.003783783783785, "grad_norm": 0.0017468318110331893, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5144 }, { "epoch": 17.003817567567566, "grad_norm": 18.64361000061035, "learning_rate": 6.25e-06, "loss": 0.1257, "step": 5145 }, { "epoch": 17.00385135135135, "grad_norm": 42.907325744628906, "learning_rate": 6.25e-06, "loss": 0.3314, "step": 5146 }, { "epoch": 17.003885135135135, "grad_norm": 0.011371614411473274, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5147 }, { "epoch": 17.00391891891892, "grad_norm": 0.15791872143745422, "learning_rate": 6.25e-06, "loss": 0.001, "step": 5148 }, { "epoch": 17.0039527027027, "grad_norm": 0.016830265522003174, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5149 }, { "epoch": 17.003986486486486, "grad_norm": 0.1830558180809021, "learning_rate": 6.25e-06, "loss": 0.0017, "step": 5150 }, { "epoch": 17.00402027027027, "grad_norm": 0.00125721818767488, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5151 }, { "epoch": 17.004054054054055, "grad_norm": 1.2853738069534302, "learning_rate": 6.25e-06, "loss": 0.059, "step": 5152 }, { "epoch": 17.004087837837837, "grad_norm": 0.3636533319950104, "learning_rate": 6.25e-06, "loss": 0.011, "step": 5153 }, { "epoch": 17.00412162162162, "grad_norm": 0.003955158870667219, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5154 }, { "epoch": 17.004155405405406, "grad_norm": 1.1193922758102417, "learning_rate": 6.25e-06, "loss": 0.0195, "step": 5155 }, { "epoch": 17.00418918918919, "grad_norm": 0.5003724694252014, "learning_rate": 6.25e-06, "loss": 0.0148, "step": 5156 }, { "epoch": 17.004222972972972, "grad_norm": 0.1675255000591278, "learning_rate": 6.25e-06, "loss": 0.0026, "step": 5157 }, { "epoch": 17.004256756756757, "grad_norm": 0.11065497994422913, "learning_rate": 6.25e-06, "loss": 0.0018, "step": 5158 }, { "epoch": 17.00429054054054, "grad_norm": 0.0826558843255043, "learning_rate": 6.25e-06, "loss": 0.0019, "step": 5159 }, { "epoch": 17.004324324324326, "grad_norm": 0.006534358952194452, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5160 }, { "epoch": 17.004358108108107, "grad_norm": 0.014793296344578266, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5161 }, { "epoch": 17.004391891891892, "grad_norm": 0.14103065431118011, "learning_rate": 6.25e-06, "loss": 0.0045, "step": 5162 }, { "epoch": 17.004425675675677, "grad_norm": 4.153752326965332, "learning_rate": 6.25e-06, "loss": 0.0285, "step": 5163 }, { "epoch": 17.004459459459458, "grad_norm": 1.4549674987792969, "learning_rate": 6.25e-06, "loss": 0.008, "step": 5164 }, { "epoch": 17.004493243243243, "grad_norm": 3.3755149841308594, "learning_rate": 6.25e-06, "loss": 0.4546, "step": 5165 }, { "epoch": 17.004527027027027, "grad_norm": 13.91598892211914, "learning_rate": 6.25e-06, "loss": 0.733, "step": 5166 }, { "epoch": 17.004560810810812, "grad_norm": 0.003257892094552517, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5167 }, { "epoch": 17.004594594594593, "grad_norm": 2.1087615489959717, "learning_rate": 6.25e-06, "loss": 0.0053, "step": 5168 }, { "epoch": 17.004628378378378, "grad_norm": 0.11560925096273422, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 5169 }, { "epoch": 17.004662162162163, "grad_norm": 0.1017506867647171, "learning_rate": 6.25e-06, "loss": 0.002, "step": 5170 }, { "epoch": 17.004695945945947, "grad_norm": 0.012939916923642159, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5171 }, { "epoch": 17.00472972972973, "grad_norm": 0.010735008865594864, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5172 }, { "epoch": 17.004763513513513, "grad_norm": 0.01834612525999546, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5173 }, { "epoch": 17.004797297297298, "grad_norm": 1.8440968990325928, "learning_rate": 6.25e-06, "loss": 0.0594, "step": 5174 }, { "epoch": 17.004831081081083, "grad_norm": 0.009740139357745647, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5175 }, { "epoch": 17.004864864864864, "grad_norm": 0.012457486242055893, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5176 }, { "epoch": 17.00489864864865, "grad_norm": 0.009876282885670662, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5177 }, { "epoch": 17.004932432432433, "grad_norm": 0.02456759661436081, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5178 }, { "epoch": 17.004966216216218, "grad_norm": 0.007504977751523256, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5179 }, { "epoch": 17.005, "grad_norm": 0.16537785530090332, "learning_rate": 6.25e-06, "loss": 0.0035, "step": 5180 }, { "epoch": 17.005033783783784, "grad_norm": 5.13346004486084, "learning_rate": 6.25e-06, "loss": 0.0829, "step": 5181 }, { "epoch": 17.00506756756757, "grad_norm": 0.02954983524978161, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5182 }, { "epoch": 17.00510135135135, "grad_norm": 0.12411246448755264, "learning_rate": 6.25e-06, "loss": 0.0048, "step": 5183 }, { "epoch": 17.005135135135134, "grad_norm": 0.009552556090056896, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5184 }, { "epoch": 17.00516891891892, "grad_norm": 0.05612032115459442, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5185 }, { "epoch": 17.005202702702704, "grad_norm": 0.2690412998199463, "learning_rate": 6.25e-06, "loss": 0.0021, "step": 5186 }, { "epoch": 17.005236486486485, "grad_norm": 0.7145616412162781, "learning_rate": 6.25e-06, "loss": 0.0169, "step": 5187 }, { "epoch": 17.00527027027027, "grad_norm": 0.03538921847939491, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5188 }, { "epoch": 17.005304054054054, "grad_norm": 0.005931689869612455, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5189 }, { "epoch": 17.00533783783784, "grad_norm": 0.002676143078133464, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5190 }, { "epoch": 17.00537162162162, "grad_norm": 0.0010938404593616724, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5191 }, { "epoch": 17.005405405405405, "grad_norm": 0.003201277693733573, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5192 }, { "epoch": 17.00543918918919, "grad_norm": 0.0014141856227070093, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5193 }, { "epoch": 17.005472972972974, "grad_norm": 0.9387729167938232, "learning_rate": 6.25e-06, "loss": 0.0076, "step": 5194 }, { "epoch": 17.005506756756755, "grad_norm": 0.0015266078989952803, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5195 }, { "epoch": 17.00554054054054, "grad_norm": 0.0011609059292823076, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5196 }, { "epoch": 17.005574324324325, "grad_norm": 0.007052945904433727, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5197 }, { "epoch": 17.00560810810811, "grad_norm": 12.315557479858398, "learning_rate": 6.25e-06, "loss": 0.2335, "step": 5198 }, { "epoch": 17.00564189189189, "grad_norm": 12.343475341796875, "learning_rate": 6.25e-06, "loss": 0.5983, "step": 5199 }, { "epoch": 17.005675675675676, "grad_norm": 0.0009385208250023425, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5200 }, { "epoch": 17.00570945945946, "grad_norm": 0.37926411628723145, "learning_rate": 6.25e-06, "loss": 0.0025, "step": 5201 }, { "epoch": 17.005743243243245, "grad_norm": 0.3267117142677307, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 5202 }, { "epoch": 17.005777027027026, "grad_norm": 8.005975723266602, "learning_rate": 6.25e-06, "loss": 0.0301, "step": 5203 }, { "epoch": 17.00581081081081, "grad_norm": 8.764381408691406, "learning_rate": 6.25e-06, "loss": 0.0348, "step": 5204 }, { "epoch": 17.005844594594596, "grad_norm": 27.267608642578125, "learning_rate": 6.25e-06, "loss": 0.3633, "step": 5205 }, { "epoch": 17.005878378378377, "grad_norm": 0.0032374225556850433, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5206 }, { "epoch": 17.00591216216216, "grad_norm": 0.027084989473223686, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5207 }, { "epoch": 17.005945945945946, "grad_norm": 0.003185974434018135, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5208 }, { "epoch": 17.00597972972973, "grad_norm": 0.07125826179981232, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5209 }, { "epoch": 17.006013513513512, "grad_norm": 0.007335605099797249, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5210 }, { "epoch": 17.006047297297297, "grad_norm": 0.002198162255808711, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5211 }, { "epoch": 17.00608108108108, "grad_norm": 0.0028679906390607357, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5212 }, { "epoch": 17.006114864864866, "grad_norm": 0.24395254254341125, "learning_rate": 6.25e-06, "loss": 0.005, "step": 5213 }, { "epoch": 17.006148648648647, "grad_norm": 0.0016890134429559112, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5214 }, { "epoch": 17.006182432432432, "grad_norm": 0.0060528237372636795, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5215 }, { "epoch": 17.006216216216217, "grad_norm": 0.004404337145388126, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5216 }, { "epoch": 17.00625, "grad_norm": 0.04643971100449562, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 5217 }, { "epoch": 17.006283783783783, "grad_norm": 2.6019763946533203, "learning_rate": 6.25e-06, "loss": 0.0081, "step": 5218 }, { "epoch": 17.006317567567567, "grad_norm": 0.028994591906666756, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5219 }, { "epoch": 17.006351351351352, "grad_norm": 0.009219878353178501, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5220 }, { "epoch": 17.006385135135137, "grad_norm": 0.3491457998752594, "learning_rate": 6.25e-06, "loss": 0.0029, "step": 5221 }, { "epoch": 17.006418918918918, "grad_norm": 0.007847310043871403, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5222 }, { "epoch": 17.006452702702703, "grad_norm": 0.0016882530180737376, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5223 }, { "epoch": 17.006486486486487, "grad_norm": 0.2781285047531128, "learning_rate": 6.25e-06, "loss": 0.0014, "step": 5224 }, { "epoch": 17.006520270270272, "grad_norm": 0.24420464038848877, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 5225 }, { "epoch": 17.006554054054053, "grad_norm": 0.019588163122534752, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5226 }, { "epoch": 17.006587837837838, "grad_norm": 0.713194727897644, "learning_rate": 6.25e-06, "loss": 0.0089, "step": 5227 }, { "epoch": 17.006621621621623, "grad_norm": 0.10819940268993378, "learning_rate": 6.25e-06, "loss": 0.0042, "step": 5228 }, { "epoch": 17.006655405405404, "grad_norm": 0.006334772799164057, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5229 }, { "epoch": 17.00668918918919, "grad_norm": 0.1055331751704216, "learning_rate": 6.25e-06, "loss": 0.004, "step": 5230 }, { "epoch": 17.006722972972973, "grad_norm": 0.0020978187676519156, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5231 }, { "epoch": 17.006756756756758, "grad_norm": 0.003360411152243614, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5232 }, { "epoch": 17.00679054054054, "grad_norm": 0.00255767535418272, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5233 }, { "epoch": 17.006824324324324, "grad_norm": 0.004662193823605776, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5234 }, { "epoch": 17.00685810810811, "grad_norm": 0.0030455985106527805, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5235 }, { "epoch": 17.006891891891893, "grad_norm": 0.20773518085479736, "learning_rate": 6.25e-06, "loss": 0.0079, "step": 5236 }, { "epoch": 17.006925675675674, "grad_norm": 0.0029192627407610416, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5237 }, { "epoch": 17.00695945945946, "grad_norm": 5.729275226593018, "learning_rate": 6.25e-06, "loss": 0.0909, "step": 5238 }, { "epoch": 17.006993243243244, "grad_norm": 2.6966848373413086, "learning_rate": 6.25e-06, "loss": 0.0211, "step": 5239 }, { "epoch": 17.00702702702703, "grad_norm": 0.08947363495826721, "learning_rate": 6.25e-06, "loss": 0.001, "step": 5240 }, { "epoch": 17.00706081081081, "grad_norm": 0.007375512272119522, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5241 }, { "epoch": 17.007094594594594, "grad_norm": 5.606030464172363, "learning_rate": 6.25e-06, "loss": 0.0754, "step": 5242 }, { "epoch": 17.00712837837838, "grad_norm": 0.0017530868062749505, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5243 }, { "epoch": 17.007162162162164, "grad_norm": 20.65266990661621, "learning_rate": 6.25e-06, "loss": 0.6892, "step": 5244 }, { "epoch": 17.007195945945945, "grad_norm": 85.48383331298828, "learning_rate": 6.25e-06, "loss": 0.3308, "step": 5245 }, { "epoch": 17.00722972972973, "grad_norm": 0.0008734624134376645, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5246 }, { "epoch": 17.007263513513514, "grad_norm": 0.008210535161197186, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5247 }, { "epoch": 17.007297297297296, "grad_norm": 0.10586383193731308, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 5248 }, { "epoch": 17.00733108108108, "grad_norm": 0.015021346509456635, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5249 }, { "epoch": 17.007364864864865, "grad_norm": 0.9790030121803284, "learning_rate": 6.25e-06, "loss": 0.0037, "step": 5250 }, { "epoch": 17.00739864864865, "grad_norm": 0.009083347395062447, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5251 }, { "epoch": 17.00743243243243, "grad_norm": 0.01202261820435524, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5252 }, { "epoch": 17.007466216216216, "grad_norm": 0.003376009175553918, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5253 }, { "epoch": 17.0075, "grad_norm": 0.0343497134745121, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 5254 }, { "epoch": 17.007533783783785, "grad_norm": 0.004174869507551193, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5255 }, { "epoch": 17.007567567567566, "grad_norm": 14.870882034301758, "learning_rate": 6.25e-06, "loss": 0.0498, "step": 5256 }, { "epoch": 17.00760135135135, "grad_norm": 0.15260405838489532, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 5257 }, { "epoch": 17.007635135135136, "grad_norm": 0.001035640249028802, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5258 }, { "epoch": 17.00766891891892, "grad_norm": 0.004990444518625736, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5259 }, { "epoch": 17.0077027027027, "grad_norm": 0.003411389421671629, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5260 }, { "epoch": 17.007736486486486, "grad_norm": 1.2221009731292725, "learning_rate": 6.25e-06, "loss": 0.0221, "step": 5261 }, { "epoch": 17.00777027027027, "grad_norm": 0.20411117374897003, "learning_rate": 6.25e-06, "loss": 0.003, "step": 5262 }, { "epoch": 17.007804054054056, "grad_norm": 0.4707574248313904, "learning_rate": 6.25e-06, "loss": 0.0024, "step": 5263 }, { "epoch": 17.007837837837837, "grad_norm": 2.314356803894043, "learning_rate": 6.25e-06, "loss": 0.0115, "step": 5264 }, { "epoch": 17.00787162162162, "grad_norm": 0.004411304369568825, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5265 }, { "epoch": 17.007905405405406, "grad_norm": 9.147323608398438, "learning_rate": 6.25e-06, "loss": 0.6531, "step": 5266 }, { "epoch": 17.00793918918919, "grad_norm": 1.8233519792556763, "learning_rate": 6.25e-06, "loss": 0.0399, "step": 5267 }, { "epoch": 17.007972972972972, "grad_norm": 48.97874450683594, "learning_rate": 6.25e-06, "loss": 0.2699, "step": 5268 }, { "epoch": 17.008006756756757, "grad_norm": 0.08017203956842422, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5269 }, { "epoch": 17.00804054054054, "grad_norm": 2.529970645904541, "learning_rate": 6.25e-06, "loss": 0.0966, "step": 5270 }, { "epoch": 17.008074324324323, "grad_norm": 0.0035403685178607702, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5271 }, { "epoch": 17.008108108108107, "grad_norm": 0.0017044200794771314, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5272 }, { "epoch": 17.008141891891892, "grad_norm": 0.0019172461470589042, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5273 }, { "epoch": 17.008175675675677, "grad_norm": 0.004355963319540024, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5274 }, { "epoch": 17.008209459459458, "grad_norm": 16.00506591796875, "learning_rate": 6.25e-06, "loss": 0.0544, "step": 5275 }, { "epoch": 17.008243243243243, "grad_norm": 0.0019245303701609373, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5276 }, { "epoch": 17.008277027027027, "grad_norm": 0.011831254698336124, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5277 }, { "epoch": 17.008310810810812, "grad_norm": 0.017747366800904274, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5278 }, { "epoch": 17.008344594594593, "grad_norm": 0.0029708119109272957, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5279 }, { "epoch": 17.008378378378378, "grad_norm": 0.0094800665974617, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5280 }, { "epoch": 17.008412162162163, "grad_norm": 32.867591857910156, "learning_rate": 6.25e-06, "loss": 0.0997, "step": 5281 }, { "epoch": 17.008445945945947, "grad_norm": 0.15568149089813232, "learning_rate": 6.25e-06, "loss": 0.0017, "step": 5282 }, { "epoch": 17.00847972972973, "grad_norm": 0.006487546022981405, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5283 }, { "epoch": 17.008513513513513, "grad_norm": 0.2903490364551544, "learning_rate": 6.25e-06, "loss": 0.0065, "step": 5284 }, { "epoch": 17.008547297297298, "grad_norm": 0.0020892873872071505, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5285 }, { "epoch": 17.008581081081083, "grad_norm": 20.802099227905273, "learning_rate": 6.25e-06, "loss": 0.0647, "step": 5286 }, { "epoch": 17.008614864864864, "grad_norm": 0.08413363993167877, "learning_rate": 6.25e-06, "loss": 0.0032, "step": 5287 }, { "epoch": 17.00864864864865, "grad_norm": 0.002446443075314164, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5288 }, { "epoch": 17.008682432432433, "grad_norm": 37.11707305908203, "learning_rate": 6.25e-06, "loss": 0.2349, "step": 5289 }, { "epoch": 17.008716216216218, "grad_norm": 0.006167783867567778, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5290 }, { "epoch": 17.00875, "grad_norm": 0.0017397516639903188, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5291 }, { "epoch": 17.008783783783784, "grad_norm": 0.002715862588956952, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5292 }, { "epoch": 17.00881756756757, "grad_norm": 0.011930262669920921, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5293 }, { "epoch": 17.00885135135135, "grad_norm": 0.0011163451708853245, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5294 }, { "epoch": 17.008885135135134, "grad_norm": 3.1201322078704834, "learning_rate": 6.25e-06, "loss": 0.0063, "step": 5295 }, { "epoch": 17.00891891891892, "grad_norm": 29.848588943481445, "learning_rate": 6.25e-06, "loss": 0.1693, "step": 5296 }, { "epoch": 17.008952702702704, "grad_norm": 0.0784669816493988, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5297 }, { "epoch": 17.008986486486485, "grad_norm": 26.1715087890625, "learning_rate": 6.25e-06, "loss": 0.4955, "step": 5298 }, { "epoch": 17.00902027027027, "grad_norm": 0.03780488669872284, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5299 }, { "epoch": 17.009054054054054, "grad_norm": 0.4283599555492401, "learning_rate": 6.25e-06, "loss": 0.0014, "step": 5300 }, { "epoch": 17.00908783783784, "grad_norm": 0.0139641547575593, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5301 }, { "epoch": 17.00912162162162, "grad_norm": 0.09829744696617126, "learning_rate": 6.25e-06, "loss": 0.0038, "step": 5302 }, { "epoch": 17.009155405405405, "grad_norm": 0.0026274295523762703, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5303 }, { "epoch": 17.00918918918919, "grad_norm": 1.6572014093399048, "learning_rate": 6.25e-06, "loss": 0.0547, "step": 5304 }, { "epoch": 17.009222972972974, "grad_norm": 0.0009020884754136205, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5305 }, { "epoch": 17.009256756756756, "grad_norm": 0.009888749569654465, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5306 }, { "epoch": 17.00929054054054, "grad_norm": 0.03367192670702934, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5307 }, { "epoch": 17.009324324324325, "grad_norm": 0.0008338293991982937, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5308 }, { "epoch": 17.00935810810811, "grad_norm": 0.00313071021810174, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5309 }, { "epoch": 17.00939189189189, "grad_norm": 0.01589338481426239, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5310 }, { "epoch": 17.009425675675676, "grad_norm": 0.2279815673828125, "learning_rate": 6.25e-06, "loss": 0.0018, "step": 5311 }, { "epoch": 17.00945945945946, "grad_norm": 0.005092721898108721, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5312 }, { "epoch": 17.00949324324324, "grad_norm": 0.14316755533218384, "learning_rate": 6.25e-06, "loss": 0.0016, "step": 5313 }, { "epoch": 17.009527027027026, "grad_norm": 0.002598854247480631, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5314 }, { "epoch": 17.00956081081081, "grad_norm": 0.005120580550283194, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5315 }, { "epoch": 17.009594594594596, "grad_norm": 0.016467396169900894, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5316 }, { "epoch": 17.009628378378377, "grad_norm": 0.09870320558547974, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5317 }, { "epoch": 17.00966216216216, "grad_norm": 0.4134945571422577, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 5318 }, { "epoch": 17.009695945945946, "grad_norm": 0.33220174908638, "learning_rate": 6.25e-06, "loss": 0.005, "step": 5319 }, { "epoch": 17.00972972972973, "grad_norm": 3.448399782180786, "learning_rate": 6.25e-06, "loss": 0.4929, "step": 5320 }, { "epoch": 17.009763513513512, "grad_norm": 0.0034450034145265818, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5321 }, { "epoch": 17.009797297297297, "grad_norm": 0.0074546560645103455, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5322 }, { "epoch": 17.00983108108108, "grad_norm": 17.428823471069336, "learning_rate": 6.25e-06, "loss": 0.9237, "step": 5323 }, { "epoch": 17.009864864864866, "grad_norm": 0.0014727250672876835, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5324 }, { "epoch": 17.009898648648647, "grad_norm": 18.261728286743164, "learning_rate": 6.25e-06, "loss": 0.0374, "step": 5325 }, { "epoch": 17.009932432432432, "grad_norm": 0.011649698950350285, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5326 }, { "epoch": 17.009966216216217, "grad_norm": 0.0032610944472253323, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5327 }, { "epoch": 17.01, "grad_norm": 0.0034661791287362576, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5328 }, { "epoch": 17.01, "eval_accuracy": 0.8869143780290791, "eval_loss": 0.6140531897544861, "eval_runtime": 32.0558, "eval_samples_per_second": 19.31, "eval_steps_per_second": 2.433, "step": 5328 }, { "epoch": 18.000033783783785, "grad_norm": 0.003742646425962448, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5329 }, { "epoch": 18.000067567567566, "grad_norm": 0.005247204564511776, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5330 }, { "epoch": 18.00010135135135, "grad_norm": 1.7563365697860718, "learning_rate": 6.25e-06, "loss": 0.0318, "step": 5331 }, { "epoch": 18.000135135135135, "grad_norm": 0.008133582770824432, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5332 }, { "epoch": 18.00016891891892, "grad_norm": 0.4016805589199066, "learning_rate": 6.25e-06, "loss": 0.0058, "step": 5333 }, { "epoch": 18.0002027027027, "grad_norm": 0.37068745493888855, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 5334 }, { "epoch": 18.000236486486486, "grad_norm": 0.002924645086750388, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5335 }, { "epoch": 18.00027027027027, "grad_norm": 0.0036420959513634443, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5336 }, { "epoch": 18.000304054054055, "grad_norm": 3.9568917751312256, "learning_rate": 6.25e-06, "loss": 0.1629, "step": 5337 }, { "epoch": 18.000337837837836, "grad_norm": 2.4280152320861816, "learning_rate": 6.25e-06, "loss": 0.0173, "step": 5338 }, { "epoch": 18.00037162162162, "grad_norm": 0.09310601651668549, "learning_rate": 6.25e-06, "loss": 0.0035, "step": 5339 }, { "epoch": 18.000405405405406, "grad_norm": 0.0011248422088101506, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5340 }, { "epoch": 18.00043918918919, "grad_norm": 0.3618309199810028, "learning_rate": 6.25e-06, "loss": 0.0015, "step": 5341 }, { "epoch": 18.00047297297297, "grad_norm": 0.07630284875631332, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 5342 }, { "epoch": 18.000506756756756, "grad_norm": 0.14173702895641327, "learning_rate": 6.25e-06, "loss": 0.0049, "step": 5343 }, { "epoch": 18.00054054054054, "grad_norm": 0.03782793506979942, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 5344 }, { "epoch": 18.000574324324326, "grad_norm": 0.048676498234272, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 5345 }, { "epoch": 18.000608108108107, "grad_norm": 0.0009898734278976917, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5346 }, { "epoch": 18.000641891891892, "grad_norm": 0.0036973576061427593, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5347 }, { "epoch": 18.000675675675677, "grad_norm": 0.0011293411953374743, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5348 }, { "epoch": 18.00070945945946, "grad_norm": 0.004060371313244104, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5349 }, { "epoch": 18.000743243243242, "grad_norm": 0.9139227867126465, "learning_rate": 6.25e-06, "loss": 0.0063, "step": 5350 }, { "epoch": 18.000777027027027, "grad_norm": 58.589176177978516, "learning_rate": 6.25e-06, "loss": 0.159, "step": 5351 }, { "epoch": 18.000810810810812, "grad_norm": 0.17466846108436584, "learning_rate": 6.25e-06, "loss": 0.0037, "step": 5352 }, { "epoch": 18.000844594594593, "grad_norm": 19.837228775024414, "learning_rate": 6.25e-06, "loss": 0.333, "step": 5353 }, { "epoch": 18.000878378378378, "grad_norm": 0.0029193577356636524, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5354 }, { "epoch": 18.000912162162162, "grad_norm": 0.004889044910669327, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5355 }, { "epoch": 18.000945945945947, "grad_norm": 6.634990215301514, "learning_rate": 6.25e-06, "loss": 0.4374, "step": 5356 }, { "epoch": 18.00097972972973, "grad_norm": 0.001587221398949623, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5357 }, { "epoch": 18.001013513513513, "grad_norm": 0.044553060084581375, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5358 }, { "epoch": 18.001047297297298, "grad_norm": 0.0023358725011348724, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5359 }, { "epoch": 18.001081081081082, "grad_norm": 0.02200653776526451, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5360 }, { "epoch": 18.001114864864864, "grad_norm": 0.0016804059268906713, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5361 }, { "epoch": 18.00114864864865, "grad_norm": 0.0047289407812058926, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5362 }, { "epoch": 18.001182432432433, "grad_norm": 0.0017652679234743118, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5363 }, { "epoch": 18.001216216216218, "grad_norm": 0.006844645831733942, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5364 }, { "epoch": 18.00125, "grad_norm": 0.008788123726844788, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5365 }, { "epoch": 18.001283783783784, "grad_norm": 0.08229660987854004, "learning_rate": 6.25e-06, "loss": 0.0026, "step": 5366 }, { "epoch": 18.00131756756757, "grad_norm": 0.004284827038645744, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5367 }, { "epoch": 18.001351351351353, "grad_norm": 0.0013558239443227649, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5368 }, { "epoch": 18.001385135135134, "grad_norm": 0.004132052417844534, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5369 }, { "epoch": 18.00141891891892, "grad_norm": 0.0019189108861610293, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5370 }, { "epoch": 18.001452702702704, "grad_norm": 0.06079934909939766, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5371 }, { "epoch": 18.001486486486485, "grad_norm": 0.5953890085220337, "learning_rate": 6.25e-06, "loss": 0.0067, "step": 5372 }, { "epoch": 18.00152027027027, "grad_norm": 0.45432958006858826, "learning_rate": 6.25e-06, "loss": 0.0109, "step": 5373 }, { "epoch": 18.001554054054054, "grad_norm": 17.951448440551758, "learning_rate": 6.25e-06, "loss": 0.3394, "step": 5374 }, { "epoch": 18.00158783783784, "grad_norm": 0.0025584660470485687, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5375 }, { "epoch": 18.00162162162162, "grad_norm": 0.08655966818332672, "learning_rate": 6.25e-06, "loss": 0.0029, "step": 5376 }, { "epoch": 18.001655405405405, "grad_norm": 0.0014934978680685163, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5377 }, { "epoch": 18.00168918918919, "grad_norm": 50.25209426879883, "learning_rate": 6.25e-06, "loss": 1.3805, "step": 5378 }, { "epoch": 18.001722972972974, "grad_norm": 0.008233748376369476, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5379 }, { "epoch": 18.001756756756755, "grad_norm": 0.0012129971291869879, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5380 }, { "epoch": 18.00179054054054, "grad_norm": 0.008671046234667301, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5381 }, { "epoch": 18.001824324324325, "grad_norm": 0.002431967295706272, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5382 }, { "epoch": 18.00185810810811, "grad_norm": 32.40951919555664, "learning_rate": 6.25e-06, "loss": 0.3277, "step": 5383 }, { "epoch": 18.00189189189189, "grad_norm": 0.02590378187596798, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5384 }, { "epoch": 18.001925675675675, "grad_norm": 1.3405506610870361, "learning_rate": 6.25e-06, "loss": 0.0267, "step": 5385 }, { "epoch": 18.00195945945946, "grad_norm": 4.353950023651123, "learning_rate": 6.25e-06, "loss": 0.4651, "step": 5386 }, { "epoch": 18.001993243243245, "grad_norm": 0.0035633952356874943, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5387 }, { "epoch": 18.002027027027026, "grad_norm": 0.0030537855345755816, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5388 }, { "epoch": 18.00206081081081, "grad_norm": 0.0020454316399991512, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5389 }, { "epoch": 18.002094594594595, "grad_norm": 0.0014220745069906116, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5390 }, { "epoch": 18.00212837837838, "grad_norm": 25.798418045043945, "learning_rate": 6.25e-06, "loss": 0.0971, "step": 5391 }, { "epoch": 18.00216216216216, "grad_norm": 0.03757604956626892, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5392 }, { "epoch": 18.002195945945946, "grad_norm": 0.169366255402565, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5393 }, { "epoch": 18.00222972972973, "grad_norm": 0.00075394386658445, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5394 }, { "epoch": 18.002263513513512, "grad_norm": 0.03956903889775276, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 5395 }, { "epoch": 18.002297297297297, "grad_norm": 0.0033399811945855618, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5396 }, { "epoch": 18.00233108108108, "grad_norm": 0.8040568828582764, "learning_rate": 6.25e-06, "loss": 0.0034, "step": 5397 }, { "epoch": 18.002364864864866, "grad_norm": 0.004668258596211672, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5398 }, { "epoch": 18.002398648648647, "grad_norm": 0.0015993400011211634, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5399 }, { "epoch": 18.002432432432432, "grad_norm": 0.000986258266493678, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5400 }, { "epoch": 18.002466216216217, "grad_norm": 0.8932062983512878, "learning_rate": 6.25e-06, "loss": 0.0036, "step": 5401 }, { "epoch": 18.0025, "grad_norm": 0.21905037760734558, "learning_rate": 6.25e-06, "loss": 0.0048, "step": 5402 }, { "epoch": 18.002533783783782, "grad_norm": 0.005518477875739336, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5403 }, { "epoch": 18.002567567567567, "grad_norm": 0.5210920572280884, "learning_rate": 6.25e-06, "loss": 0.0076, "step": 5404 }, { "epoch": 18.002601351351352, "grad_norm": 0.0025809581857174635, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5405 }, { "epoch": 18.002635135135137, "grad_norm": 0.18864379823207855, "learning_rate": 6.25e-06, "loss": 0.0029, "step": 5406 }, { "epoch": 18.002668918918918, "grad_norm": 0.015031958930194378, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5407 }, { "epoch": 18.002702702702702, "grad_norm": 0.0015856820391491055, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5408 }, { "epoch": 18.002736486486487, "grad_norm": 0.005694101098924875, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5409 }, { "epoch": 18.002770270270272, "grad_norm": 0.002356712007895112, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5410 }, { "epoch": 18.002804054054053, "grad_norm": 0.0011179446009919047, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5411 }, { "epoch": 18.002837837837838, "grad_norm": 0.1360425353050232, "learning_rate": 6.25e-06, "loss": 0.0026, "step": 5412 }, { "epoch": 18.002871621621622, "grad_norm": 39.60783767700195, "learning_rate": 6.25e-06, "loss": 0.6372, "step": 5413 }, { "epoch": 18.002905405405407, "grad_norm": 0.0016640520188957453, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5414 }, { "epoch": 18.00293918918919, "grad_norm": 0.06674274057149887, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 5415 }, { "epoch": 18.002972972972973, "grad_norm": 0.9529625177383423, "learning_rate": 6.25e-06, "loss": 0.0068, "step": 5416 }, { "epoch": 18.003006756756758, "grad_norm": 0.0032076379284262657, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5417 }, { "epoch": 18.00304054054054, "grad_norm": 0.03820148855447769, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5418 }, { "epoch": 18.003074324324324, "grad_norm": 3.0038185119628906, "learning_rate": 6.25e-06, "loss": 0.0727, "step": 5419 }, { "epoch": 18.00310810810811, "grad_norm": 0.001047103782184422, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5420 }, { "epoch": 18.003141891891893, "grad_norm": 0.0012359449174255133, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5421 }, { "epoch": 18.003175675675674, "grad_norm": 0.0990113914012909, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 5422 }, { "epoch": 18.00320945945946, "grad_norm": 8.806974411010742, "learning_rate": 6.25e-06, "loss": 0.5977, "step": 5423 }, { "epoch": 18.003243243243244, "grad_norm": 0.1519995778799057, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 5424 }, { "epoch": 18.00327702702703, "grad_norm": 0.0010962013620883226, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5425 }, { "epoch": 18.00331081081081, "grad_norm": 0.597690999507904, "learning_rate": 6.25e-06, "loss": 0.0046, "step": 5426 }, { "epoch": 18.003344594594594, "grad_norm": 0.003696816274896264, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5427 }, { "epoch": 18.00337837837838, "grad_norm": 0.06600815802812576, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 5428 }, { "epoch": 18.003412162162164, "grad_norm": 4.005048751831055, "learning_rate": 6.25e-06, "loss": 0.4655, "step": 5429 }, { "epoch": 18.003445945945945, "grad_norm": 0.019626745954155922, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5430 }, { "epoch": 18.00347972972973, "grad_norm": 0.007146215997636318, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5431 }, { "epoch": 18.003513513513514, "grad_norm": 0.1377347707748413, "learning_rate": 6.25e-06, "loss": 0.0051, "step": 5432 }, { "epoch": 18.0035472972973, "grad_norm": 0.05353749915957451, "learning_rate": 6.25e-06, "loss": 0.0013, "step": 5433 }, { "epoch": 18.00358108108108, "grad_norm": 1.1021132469177246, "learning_rate": 6.25e-06, "loss": 0.0166, "step": 5434 }, { "epoch": 18.003614864864865, "grad_norm": 0.09222768247127533, "learning_rate": 6.25e-06, "loss": 0.0032, "step": 5435 }, { "epoch": 18.00364864864865, "grad_norm": 16.387203216552734, "learning_rate": 6.25e-06, "loss": 0.3012, "step": 5436 }, { "epoch": 18.00368243243243, "grad_norm": 0.002527096541598439, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5437 }, { "epoch": 18.003716216216215, "grad_norm": 3.413753032684326, "learning_rate": 6.25e-06, "loss": 0.4275, "step": 5438 }, { "epoch": 18.00375, "grad_norm": 0.08824539184570312, "learning_rate": 6.25e-06, "loss": 0.0033, "step": 5439 }, { "epoch": 18.003783783783785, "grad_norm": 0.0019407800864428282, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5440 }, { "epoch": 18.003817567567566, "grad_norm": 4.197524547576904, "learning_rate": 6.25e-06, "loss": 0.0425, "step": 5441 }, { "epoch": 18.00385135135135, "grad_norm": 0.001565673272125423, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5442 }, { "epoch": 18.003885135135135, "grad_norm": 0.0014909696765244007, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5443 }, { "epoch": 18.00391891891892, "grad_norm": 0.0016708087641745806, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5444 }, { "epoch": 18.0039527027027, "grad_norm": 0.001679010922089219, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5445 }, { "epoch": 18.003986486486486, "grad_norm": 0.42710623145103455, "learning_rate": 6.25e-06, "loss": 0.0044, "step": 5446 }, { "epoch": 18.00402027027027, "grad_norm": 21.233930587768555, "learning_rate": 6.25e-06, "loss": 1.0374, "step": 5447 }, { "epoch": 18.004054054054055, "grad_norm": 0.003873799229040742, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5448 }, { "epoch": 18.004087837837837, "grad_norm": 0.005219311453402042, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5449 }, { "epoch": 18.00412162162162, "grad_norm": 0.03194380924105644, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5450 }, { "epoch": 18.004155405405406, "grad_norm": 0.006947904825210571, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5451 }, { "epoch": 18.00418918918919, "grad_norm": 0.007036623544991016, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5452 }, { "epoch": 18.004222972972972, "grad_norm": 0.04853545129299164, "learning_rate": 6.25e-06, "loss": 0.001, "step": 5453 }, { "epoch": 18.004256756756757, "grad_norm": 0.19368648529052734, "learning_rate": 6.25e-06, "loss": 0.0027, "step": 5454 }, { "epoch": 18.00429054054054, "grad_norm": 8.441944122314453, "learning_rate": 6.25e-06, "loss": 0.896, "step": 5455 }, { "epoch": 18.004324324324326, "grad_norm": 0.004418001044541597, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5456 }, { "epoch": 18.004358108108107, "grad_norm": 0.04355396702885628, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5457 }, { "epoch": 18.004391891891892, "grad_norm": 0.001968814991414547, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5458 }, { "epoch": 18.004425675675677, "grad_norm": 0.003146806498989463, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5459 }, { "epoch": 18.004459459459458, "grad_norm": 7.503489017486572, "learning_rate": 6.25e-06, "loss": 0.0811, "step": 5460 }, { "epoch": 18.004493243243243, "grad_norm": 0.002286783419549465, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5461 }, { "epoch": 18.004527027027027, "grad_norm": 0.0049104392528533936, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5462 }, { "epoch": 18.004560810810812, "grad_norm": 0.034613896161317825, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5463 }, { "epoch": 18.004594594594593, "grad_norm": 0.17471691966056824, "learning_rate": 6.25e-06, "loss": 0.0057, "step": 5464 }, { "epoch": 18.004628378378378, "grad_norm": 6.020370960235596, "learning_rate": 6.25e-06, "loss": 0.4773, "step": 5465 }, { "epoch": 18.004662162162163, "grad_norm": 20.198516845703125, "learning_rate": 6.25e-06, "loss": 0.2119, "step": 5466 }, { "epoch": 18.004695945945947, "grad_norm": 0.01585167832672596, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5467 }, { "epoch": 18.00472972972973, "grad_norm": 0.3838268518447876, "learning_rate": 6.25e-06, "loss": 0.0012, "step": 5468 }, { "epoch": 18.004763513513513, "grad_norm": 0.0015470918733626604, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5469 }, { "epoch": 18.004797297297298, "grad_norm": 0.009339472278952599, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5470 }, { "epoch": 18.004831081081083, "grad_norm": 0.018086416646838188, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5471 }, { "epoch": 18.004864864864864, "grad_norm": 0.0036934136878699064, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5472 }, { "epoch": 18.00489864864865, "grad_norm": 4.081578254699707, "learning_rate": 6.25e-06, "loss": 0.0075, "step": 5473 }, { "epoch": 18.004932432432433, "grad_norm": 0.2906435430049896, "learning_rate": 6.25e-06, "loss": 0.0102, "step": 5474 }, { "epoch": 18.004966216216218, "grad_norm": 1.6087809801101685, "learning_rate": 6.25e-06, "loss": 0.0052, "step": 5475 }, { "epoch": 18.005, "grad_norm": 13.782269477844238, "learning_rate": 6.25e-06, "loss": 0.2356, "step": 5476 }, { "epoch": 18.005033783783784, "grad_norm": 0.0019098311895504594, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5477 }, { "epoch": 18.00506756756757, "grad_norm": 0.0016614821506664157, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5478 }, { "epoch": 18.00510135135135, "grad_norm": 0.015211768448352814, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5479 }, { "epoch": 18.005135135135134, "grad_norm": 0.009388988837599754, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5480 }, { "epoch": 18.00516891891892, "grad_norm": 0.01557168085128069, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5481 }, { "epoch": 18.005202702702704, "grad_norm": 0.03248495236039162, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5482 }, { "epoch": 18.005236486486485, "grad_norm": 0.00364185543730855, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5483 }, { "epoch": 18.00527027027027, "grad_norm": 0.011251832358539104, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5484 }, { "epoch": 18.005304054054054, "grad_norm": 0.3544321060180664, "learning_rate": 6.25e-06, "loss": 0.0024, "step": 5485 }, { "epoch": 18.00533783783784, "grad_norm": 2.9243593215942383, "learning_rate": 6.25e-06, "loss": 0.0073, "step": 5486 }, { "epoch": 18.00537162162162, "grad_norm": 0.0012439113343134522, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5487 }, { "epoch": 18.005405405405405, "grad_norm": 0.005543914623558521, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5488 }, { "epoch": 18.00543918918919, "grad_norm": 0.011622690595686436, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5489 }, { "epoch": 18.005472972972974, "grad_norm": 0.0026158203836530447, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5490 }, { "epoch": 18.005506756756755, "grad_norm": 0.0015709949657320976, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5491 }, { "epoch": 18.00554054054054, "grad_norm": 0.7332475185394287, "learning_rate": 6.25e-06, "loss": 0.0136, "step": 5492 }, { "epoch": 18.005574324324325, "grad_norm": 27.952314376831055, "learning_rate": 6.25e-06, "loss": 0.4074, "step": 5493 }, { "epoch": 18.00560810810811, "grad_norm": 0.027101021260023117, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5494 }, { "epoch": 18.00564189189189, "grad_norm": 38.303802490234375, "learning_rate": 6.25e-06, "loss": 0.1116, "step": 5495 }, { "epoch": 18.005675675675676, "grad_norm": 12.662712097167969, "learning_rate": 6.25e-06, "loss": 0.1245, "step": 5496 }, { "epoch": 18.00570945945946, "grad_norm": 4.052865982055664, "learning_rate": 6.25e-06, "loss": 0.008, "step": 5497 }, { "epoch": 18.005743243243245, "grad_norm": 0.0065489779226481915, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5498 }, { "epoch": 18.005777027027026, "grad_norm": 0.0013871341943740845, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5499 }, { "epoch": 18.00581081081081, "grad_norm": 41.98893737792969, "learning_rate": 6.25e-06, "loss": 0.1535, "step": 5500 }, { "epoch": 18.005844594594596, "grad_norm": 0.13757820427417755, "learning_rate": 6.25e-06, "loss": 0.0052, "step": 5501 }, { "epoch": 18.005878378378377, "grad_norm": 0.002031031297519803, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5502 }, { "epoch": 18.00591216216216, "grad_norm": 0.30062949657440186, "learning_rate": 6.25e-06, "loss": 0.0066, "step": 5503 }, { "epoch": 18.005945945945946, "grad_norm": 5.738887310028076, "learning_rate": 6.25e-06, "loss": 0.0145, "step": 5504 }, { "epoch": 18.00597972972973, "grad_norm": 0.13894641399383545, "learning_rate": 6.25e-06, "loss": 0.0054, "step": 5505 }, { "epoch": 18.006013513513512, "grad_norm": 51.17976760864258, "learning_rate": 6.25e-06, "loss": 0.5229, "step": 5506 }, { "epoch": 18.006047297297297, "grad_norm": 0.0028022623155266047, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5507 }, { "epoch": 18.00608108108108, "grad_norm": 0.11045511066913605, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 5508 }, { "epoch": 18.006114864864866, "grad_norm": 0.002341953571885824, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5509 }, { "epoch": 18.006148648648647, "grad_norm": 0.016703927889466286, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5510 }, { "epoch": 18.006182432432432, "grad_norm": 0.13113366067409515, "learning_rate": 6.25e-06, "loss": 0.005, "step": 5511 }, { "epoch": 18.006216216216217, "grad_norm": 0.03963372856378555, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5512 }, { "epoch": 18.00625, "grad_norm": 0.0028993638698011637, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5513 }, { "epoch": 18.006283783783783, "grad_norm": 0.0033080659341067076, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5514 }, { "epoch": 18.006317567567567, "grad_norm": 0.010030597448348999, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5515 }, { "epoch": 18.006351351351352, "grad_norm": 0.06408454477787018, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 5516 }, { "epoch": 18.006385135135137, "grad_norm": 0.0011767159448936582, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5517 }, { "epoch": 18.006418918918918, "grad_norm": 0.13692185282707214, "learning_rate": 6.25e-06, "loss": 0.0054, "step": 5518 }, { "epoch": 18.006452702702703, "grad_norm": 0.03571814298629761, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 5519 }, { "epoch": 18.006486486486487, "grad_norm": 0.0024660236667841673, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5520 }, { "epoch": 18.006520270270272, "grad_norm": 0.004428754094988108, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5521 }, { "epoch": 18.006554054054053, "grad_norm": 12.74122428894043, "learning_rate": 6.25e-06, "loss": 0.8845, "step": 5522 }, { "epoch": 18.006587837837838, "grad_norm": 0.013381407596170902, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5523 }, { "epoch": 18.006621621621623, "grad_norm": 2.2930572032928467, "learning_rate": 6.25e-06, "loss": 0.0603, "step": 5524 }, { "epoch": 18.006655405405404, "grad_norm": 0.0009388031903654337, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5525 }, { "epoch": 18.00668918918919, "grad_norm": 0.0016030282713472843, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5526 }, { "epoch": 18.006722972972973, "grad_norm": 0.02312900312244892, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5527 }, { "epoch": 18.006756756756758, "grad_norm": 0.004110496491193771, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5528 }, { "epoch": 18.00679054054054, "grad_norm": 0.0016901901690289378, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5529 }, { "epoch": 18.006824324324324, "grad_norm": 80.00139617919922, "learning_rate": 6.25e-06, "loss": 0.8164, "step": 5530 }, { "epoch": 18.00685810810811, "grad_norm": 0.007348970044404268, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5531 }, { "epoch": 18.006891891891893, "grad_norm": 0.0033980815205723047, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5532 }, { "epoch": 18.006925675675674, "grad_norm": 0.00644799554720521, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5533 }, { "epoch": 18.00695945945946, "grad_norm": 0.582086980342865, "learning_rate": 6.25e-06, "loss": 0.0187, "step": 5534 }, { "epoch": 18.006993243243244, "grad_norm": 7.9932146072387695, "learning_rate": 6.25e-06, "loss": 0.0526, "step": 5535 }, { "epoch": 18.00702702702703, "grad_norm": 32.887359619140625, "learning_rate": 6.25e-06, "loss": 0.1458, "step": 5536 }, { "epoch": 18.00706081081081, "grad_norm": 0.007296436466276646, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5537 }, { "epoch": 18.007094594594594, "grad_norm": 0.013896353542804718, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5538 }, { "epoch": 18.00712837837838, "grad_norm": 0.010358489118516445, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5539 }, { "epoch": 18.007162162162164, "grad_norm": 0.0016511038411408663, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5540 }, { "epoch": 18.007195945945945, "grad_norm": 0.003157589351758361, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5541 }, { "epoch": 18.00722972972973, "grad_norm": 0.033251043409109116, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5542 }, { "epoch": 18.007263513513514, "grad_norm": 30.02251434326172, "learning_rate": 6.25e-06, "loss": 0.578, "step": 5543 }, { "epoch": 18.007297297297296, "grad_norm": 0.0032062088139355183, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5544 }, { "epoch": 18.00733108108108, "grad_norm": 0.11596222221851349, "learning_rate": 6.25e-06, "loss": 0.0043, "step": 5545 }, { "epoch": 18.007364864864865, "grad_norm": 0.020125029608607292, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5546 }, { "epoch": 18.00739864864865, "grad_norm": 0.045207519084215164, "learning_rate": 6.25e-06, "loss": 0.0007, "step": 5547 }, { "epoch": 18.00743243243243, "grad_norm": 0.058339327573776245, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 5548 }, { "epoch": 18.007466216216216, "grad_norm": 3.395238161087036, "learning_rate": 6.25e-06, "loss": 0.1648, "step": 5549 }, { "epoch": 18.0075, "grad_norm": 0.003835646202787757, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5550 }, { "epoch": 18.007533783783785, "grad_norm": 0.021999884396791458, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5551 }, { "epoch": 18.007567567567566, "grad_norm": 12.731283187866211, "learning_rate": 6.25e-06, "loss": 0.251, "step": 5552 }, { "epoch": 18.00760135135135, "grad_norm": 0.007430264726281166, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5553 }, { "epoch": 18.007635135135136, "grad_norm": 1.3794474601745605, "learning_rate": 6.25e-06, "loss": 0.0506, "step": 5554 }, { "epoch": 18.00766891891892, "grad_norm": 0.018120460212230682, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5555 }, { "epoch": 18.0077027027027, "grad_norm": 0.004780203104019165, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5556 }, { "epoch": 18.007736486486486, "grad_norm": 46.86671447753906, "learning_rate": 6.25e-06, "loss": 0.1292, "step": 5557 }, { "epoch": 18.00777027027027, "grad_norm": 0.15705960988998413, "learning_rate": 6.25e-06, "loss": 0.0041, "step": 5558 }, { "epoch": 18.007804054054056, "grad_norm": 0.001145820366218686, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5559 }, { "epoch": 18.007837837837837, "grad_norm": 0.44841164350509644, "learning_rate": 6.25e-06, "loss": 0.0028, "step": 5560 }, { "epoch": 18.00787162162162, "grad_norm": 0.11064527928829193, "learning_rate": 6.25e-06, "loss": 0.001, "step": 5561 }, { "epoch": 18.007905405405406, "grad_norm": 0.04718821495771408, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5562 }, { "epoch": 18.00793918918919, "grad_norm": 0.0039081391878426075, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5563 }, { "epoch": 18.007972972972972, "grad_norm": 0.21038717031478882, "learning_rate": 6.25e-06, "loss": 0.0011, "step": 5564 }, { "epoch": 18.008006756756757, "grad_norm": 28.885196685791016, "learning_rate": 6.25e-06, "loss": 0.1484, "step": 5565 }, { "epoch": 18.00804054054054, "grad_norm": 0.026991674676537514, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5566 }, { "epoch": 18.008074324324323, "grad_norm": 0.001104566385038197, "learning_rate": 6.25e-06, "loss": 0.0, "step": 5567 }, { "epoch": 18.008108108108107, "grad_norm": 4.108168125152588, "learning_rate": 6.25e-06, "loss": 0.4391, "step": 5568 }, { "epoch": 18.008141891891892, "grad_norm": 0.0013863717904314399, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5569 }, { "epoch": 18.008175675675677, "grad_norm": 4.992063522338867, "learning_rate": 6.25e-06, "loss": 0.1067, "step": 5570 }, { "epoch": 18.008209459459458, "grad_norm": 7.491866588592529, "learning_rate": 6.25e-06, "loss": 0.4052, "step": 5571 }, { "epoch": 18.008243243243243, "grad_norm": 0.11053617298603058, "learning_rate": 6.25e-06, "loss": 0.0042, "step": 5572 }, { "epoch": 18.008277027027027, "grad_norm": 0.0033066438045352697, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5573 }, { "epoch": 18.008310810810812, "grad_norm": 0.016801778227090836, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5574 }, { "epoch": 18.008344594594593, "grad_norm": 0.047577664256095886, "learning_rate": 6.25e-06, "loss": 0.0009, "step": 5575 }, { "epoch": 18.008378378378378, "grad_norm": 0.013932404108345509, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5576 }, { "epoch": 18.008412162162163, "grad_norm": 6.717445373535156, "learning_rate": 6.25e-06, "loss": 0.4406, "step": 5577 }, { "epoch": 18.008445945945947, "grad_norm": 0.0022451423574239016, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5578 }, { "epoch": 18.00847972972973, "grad_norm": 0.001461689011193812, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5579 }, { "epoch": 18.008513513513513, "grad_norm": 0.014073166996240616, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5580 }, { "epoch": 18.008547297297298, "grad_norm": 0.004186231642961502, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5581 }, { "epoch": 18.008581081081083, "grad_norm": 0.00784520898014307, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5582 }, { "epoch": 18.008614864864864, "grad_norm": 0.5111196637153625, "learning_rate": 6.25e-06, "loss": 0.0058, "step": 5583 }, { "epoch": 18.00864864864865, "grad_norm": 0.12275556474924088, "learning_rate": 6.25e-06, "loss": 0.0041, "step": 5584 }, { "epoch": 18.008682432432433, "grad_norm": 0.004450689069926739, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5585 }, { "epoch": 18.008716216216218, "grad_norm": 0.0021462836302816868, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5586 }, { "epoch": 18.00875, "grad_norm": 11.934401512145996, "learning_rate": 6.25e-06, "loss": 1.0344, "step": 5587 }, { "epoch": 18.008783783783784, "grad_norm": 0.0013003598432987928, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5588 }, { "epoch": 18.00881756756757, "grad_norm": 0.12876072525978088, "learning_rate": 6.25e-06, "loss": 0.005, "step": 5589 }, { "epoch": 18.00885135135135, "grad_norm": 2.801797389984131, "learning_rate": 6.25e-06, "loss": 0.1392, "step": 5590 }, { "epoch": 18.008885135135134, "grad_norm": 15.838948249816895, "learning_rate": 6.25e-06, "loss": 0.0841, "step": 5591 }, { "epoch": 18.00891891891892, "grad_norm": 0.02657274715602398, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5592 }, { "epoch": 18.008952702702704, "grad_norm": 0.003185300389304757, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5593 }, { "epoch": 18.008986486486485, "grad_norm": 0.003431904362514615, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5594 }, { "epoch": 18.00902027027027, "grad_norm": 0.5533187389373779, "learning_rate": 6.25e-06, "loss": 0.0064, "step": 5595 }, { "epoch": 18.009054054054054, "grad_norm": 0.027510832995176315, "learning_rate": 6.25e-06, "loss": 0.0004, "step": 5596 }, { "epoch": 18.00908783783784, "grad_norm": 0.04402439296245575, "learning_rate": 6.25e-06, "loss": 0.0008, "step": 5597 }, { "epoch": 18.00912162162162, "grad_norm": 0.004037074279040098, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5598 }, { "epoch": 18.009155405405405, "grad_norm": 3.3830151557922363, "learning_rate": 6.25e-06, "loss": 0.3934, "step": 5599 }, { "epoch": 18.00918918918919, "grad_norm": 0.8047794699668884, "learning_rate": 6.25e-06, "loss": 0.0188, "step": 5600 }, { "epoch": 18.009222972972974, "grad_norm": 0.0037643846590071917, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5601 }, { "epoch": 18.009256756756756, "grad_norm": 0.0016132034361362457, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5602 }, { "epoch": 18.00929054054054, "grad_norm": 0.21639913320541382, "learning_rate": 6.25e-06, "loss": 0.0056, "step": 5603 }, { "epoch": 18.009324324324325, "grad_norm": 0.005699628964066505, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5604 }, { "epoch": 18.00935810810811, "grad_norm": 0.005414705257862806, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5605 }, { "epoch": 18.00939189189189, "grad_norm": 0.016354655846953392, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5606 }, { "epoch": 18.009425675675676, "grad_norm": 0.0025984495878219604, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5607 }, { "epoch": 18.00945945945946, "grad_norm": 7.5610198974609375, "learning_rate": 6.25e-06, "loss": 0.8949, "step": 5608 }, { "epoch": 18.00949324324324, "grad_norm": 0.0053025465458631516, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5609 }, { "epoch": 18.009527027027026, "grad_norm": 0.6092489957809448, "learning_rate": 6.25e-06, "loss": 0.0053, "step": 5610 }, { "epoch": 18.00956081081081, "grad_norm": 25.09544563293457, "learning_rate": 6.25e-06, "loss": 0.3397, "step": 5611 }, { "epoch": 18.009594594594596, "grad_norm": 0.052739378064870834, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 5612 }, { "epoch": 18.009628378378377, "grad_norm": 0.09252568334341049, "learning_rate": 6.25e-06, "loss": 0.0016, "step": 5613 }, { "epoch": 18.00966216216216, "grad_norm": 14.054698944091797, "learning_rate": 6.25e-06, "loss": 0.1427, "step": 5614 }, { "epoch": 18.009695945945946, "grad_norm": 0.2615377604961395, "learning_rate": 6.25e-06, "loss": 0.0015, "step": 5615 }, { "epoch": 18.00972972972973, "grad_norm": 0.15497983992099762, "learning_rate": 6.25e-06, "loss": 0.0058, "step": 5616 }, { "epoch": 18.009763513513512, "grad_norm": 0.016890157014131546, "learning_rate": 6.25e-06, "loss": 0.0005, "step": 5617 }, { "epoch": 18.009797297297297, "grad_norm": 0.02355804666876793, "learning_rate": 6.25e-06, "loss": 0.0003, "step": 5618 }, { "epoch": 18.00983108108108, "grad_norm": 1.7949854135513306, "learning_rate": 6.25e-06, "loss": 0.0067, "step": 5619 }, { "epoch": 18.009864864864866, "grad_norm": 0.003292596200481057, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5620 }, { "epoch": 18.009898648648647, "grad_norm": 0.41285672783851624, "learning_rate": 6.25e-06, "loss": 0.006, "step": 5621 }, { "epoch": 18.009932432432432, "grad_norm": 0.003072169376537204, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 5622 }, { "epoch": 18.009966216216217, "grad_norm": 0.007875639945268631, "learning_rate": 6.25e-06, "loss": 0.0002, "step": 5623 }, { "epoch": 18.01, "grad_norm": 8.474455833435059, "learning_rate": 6.25e-06, "loss": 0.0363, "step": 5624 }, { "epoch": 18.01, "eval_accuracy": 0.8933764135702746, "eval_loss": 0.5503578186035156, "eval_runtime": 32.2462, "eval_samples_per_second": 19.196, "eval_steps_per_second": 2.419, "step": 5624 }, { "epoch": 19.000033783783785, "grad_norm": 0.014685126021504402, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5625 }, { "epoch": 19.000067567567566, "grad_norm": 0.18451842665672302, "learning_rate": 3.125e-06, "loss": 0.0069, "step": 5626 }, { "epoch": 19.00010135135135, "grad_norm": 0.019866943359375, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5627 }, { "epoch": 19.000135135135135, "grad_norm": 0.4069860875606537, "learning_rate": 3.125e-06, "loss": 0.002, "step": 5628 }, { "epoch": 19.00016891891892, "grad_norm": 1.9400923252105713, "learning_rate": 3.125e-06, "loss": 0.0106, "step": 5629 }, { "epoch": 19.0002027027027, "grad_norm": 0.009504796005785465, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5630 }, { "epoch": 19.000236486486486, "grad_norm": 0.0027450439520180225, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5631 }, { "epoch": 19.00027027027027, "grad_norm": 2.063887357711792, "learning_rate": 3.125e-06, "loss": 0.0494, "step": 5632 }, { "epoch": 19.000304054054055, "grad_norm": 0.0054616681300103664, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5633 }, { "epoch": 19.000337837837836, "grad_norm": 0.10782387107610703, "learning_rate": 3.125e-06, "loss": 0.001, "step": 5634 }, { "epoch": 19.00037162162162, "grad_norm": 0.03177789971232414, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5635 }, { "epoch": 19.000405405405406, "grad_norm": 0.3129858374595642, "learning_rate": 3.125e-06, "loss": 0.0038, "step": 5636 }, { "epoch": 19.00043918918919, "grad_norm": 0.0030939572025090456, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5637 }, { "epoch": 19.00047297297297, "grad_norm": 0.006913531105965376, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5638 }, { "epoch": 19.000506756756756, "grad_norm": 0.0065153818577528, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5639 }, { "epoch": 19.00054054054054, "grad_norm": 0.18006251752376556, "learning_rate": 3.125e-06, "loss": 0.0071, "step": 5640 }, { "epoch": 19.000574324324326, "grad_norm": 0.043603941798210144, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5641 }, { "epoch": 19.000608108108107, "grad_norm": 37.98887252807617, "learning_rate": 3.125e-06, "loss": 0.101, "step": 5642 }, { "epoch": 19.000641891891892, "grad_norm": 0.008999464102089405, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5643 }, { "epoch": 19.000675675675677, "grad_norm": 1.1646724939346313, "learning_rate": 3.125e-06, "loss": 0.0132, "step": 5644 }, { "epoch": 19.00070945945946, "grad_norm": 0.020394347608089447, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5645 }, { "epoch": 19.000743243243242, "grad_norm": 0.006197542883455753, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5646 }, { "epoch": 19.000777027027027, "grad_norm": 20.839488983154297, "learning_rate": 3.125e-06, "loss": 0.04, "step": 5647 }, { "epoch": 19.000810810810812, "grad_norm": 0.02165956236422062, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5648 }, { "epoch": 19.000844594594593, "grad_norm": 0.04489840939640999, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5649 }, { "epoch": 19.000878378378378, "grad_norm": 0.3798849284648895, "learning_rate": 3.125e-06, "loss": 0.0025, "step": 5650 }, { "epoch": 19.000912162162162, "grad_norm": 0.24067823588848114, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 5651 }, { "epoch": 19.000945945945947, "grad_norm": 0.21249660849571228, "learning_rate": 3.125e-06, "loss": 0.0068, "step": 5652 }, { "epoch": 19.00097972972973, "grad_norm": 0.005405826959758997, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5653 }, { "epoch": 19.001013513513513, "grad_norm": 0.12573537230491638, "learning_rate": 3.125e-06, "loss": 0.0046, "step": 5654 }, { "epoch": 19.001047297297298, "grad_norm": 0.02950000949203968, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5655 }, { "epoch": 19.001081081081082, "grad_norm": 0.13503719866275787, "learning_rate": 3.125e-06, "loss": 0.0012, "step": 5656 }, { "epoch": 19.001114864864864, "grad_norm": 0.015727685764431953, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5657 }, { "epoch": 19.00114864864865, "grad_norm": 0.0014316916931420565, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5658 }, { "epoch": 19.001182432432433, "grad_norm": 0.0054970611818134785, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5659 }, { "epoch": 19.001216216216218, "grad_norm": 0.00413794768974185, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5660 }, { "epoch": 19.00125, "grad_norm": 0.03922516480088234, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5661 }, { "epoch": 19.001283783783784, "grad_norm": 12.95672607421875, "learning_rate": 3.125e-06, "loss": 0.0217, "step": 5662 }, { "epoch": 19.00131756756757, "grad_norm": 0.007827735505998135, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5663 }, { "epoch": 19.001351351351353, "grad_norm": 0.0017356903990730643, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5664 }, { "epoch": 19.001385135135134, "grad_norm": 0.0035993577912449837, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5665 }, { "epoch": 19.00141891891892, "grad_norm": 0.0015202780487015843, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5666 }, { "epoch": 19.001452702702704, "grad_norm": 6.118626594543457, "learning_rate": 3.125e-06, "loss": 0.0949, "step": 5667 }, { "epoch": 19.001486486486485, "grad_norm": 4.686212539672852, "learning_rate": 3.125e-06, "loss": 0.0863, "step": 5668 }, { "epoch": 19.00152027027027, "grad_norm": 0.003767149755731225, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5669 }, { "epoch": 19.001554054054054, "grad_norm": 10.487382888793945, "learning_rate": 3.125e-06, "loss": 0.1614, "step": 5670 }, { "epoch": 19.00158783783784, "grad_norm": 0.025486087426543236, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5671 }, { "epoch": 19.00162162162162, "grad_norm": 2.780301570892334, "learning_rate": 3.125e-06, "loss": 0.0723, "step": 5672 }, { "epoch": 19.001655405405405, "grad_norm": 0.005384041927754879, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5673 }, { "epoch": 19.00168918918919, "grad_norm": 3.3509817123413086, "learning_rate": 3.125e-06, "loss": 0.0436, "step": 5674 }, { "epoch": 19.001722972972974, "grad_norm": 0.005122194066643715, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5675 }, { "epoch": 19.001756756756755, "grad_norm": 0.001053948188200593, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5676 }, { "epoch": 19.00179054054054, "grad_norm": 0.1359415054321289, "learning_rate": 3.125e-06, "loss": 0.0051, "step": 5677 }, { "epoch": 19.001824324324325, "grad_norm": 0.0014482360566034913, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5678 }, { "epoch": 19.00185810810811, "grad_norm": 10.726536750793457, "learning_rate": 3.125e-06, "loss": 0.3146, "step": 5679 }, { "epoch": 19.00189189189189, "grad_norm": 0.0012520607560873032, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5680 }, { "epoch": 19.001925675675675, "grad_norm": 0.0017341050552204251, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5681 }, { "epoch": 19.00195945945946, "grad_norm": 0.015366344712674618, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5682 }, { "epoch": 19.001993243243245, "grad_norm": 0.18949362635612488, "learning_rate": 3.125e-06, "loss": 0.0018, "step": 5683 }, { "epoch": 19.002027027027026, "grad_norm": 0.17462392151355743, "learning_rate": 3.125e-06, "loss": 0.0017, "step": 5684 }, { "epoch": 19.00206081081081, "grad_norm": 0.01465727761387825, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5685 }, { "epoch": 19.002094594594595, "grad_norm": 6.237909317016602, "learning_rate": 3.125e-06, "loss": 0.0757, "step": 5686 }, { "epoch": 19.00212837837838, "grad_norm": 0.002470956416800618, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5687 }, { "epoch": 19.00216216216216, "grad_norm": 0.002444313606247306, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5688 }, { "epoch": 19.002195945945946, "grad_norm": 0.006584628485143185, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5689 }, { "epoch": 19.00222972972973, "grad_norm": 0.15828177332878113, "learning_rate": 3.125e-06, "loss": 0.006, "step": 5690 }, { "epoch": 19.002263513513512, "grad_norm": 4.108800411224365, "learning_rate": 3.125e-06, "loss": 0.3019, "step": 5691 }, { "epoch": 19.002297297297297, "grad_norm": 0.00897333212196827, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5692 }, { "epoch": 19.00233108108108, "grad_norm": 0.0023493345361202955, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5693 }, { "epoch": 19.002364864864866, "grad_norm": 0.005450132302939892, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5694 }, { "epoch": 19.002398648648647, "grad_norm": 16.990062713623047, "learning_rate": 3.125e-06, "loss": 0.0284, "step": 5695 }, { "epoch": 19.002432432432432, "grad_norm": 0.005054935812950134, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5696 }, { "epoch": 19.002466216216217, "grad_norm": 0.20043903589248657, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 5697 }, { "epoch": 19.0025, "grad_norm": 6.134324550628662, "learning_rate": 3.125e-06, "loss": 0.0704, "step": 5698 }, { "epoch": 19.002533783783782, "grad_norm": 25.001209259033203, "learning_rate": 3.125e-06, "loss": 0.9739, "step": 5699 }, { "epoch": 19.002567567567567, "grad_norm": 0.014663687907159328, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5700 }, { "epoch": 19.002601351351352, "grad_norm": 6.168490886688232, "learning_rate": 3.125e-06, "loss": 0.5476, "step": 5701 }, { "epoch": 19.002635135135137, "grad_norm": 0.02101719379425049, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5702 }, { "epoch": 19.002668918918918, "grad_norm": 0.012183710001409054, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5703 }, { "epoch": 19.002702702702702, "grad_norm": 0.1296146661043167, "learning_rate": 3.125e-06, "loss": 0.0049, "step": 5704 }, { "epoch": 19.002736486486487, "grad_norm": 0.0030276330653578043, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5705 }, { "epoch": 19.002770270270272, "grad_norm": 0.005275408271700144, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5706 }, { "epoch": 19.002804054054053, "grad_norm": 0.0022835771087557077, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5707 }, { "epoch": 19.002837837837838, "grad_norm": 0.010236503556370735, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5708 }, { "epoch": 19.002871621621622, "grad_norm": 2.1064469814300537, "learning_rate": 3.125e-06, "loss": 0.0246, "step": 5709 }, { "epoch": 19.002905405405407, "grad_norm": 0.005650438833981752, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5710 }, { "epoch": 19.00293918918919, "grad_norm": 0.5399821996688843, "learning_rate": 3.125e-06, "loss": 0.0068, "step": 5711 }, { "epoch": 19.002972972972973, "grad_norm": 0.004145395010709763, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5712 }, { "epoch": 19.003006756756758, "grad_norm": 0.0018025500467047095, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5713 }, { "epoch": 19.00304054054054, "grad_norm": 0.004063563887029886, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5714 }, { "epoch": 19.003074324324324, "grad_norm": 0.0027354764752089977, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5715 }, { "epoch": 19.00310810810811, "grad_norm": 0.007460654713213444, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5716 }, { "epoch": 19.003141891891893, "grad_norm": 0.002138307550922036, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5717 }, { "epoch": 19.003175675675674, "grad_norm": 0.0027509357314556837, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5718 }, { "epoch": 19.00320945945946, "grad_norm": 0.06046043336391449, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5719 }, { "epoch": 19.003243243243244, "grad_norm": 4.164306640625, "learning_rate": 3.125e-06, "loss": 0.3685, "step": 5720 }, { "epoch": 19.00327702702703, "grad_norm": 2.151951313018799, "learning_rate": 3.125e-06, "loss": 0.004, "step": 5721 }, { "epoch": 19.00331081081081, "grad_norm": 0.018402395769953728, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5722 }, { "epoch": 19.003344594594594, "grad_norm": 0.4262015223503113, "learning_rate": 3.125e-06, "loss": 0.0023, "step": 5723 }, { "epoch": 19.00337837837838, "grad_norm": 0.13790035247802734, "learning_rate": 3.125e-06, "loss": 0.0052, "step": 5724 }, { "epoch": 19.003412162162164, "grad_norm": 0.0017722602933645248, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5725 }, { "epoch": 19.003445945945945, "grad_norm": 0.011300048790872097, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5726 }, { "epoch": 19.00347972972973, "grad_norm": 0.19456847012043, "learning_rate": 3.125e-06, "loss": 0.0036, "step": 5727 }, { "epoch": 19.003513513513514, "grad_norm": 4.7117486000061035, "learning_rate": 3.125e-06, "loss": 0.1472, "step": 5728 }, { "epoch": 19.0035472972973, "grad_norm": 0.12801150977611542, "learning_rate": 3.125e-06, "loss": 0.0049, "step": 5729 }, { "epoch": 19.00358108108108, "grad_norm": 0.07664806395769119, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5730 }, { "epoch": 19.003614864864865, "grad_norm": 1.0968588590621948, "learning_rate": 3.125e-06, "loss": 0.0031, "step": 5731 }, { "epoch": 19.00364864864865, "grad_norm": 0.0235685296356678, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5732 }, { "epoch": 19.00368243243243, "grad_norm": 0.7919905185699463, "learning_rate": 3.125e-06, "loss": 0.0023, "step": 5733 }, { "epoch": 19.003716216216215, "grad_norm": 0.0075371526181697845, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5734 }, { "epoch": 19.00375, "grad_norm": 0.04154372960329056, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 5735 }, { "epoch": 19.003783783783785, "grad_norm": 0.001591540640220046, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5736 }, { "epoch": 19.003817567567566, "grad_norm": 1.4156367778778076, "learning_rate": 3.125e-06, "loss": 0.0215, "step": 5737 }, { "epoch": 19.00385135135135, "grad_norm": 5.609053134918213, "learning_rate": 3.125e-06, "loss": 0.27, "step": 5738 }, { "epoch": 19.003885135135135, "grad_norm": 0.0009258917416445911, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5739 }, { "epoch": 19.00391891891892, "grad_norm": 0.1401393711566925, "learning_rate": 3.125e-06, "loss": 0.0053, "step": 5740 }, { "epoch": 19.0039527027027, "grad_norm": 0.04089809209108353, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5741 }, { "epoch": 19.003986486486486, "grad_norm": 0.013451620936393738, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5742 }, { "epoch": 19.00402027027027, "grad_norm": 0.005127669777721167, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5743 }, { "epoch": 19.004054054054055, "grad_norm": 0.002220334019511938, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5744 }, { "epoch": 19.004087837837837, "grad_norm": 0.0118923494592309, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5745 }, { "epoch": 19.00412162162162, "grad_norm": 0.1499602496623993, "learning_rate": 3.125e-06, "loss": 0.0033, "step": 5746 }, { "epoch": 19.004155405405406, "grad_norm": 0.0015431055799126625, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5747 }, { "epoch": 19.00418918918919, "grad_norm": 0.001505127060227096, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5748 }, { "epoch": 19.004222972972972, "grad_norm": 26.69332504272461, "learning_rate": 3.125e-06, "loss": 0.045, "step": 5749 }, { "epoch": 19.004256756756757, "grad_norm": 0.007219700608402491, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5750 }, { "epoch": 19.00429054054054, "grad_norm": 0.045310936868190765, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5751 }, { "epoch": 19.004324324324326, "grad_norm": 3.7703731060028076, "learning_rate": 3.125e-06, "loss": 0.4212, "step": 5752 }, { "epoch": 19.004358108108107, "grad_norm": 0.06143440306186676, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5753 }, { "epoch": 19.004391891891892, "grad_norm": 1.6717920303344727, "learning_rate": 3.125e-06, "loss": 0.0045, "step": 5754 }, { "epoch": 19.004425675675677, "grad_norm": 0.2538090944290161, "learning_rate": 3.125e-06, "loss": 0.0012, "step": 5755 }, { "epoch": 19.004459459459458, "grad_norm": 0.02121971920132637, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5756 }, { "epoch": 19.004493243243243, "grad_norm": 0.00738832913339138, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5757 }, { "epoch": 19.004527027027027, "grad_norm": 0.0026595687959343195, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5758 }, { "epoch": 19.004560810810812, "grad_norm": 0.2412494271993637, "learning_rate": 3.125e-06, "loss": 0.0025, "step": 5759 }, { "epoch": 19.004594594594593, "grad_norm": 25.232587814331055, "learning_rate": 3.125e-06, "loss": 0.0518, "step": 5760 }, { "epoch": 19.004628378378378, "grad_norm": 0.00886440183967352, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5761 }, { "epoch": 19.004662162162163, "grad_norm": 41.765106201171875, "learning_rate": 3.125e-06, "loss": 0.4943, "step": 5762 }, { "epoch": 19.004695945945947, "grad_norm": 0.002600626088678837, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5763 }, { "epoch": 19.00472972972973, "grad_norm": 0.10958005487918854, "learning_rate": 3.125e-06, "loss": 0.0041, "step": 5764 }, { "epoch": 19.004763513513513, "grad_norm": 0.005458759143948555, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5765 }, { "epoch": 19.004797297297298, "grad_norm": 0.001319630304351449, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5766 }, { "epoch": 19.004831081081083, "grad_norm": 0.0059083872474730015, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5767 }, { "epoch": 19.004864864864864, "grad_norm": 0.006607471965253353, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5768 }, { "epoch": 19.00489864864865, "grad_norm": 1.8555998802185059, "learning_rate": 3.125e-06, "loss": 0.0832, "step": 5769 }, { "epoch": 19.004932432432433, "grad_norm": 0.009566046297550201, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5770 }, { "epoch": 19.004966216216218, "grad_norm": 10.373348236083984, "learning_rate": 3.125e-06, "loss": 0.071, "step": 5771 }, { "epoch": 19.005, "grad_norm": 0.0027457792311906815, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5772 }, { "epoch": 19.005033783783784, "grad_norm": 0.002522710245102644, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5773 }, { "epoch": 19.00506756756757, "grad_norm": 0.10202375799417496, "learning_rate": 3.125e-06, "loss": 0.0038, "step": 5774 }, { "epoch": 19.00510135135135, "grad_norm": 0.084376260638237, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5775 }, { "epoch": 19.005135135135134, "grad_norm": 0.0014917001826688647, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5776 }, { "epoch": 19.00516891891892, "grad_norm": 23.062644958496094, "learning_rate": 3.125e-06, "loss": 0.0564, "step": 5777 }, { "epoch": 19.005202702702704, "grad_norm": 0.0022127132397145033, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5778 }, { "epoch": 19.005236486486485, "grad_norm": 0.30202794075012207, "learning_rate": 3.125e-06, "loss": 0.001, "step": 5779 }, { "epoch": 19.00527027027027, "grad_norm": 0.0038610119372606277, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5780 }, { "epoch": 19.005304054054054, "grad_norm": 0.5180761814117432, "learning_rate": 3.125e-06, "loss": 0.0027, "step": 5781 }, { "epoch": 19.00533783783784, "grad_norm": 0.33735814690589905, "learning_rate": 3.125e-06, "loss": 0.0087, "step": 5782 }, { "epoch": 19.00537162162162, "grad_norm": 0.007819636724889278, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5783 }, { "epoch": 19.005405405405405, "grad_norm": 0.002317032776772976, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5784 }, { "epoch": 19.00543918918919, "grad_norm": 0.0026880495715886354, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5785 }, { "epoch": 19.005472972972974, "grad_norm": 0.013287232257425785, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5786 }, { "epoch": 19.005506756756755, "grad_norm": 0.0042038969695568085, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5787 }, { "epoch": 19.00554054054054, "grad_norm": 5.074505805969238, "learning_rate": 3.125e-06, "loss": 0.0195, "step": 5788 }, { "epoch": 19.005574324324325, "grad_norm": 0.31828510761260986, "learning_rate": 3.125e-06, "loss": 0.009, "step": 5789 }, { "epoch": 19.00560810810811, "grad_norm": 0.0019130682339891791, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5790 }, { "epoch": 19.00564189189189, "grad_norm": 0.011800368316471577, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5791 }, { "epoch": 19.005675675675676, "grad_norm": 111.45338439941406, "learning_rate": 3.125e-06, "loss": 0.4166, "step": 5792 }, { "epoch": 19.00570945945946, "grad_norm": 0.0032688884530216455, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5793 }, { "epoch": 19.005743243243245, "grad_norm": 0.007522083353251219, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5794 }, { "epoch": 19.005777027027026, "grad_norm": 0.10255835950374603, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 5795 }, { "epoch": 19.00581081081081, "grad_norm": 0.003521845443174243, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5796 }, { "epoch": 19.005844594594596, "grad_norm": 0.04767296090722084, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5797 }, { "epoch": 19.005878378378377, "grad_norm": 0.002517420332878828, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5798 }, { "epoch": 19.00591216216216, "grad_norm": 0.0040013231337070465, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5799 }, { "epoch": 19.005945945945946, "grad_norm": 21.209081649780273, "learning_rate": 3.125e-06, "loss": 0.5476, "step": 5800 }, { "epoch": 19.00597972972973, "grad_norm": 0.0020897085778415203, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5801 }, { "epoch": 19.006013513513512, "grad_norm": 4.867655277252197, "learning_rate": 3.125e-06, "loss": 0.3683, "step": 5802 }, { "epoch": 19.006047297297297, "grad_norm": 0.04923569783568382, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 5803 }, { "epoch": 19.00608108108108, "grad_norm": 0.0033463062718510628, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5804 }, { "epoch": 19.006114864864866, "grad_norm": 0.0027407249435782433, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5805 }, { "epoch": 19.006148648648647, "grad_norm": 0.007874956354498863, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5806 }, { "epoch": 19.006182432432432, "grad_norm": 1.7935137748718262, "learning_rate": 3.125e-06, "loss": 0.052, "step": 5807 }, { "epoch": 19.006216216216217, "grad_norm": 0.11328332126140594, "learning_rate": 3.125e-06, "loss": 0.0042, "step": 5808 }, { "epoch": 19.00625, "grad_norm": 0.05351507291197777, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5809 }, { "epoch": 19.006283783783783, "grad_norm": 1.4963997602462769, "learning_rate": 3.125e-06, "loss": 0.0162, "step": 5810 }, { "epoch": 19.006317567567567, "grad_norm": 0.001434067147783935, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5811 }, { "epoch": 19.006351351351352, "grad_norm": 0.016054289415478706, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5812 }, { "epoch": 19.006385135135137, "grad_norm": 0.01561115961521864, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5813 }, { "epoch": 19.006418918918918, "grad_norm": 0.0071879783645272255, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5814 }, { "epoch": 19.006452702702703, "grad_norm": 0.002662831451743841, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5815 }, { "epoch": 19.006486486486487, "grad_norm": 0.018576500937342644, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5816 }, { "epoch": 19.006520270270272, "grad_norm": 0.009071574546396732, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5817 }, { "epoch": 19.006554054054053, "grad_norm": 0.004758299328386784, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5818 }, { "epoch": 19.006587837837838, "grad_norm": 0.1859305500984192, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 5819 }, { "epoch": 19.006621621621623, "grad_norm": 0.12612438201904297, "learning_rate": 3.125e-06, "loss": 0.0046, "step": 5820 }, { "epoch": 19.006655405405404, "grad_norm": 0.0017958454554900527, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5821 }, { "epoch": 19.00668918918919, "grad_norm": 0.7652896642684937, "learning_rate": 3.125e-06, "loss": 0.0228, "step": 5822 }, { "epoch": 19.006722972972973, "grad_norm": 1.728490948677063, "learning_rate": 3.125e-06, "loss": 0.0062, "step": 5823 }, { "epoch": 19.006756756756758, "grad_norm": 0.009315239265561104, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5824 }, { "epoch": 19.00679054054054, "grad_norm": 0.001909904764033854, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5825 }, { "epoch": 19.006824324324324, "grad_norm": 0.140609011054039, "learning_rate": 3.125e-06, "loss": 0.0016, "step": 5826 }, { "epoch": 19.00685810810811, "grad_norm": 0.4972599446773529, "learning_rate": 3.125e-06, "loss": 0.0031, "step": 5827 }, { "epoch": 19.006891891891893, "grad_norm": 19.57426643371582, "learning_rate": 3.125e-06, "loss": 0.5183, "step": 5828 }, { "epoch": 19.006925675675674, "grad_norm": 0.0063064307905733585, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5829 }, { "epoch": 19.00695945945946, "grad_norm": 17.64139747619629, "learning_rate": 3.125e-06, "loss": 0.0931, "step": 5830 }, { "epoch": 19.006993243243244, "grad_norm": 0.0438225232064724, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 5831 }, { "epoch": 19.00702702702703, "grad_norm": 0.0022484352812170982, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5832 }, { "epoch": 19.00706081081081, "grad_norm": 1.1483426094055176, "learning_rate": 3.125e-06, "loss": 0.0151, "step": 5833 }, { "epoch": 19.007094594594594, "grad_norm": 3.308156728744507, "learning_rate": 3.125e-06, "loss": 0.4397, "step": 5834 }, { "epoch": 19.00712837837838, "grad_norm": 0.003761749481782317, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5835 }, { "epoch": 19.007162162162164, "grad_norm": 0.001217405078932643, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5836 }, { "epoch": 19.007195945945945, "grad_norm": 0.01623525097966194, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5837 }, { "epoch": 19.00722972972973, "grad_norm": 0.0008789841085672379, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5838 }, { "epoch": 19.007263513513514, "grad_norm": 0.0011298658791929483, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5839 }, { "epoch": 19.007297297297296, "grad_norm": 0.005217809695750475, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5840 }, { "epoch": 19.00733108108108, "grad_norm": 0.017832184210419655, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5841 }, { "epoch": 19.007364864864865, "grad_norm": 0.0024569183588027954, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5842 }, { "epoch": 19.00739864864865, "grad_norm": 6.551516532897949, "learning_rate": 3.125e-06, "loss": 0.3506, "step": 5843 }, { "epoch": 19.00743243243243, "grad_norm": 0.0023164518643170595, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5844 }, { "epoch": 19.007466216216216, "grad_norm": 0.0014618647983297706, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5845 }, { "epoch": 19.0075, "grad_norm": 0.004426710307598114, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5846 }, { "epoch": 19.007533783783785, "grad_norm": 0.008821259252727032, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5847 }, { "epoch": 19.007567567567566, "grad_norm": 0.002978474134579301, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5848 }, { "epoch": 19.00760135135135, "grad_norm": 0.007666551973670721, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5849 }, { "epoch": 19.007635135135136, "grad_norm": 0.017497433349490166, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5850 }, { "epoch": 19.00766891891892, "grad_norm": 0.04479021951556206, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 5851 }, { "epoch": 19.0077027027027, "grad_norm": 0.006973946932703257, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5852 }, { "epoch": 19.007736486486486, "grad_norm": 0.0018111872486770153, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5853 }, { "epoch": 19.00777027027027, "grad_norm": 11.179483413696289, "learning_rate": 3.125e-06, "loss": 0.7531, "step": 5854 }, { "epoch": 19.007804054054056, "grad_norm": 0.00364773441106081, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5855 }, { "epoch": 19.007837837837837, "grad_norm": 0.00194218335673213, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5856 }, { "epoch": 19.00787162162162, "grad_norm": 6.568029880523682, "learning_rate": 3.125e-06, "loss": 0.7921, "step": 5857 }, { "epoch": 19.007905405405406, "grad_norm": 0.10839320719242096, "learning_rate": 3.125e-06, "loss": 0.004, "step": 5858 }, { "epoch": 19.00793918918919, "grad_norm": 0.15042348206043243, "learning_rate": 3.125e-06, "loss": 0.0055, "step": 5859 }, { "epoch": 19.007972972972972, "grad_norm": 0.006650073453783989, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5860 }, { "epoch": 19.008006756756757, "grad_norm": 0.0036442442797124386, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5861 }, { "epoch": 19.00804054054054, "grad_norm": 0.002142476150766015, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5862 }, { "epoch": 19.008074324324323, "grad_norm": 11.734651565551758, "learning_rate": 3.125e-06, "loss": 0.4261, "step": 5863 }, { "epoch": 19.008108108108107, "grad_norm": 75.10063934326172, "learning_rate": 3.125e-06, "loss": 0.5216, "step": 5864 }, { "epoch": 19.008141891891892, "grad_norm": 0.0031418288126587868, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5865 }, { "epoch": 19.008175675675677, "grad_norm": 0.8048756122589111, "learning_rate": 3.125e-06, "loss": 0.0319, "step": 5866 }, { "epoch": 19.008209459459458, "grad_norm": 0.0046356250531971455, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5867 }, { "epoch": 19.008243243243243, "grad_norm": 0.006290269084274769, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5868 }, { "epoch": 19.008277027027027, "grad_norm": 0.0018068996723741293, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5869 }, { "epoch": 19.008310810810812, "grad_norm": 0.0037912088446319103, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5870 }, { "epoch": 19.008344594594593, "grad_norm": 0.015777209773659706, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5871 }, { "epoch": 19.008378378378378, "grad_norm": 64.04732513427734, "learning_rate": 3.125e-06, "loss": 0.1613, "step": 5872 }, { "epoch": 19.008412162162163, "grad_norm": 0.01886082999408245, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5873 }, { "epoch": 19.008445945945947, "grad_norm": 9.695552825927734, "learning_rate": 3.125e-06, "loss": 0.1565, "step": 5874 }, { "epoch": 19.00847972972973, "grad_norm": 16.941444396972656, "learning_rate": 3.125e-06, "loss": 0.0389, "step": 5875 }, { "epoch": 19.008513513513513, "grad_norm": 0.0032394935842603445, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5876 }, { "epoch": 19.008547297297298, "grad_norm": 0.009612558409571648, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5877 }, { "epoch": 19.008581081081083, "grad_norm": 0.0037123835645616055, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5878 }, { "epoch": 19.008614864864864, "grad_norm": 0.002257057698443532, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5879 }, { "epoch": 19.00864864864865, "grad_norm": 0.002837443258613348, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5880 }, { "epoch": 19.008682432432433, "grad_norm": 0.03241979703307152, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5881 }, { "epoch": 19.008716216216218, "grad_norm": 0.007753712125122547, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5882 }, { "epoch": 19.00875, "grad_norm": 0.0030991453677415848, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5883 }, { "epoch": 19.008783783783784, "grad_norm": 0.2295122891664505, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5884 }, { "epoch": 19.00881756756757, "grad_norm": 4.0709547996521, "learning_rate": 3.125e-06, "loss": 0.0172, "step": 5885 }, { "epoch": 19.00885135135135, "grad_norm": 0.15237925946712494, "learning_rate": 3.125e-06, "loss": 0.0058, "step": 5886 }, { "epoch": 19.008885135135134, "grad_norm": 0.005729808937758207, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5887 }, { "epoch": 19.00891891891892, "grad_norm": 0.008884409442543983, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5888 }, { "epoch": 19.008952702702704, "grad_norm": 0.0013652611523866653, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5889 }, { "epoch": 19.008986486486485, "grad_norm": 0.020695630460977554, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5890 }, { "epoch": 19.00902027027027, "grad_norm": 0.7592935562133789, "learning_rate": 3.125e-06, "loss": 0.0265, "step": 5891 }, { "epoch": 19.009054054054054, "grad_norm": 0.052753232419490814, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5892 }, { "epoch": 19.00908783783784, "grad_norm": 0.662564754486084, "learning_rate": 3.125e-06, "loss": 0.0065, "step": 5893 }, { "epoch": 19.00912162162162, "grad_norm": 9.553217887878418, "learning_rate": 3.125e-06, "loss": 0.9539, "step": 5894 }, { "epoch": 19.009155405405405, "grad_norm": 0.03730585798621178, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5895 }, { "epoch": 19.00918918918919, "grad_norm": 14.65488052368164, "learning_rate": 3.125e-06, "loss": 0.1395, "step": 5896 }, { "epoch": 19.009222972972974, "grad_norm": 0.0015640354249626398, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5897 }, { "epoch": 19.009256756756756, "grad_norm": 0.0177916307002306, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5898 }, { "epoch": 19.00929054054054, "grad_norm": 0.12813277542591095, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5899 }, { "epoch": 19.009324324324325, "grad_norm": 0.08352607488632202, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5900 }, { "epoch": 19.00935810810811, "grad_norm": 0.006943795830011368, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5901 }, { "epoch": 19.00939189189189, "grad_norm": 0.002120167249813676, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5902 }, { "epoch": 19.009425675675676, "grad_norm": 0.005002297926694155, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5903 }, { "epoch": 19.00945945945946, "grad_norm": 0.7449630498886108, "learning_rate": 3.125e-06, "loss": 0.0236, "step": 5904 }, { "epoch": 19.00949324324324, "grad_norm": 0.0013524320675060153, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5905 }, { "epoch": 19.009527027027026, "grad_norm": 0.031823381781578064, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 5906 }, { "epoch": 19.00956081081081, "grad_norm": 0.01952996477484703, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5907 }, { "epoch": 19.009594594594596, "grad_norm": 0.01430537924170494, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5908 }, { "epoch": 19.009628378378377, "grad_norm": 21.83749771118164, "learning_rate": 3.125e-06, "loss": 0.0773, "step": 5909 }, { "epoch": 19.00966216216216, "grad_norm": 0.012954726815223694, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5910 }, { "epoch": 19.009695945945946, "grad_norm": 3.0131280422210693, "learning_rate": 3.125e-06, "loss": 0.0205, "step": 5911 }, { "epoch": 19.00972972972973, "grad_norm": 0.5303777456283569, "learning_rate": 3.125e-06, "loss": 0.0085, "step": 5912 }, { "epoch": 19.009763513513512, "grad_norm": 0.007947561331093311, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5913 }, { "epoch": 19.009797297297297, "grad_norm": 0.01598992384970188, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5914 }, { "epoch": 19.00983108108108, "grad_norm": 0.012131324969232082, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5915 }, { "epoch": 19.009864864864866, "grad_norm": 0.007626494858413935, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5916 }, { "epoch": 19.009898648648647, "grad_norm": 0.15588432550430298, "learning_rate": 3.125e-06, "loss": 0.0057, "step": 5917 }, { "epoch": 19.009932432432432, "grad_norm": 0.9402834177017212, "learning_rate": 3.125e-06, "loss": 0.0099, "step": 5918 }, { "epoch": 19.009966216216217, "grad_norm": 0.032747022807598114, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 5919 }, { "epoch": 19.01, "grad_norm": 0.005064574535936117, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5920 }, { "epoch": 19.01, "eval_accuracy": 0.8982229402261712, "eval_loss": 0.5372304320335388, "eval_runtime": 31.9565, "eval_samples_per_second": 19.37, "eval_steps_per_second": 2.441, "step": 5920 }, { "epoch": 20.000033783783785, "grad_norm": 0.0031984334345906973, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5921 }, { "epoch": 20.000067567567566, "grad_norm": 0.014314466156065464, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5922 }, { "epoch": 20.00010135135135, "grad_norm": 0.004208180587738752, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5923 }, { "epoch": 20.000135135135135, "grad_norm": 16.038318634033203, "learning_rate": 3.125e-06, "loss": 0.2718, "step": 5924 }, { "epoch": 20.00016891891892, "grad_norm": 0.0014087847666814923, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5925 }, { "epoch": 20.0002027027027, "grad_norm": 0.061499983072280884, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 5926 }, { "epoch": 20.000236486486486, "grad_norm": 0.0029153611976653337, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5927 }, { "epoch": 20.00027027027027, "grad_norm": 0.7589055895805359, "learning_rate": 3.125e-06, "loss": 0.0142, "step": 5928 }, { "epoch": 20.000304054054055, "grad_norm": 54.959449768066406, "learning_rate": 3.125e-06, "loss": 0.4299, "step": 5929 }, { "epoch": 20.000337837837836, "grad_norm": 0.007041578181087971, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5930 }, { "epoch": 20.00037162162162, "grad_norm": 0.13534902036190033, "learning_rate": 3.125e-06, "loss": 0.005, "step": 5931 }, { "epoch": 20.000405405405406, "grad_norm": 0.034556735306978226, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 5932 }, { "epoch": 20.00043918918919, "grad_norm": 0.0040330300107598305, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5933 }, { "epoch": 20.00047297297297, "grad_norm": 0.0021542366594076157, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5934 }, { "epoch": 20.000506756756756, "grad_norm": 0.17456449568271637, "learning_rate": 3.125e-06, "loss": 0.0066, "step": 5935 }, { "epoch": 20.00054054054054, "grad_norm": 0.8462532758712769, "learning_rate": 3.125e-06, "loss": 0.0026, "step": 5936 }, { "epoch": 20.000574324324326, "grad_norm": 0.29842543601989746, "learning_rate": 3.125e-06, "loss": 0.0024, "step": 5937 }, { "epoch": 20.000608108108107, "grad_norm": 0.002802501665428281, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5938 }, { "epoch": 20.000641891891892, "grad_norm": 0.002235695719718933, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5939 }, { "epoch": 20.000675675675677, "grad_norm": 0.005168780218809843, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5940 }, { "epoch": 20.00070945945946, "grad_norm": 0.006299460306763649, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5941 }, { "epoch": 20.000743243243242, "grad_norm": 0.28718453645706177, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 5942 }, { "epoch": 20.000777027027027, "grad_norm": 0.008894680999219418, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5943 }, { "epoch": 20.000810810810812, "grad_norm": 36.21186447143555, "learning_rate": 3.125e-06, "loss": 0.5955, "step": 5944 }, { "epoch": 20.000844594594593, "grad_norm": 0.004189350642263889, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5945 }, { "epoch": 20.000878378378378, "grad_norm": 0.002128296997398138, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5946 }, { "epoch": 20.000912162162162, "grad_norm": 0.1540898084640503, "learning_rate": 3.125e-06, "loss": 0.0013, "step": 5947 }, { "epoch": 20.000945945945947, "grad_norm": 0.0274372361600399, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5948 }, { "epoch": 20.00097972972973, "grad_norm": 0.09294416010379791, "learning_rate": 3.125e-06, "loss": 0.0026, "step": 5949 }, { "epoch": 20.001013513513513, "grad_norm": 0.0017553342040628195, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5950 }, { "epoch": 20.001047297297298, "grad_norm": 0.012934430502355099, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5951 }, { "epoch": 20.001081081081082, "grad_norm": 0.30420759320259094, "learning_rate": 3.125e-06, "loss": 0.0013, "step": 5952 }, { "epoch": 20.001114864864864, "grad_norm": 0.12346427142620087, "learning_rate": 3.125e-06, "loss": 0.0041, "step": 5953 }, { "epoch": 20.00114864864865, "grad_norm": 0.008236492983996868, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5954 }, { "epoch": 20.001182432432433, "grad_norm": 0.007666994351893663, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5955 }, { "epoch": 20.001216216216218, "grad_norm": 3.740659475326538, "learning_rate": 3.125e-06, "loss": 0.338, "step": 5956 }, { "epoch": 20.00125, "grad_norm": 2.1888251304626465, "learning_rate": 3.125e-06, "loss": 0.033, "step": 5957 }, { "epoch": 20.001283783783784, "grad_norm": 0.003553691552951932, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5958 }, { "epoch": 20.00131756756757, "grad_norm": 0.0012989618116989732, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5959 }, { "epoch": 20.001351351351353, "grad_norm": 0.001929356367327273, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5960 }, { "epoch": 20.001385135135134, "grad_norm": 3.3176627159118652, "learning_rate": 3.125e-06, "loss": 0.4242, "step": 5961 }, { "epoch": 20.00141891891892, "grad_norm": 0.0027592855039983988, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5962 }, { "epoch": 20.001452702702704, "grad_norm": 0.0032508557196706533, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5963 }, { "epoch": 20.001486486486485, "grad_norm": 0.0023298829328268766, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5964 }, { "epoch": 20.00152027027027, "grad_norm": 0.0019121826626360416, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5965 }, { "epoch": 20.001554054054054, "grad_norm": 0.0008033044287003577, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5966 }, { "epoch": 20.00158783783784, "grad_norm": 0.003275585826486349, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5967 }, { "epoch": 20.00162162162162, "grad_norm": 0.021949075162410736, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5968 }, { "epoch": 20.001655405405405, "grad_norm": 0.001949469791725278, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5969 }, { "epoch": 20.00168918918919, "grad_norm": 0.059205878525972366, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 5970 }, { "epoch": 20.001722972972974, "grad_norm": 0.002138095209375024, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5971 }, { "epoch": 20.001756756756755, "grad_norm": 0.025455161929130554, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 5972 }, { "epoch": 20.00179054054054, "grad_norm": 0.0021695257164537907, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5973 }, { "epoch": 20.001824324324325, "grad_norm": 0.0008042330155149102, "learning_rate": 3.125e-06, "loss": 0.0, "step": 5974 }, { "epoch": 20.00185810810811, "grad_norm": 2.487604856491089, "learning_rate": 3.125e-06, "loss": 0.0302, "step": 5975 }, { "epoch": 20.00189189189189, "grad_norm": 0.7195004224777222, "learning_rate": 3.125e-06, "loss": 0.0168, "step": 5976 }, { "epoch": 20.001925675675675, "grad_norm": 12.382259368896484, "learning_rate": 3.125e-06, "loss": 0.2866, "step": 5977 }, { "epoch": 20.00195945945946, "grad_norm": 6.210673809051514, "learning_rate": 3.125e-06, "loss": 0.0466, "step": 5978 }, { "epoch": 20.001993243243245, "grad_norm": 0.13755863904953003, "learning_rate": 3.125e-06, "loss": 0.0051, "step": 5979 }, { "epoch": 20.002027027027026, "grad_norm": 0.1384689211845398, "learning_rate": 3.125e-06, "loss": 0.0013, "step": 5980 }, { "epoch": 20.00206081081081, "grad_norm": 0.002926051151007414, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5981 }, { "epoch": 20.002094594594595, "grad_norm": 0.012678239494562149, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5982 }, { "epoch": 20.00212837837838, "grad_norm": 40.697330474853516, "learning_rate": 3.125e-06, "loss": 0.7915, "step": 5983 }, { "epoch": 20.00216216216216, "grad_norm": 0.011990496888756752, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 5984 }, { "epoch": 20.002195945945946, "grad_norm": 0.2028229981660843, "learning_rate": 3.125e-06, "loss": 0.0036, "step": 5985 }, { "epoch": 20.00222972972973, "grad_norm": 5.241381645202637, "learning_rate": 3.125e-06, "loss": 0.3645, "step": 5986 }, { "epoch": 20.002263513513512, "grad_norm": 0.005352843087166548, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5987 }, { "epoch": 20.002297297297297, "grad_norm": 0.0023029909934848547, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5988 }, { "epoch": 20.00233108108108, "grad_norm": 0.002100689336657524, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5989 }, { "epoch": 20.002364864864866, "grad_norm": 0.003386547788977623, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5990 }, { "epoch": 20.002398648648647, "grad_norm": 0.21462340652942657, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 5991 }, { "epoch": 20.002432432432432, "grad_norm": 0.1548611968755722, "learning_rate": 3.125e-06, "loss": 0.0058, "step": 5992 }, { "epoch": 20.002466216216217, "grad_norm": 0.007644061930477619, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5993 }, { "epoch": 20.0025, "grad_norm": 0.003935737069696188, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5994 }, { "epoch": 20.002533783783782, "grad_norm": 0.0017927272710949183, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5995 }, { "epoch": 20.002567567567567, "grad_norm": 29.332836151123047, "learning_rate": 3.125e-06, "loss": 0.102, "step": 5996 }, { "epoch": 20.002601351351352, "grad_norm": 0.004399003926664591, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 5997 }, { "epoch": 20.002635135135137, "grad_norm": 3.3110435009002686, "learning_rate": 3.125e-06, "loss": 0.017, "step": 5998 }, { "epoch": 20.002668918918918, "grad_norm": 0.01698857918381691, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 5999 }, { "epoch": 20.002702702702702, "grad_norm": 0.28531211614608765, "learning_rate": 3.125e-06, "loss": 0.0077, "step": 6000 }, { "epoch": 20.002736486486487, "grad_norm": 0.005111262667924166, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6001 }, { "epoch": 20.002770270270272, "grad_norm": 0.018290603533387184, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6002 }, { "epoch": 20.002804054054053, "grad_norm": 0.0009949345840141177, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6003 }, { "epoch": 20.002837837837838, "grad_norm": 44.14219665527344, "learning_rate": 3.125e-06, "loss": 0.1369, "step": 6004 }, { "epoch": 20.002871621621622, "grad_norm": 0.0051893629133701324, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6005 }, { "epoch": 20.002905405405407, "grad_norm": 0.05689926818013191, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6006 }, { "epoch": 20.00293918918919, "grad_norm": 0.004786766599863768, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6007 }, { "epoch": 20.002972972972973, "grad_norm": 0.08803321421146393, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6008 }, { "epoch": 20.003006756756758, "grad_norm": 0.0038933490868657827, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6009 }, { "epoch": 20.00304054054054, "grad_norm": 0.0016426661750301719, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6010 }, { "epoch": 20.003074324324324, "grad_norm": 0.14240676164627075, "learning_rate": 3.125e-06, "loss": 0.0055, "step": 6011 }, { "epoch": 20.00310810810811, "grad_norm": 0.2620672285556793, "learning_rate": 3.125e-06, "loss": 0.0015, "step": 6012 }, { "epoch": 20.003141891891893, "grad_norm": 0.005679963156580925, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6013 }, { "epoch": 20.003175675675674, "grad_norm": 0.003552322741597891, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6014 }, { "epoch": 20.00320945945946, "grad_norm": 0.005901510361582041, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6015 }, { "epoch": 20.003243243243244, "grad_norm": 0.14972688257694244, "learning_rate": 3.125e-06, "loss": 0.0058, "step": 6016 }, { "epoch": 20.00327702702703, "grad_norm": 0.005542931146919727, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6017 }, { "epoch": 20.00331081081081, "grad_norm": 0.0030485205352306366, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6018 }, { "epoch": 20.003344594594594, "grad_norm": 0.020227570086717606, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6019 }, { "epoch": 20.00337837837838, "grad_norm": 0.00430562999099493, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6020 }, { "epoch": 20.003412162162164, "grad_norm": 0.005051612854003906, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6021 }, { "epoch": 20.003445945945945, "grad_norm": 7.624325275421143, "learning_rate": 3.125e-06, "loss": 0.0217, "step": 6022 }, { "epoch": 20.00347972972973, "grad_norm": 0.0061437636613845825, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6023 }, { "epoch": 20.003513513513514, "grad_norm": 0.025669846683740616, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6024 }, { "epoch": 20.0035472972973, "grad_norm": 0.017197581008076668, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6025 }, { "epoch": 20.00358108108108, "grad_norm": 0.0011383539531379938, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6026 }, { "epoch": 20.003614864864865, "grad_norm": 0.0012125609209761024, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6027 }, { "epoch": 20.00364864864865, "grad_norm": 0.30362728238105774, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6028 }, { "epoch": 20.00368243243243, "grad_norm": 0.006255457643419504, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6029 }, { "epoch": 20.003716216216215, "grad_norm": 0.026833467185497284, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6030 }, { "epoch": 20.00375, "grad_norm": 0.023693371564149857, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6031 }, { "epoch": 20.003783783783785, "grad_norm": 0.006135558243840933, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6032 }, { "epoch": 20.003817567567566, "grad_norm": 0.004696815740317106, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6033 }, { "epoch": 20.00385135135135, "grad_norm": 0.036500945687294006, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6034 }, { "epoch": 20.003885135135135, "grad_norm": 0.039515502750873566, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6035 }, { "epoch": 20.00391891891892, "grad_norm": 0.005547904875129461, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6036 }, { "epoch": 20.0039527027027, "grad_norm": 0.009679918177425861, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6037 }, { "epoch": 20.003986486486486, "grad_norm": 0.006794276647269726, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6038 }, { "epoch": 20.00402027027027, "grad_norm": 0.0032288627699017525, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6039 }, { "epoch": 20.004054054054055, "grad_norm": 0.0019902000203728676, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6040 }, { "epoch": 20.004087837837837, "grad_norm": 0.030043436214327812, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6041 }, { "epoch": 20.00412162162162, "grad_norm": 32.9473762512207, "learning_rate": 3.125e-06, "loss": 0.0413, "step": 6042 }, { "epoch": 20.004155405405406, "grad_norm": 0.002859950065612793, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6043 }, { "epoch": 20.00418918918919, "grad_norm": 0.16144531965255737, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6044 }, { "epoch": 20.004222972972972, "grad_norm": 0.0682632327079773, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6045 }, { "epoch": 20.004256756756757, "grad_norm": 0.0023100273683667183, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6046 }, { "epoch": 20.00429054054054, "grad_norm": 0.5387685894966125, "learning_rate": 3.125e-06, "loss": 0.016, "step": 6047 }, { "epoch": 20.004324324324326, "grad_norm": 25.06048011779785, "learning_rate": 3.125e-06, "loss": 0.5073, "step": 6048 }, { "epoch": 20.004358108108107, "grad_norm": 0.004027441143989563, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6049 }, { "epoch": 20.004391891891892, "grad_norm": 0.005498509854078293, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6050 }, { "epoch": 20.004425675675677, "grad_norm": 0.003945914562791586, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6051 }, { "epoch": 20.004459459459458, "grad_norm": 0.4687178134918213, "learning_rate": 3.125e-06, "loss": 0.0013, "step": 6052 }, { "epoch": 20.004493243243243, "grad_norm": 39.08582305908203, "learning_rate": 3.125e-06, "loss": 0.1035, "step": 6053 }, { "epoch": 20.004527027027027, "grad_norm": 0.005812529940158129, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6054 }, { "epoch": 20.004560810810812, "grad_norm": 0.022025926038622856, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6055 }, { "epoch": 20.004594594594593, "grad_norm": 14.347614288330078, "learning_rate": 3.125e-06, "loss": 0.0893, "step": 6056 }, { "epoch": 20.004628378378378, "grad_norm": 0.009465169161558151, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6057 }, { "epoch": 20.004662162162163, "grad_norm": 12.967144012451172, "learning_rate": 3.125e-06, "loss": 0.7984, "step": 6058 }, { "epoch": 20.004695945945947, "grad_norm": 0.0017797318287193775, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6059 }, { "epoch": 20.00472972972973, "grad_norm": 0.011171304620802402, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6060 }, { "epoch": 20.004763513513513, "grad_norm": 0.017143135890364647, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6061 }, { "epoch": 20.004797297297298, "grad_norm": 0.003928487654775381, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6062 }, { "epoch": 20.004831081081083, "grad_norm": 0.058333832770586014, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6063 }, { "epoch": 20.004864864864864, "grad_norm": 0.0015024275053292513, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6064 }, { "epoch": 20.00489864864865, "grad_norm": 0.05109415203332901, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 6065 }, { "epoch": 20.004932432432433, "grad_norm": 0.003777230391278863, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6066 }, { "epoch": 20.004966216216218, "grad_norm": 0.001519282697699964, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6067 }, { "epoch": 20.005, "grad_norm": 0.0019312157528474927, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6068 }, { "epoch": 20.005033783783784, "grad_norm": 0.0874343290925026, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6069 }, { "epoch": 20.00506756756757, "grad_norm": 26.953006744384766, "learning_rate": 3.125e-06, "loss": 0.0844, "step": 6070 }, { "epoch": 20.00510135135135, "grad_norm": 0.04271981120109558, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6071 }, { "epoch": 20.005135135135134, "grad_norm": 0.0016917664324864745, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6072 }, { "epoch": 20.00516891891892, "grad_norm": 0.003977862652391195, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6073 }, { "epoch": 20.005202702702704, "grad_norm": 0.0021088600624352694, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6074 }, { "epoch": 20.005236486486485, "grad_norm": 5.135536193847656, "learning_rate": 3.125e-06, "loss": 0.0631, "step": 6075 }, { "epoch": 20.00527027027027, "grad_norm": 0.004463047254830599, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6076 }, { "epoch": 20.005304054054054, "grad_norm": 0.12388350069522858, "learning_rate": 3.125e-06, "loss": 0.0032, "step": 6077 }, { "epoch": 20.00533783783784, "grad_norm": 1.7870213985443115, "learning_rate": 3.125e-06, "loss": 0.0038, "step": 6078 }, { "epoch": 20.00537162162162, "grad_norm": 0.031121106818318367, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6079 }, { "epoch": 20.005405405405405, "grad_norm": 0.004618913400918245, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6080 }, { "epoch": 20.00543918918919, "grad_norm": 0.0018099916633218527, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6081 }, { "epoch": 20.005472972972974, "grad_norm": 0.0020934545900672674, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6082 }, { "epoch": 20.005506756756755, "grad_norm": 0.002609138609841466, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6083 }, { "epoch": 20.00554054054054, "grad_norm": 0.0018990365788340569, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6084 }, { "epoch": 20.005574324324325, "grad_norm": 0.002647630637511611, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6085 }, { "epoch": 20.00560810810811, "grad_norm": 0.19621752202510834, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6086 }, { "epoch": 20.00564189189189, "grad_norm": 0.00326128164306283, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6087 }, { "epoch": 20.005675675675676, "grad_norm": 0.011591702699661255, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6088 }, { "epoch": 20.00570945945946, "grad_norm": 0.06474150717258453, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6089 }, { "epoch": 20.005743243243245, "grad_norm": 0.04225614294409752, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6090 }, { "epoch": 20.005777027027026, "grad_norm": 0.001490467693656683, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6091 }, { "epoch": 20.00581081081081, "grad_norm": 0.017384575679898262, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6092 }, { "epoch": 20.005844594594596, "grad_norm": 0.2555469572544098, "learning_rate": 3.125e-06, "loss": 0.0098, "step": 6093 }, { "epoch": 20.005878378378377, "grad_norm": 0.0028449532110244036, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6094 }, { "epoch": 20.00591216216216, "grad_norm": 0.009798742830753326, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6095 }, { "epoch": 20.005945945945946, "grad_norm": 6.89240837097168, "learning_rate": 3.125e-06, "loss": 0.5259, "step": 6096 }, { "epoch": 20.00597972972973, "grad_norm": 4.094933986663818, "learning_rate": 3.125e-06, "loss": 0.1953, "step": 6097 }, { "epoch": 20.006013513513512, "grad_norm": 0.002100719138979912, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6098 }, { "epoch": 20.006047297297297, "grad_norm": 0.00495041674003005, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6099 }, { "epoch": 20.00608108108108, "grad_norm": 0.013409001752734184, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6100 }, { "epoch": 20.006114864864866, "grad_norm": 0.004817471839487553, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6101 }, { "epoch": 20.006148648648647, "grad_norm": 0.01457239780575037, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6102 }, { "epoch": 20.006182432432432, "grad_norm": 0.0037036407738924026, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6103 }, { "epoch": 20.006216216216217, "grad_norm": 0.02637886069715023, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6104 }, { "epoch": 20.00625, "grad_norm": 0.013403069227933884, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6105 }, { "epoch": 20.006283783783783, "grad_norm": 1.5501497983932495, "learning_rate": 3.125e-06, "loss": 0.0332, "step": 6106 }, { "epoch": 20.006317567567567, "grad_norm": 0.001707589253783226, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6107 }, { "epoch": 20.006351351351352, "grad_norm": 0.001488235080614686, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6108 }, { "epoch": 20.006385135135137, "grad_norm": 0.04772495478391647, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6109 }, { "epoch": 20.006418918918918, "grad_norm": 0.004277093335986137, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6110 }, { "epoch": 20.006452702702703, "grad_norm": 0.002050521783530712, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6111 }, { "epoch": 20.006486486486487, "grad_norm": 0.002148660831153393, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6112 }, { "epoch": 20.006520270270272, "grad_norm": 0.012207250110805035, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6113 }, { "epoch": 20.006554054054053, "grad_norm": 0.0033463030122220516, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6114 }, { "epoch": 20.006587837837838, "grad_norm": 0.0032296190038323402, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6115 }, { "epoch": 20.006621621621623, "grad_norm": 0.02063279040157795, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6116 }, { "epoch": 20.006655405405404, "grad_norm": 0.08057605475187302, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6117 }, { "epoch": 20.00668918918919, "grad_norm": 0.002821562811732292, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6118 }, { "epoch": 20.006722972972973, "grad_norm": 0.13354673981666565, "learning_rate": 3.125e-06, "loss": 0.0051, "step": 6119 }, { "epoch": 20.006756756756758, "grad_norm": 0.006268047261983156, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6120 }, { "epoch": 20.00679054054054, "grad_norm": 55.92080307006836, "learning_rate": 3.125e-06, "loss": 0.7196, "step": 6121 }, { "epoch": 20.006824324324324, "grad_norm": 0.013048099353909492, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6122 }, { "epoch": 20.00685810810811, "grad_norm": 0.00567180709913373, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6123 }, { "epoch": 20.006891891891893, "grad_norm": 1.918643832206726, "learning_rate": 3.125e-06, "loss": 0.0335, "step": 6124 }, { "epoch": 20.006925675675674, "grad_norm": 0.010547625832259655, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6125 }, { "epoch": 20.00695945945946, "grad_norm": 0.0012879424029961228, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6126 }, { "epoch": 20.006993243243244, "grad_norm": 1.7642897367477417, "learning_rate": 3.125e-06, "loss": 0.0288, "step": 6127 }, { "epoch": 20.00702702702703, "grad_norm": 0.01377816777676344, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6128 }, { "epoch": 20.00706081081081, "grad_norm": 0.0017686127685010433, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6129 }, { "epoch": 20.007094594594594, "grad_norm": 0.36737099289894104, "learning_rate": 3.125e-06, "loss": 0.0105, "step": 6130 }, { "epoch": 20.00712837837838, "grad_norm": 1.0438367128372192, "learning_rate": 3.125e-06, "loss": 0.0027, "step": 6131 }, { "epoch": 20.007162162162164, "grad_norm": 8.50983715057373, "learning_rate": 3.125e-06, "loss": 0.291, "step": 6132 }, { "epoch": 20.007195945945945, "grad_norm": 0.163718581199646, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 6133 }, { "epoch": 20.00722972972973, "grad_norm": 0.005946979857981205, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6134 }, { "epoch": 20.007263513513514, "grad_norm": 0.003495968645438552, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6135 }, { "epoch": 20.007297297297296, "grad_norm": 0.0015410037012770772, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6136 }, { "epoch": 20.00733108108108, "grad_norm": 0.012003994546830654, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6137 }, { "epoch": 20.007364864864865, "grad_norm": 4.011589050292969, "learning_rate": 3.125e-06, "loss": 0.0226, "step": 6138 }, { "epoch": 20.00739864864865, "grad_norm": 0.14675718545913696, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6139 }, { "epoch": 20.00743243243243, "grad_norm": 3.5218167304992676, "learning_rate": 3.125e-06, "loss": 0.1973, "step": 6140 }, { "epoch": 20.007466216216216, "grad_norm": 7.318106651306152, "learning_rate": 3.125e-06, "loss": 0.3171, "step": 6141 }, { "epoch": 20.0075, "grad_norm": 0.03158298134803772, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6142 }, { "epoch": 20.007533783783785, "grad_norm": 0.0036787635181099176, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6143 }, { "epoch": 20.007567567567566, "grad_norm": 0.0032567211892455816, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6144 }, { "epoch": 20.00760135135135, "grad_norm": 0.003495384007692337, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6145 }, { "epoch": 20.007635135135136, "grad_norm": 0.0030346170533448458, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6146 }, { "epoch": 20.00766891891892, "grad_norm": 0.08306758105754852, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6147 }, { "epoch": 20.0077027027027, "grad_norm": 0.8563951849937439, "learning_rate": 3.125e-06, "loss": 0.0195, "step": 6148 }, { "epoch": 20.007736486486486, "grad_norm": 0.004438839852809906, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6149 }, { "epoch": 20.00777027027027, "grad_norm": 0.07889803498983383, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6150 }, { "epoch": 20.007804054054056, "grad_norm": 0.0024029614869505167, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6151 }, { "epoch": 20.007837837837837, "grad_norm": 0.039702944457530975, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6152 }, { "epoch": 20.00787162162162, "grad_norm": 0.002373743336647749, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6153 }, { "epoch": 20.007905405405406, "grad_norm": 0.00946226716041565, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6154 }, { "epoch": 20.00793918918919, "grad_norm": 0.13076238334178925, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6155 }, { "epoch": 20.007972972972972, "grad_norm": 0.09814740717411041, "learning_rate": 3.125e-06, "loss": 0.0026, "step": 6156 }, { "epoch": 20.008006756756757, "grad_norm": 0.002641798695549369, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6157 }, { "epoch": 20.00804054054054, "grad_norm": 0.0020396115723997355, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6158 }, { "epoch": 20.008074324324323, "grad_norm": 0.007178288418799639, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6159 }, { "epoch": 20.008108108108107, "grad_norm": 0.00827743113040924, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6160 }, { "epoch": 20.008141891891892, "grad_norm": 0.0013193782651796937, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6161 }, { "epoch": 20.008175675675677, "grad_norm": 0.0890795961022377, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6162 }, { "epoch": 20.008209459459458, "grad_norm": 0.37480294704437256, "learning_rate": 3.125e-06, "loss": 0.0025, "step": 6163 }, { "epoch": 20.008243243243243, "grad_norm": 0.001372281345538795, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6164 }, { "epoch": 20.008277027027027, "grad_norm": 2.366615056991577, "learning_rate": 3.125e-06, "loss": 0.0532, "step": 6165 }, { "epoch": 20.008310810810812, "grad_norm": 0.8888756036758423, "learning_rate": 3.125e-06, "loss": 0.0035, "step": 6166 }, { "epoch": 20.008344594594593, "grad_norm": 1.019343614578247, "learning_rate": 3.125e-06, "loss": 0.0027, "step": 6167 }, { "epoch": 20.008378378378378, "grad_norm": 0.0023506435099989176, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6168 }, { "epoch": 20.008412162162163, "grad_norm": 0.17912745475769043, "learning_rate": 3.125e-06, "loss": 0.0016, "step": 6169 }, { "epoch": 20.008445945945947, "grad_norm": 0.004317872226238251, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6170 }, { "epoch": 20.00847972972973, "grad_norm": 0.3648601472377777, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 6171 }, { "epoch": 20.008513513513513, "grad_norm": 0.007964624091982841, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6172 }, { "epoch": 20.008547297297298, "grad_norm": 0.0015865601599216461, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6173 }, { "epoch": 20.008581081081083, "grad_norm": 0.022537311539053917, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6174 }, { "epoch": 20.008614864864864, "grad_norm": 0.12458238750696182, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6175 }, { "epoch": 20.00864864864865, "grad_norm": 0.0033819791860878468, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6176 }, { "epoch": 20.008682432432433, "grad_norm": 18.664674758911133, "learning_rate": 3.125e-06, "loss": 0.7918, "step": 6177 }, { "epoch": 20.008716216216218, "grad_norm": 0.08858013898134232, "learning_rate": 3.125e-06, "loss": 0.0017, "step": 6178 }, { "epoch": 20.00875, "grad_norm": 0.0018463528249412775, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6179 }, { "epoch": 20.008783783783784, "grad_norm": 0.0016017996240407228, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6180 }, { "epoch": 20.00881756756757, "grad_norm": 0.003717504907399416, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6181 }, { "epoch": 20.00885135135135, "grad_norm": 0.003150715259835124, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6182 }, { "epoch": 20.008885135135134, "grad_norm": 0.02472642995417118, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6183 }, { "epoch": 20.00891891891892, "grad_norm": 0.010974645614624023, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6184 }, { "epoch": 20.008952702702704, "grad_norm": 20.503765106201172, "learning_rate": 3.125e-06, "loss": 0.4769, "step": 6185 }, { "epoch": 20.008986486486485, "grad_norm": 0.11485159397125244, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 6186 }, { "epoch": 20.00902027027027, "grad_norm": 0.0018449670169502497, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6187 }, { "epoch": 20.009054054054054, "grad_norm": 0.001329201040789485, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6188 }, { "epoch": 20.00908783783784, "grad_norm": 0.009561531245708466, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6189 }, { "epoch": 20.00912162162162, "grad_norm": 0.006405401974916458, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6190 }, { "epoch": 20.009155405405405, "grad_norm": 0.0020204412285238504, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6191 }, { "epoch": 20.00918918918919, "grad_norm": 0.0018908967031165957, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6192 }, { "epoch": 20.009222972972974, "grad_norm": 0.005610856227576733, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6193 }, { "epoch": 20.009256756756756, "grad_norm": 0.004560245666652918, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6194 }, { "epoch": 20.00929054054054, "grad_norm": 0.163558229804039, "learning_rate": 3.125e-06, "loss": 0.0061, "step": 6195 }, { "epoch": 20.009324324324325, "grad_norm": 0.0040845065377652645, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6196 }, { "epoch": 20.00935810810811, "grad_norm": 56.698585510253906, "learning_rate": 3.125e-06, "loss": 0.3546, "step": 6197 }, { "epoch": 20.00939189189189, "grad_norm": 0.005862810183316469, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6198 }, { "epoch": 20.009425675675676, "grad_norm": 0.36396631598472595, "learning_rate": 3.125e-06, "loss": 0.0043, "step": 6199 }, { "epoch": 20.00945945945946, "grad_norm": 0.031472399830818176, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6200 }, { "epoch": 20.00949324324324, "grad_norm": 0.0007540663937106729, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6201 }, { "epoch": 20.009527027027026, "grad_norm": 0.26966798305511475, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6202 }, { "epoch": 20.00956081081081, "grad_norm": 0.04619956016540527, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6203 }, { "epoch": 20.009594594594596, "grad_norm": 0.008070673793554306, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6204 }, { "epoch": 20.009628378378377, "grad_norm": 0.0010000548791140318, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6205 }, { "epoch": 20.00966216216216, "grad_norm": 0.012155809439718723, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6206 }, { "epoch": 20.009695945945946, "grad_norm": 0.0014881506795063615, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6207 }, { "epoch": 20.00972972972973, "grad_norm": 0.883762001991272, "learning_rate": 3.125e-06, "loss": 0.0046, "step": 6208 }, { "epoch": 20.009763513513512, "grad_norm": 0.16451840102672577, "learning_rate": 3.125e-06, "loss": 0.0064, "step": 6209 }, { "epoch": 20.009797297297297, "grad_norm": 0.13318553566932678, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6210 }, { "epoch": 20.00983108108108, "grad_norm": 0.2346818894147873, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6211 }, { "epoch": 20.009864864864866, "grad_norm": 0.0027442313730716705, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6212 }, { "epoch": 20.009898648648647, "grad_norm": 0.32386884093284607, "learning_rate": 3.125e-06, "loss": 0.0013, "step": 6213 }, { "epoch": 20.009932432432432, "grad_norm": 0.0626472607254982, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6214 }, { "epoch": 20.009966216216217, "grad_norm": 0.002004368929192424, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6215 }, { "epoch": 20.01, "grad_norm": 3.124154567718506, "learning_rate": 3.125e-06, "loss": 0.4009, "step": 6216 }, { "epoch": 20.01, "eval_accuracy": 0.8998384491114702, "eval_loss": 0.5421877503395081, "eval_runtime": 32.3998, "eval_samples_per_second": 19.105, "eval_steps_per_second": 2.407, "step": 6216 }, { "epoch": 21.000033783783785, "grad_norm": 0.0025512746069580317, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6217 }, { "epoch": 21.000067567567566, "grad_norm": 0.14403150975704193, "learning_rate": 3.125e-06, "loss": 0.0048, "step": 6218 }, { "epoch": 21.00010135135135, "grad_norm": 1.4303315877914429, "learning_rate": 3.125e-06, "loss": 0.0115, "step": 6219 }, { "epoch": 21.000135135135135, "grad_norm": 35.3403434753418, "learning_rate": 3.125e-06, "loss": 0.1875, "step": 6220 }, { "epoch": 21.00016891891892, "grad_norm": 0.08460438251495361, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6221 }, { "epoch": 21.0002027027027, "grad_norm": 4.805853366851807, "learning_rate": 3.125e-06, "loss": 0.0127, "step": 6222 }, { "epoch": 21.000236486486486, "grad_norm": 0.0035148547030985355, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6223 }, { "epoch": 21.00027027027027, "grad_norm": 0.004103355575352907, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6224 }, { "epoch": 21.000304054054055, "grad_norm": 0.016479047015309334, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6225 }, { "epoch": 21.000337837837836, "grad_norm": 0.038472384214401245, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6226 }, { "epoch": 21.00037162162162, "grad_norm": 0.8694924116134644, "learning_rate": 3.125e-06, "loss": 0.005, "step": 6227 }, { "epoch": 21.000405405405406, "grad_norm": 0.002639228943735361, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6228 }, { "epoch": 21.00043918918919, "grad_norm": 0.002636321121826768, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6229 }, { "epoch": 21.00047297297297, "grad_norm": 0.0009935300331562757, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6230 }, { "epoch": 21.000506756756756, "grad_norm": 5.908726215362549, "learning_rate": 3.125e-06, "loss": 0.2798, "step": 6231 }, { "epoch": 21.00054054054054, "grad_norm": 0.0039406633004546165, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6232 }, { "epoch": 21.000574324324326, "grad_norm": 0.0019208381418138742, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6233 }, { "epoch": 21.000608108108107, "grad_norm": 2.261544704437256, "learning_rate": 3.125e-06, "loss": 0.0346, "step": 6234 }, { "epoch": 21.000641891891892, "grad_norm": 0.07846692949533463, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6235 }, { "epoch": 21.000675675675677, "grad_norm": 0.13554151356220245, "learning_rate": 3.125e-06, "loss": 0.0051, "step": 6236 }, { "epoch": 21.00070945945946, "grad_norm": 0.0022631254978477955, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6237 }, { "epoch": 21.000743243243242, "grad_norm": 37.36821746826172, "learning_rate": 3.125e-06, "loss": 0.2301, "step": 6238 }, { "epoch": 21.000777027027027, "grad_norm": 0.0167694091796875, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6239 }, { "epoch": 21.000810810810812, "grad_norm": 0.04659680649638176, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6240 }, { "epoch": 21.000844594594593, "grad_norm": 0.14784392714500427, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 6241 }, { "epoch": 21.000878378378378, "grad_norm": 0.184809148311615, "learning_rate": 3.125e-06, "loss": 0.0064, "step": 6242 }, { "epoch": 21.000912162162162, "grad_norm": 0.018449535593390465, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6243 }, { "epoch": 21.000945945945947, "grad_norm": 0.11217664182186127, "learning_rate": 3.125e-06, "loss": 0.0026, "step": 6244 }, { "epoch": 21.00097972972973, "grad_norm": 0.002307078568264842, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6245 }, { "epoch": 21.001013513513513, "grad_norm": 0.910245954990387, "learning_rate": 3.125e-06, "loss": 0.016, "step": 6246 }, { "epoch": 21.001047297297298, "grad_norm": 0.0016344016185030341, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6247 }, { "epoch": 21.001081081081082, "grad_norm": 19.49326515197754, "learning_rate": 3.125e-06, "loss": 0.8028, "step": 6248 }, { "epoch": 21.001114864864864, "grad_norm": 0.005725006572902203, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6249 }, { "epoch": 21.00114864864865, "grad_norm": 0.02023373357951641, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6250 }, { "epoch": 21.001182432432433, "grad_norm": 0.001234993222169578, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6251 }, { "epoch": 21.001216216216218, "grad_norm": 0.001868929946795106, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6252 }, { "epoch": 21.00125, "grad_norm": 0.023429689928889275, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6253 }, { "epoch": 21.001283783783784, "grad_norm": 1.4587218761444092, "learning_rate": 3.125e-06, "loss": 0.0225, "step": 6254 }, { "epoch": 21.00131756756757, "grad_norm": 0.1583731770515442, "learning_rate": 3.125e-06, "loss": 0.0059, "step": 6255 }, { "epoch": 21.001351351351353, "grad_norm": 0.0017599626444280148, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6256 }, { "epoch": 21.001385135135134, "grad_norm": 0.3441855311393738, "learning_rate": 3.125e-06, "loss": 0.0061, "step": 6257 }, { "epoch": 21.00141891891892, "grad_norm": 0.01903519593179226, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6258 }, { "epoch": 21.001452702702704, "grad_norm": 0.0017632127273827791, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6259 }, { "epoch": 21.001486486486485, "grad_norm": 1.8656100034713745, "learning_rate": 3.125e-06, "loss": 0.0477, "step": 6260 }, { "epoch": 21.00152027027027, "grad_norm": 0.003673405619338155, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6261 }, { "epoch": 21.001554054054054, "grad_norm": 0.16315871477127075, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 6262 }, { "epoch": 21.00158783783784, "grad_norm": 0.004095831885933876, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6263 }, { "epoch": 21.00162162162162, "grad_norm": 0.04873984307050705, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6264 }, { "epoch": 21.001655405405405, "grad_norm": 0.0027706932742148638, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6265 }, { "epoch": 21.00168918918919, "grad_norm": 0.002388453111052513, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6266 }, { "epoch": 21.001722972972974, "grad_norm": 0.06911661475896835, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 6267 }, { "epoch": 21.001756756756755, "grad_norm": 0.001411712379194796, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6268 }, { "epoch": 21.00179054054054, "grad_norm": 0.005651530344039202, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6269 }, { "epoch": 21.001824324324325, "grad_norm": 0.0012045524781569839, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6270 }, { "epoch": 21.00185810810811, "grad_norm": 3.345552682876587, "learning_rate": 3.125e-06, "loss": 0.3851, "step": 6271 }, { "epoch": 21.00189189189189, "grad_norm": 0.008614247664809227, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6272 }, { "epoch": 21.001925675675675, "grad_norm": 0.1535835862159729, "learning_rate": 3.125e-06, "loss": 0.0049, "step": 6273 }, { "epoch": 21.00195945945946, "grad_norm": 0.004800194408744574, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6274 }, { "epoch": 21.001993243243245, "grad_norm": 4.304607391357422, "learning_rate": 3.125e-06, "loss": 0.3236, "step": 6275 }, { "epoch": 21.002027027027026, "grad_norm": 0.1852768361568451, "learning_rate": 3.125e-06, "loss": 0.006, "step": 6276 }, { "epoch": 21.00206081081081, "grad_norm": 0.004634973127394915, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6277 }, { "epoch": 21.002094594594595, "grad_norm": 46.58440017700195, "learning_rate": 3.125e-06, "loss": 0.1609, "step": 6278 }, { "epoch": 21.00212837837838, "grad_norm": 0.01258929818868637, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6279 }, { "epoch": 21.00216216216216, "grad_norm": 36.40911865234375, "learning_rate": 3.125e-06, "loss": 0.124, "step": 6280 }, { "epoch": 21.002195945945946, "grad_norm": 0.1475944221019745, "learning_rate": 3.125e-06, "loss": 0.0017, "step": 6281 }, { "epoch": 21.00222972972973, "grad_norm": 0.42561864852905273, "learning_rate": 3.125e-06, "loss": 0.0012, "step": 6282 }, { "epoch": 21.002263513513512, "grad_norm": 3.539653778076172, "learning_rate": 3.125e-06, "loss": 0.051, "step": 6283 }, { "epoch": 21.002297297297297, "grad_norm": 1.0198109149932861, "learning_rate": 3.125e-06, "loss": 0.0087, "step": 6284 }, { "epoch": 21.00233108108108, "grad_norm": 0.0009664483950473368, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6285 }, { "epoch": 21.002364864864866, "grad_norm": 0.006020096130669117, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6286 }, { "epoch": 21.002398648648647, "grad_norm": 0.003025102661922574, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6287 }, { "epoch": 21.002432432432432, "grad_norm": 0.005618843715637922, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6288 }, { "epoch": 21.002466216216217, "grad_norm": 3.5940091609954834, "learning_rate": 3.125e-06, "loss": 0.2345, "step": 6289 }, { "epoch": 21.0025, "grad_norm": 18.36360740661621, "learning_rate": 3.125e-06, "loss": 1.0128, "step": 6290 }, { "epoch": 21.002533783783782, "grad_norm": 0.005356296431273222, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6291 }, { "epoch": 21.002567567567567, "grad_norm": 0.0028623705729842186, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6292 }, { "epoch": 21.002601351351352, "grad_norm": 0.01000396441668272, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6293 }, { "epoch": 21.002635135135137, "grad_norm": 0.007898572832345963, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6294 }, { "epoch": 21.002668918918918, "grad_norm": 0.004678857047110796, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6295 }, { "epoch": 21.002702702702702, "grad_norm": 6.50321626663208, "learning_rate": 3.125e-06, "loss": 0.4456, "step": 6296 }, { "epoch": 21.002736486486487, "grad_norm": 0.16350838541984558, "learning_rate": 3.125e-06, "loss": 0.0062, "step": 6297 }, { "epoch": 21.002770270270272, "grad_norm": 0.14635120332241058, "learning_rate": 3.125e-06, "loss": 0.0055, "step": 6298 }, { "epoch": 21.002804054054053, "grad_norm": 0.0035913961473852396, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6299 }, { "epoch": 21.002837837837838, "grad_norm": 0.5751926302909851, "learning_rate": 3.125e-06, "loss": 0.0081, "step": 6300 }, { "epoch": 21.002871621621622, "grad_norm": 0.006306190509349108, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6301 }, { "epoch": 21.002905405405407, "grad_norm": 0.0016660909168422222, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6302 }, { "epoch": 21.00293918918919, "grad_norm": 0.5396387577056885, "learning_rate": 3.125e-06, "loss": 0.0126, "step": 6303 }, { "epoch": 21.002972972972973, "grad_norm": 3.5259246826171875, "learning_rate": 3.125e-06, "loss": 0.0067, "step": 6304 }, { "epoch": 21.003006756756758, "grad_norm": 0.31211814284324646, "learning_rate": 3.125e-06, "loss": 0.0094, "step": 6305 }, { "epoch": 21.00304054054054, "grad_norm": 0.23293665051460266, "learning_rate": 3.125e-06, "loss": 0.0019, "step": 6306 }, { "epoch": 21.003074324324324, "grad_norm": 0.0021067417692393064, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6307 }, { "epoch": 21.00310810810811, "grad_norm": 0.0034417291171848774, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6308 }, { "epoch": 21.003141891891893, "grad_norm": 6.551512241363525, "learning_rate": 3.125e-06, "loss": 0.5524, "step": 6309 }, { "epoch": 21.003175675675674, "grad_norm": 0.0029895389452576637, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6310 }, { "epoch": 21.00320945945946, "grad_norm": 0.023891210556030273, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6311 }, { "epoch": 21.003243243243244, "grad_norm": 0.2666957676410675, "learning_rate": 3.125e-06, "loss": 0.0061, "step": 6312 }, { "epoch": 21.00327702702703, "grad_norm": 9.8089017868042, "learning_rate": 3.125e-06, "loss": 0.2877, "step": 6313 }, { "epoch": 21.00331081081081, "grad_norm": 0.03535982966423035, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6314 }, { "epoch": 21.003344594594594, "grad_norm": 0.005117643158882856, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6315 }, { "epoch": 21.00337837837838, "grad_norm": 0.0007017299067229033, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6316 }, { "epoch": 21.003412162162164, "grad_norm": 0.1938471496105194, "learning_rate": 3.125e-06, "loss": 0.0066, "step": 6317 }, { "epoch": 21.003445945945945, "grad_norm": 34.76237106323242, "learning_rate": 3.125e-06, "loss": 0.3799, "step": 6318 }, { "epoch": 21.00347972972973, "grad_norm": 0.0019518104381859303, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6319 }, { "epoch": 21.003513513513514, "grad_norm": 0.014572063460946083, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6320 }, { "epoch": 21.0035472972973, "grad_norm": 6.469024181365967, "learning_rate": 3.125e-06, "loss": 0.2875, "step": 6321 }, { "epoch": 21.00358108108108, "grad_norm": 0.0009912372333928943, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6322 }, { "epoch": 21.003614864864865, "grad_norm": 0.009893547743558884, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6323 }, { "epoch": 21.00364864864865, "grad_norm": 0.001900159171782434, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6324 }, { "epoch": 21.00368243243243, "grad_norm": 0.020206032320857048, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6325 }, { "epoch": 21.003716216216215, "grad_norm": 0.018712880089879036, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6326 }, { "epoch": 21.00375, "grad_norm": 0.1600453108549118, "learning_rate": 3.125e-06, "loss": 0.0012, "step": 6327 }, { "epoch": 21.003783783783785, "grad_norm": 0.0019208334852010012, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6328 }, { "epoch": 21.003817567567566, "grad_norm": 24.324886322021484, "learning_rate": 3.125e-06, "loss": 0.842, "step": 6329 }, { "epoch": 21.00385135135135, "grad_norm": 0.003488784423097968, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6330 }, { "epoch": 21.003885135135135, "grad_norm": 0.011577000841498375, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6331 }, { "epoch": 21.00391891891892, "grad_norm": 0.0015868906630203128, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6332 }, { "epoch": 21.0039527027027, "grad_norm": 0.0021320933010429144, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6333 }, { "epoch": 21.003986486486486, "grad_norm": 0.001176493358798325, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6334 }, { "epoch": 21.00402027027027, "grad_norm": 4.400754928588867, "learning_rate": 3.125e-06, "loss": 0.3255, "step": 6335 }, { "epoch": 21.004054054054055, "grad_norm": 0.001129207550548017, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6336 }, { "epoch": 21.004087837837837, "grad_norm": 1.0349366664886475, "learning_rate": 3.125e-06, "loss": 0.005, "step": 6337 }, { "epoch": 21.00412162162162, "grad_norm": 0.06888977438211441, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6338 }, { "epoch": 21.004155405405406, "grad_norm": 1.1161216497421265, "learning_rate": 3.125e-06, "loss": 0.0047, "step": 6339 }, { "epoch": 21.00418918918919, "grad_norm": 0.17674672603607178, "learning_rate": 3.125e-06, "loss": 0.0059, "step": 6340 }, { "epoch": 21.004222972972972, "grad_norm": 15.195623397827148, "learning_rate": 3.125e-06, "loss": 0.0616, "step": 6341 }, { "epoch": 21.004256756756757, "grad_norm": 3.929403781890869, "learning_rate": 3.125e-06, "loss": 0.3719, "step": 6342 }, { "epoch": 21.00429054054054, "grad_norm": 0.017807500436902046, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6343 }, { "epoch": 21.004324324324326, "grad_norm": 0.0009891858790069818, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6344 }, { "epoch": 21.004358108108107, "grad_norm": 0.004835368599742651, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6345 }, { "epoch": 21.004391891891892, "grad_norm": 0.027369854971766472, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6346 }, { "epoch": 21.004425675675677, "grad_norm": 70.3046875, "learning_rate": 3.125e-06, "loss": 0.2179, "step": 6347 }, { "epoch": 21.004459459459458, "grad_norm": 0.3470238745212555, "learning_rate": 3.125e-06, "loss": 0.0089, "step": 6348 }, { "epoch": 21.004493243243243, "grad_norm": 3.7996907234191895, "learning_rate": 3.125e-06, "loss": 0.0786, "step": 6349 }, { "epoch": 21.004527027027027, "grad_norm": 0.0020498838275671005, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6350 }, { "epoch": 21.004560810810812, "grad_norm": 0.020201630890369415, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6351 }, { "epoch": 21.004594594594593, "grad_norm": 0.15627571940422058, "learning_rate": 3.125e-06, "loss": 0.0038, "step": 6352 }, { "epoch": 21.004628378378378, "grad_norm": 0.044850729405879974, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6353 }, { "epoch": 21.004662162162163, "grad_norm": 0.1193866953253746, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6354 }, { "epoch": 21.004695945945947, "grad_norm": 0.003888100851327181, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6355 }, { "epoch": 21.00472972972973, "grad_norm": 0.0029017699416726828, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6356 }, { "epoch": 21.004763513513513, "grad_norm": 0.23701737821102142, "learning_rate": 3.125e-06, "loss": 0.0035, "step": 6357 }, { "epoch": 21.004797297297298, "grad_norm": 0.8260611891746521, "learning_rate": 3.125e-06, "loss": 0.0022, "step": 6358 }, { "epoch": 21.004831081081083, "grad_norm": 0.0021296062041074038, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6359 }, { "epoch": 21.004864864864864, "grad_norm": 0.032943855971097946, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6360 }, { "epoch": 21.00489864864865, "grad_norm": 0.6638562083244324, "learning_rate": 3.125e-06, "loss": 0.0023, "step": 6361 }, { "epoch": 21.004932432432433, "grad_norm": 0.0023684417828917503, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6362 }, { "epoch": 21.004966216216218, "grad_norm": 30.8080997467041, "learning_rate": 3.125e-06, "loss": 0.5136, "step": 6363 }, { "epoch": 21.005, "grad_norm": 0.031161172315478325, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6364 }, { "epoch": 21.005033783783784, "grad_norm": 7.773580074310303, "learning_rate": 3.125e-06, "loss": 0.1293, "step": 6365 }, { "epoch": 21.00506756756757, "grad_norm": 0.0009722486138343811, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6366 }, { "epoch": 21.00510135135135, "grad_norm": 19.15314483642578, "learning_rate": 3.125e-06, "loss": 0.0687, "step": 6367 }, { "epoch": 21.005135135135134, "grad_norm": 0.010238208808004856, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6368 }, { "epoch": 21.00516891891892, "grad_norm": 0.0015504260081797838, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6369 }, { "epoch": 21.005202702702704, "grad_norm": 0.011157438158988953, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6370 }, { "epoch": 21.005236486486485, "grad_norm": 0.00307928747497499, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6371 }, { "epoch": 21.00527027027027, "grad_norm": 1.6361520290374756, "learning_rate": 3.125e-06, "loss": 0.0479, "step": 6372 }, { "epoch": 21.005304054054054, "grad_norm": 0.002036349382251501, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6373 }, { "epoch": 21.00533783783784, "grad_norm": 6.252273082733154, "learning_rate": 3.125e-06, "loss": 0.421, "step": 6374 }, { "epoch": 21.00537162162162, "grad_norm": 6.344082355499268, "learning_rate": 3.125e-06, "loss": 0.0526, "step": 6375 }, { "epoch": 21.005405405405405, "grad_norm": 0.0041311951354146, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6376 }, { "epoch": 21.00543918918919, "grad_norm": 0.0015281792730093002, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6377 }, { "epoch": 21.005472972972974, "grad_norm": 0.16181696951389313, "learning_rate": 3.125e-06, "loss": 0.0061, "step": 6378 }, { "epoch": 21.005506756756755, "grad_norm": 0.0493512786924839, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6379 }, { "epoch": 21.00554054054054, "grad_norm": 0.003282907884567976, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6380 }, { "epoch": 21.005574324324325, "grad_norm": 0.001088647753931582, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6381 }, { "epoch": 21.00560810810811, "grad_norm": 0.0033003399148583412, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6382 }, { "epoch": 21.00564189189189, "grad_norm": 0.18747150897979736, "learning_rate": 3.125e-06, "loss": 0.0072, "step": 6383 }, { "epoch": 21.005675675675676, "grad_norm": 0.022629182785749435, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6384 }, { "epoch": 21.00570945945946, "grad_norm": 0.07729567587375641, "learning_rate": 3.125e-06, "loss": 0.0016, "step": 6385 }, { "epoch": 21.005743243243245, "grad_norm": 0.0013431203551590443, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6386 }, { "epoch": 21.005777027027026, "grad_norm": 0.013447233475744724, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6387 }, { "epoch": 21.00581081081081, "grad_norm": 0.007908782921731472, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6388 }, { "epoch": 21.005844594594596, "grad_norm": 0.0016157153295353055, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6389 }, { "epoch": 21.005878378378377, "grad_norm": 9.655580520629883, "learning_rate": 3.125e-06, "loss": 0.9433, "step": 6390 }, { "epoch": 21.00591216216216, "grad_norm": 0.000927773246075958, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6391 }, { "epoch": 21.005945945945946, "grad_norm": 0.0035124209243804216, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6392 }, { "epoch": 21.00597972972973, "grad_norm": 8.224082946777344, "learning_rate": 3.125e-06, "loss": 0.0414, "step": 6393 }, { "epoch": 21.006013513513512, "grad_norm": 0.0009326684521511197, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6394 }, { "epoch": 21.006047297297297, "grad_norm": 0.8742517232894897, "learning_rate": 3.125e-06, "loss": 0.0097, "step": 6395 }, { "epoch": 21.00608108108108, "grad_norm": 0.007620309945195913, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6396 }, { "epoch": 21.006114864864866, "grad_norm": 41.249969482421875, "learning_rate": 3.125e-06, "loss": 0.2105, "step": 6397 }, { "epoch": 21.006148648648647, "grad_norm": 0.005312579218298197, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6398 }, { "epoch": 21.006182432432432, "grad_norm": 0.15189653635025024, "learning_rate": 3.125e-06, "loss": 0.0058, "step": 6399 }, { "epoch": 21.006216216216217, "grad_norm": 0.005687260068953037, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6400 }, { "epoch": 21.00625, "grad_norm": 0.09430119395256042, "learning_rate": 3.125e-06, "loss": 0.0019, "step": 6401 }, { "epoch": 21.006283783783783, "grad_norm": 14.702796936035156, "learning_rate": 3.125e-06, "loss": 0.0379, "step": 6402 }, { "epoch": 21.006317567567567, "grad_norm": 0.004261600784957409, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6403 }, { "epoch": 21.006351351351352, "grad_norm": 0.003023360623046756, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6404 }, { "epoch": 21.006385135135137, "grad_norm": 0.12318991124629974, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6405 }, { "epoch": 21.006418918918918, "grad_norm": 0.07268359512090683, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6406 }, { "epoch": 21.006452702702703, "grad_norm": 0.06546255201101303, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6407 }, { "epoch": 21.006486486486487, "grad_norm": 21.42632484436035, "learning_rate": 3.125e-06, "loss": 0.0549, "step": 6408 }, { "epoch": 21.006520270270272, "grad_norm": 1.6867749691009521, "learning_rate": 3.125e-06, "loss": 0.0173, "step": 6409 }, { "epoch": 21.006554054054053, "grad_norm": 0.002722121775150299, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6410 }, { "epoch": 21.006587837837838, "grad_norm": 0.0921490266919136, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6411 }, { "epoch": 21.006621621621623, "grad_norm": 0.015065099112689495, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6412 }, { "epoch": 21.006655405405404, "grad_norm": 0.21331235766410828, "learning_rate": 3.125e-06, "loss": 0.002, "step": 6413 }, { "epoch": 21.00668918918919, "grad_norm": 0.0034301981795579195, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6414 }, { "epoch": 21.006722972972973, "grad_norm": 0.102218396961689, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6415 }, { "epoch": 21.006756756756758, "grad_norm": 1.3112068176269531, "learning_rate": 3.125e-06, "loss": 0.0134, "step": 6416 }, { "epoch": 21.00679054054054, "grad_norm": 0.0014174834359437227, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6417 }, { "epoch": 21.006824324324324, "grad_norm": 0.0748506486415863, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6418 }, { "epoch": 21.00685810810811, "grad_norm": 0.1481262892484665, "learning_rate": 3.125e-06, "loss": 0.0056, "step": 6419 }, { "epoch": 21.006891891891893, "grad_norm": 0.005313316825777292, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6420 }, { "epoch": 21.006925675675674, "grad_norm": 0.013276099227368832, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6421 }, { "epoch": 21.00695945945946, "grad_norm": 0.05002432316541672, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6422 }, { "epoch": 21.006993243243244, "grad_norm": 0.007645812351256609, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6423 }, { "epoch": 21.00702702702703, "grad_norm": 0.00763802882283926, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6424 }, { "epoch": 21.00706081081081, "grad_norm": 0.0031855758279561996, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6425 }, { "epoch": 21.007094594594594, "grad_norm": 0.0013503963127732277, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6426 }, { "epoch": 21.00712837837838, "grad_norm": 0.13697972893714905, "learning_rate": 3.125e-06, "loss": 0.0048, "step": 6427 }, { "epoch": 21.007162162162164, "grad_norm": 18.693328857421875, "learning_rate": 3.125e-06, "loss": 0.7453, "step": 6428 }, { "epoch": 21.007195945945945, "grad_norm": 3.2851052284240723, "learning_rate": 3.125e-06, "loss": 0.4131, "step": 6429 }, { "epoch": 21.00722972972973, "grad_norm": 20.235342025756836, "learning_rate": 3.125e-06, "loss": 0.1638, "step": 6430 }, { "epoch": 21.007263513513514, "grad_norm": 0.0014802489895373583, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6431 }, { "epoch": 21.007297297297296, "grad_norm": 0.001290163490921259, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6432 }, { "epoch": 21.00733108108108, "grad_norm": 0.013116667978465557, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6433 }, { "epoch": 21.007364864864865, "grad_norm": 0.007638348266482353, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6434 }, { "epoch": 21.00739864864865, "grad_norm": 0.675811767578125, "learning_rate": 3.125e-06, "loss": 0.0258, "step": 6435 }, { "epoch": 21.00743243243243, "grad_norm": 0.6015931367874146, "learning_rate": 3.125e-06, "loss": 0.0167, "step": 6436 }, { "epoch": 21.007466216216216, "grad_norm": 0.0012131264666095376, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6437 }, { "epoch": 21.0075, "grad_norm": 0.01482481136918068, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6438 }, { "epoch": 21.007533783783785, "grad_norm": 0.28956517577171326, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6439 }, { "epoch": 21.007567567567566, "grad_norm": 0.36916178464889526, "learning_rate": 3.125e-06, "loss": 0.0128, "step": 6440 }, { "epoch": 21.00760135135135, "grad_norm": 0.008464723825454712, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6441 }, { "epoch": 21.007635135135136, "grad_norm": 0.17638231813907623, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 6442 }, { "epoch": 21.00766891891892, "grad_norm": 0.14720962941646576, "learning_rate": 3.125e-06, "loss": 0.0053, "step": 6443 }, { "epoch": 21.0077027027027, "grad_norm": 0.002487106481567025, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6444 }, { "epoch": 21.007736486486486, "grad_norm": 37.887088775634766, "learning_rate": 3.125e-06, "loss": 0.1017, "step": 6445 }, { "epoch": 21.00777027027027, "grad_norm": 17.45408058166504, "learning_rate": 3.125e-06, "loss": 0.035, "step": 6446 }, { "epoch": 21.007804054054056, "grad_norm": 0.004996642470359802, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6447 }, { "epoch": 21.007837837837837, "grad_norm": 7.3471221923828125, "learning_rate": 3.125e-06, "loss": 0.4024, "step": 6448 }, { "epoch": 21.00787162162162, "grad_norm": 0.003078129841014743, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6449 }, { "epoch": 21.007905405405406, "grad_norm": 0.1758342683315277, "learning_rate": 3.125e-06, "loss": 0.0062, "step": 6450 }, { "epoch": 21.00793918918919, "grad_norm": 0.007244691252708435, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6451 }, { "epoch": 21.007972972972972, "grad_norm": 0.0029884735122323036, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6452 }, { "epoch": 21.008006756756757, "grad_norm": 0.01308363862335682, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6453 }, { "epoch": 21.00804054054054, "grad_norm": 0.0031888033263385296, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6454 }, { "epoch": 21.008074324324323, "grad_norm": 0.15212561190128326, "learning_rate": 3.125e-06, "loss": 0.0036, "step": 6455 }, { "epoch": 21.008108108108107, "grad_norm": 0.006430570501834154, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6456 }, { "epoch": 21.008141891891892, "grad_norm": 0.001692312303930521, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6457 }, { "epoch": 21.008175675675677, "grad_norm": 0.001056741806678474, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6458 }, { "epoch": 21.008209459459458, "grad_norm": 0.0031048881355673075, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6459 }, { "epoch": 21.008243243243243, "grad_norm": 0.005914156790822744, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6460 }, { "epoch": 21.008277027027027, "grad_norm": 0.14103418588638306, "learning_rate": 3.125e-06, "loss": 0.0052, "step": 6461 }, { "epoch": 21.008310810810812, "grad_norm": 0.014157090336084366, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6462 }, { "epoch": 21.008344594594593, "grad_norm": 0.18470588326454163, "learning_rate": 3.125e-06, "loss": 0.0065, "step": 6463 }, { "epoch": 21.008378378378378, "grad_norm": 0.015713371336460114, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6464 }, { "epoch": 21.008412162162163, "grad_norm": 16.638578414916992, "learning_rate": 3.125e-06, "loss": 0.1442, "step": 6465 }, { "epoch": 21.008445945945947, "grad_norm": 0.09826502948999405, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6466 }, { "epoch": 21.00847972972973, "grad_norm": 2.4514524936676025, "learning_rate": 3.125e-06, "loss": 0.0089, "step": 6467 }, { "epoch": 21.008513513513513, "grad_norm": 0.12790146470069885, "learning_rate": 3.125e-06, "loss": 0.0048, "step": 6468 }, { "epoch": 21.008547297297298, "grad_norm": 0.008363999426364899, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6469 }, { "epoch": 21.008581081081083, "grad_norm": 4.208961009979248, "learning_rate": 3.125e-06, "loss": 0.0091, "step": 6470 }, { "epoch": 21.008614864864864, "grad_norm": 0.10655983537435532, "learning_rate": 3.125e-06, "loss": 0.0039, "step": 6471 }, { "epoch": 21.00864864864865, "grad_norm": 1.6602188348770142, "learning_rate": 3.125e-06, "loss": 0.0054, "step": 6472 }, { "epoch": 21.008682432432433, "grad_norm": 0.0008620591252110898, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6473 }, { "epoch": 21.008716216216218, "grad_norm": 0.05657025799155235, "learning_rate": 3.125e-06, "loss": 0.0016, "step": 6474 }, { "epoch": 21.00875, "grad_norm": 0.050365690141916275, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6475 }, { "epoch": 21.008783783783784, "grad_norm": 0.0017084194114431739, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6476 }, { "epoch": 21.00881756756757, "grad_norm": 7.32320499420166, "learning_rate": 3.125e-06, "loss": 0.1749, "step": 6477 }, { "epoch": 21.00885135135135, "grad_norm": 0.8447900414466858, "learning_rate": 3.125e-06, "loss": 0.0173, "step": 6478 }, { "epoch": 21.008885135135134, "grad_norm": 3.3071770668029785, "learning_rate": 3.125e-06, "loss": 0.3869, "step": 6479 }, { "epoch": 21.00891891891892, "grad_norm": 0.0017867377027869225, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6480 }, { "epoch": 21.008952702702704, "grad_norm": 0.0031899320892989635, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6481 }, { "epoch": 21.008986486486485, "grad_norm": 0.00230794376693666, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6482 }, { "epoch": 21.00902027027027, "grad_norm": 0.016552487388253212, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6483 }, { "epoch": 21.009054054054054, "grad_norm": 0.23604270815849304, "learning_rate": 3.125e-06, "loss": 0.0008, "step": 6484 }, { "epoch": 21.00908783783784, "grad_norm": 0.00464843912050128, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6485 }, { "epoch": 21.00912162162162, "grad_norm": 0.001812653266824782, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6486 }, { "epoch": 21.009155405405405, "grad_norm": 18.243968963623047, "learning_rate": 3.125e-06, "loss": 0.1121, "step": 6487 }, { "epoch": 21.00918918918919, "grad_norm": 0.14154759049415588, "learning_rate": 3.125e-06, "loss": 0.0054, "step": 6488 }, { "epoch": 21.009222972972974, "grad_norm": 0.1134919822216034, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6489 }, { "epoch": 21.009256756756756, "grad_norm": 0.12302907556295395, "learning_rate": 3.125e-06, "loss": 0.0045, "step": 6490 }, { "epoch": 21.00929054054054, "grad_norm": 0.002235167659819126, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6491 }, { "epoch": 21.009324324324325, "grad_norm": 21.105016708374023, "learning_rate": 3.125e-06, "loss": 1.0334, "step": 6492 }, { "epoch": 21.00935810810811, "grad_norm": 0.005220339167863131, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6493 }, { "epoch": 21.00939189189189, "grad_norm": 0.005436084698885679, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6494 }, { "epoch": 21.009425675675676, "grad_norm": 0.0012934934347867966, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6495 }, { "epoch": 21.00945945945946, "grad_norm": 0.0036114456597715616, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6496 }, { "epoch": 21.00949324324324, "grad_norm": 1.587662696838379, "learning_rate": 3.125e-06, "loss": 0.0032, "step": 6497 }, { "epoch": 21.009527027027026, "grad_norm": 0.0025674111675471067, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6498 }, { "epoch": 21.00956081081081, "grad_norm": 0.0009314234484918416, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6499 }, { "epoch": 21.009594594594596, "grad_norm": 2.8584506511688232, "learning_rate": 3.125e-06, "loss": 0.0689, "step": 6500 }, { "epoch": 21.009628378378377, "grad_norm": 0.003617633134126663, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6501 }, { "epoch": 21.00966216216216, "grad_norm": 6.695512294769287, "learning_rate": 3.125e-06, "loss": 0.0124, "step": 6502 }, { "epoch": 21.009695945945946, "grad_norm": 0.002497099805623293, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6503 }, { "epoch": 21.00972972972973, "grad_norm": 0.004348994232714176, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6504 }, { "epoch": 21.009763513513512, "grad_norm": 0.007222857791930437, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6505 }, { "epoch": 21.009797297297297, "grad_norm": 2.0743744373321533, "learning_rate": 3.125e-06, "loss": 0.0028, "step": 6506 }, { "epoch": 21.00983108108108, "grad_norm": 0.019896477460861206, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6507 }, { "epoch": 21.009864864864866, "grad_norm": 0.025783266872167587, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6508 }, { "epoch": 21.009898648648647, "grad_norm": 0.16604773700237274, "learning_rate": 3.125e-06, "loss": 0.0059, "step": 6509 }, { "epoch": 21.009932432432432, "grad_norm": 0.004921237472444773, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6510 }, { "epoch": 21.009966216216217, "grad_norm": 0.014011316932737827, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6511 }, { "epoch": 21.01, "grad_norm": 0.13647475838661194, "learning_rate": 3.125e-06, "loss": 0.0049, "step": 6512 }, { "epoch": 21.01, "eval_accuracy": 0.8917609046849758, "eval_loss": 0.5686879754066467, "eval_runtime": 33.964, "eval_samples_per_second": 18.225, "eval_steps_per_second": 2.297, "step": 6512 }, { "epoch": 22.000033783783785, "grad_norm": 0.001384554780088365, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6513 }, { "epoch": 22.000067567567566, "grad_norm": 0.2525486648082733, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 6514 }, { "epoch": 22.00010135135135, "grad_norm": 0.0011579303536564112, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6515 }, { "epoch": 22.000135135135135, "grad_norm": 0.05042368918657303, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6516 }, { "epoch": 22.00016891891892, "grad_norm": 10.610733032226562, "learning_rate": 3.125e-06, "loss": 0.1888, "step": 6517 }, { "epoch": 22.0002027027027, "grad_norm": 0.12068235129117966, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6518 }, { "epoch": 22.000236486486486, "grad_norm": 0.003131556324660778, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6519 }, { "epoch": 22.00027027027027, "grad_norm": 1.3796409368515015, "learning_rate": 3.125e-06, "loss": 0.0046, "step": 6520 }, { "epoch": 22.000304054054055, "grad_norm": 0.008118923753499985, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6521 }, { "epoch": 22.000337837837836, "grad_norm": 0.002882490400224924, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6522 }, { "epoch": 22.00037162162162, "grad_norm": 0.2289019674062729, "learning_rate": 3.125e-06, "loss": 0.0046, "step": 6523 }, { "epoch": 22.000405405405406, "grad_norm": 0.03420291095972061, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6524 }, { "epoch": 22.00043918918919, "grad_norm": 0.0012019776040688157, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6525 }, { "epoch": 22.00047297297297, "grad_norm": 0.002063262974843383, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6526 }, { "epoch": 22.000506756756756, "grad_norm": 0.1551012098789215, "learning_rate": 3.125e-06, "loss": 0.005, "step": 6527 }, { "epoch": 22.00054054054054, "grad_norm": 80.4297866821289, "learning_rate": 3.125e-06, "loss": 0.6834, "step": 6528 }, { "epoch": 22.000574324324326, "grad_norm": 0.0033801738172769547, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6529 }, { "epoch": 22.000608108108107, "grad_norm": 5.78192663192749, "learning_rate": 3.125e-06, "loss": 0.0127, "step": 6530 }, { "epoch": 22.000641891891892, "grad_norm": 0.07412607222795486, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6531 }, { "epoch": 22.000675675675677, "grad_norm": 0.03630942851305008, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6532 }, { "epoch": 22.00070945945946, "grad_norm": 0.02454330027103424, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6533 }, { "epoch": 22.000743243243242, "grad_norm": 0.0016232538037002087, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6534 }, { "epoch": 22.000777027027027, "grad_norm": 0.0008925898582674563, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6535 }, { "epoch": 22.000810810810812, "grad_norm": 0.004149108659476042, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6536 }, { "epoch": 22.000844594594593, "grad_norm": 0.0056423344649374485, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6537 }, { "epoch": 22.000878378378378, "grad_norm": 0.05396914854645729, "learning_rate": 3.125e-06, "loss": 0.0017, "step": 6538 }, { "epoch": 22.000912162162162, "grad_norm": 1.7387524843215942, "learning_rate": 3.125e-06, "loss": 0.0562, "step": 6539 }, { "epoch": 22.000945945945947, "grad_norm": 0.4784165322780609, "learning_rate": 3.125e-06, "loss": 0.0016, "step": 6540 }, { "epoch": 22.00097972972973, "grad_norm": 0.0024193834979087114, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6541 }, { "epoch": 22.001013513513513, "grad_norm": 0.01041924674063921, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6542 }, { "epoch": 22.001047297297298, "grad_norm": 0.0027434397488832474, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6543 }, { "epoch": 22.001081081081082, "grad_norm": 0.0028106276877224445, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6544 }, { "epoch": 22.001114864864864, "grad_norm": 0.00259015872143209, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6545 }, { "epoch": 22.00114864864865, "grad_norm": 0.05227529630064964, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6546 }, { "epoch": 22.001182432432433, "grad_norm": 0.20657740533351898, "learning_rate": 3.125e-06, "loss": 0.0067, "step": 6547 }, { "epoch": 22.001216216216218, "grad_norm": 0.002413366921246052, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6548 }, { "epoch": 22.00125, "grad_norm": 23.11881446838379, "learning_rate": 3.125e-06, "loss": 0.3737, "step": 6549 }, { "epoch": 22.001283783783784, "grad_norm": 0.0008070184267126024, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6550 }, { "epoch": 22.00131756756757, "grad_norm": 0.024028293788433075, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6551 }, { "epoch": 22.001351351351353, "grad_norm": 0.0013027337845414877, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6552 }, { "epoch": 22.001385135135134, "grad_norm": 0.0010277486871927977, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6553 }, { "epoch": 22.00141891891892, "grad_norm": 0.016879750415682793, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6554 }, { "epoch": 22.001452702702704, "grad_norm": 0.001263920683413744, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6555 }, { "epoch": 22.001486486486485, "grad_norm": 0.0017407054547220469, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6556 }, { "epoch": 22.00152027027027, "grad_norm": 0.0015062151942402124, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6557 }, { "epoch": 22.001554054054054, "grad_norm": 0.3407342731952667, "learning_rate": 3.125e-06, "loss": 0.0013, "step": 6558 }, { "epoch": 22.00158783783784, "grad_norm": 0.001917010173201561, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6559 }, { "epoch": 22.00162162162162, "grad_norm": 3.0855183601379395, "learning_rate": 3.125e-06, "loss": 0.0107, "step": 6560 }, { "epoch": 22.001655405405405, "grad_norm": 4.231725692749023, "learning_rate": 3.125e-06, "loss": 0.1771, "step": 6561 }, { "epoch": 22.00168918918919, "grad_norm": 0.9541206955909729, "learning_rate": 3.125e-06, "loss": 0.0027, "step": 6562 }, { "epoch": 22.001722972972974, "grad_norm": 0.014725804328918457, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6563 }, { "epoch": 22.001756756756755, "grad_norm": 0.002178647555410862, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6564 }, { "epoch": 22.00179054054054, "grad_norm": 24.813114166259766, "learning_rate": 3.125e-06, "loss": 0.1682, "step": 6565 }, { "epoch": 22.001824324324325, "grad_norm": 0.04592926800251007, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6566 }, { "epoch": 22.00185810810811, "grad_norm": 0.09469448775053024, "learning_rate": 3.125e-06, "loss": 0.0031, "step": 6567 }, { "epoch": 22.00189189189189, "grad_norm": 0.001660626963712275, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6568 }, { "epoch": 22.001925675675675, "grad_norm": 0.0023517468944191933, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6569 }, { "epoch": 22.00195945945946, "grad_norm": 44.391571044921875, "learning_rate": 3.125e-06, "loss": 0.5367, "step": 6570 }, { "epoch": 22.001993243243245, "grad_norm": 0.002691468223929405, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6571 }, { "epoch": 22.002027027027026, "grad_norm": 0.061015091836452484, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6572 }, { "epoch": 22.00206081081081, "grad_norm": 0.0020918978843837976, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6573 }, { "epoch": 22.002094594594595, "grad_norm": 0.12838146090507507, "learning_rate": 3.125e-06, "loss": 0.0049, "step": 6574 }, { "epoch": 22.00212837837838, "grad_norm": 0.19408084452152252, "learning_rate": 3.125e-06, "loss": 0.0026, "step": 6575 }, { "epoch": 22.00216216216216, "grad_norm": 0.6368032097816467, "learning_rate": 3.125e-06, "loss": 0.011, "step": 6576 }, { "epoch": 22.002195945945946, "grad_norm": 0.08854278922080994, "learning_rate": 3.125e-06, "loss": 0.0012, "step": 6577 }, { "epoch": 22.00222972972973, "grad_norm": 0.0018657060572877526, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6578 }, { "epoch": 22.002263513513512, "grad_norm": 0.12138205766677856, "learning_rate": 3.125e-06, "loss": 0.0045, "step": 6579 }, { "epoch": 22.002297297297297, "grad_norm": 0.11701923608779907, "learning_rate": 3.125e-06, "loss": 0.0044, "step": 6580 }, { "epoch": 22.00233108108108, "grad_norm": 0.005142171401530504, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6581 }, { "epoch": 22.002364864864866, "grad_norm": 0.1106816828250885, "learning_rate": 3.125e-06, "loss": 0.0037, "step": 6582 }, { "epoch": 22.002398648648647, "grad_norm": 0.0018415607046335936, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6583 }, { "epoch": 22.002432432432432, "grad_norm": 0.050942182540893555, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6584 }, { "epoch": 22.002466216216217, "grad_norm": 0.008207617327570915, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6585 }, { "epoch": 22.0025, "grad_norm": 0.002026379806920886, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6586 }, { "epoch": 22.002533783783782, "grad_norm": 0.1085343137383461, "learning_rate": 3.125e-06, "loss": 0.004, "step": 6587 }, { "epoch": 22.002567567567567, "grad_norm": 0.5073386430740356, "learning_rate": 3.125e-06, "loss": 0.0063, "step": 6588 }, { "epoch": 22.002601351351352, "grad_norm": 0.11609011888504028, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6589 }, { "epoch": 22.002635135135137, "grad_norm": 0.29298171401023865, "learning_rate": 3.125e-06, "loss": 0.0076, "step": 6590 }, { "epoch": 22.002668918918918, "grad_norm": 0.21884988248348236, "learning_rate": 3.125e-06, "loss": 0.0078, "step": 6591 }, { "epoch": 22.002702702702702, "grad_norm": 0.005286865402013063, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6592 }, { "epoch": 22.002736486486487, "grad_norm": 0.0371067076921463, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6593 }, { "epoch": 22.002770270270272, "grad_norm": 0.002202555537223816, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6594 }, { "epoch": 22.002804054054053, "grad_norm": 6.183085918426514, "learning_rate": 3.125e-06, "loss": 0.2666, "step": 6595 }, { "epoch": 22.002837837837838, "grad_norm": 0.001417767140083015, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6596 }, { "epoch": 22.002871621621622, "grad_norm": 0.3051709830760956, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 6597 }, { "epoch": 22.002905405405407, "grad_norm": 0.0019274562364444137, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6598 }, { "epoch": 22.00293918918919, "grad_norm": 0.0009971586987376213, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6599 }, { "epoch": 22.002972972972973, "grad_norm": 0.007025755476206541, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6600 }, { "epoch": 22.003006756756758, "grad_norm": 1.141969919204712, "learning_rate": 3.125e-06, "loss": 0.0157, "step": 6601 }, { "epoch": 22.00304054054054, "grad_norm": 0.0011850744485855103, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6602 }, { "epoch": 22.003074324324324, "grad_norm": 0.10693123191595078, "learning_rate": 3.125e-06, "loss": 0.0039, "step": 6603 }, { "epoch": 22.00310810810811, "grad_norm": 5.7377705574035645, "learning_rate": 3.125e-06, "loss": 0.084, "step": 6604 }, { "epoch": 22.003141891891893, "grad_norm": 0.002224995056167245, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6605 }, { "epoch": 22.003175675675674, "grad_norm": 0.00826254766434431, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6606 }, { "epoch": 22.00320945945946, "grad_norm": 2.7575037479400635, "learning_rate": 3.125e-06, "loss": 0.0138, "step": 6607 }, { "epoch": 22.003243243243244, "grad_norm": 0.0009651281870901585, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6608 }, { "epoch": 22.00327702702703, "grad_norm": 0.008833328261971474, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6609 }, { "epoch": 22.00331081081081, "grad_norm": 0.002010638127103448, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6610 }, { "epoch": 22.003344594594594, "grad_norm": 0.0020603276789188385, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6611 }, { "epoch": 22.00337837837838, "grad_norm": 0.03937558829784393, "learning_rate": 3.125e-06, "loss": 0.0005, "step": 6612 }, { "epoch": 22.003412162162164, "grad_norm": 0.003593104425817728, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6613 }, { "epoch": 22.003445945945945, "grad_norm": 0.4576609134674072, "learning_rate": 3.125e-06, "loss": 0.0029, "step": 6614 }, { "epoch": 22.00347972972973, "grad_norm": 1.7417871952056885, "learning_rate": 3.125e-06, "loss": 0.0065, "step": 6615 }, { "epoch": 22.003513513513514, "grad_norm": 0.002100571757182479, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6616 }, { "epoch": 22.0035472972973, "grad_norm": 0.002105188788846135, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6617 }, { "epoch": 22.00358108108108, "grad_norm": 0.050653740763664246, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 6618 }, { "epoch": 22.003614864864865, "grad_norm": 0.07220575958490372, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6619 }, { "epoch": 22.00364864864865, "grad_norm": 0.0009402104769833386, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6620 }, { "epoch": 22.00368243243243, "grad_norm": 0.001538801589049399, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6621 }, { "epoch": 22.003716216216215, "grad_norm": 0.002663171151652932, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6622 }, { "epoch": 22.00375, "grad_norm": 0.002604268491268158, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6623 }, { "epoch": 22.003783783783785, "grad_norm": 0.0058219339698553085, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6624 }, { "epoch": 22.003817567567566, "grad_norm": 11.47394847869873, "learning_rate": 3.125e-06, "loss": 0.0715, "step": 6625 }, { "epoch": 22.00385135135135, "grad_norm": 0.0036350879818201065, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6626 }, { "epoch": 22.003885135135135, "grad_norm": 0.004268296528607607, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6627 }, { "epoch": 22.00391891891892, "grad_norm": 42.36237716674805, "learning_rate": 3.125e-06, "loss": 0.8329, "step": 6628 }, { "epoch": 22.0039527027027, "grad_norm": 0.0024337456561625004, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6629 }, { "epoch": 22.003986486486486, "grad_norm": 0.0008479865500703454, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6630 }, { "epoch": 22.00402027027027, "grad_norm": 0.00438986299559474, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6631 }, { "epoch": 22.004054054054055, "grad_norm": 3.2322187423706055, "learning_rate": 3.125e-06, "loss": 0.1378, "step": 6632 }, { "epoch": 22.004087837837837, "grad_norm": 0.002817898290231824, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6633 }, { "epoch": 22.00412162162162, "grad_norm": 1.1394479274749756, "learning_rate": 3.125e-06, "loss": 0.0327, "step": 6634 }, { "epoch": 22.004155405405406, "grad_norm": 0.0025723916478455067, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6635 }, { "epoch": 22.00418918918919, "grad_norm": 0.007962862960994244, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6636 }, { "epoch": 22.004222972972972, "grad_norm": 3.374462842941284, "learning_rate": 3.125e-06, "loss": 0.0252, "step": 6637 }, { "epoch": 22.004256756756757, "grad_norm": 0.1426732838153839, "learning_rate": 3.125e-06, "loss": 0.004, "step": 6638 }, { "epoch": 22.00429054054054, "grad_norm": 6.771157741546631, "learning_rate": 3.125e-06, "loss": 0.4598, "step": 6639 }, { "epoch": 22.004324324324326, "grad_norm": 0.007023223210126162, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6640 }, { "epoch": 22.004358108108107, "grad_norm": 0.000984254409559071, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6641 }, { "epoch": 22.004391891891892, "grad_norm": 0.06069153547286987, "learning_rate": 3.125e-06, "loss": 0.002, "step": 6642 }, { "epoch": 22.004425675675677, "grad_norm": 0.0007559123332612216, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6643 }, { "epoch": 22.004459459459458, "grad_norm": 0.1052180752158165, "learning_rate": 3.125e-06, "loss": 0.004, "step": 6644 }, { "epoch": 22.004493243243243, "grad_norm": 0.0029628791380673647, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6645 }, { "epoch": 22.004527027027027, "grad_norm": 0.29688459634780884, "learning_rate": 3.125e-06, "loss": 0.0012, "step": 6646 }, { "epoch": 22.004560810810812, "grad_norm": 0.09092500060796738, "learning_rate": 3.125e-06, "loss": 0.0034, "step": 6647 }, { "epoch": 22.004594594594593, "grad_norm": 21.717775344848633, "learning_rate": 3.125e-06, "loss": 0.231, "step": 6648 }, { "epoch": 22.004628378378378, "grad_norm": 1.4874523878097534, "learning_rate": 3.125e-06, "loss": 0.0077, "step": 6649 }, { "epoch": 22.004662162162163, "grad_norm": 0.01699850521981716, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6650 }, { "epoch": 22.004695945945947, "grad_norm": 6.133529186248779, "learning_rate": 3.125e-06, "loss": 0.2653, "step": 6651 }, { "epoch": 22.00472972972973, "grad_norm": 0.4216693341732025, "learning_rate": 3.125e-06, "loss": 0.0018, "step": 6652 }, { "epoch": 22.004763513513513, "grad_norm": 0.011008117347955704, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6653 }, { "epoch": 22.004797297297298, "grad_norm": 0.001969808479771018, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6654 }, { "epoch": 22.004831081081083, "grad_norm": 2.1177375316619873, "learning_rate": 3.125e-06, "loss": 0.0578, "step": 6655 }, { "epoch": 22.004864864864864, "grad_norm": 0.00538245216012001, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6656 }, { "epoch": 22.00489864864865, "grad_norm": 0.003986131865531206, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6657 }, { "epoch": 22.004932432432433, "grad_norm": 0.003644858952611685, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6658 }, { "epoch": 22.004966216216218, "grad_norm": 0.0033593729604035616, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6659 }, { "epoch": 22.005, "grad_norm": 0.0023882791865617037, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6660 }, { "epoch": 22.005033783783784, "grad_norm": 0.002463611774146557, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6661 }, { "epoch": 22.00506756756757, "grad_norm": 24.262060165405273, "learning_rate": 3.125e-06, "loss": 0.8134, "step": 6662 }, { "epoch": 22.00510135135135, "grad_norm": 0.0022196683567017317, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6663 }, { "epoch": 22.005135135135134, "grad_norm": 0.09182222187519073, "learning_rate": 3.125e-06, "loss": 0.0035, "step": 6664 }, { "epoch": 22.00516891891892, "grad_norm": 1.1972476243972778, "learning_rate": 3.125e-06, "loss": 0.0097, "step": 6665 }, { "epoch": 22.005202702702704, "grad_norm": 0.17836356163024902, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6666 }, { "epoch": 22.005236486486485, "grad_norm": 0.0020625076722353697, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6667 }, { "epoch": 22.00527027027027, "grad_norm": 0.030034583061933517, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6668 }, { "epoch": 22.005304054054054, "grad_norm": 0.0019854181446135044, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6669 }, { "epoch": 22.00533783783784, "grad_norm": 0.08048216998577118, "learning_rate": 3.125e-06, "loss": 0.0025, "step": 6670 }, { "epoch": 22.00537162162162, "grad_norm": 23.59398078918457, "learning_rate": 3.125e-06, "loss": 0.5494, "step": 6671 }, { "epoch": 22.005405405405405, "grad_norm": 0.007293723523616791, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6672 }, { "epoch": 22.00543918918919, "grad_norm": 2.4310519695281982, "learning_rate": 3.125e-06, "loss": 0.1095, "step": 6673 }, { "epoch": 22.005472972972974, "grad_norm": 0.0018874413799494505, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6674 }, { "epoch": 22.005506756756755, "grad_norm": 0.03792440518736839, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6675 }, { "epoch": 22.00554054054054, "grad_norm": 0.0034648955333977938, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6676 }, { "epoch": 22.005574324324325, "grad_norm": 0.002112236339598894, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6677 }, { "epoch": 22.00560810810811, "grad_norm": 34.446746826171875, "learning_rate": 3.125e-06, "loss": 0.8243, "step": 6678 }, { "epoch": 22.00564189189189, "grad_norm": 0.026020638644695282, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6679 }, { "epoch": 22.005675675675676, "grad_norm": 0.1882900446653366, "learning_rate": 3.125e-06, "loss": 0.003, "step": 6680 }, { "epoch": 22.00570945945946, "grad_norm": 0.0015665170503780246, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6681 }, { "epoch": 22.005743243243245, "grad_norm": 0.005088607780635357, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6682 }, { "epoch": 22.005777027027026, "grad_norm": 0.0026057776995003223, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6683 }, { "epoch": 22.00581081081081, "grad_norm": 0.579857587814331, "learning_rate": 3.125e-06, "loss": 0.0068, "step": 6684 }, { "epoch": 22.005844594594596, "grad_norm": 1.4326242208480835, "learning_rate": 3.125e-06, "loss": 0.0441, "step": 6685 }, { "epoch": 22.005878378378377, "grad_norm": 0.012740805745124817, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6686 }, { "epoch": 22.00591216216216, "grad_norm": 0.0016611181199550629, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6687 }, { "epoch": 22.005945945945946, "grad_norm": 9.130837440490723, "learning_rate": 3.125e-06, "loss": 0.5264, "step": 6688 }, { "epoch": 22.00597972972973, "grad_norm": 0.0006822063587605953, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6689 }, { "epoch": 22.006013513513512, "grad_norm": 6.389156818389893, "learning_rate": 3.125e-06, "loss": 0.0201, "step": 6690 }, { "epoch": 22.006047297297297, "grad_norm": 3.395986318588257, "learning_rate": 3.125e-06, "loss": 0.4208, "step": 6691 }, { "epoch": 22.00608108108108, "grad_norm": 0.7402713894844055, "learning_rate": 3.125e-06, "loss": 0.002, "step": 6692 }, { "epoch": 22.006114864864866, "grad_norm": 0.28872451186180115, "learning_rate": 3.125e-06, "loss": 0.0009, "step": 6693 }, { "epoch": 22.006148648648647, "grad_norm": 0.017673421651124954, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6694 }, { "epoch": 22.006182432432432, "grad_norm": 0.025863774120807648, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6695 }, { "epoch": 22.006216216216217, "grad_norm": 0.0014591097133234143, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6696 }, { "epoch": 22.00625, "grad_norm": 0.0029533379711210728, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6697 }, { "epoch": 22.006283783783783, "grad_norm": 0.014804203994572163, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6698 }, { "epoch": 22.006317567567567, "grad_norm": 0.005850474815815687, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6699 }, { "epoch": 22.006351351351352, "grad_norm": 0.002801415277644992, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6700 }, { "epoch": 22.006385135135137, "grad_norm": 0.08159532397985458, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6701 }, { "epoch": 22.006418918918918, "grad_norm": 6.534579277038574, "learning_rate": 3.125e-06, "loss": 0.0143, "step": 6702 }, { "epoch": 22.006452702702703, "grad_norm": 0.08605222404003143, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 6703 }, { "epoch": 22.006486486486487, "grad_norm": 0.011644979007542133, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6704 }, { "epoch": 22.006520270270272, "grad_norm": 1.0102823972702026, "learning_rate": 3.125e-06, "loss": 0.0023, "step": 6705 }, { "epoch": 22.006554054054053, "grad_norm": 0.11972560733556747, "learning_rate": 3.125e-06, "loss": 0.0044, "step": 6706 }, { "epoch": 22.006587837837838, "grad_norm": 8.85554313659668, "learning_rate": 3.125e-06, "loss": 0.0279, "step": 6707 }, { "epoch": 22.006621621621623, "grad_norm": 3.967473030090332, "learning_rate": 3.125e-06, "loss": 0.0729, "step": 6708 }, { "epoch": 22.006655405405404, "grad_norm": 0.002540166722610593, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6709 }, { "epoch": 22.00668918918919, "grad_norm": 0.004566170275211334, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6710 }, { "epoch": 22.006722972972973, "grad_norm": 0.0008038614178076386, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6711 }, { "epoch": 22.006756756756758, "grad_norm": 1.6633782386779785, "learning_rate": 3.125e-06, "loss": 0.046, "step": 6712 }, { "epoch": 22.00679054054054, "grad_norm": 0.01103493943810463, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6713 }, { "epoch": 22.006824324324324, "grad_norm": 0.01330803707242012, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6714 }, { "epoch": 22.00685810810811, "grad_norm": 0.23358331620693207, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6715 }, { "epoch": 22.006891891891893, "grad_norm": 0.6778683662414551, "learning_rate": 3.125e-06, "loss": 0.0036, "step": 6716 }, { "epoch": 22.006925675675674, "grad_norm": 0.003675404004752636, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6717 }, { "epoch": 22.00695945945946, "grad_norm": 0.41564688086509705, "learning_rate": 3.125e-06, "loss": 0.0121, "step": 6718 }, { "epoch": 22.006993243243244, "grad_norm": 0.055150795727968216, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6719 }, { "epoch": 22.00702702702703, "grad_norm": 0.1103525310754776, "learning_rate": 3.125e-06, "loss": 0.0039, "step": 6720 }, { "epoch": 22.00706081081081, "grad_norm": 0.26843512058258057, "learning_rate": 3.125e-06, "loss": 0.0015, "step": 6721 }, { "epoch": 22.007094594594594, "grad_norm": 0.031499043107032776, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6722 }, { "epoch": 22.00712837837838, "grad_norm": 0.08051349967718124, "learning_rate": 3.125e-06, "loss": 0.0014, "step": 6723 }, { "epoch": 22.007162162162164, "grad_norm": 17.411222457885742, "learning_rate": 3.125e-06, "loss": 0.2487, "step": 6724 }, { "epoch": 22.007195945945945, "grad_norm": 7.236246109008789, "learning_rate": 3.125e-06, "loss": 0.4673, "step": 6725 }, { "epoch": 22.00722972972973, "grad_norm": 0.15771178901195526, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6726 }, { "epoch": 22.007263513513514, "grad_norm": 0.0021834573708474636, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6727 }, { "epoch": 22.007297297297296, "grad_norm": 0.016145892441272736, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6728 }, { "epoch": 22.00733108108108, "grad_norm": 0.0033419965766370296, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6729 }, { "epoch": 22.007364864864865, "grad_norm": 0.20156747102737427, "learning_rate": 3.125e-06, "loss": 0.001, "step": 6730 }, { "epoch": 22.00739864864865, "grad_norm": 0.009036063216626644, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6731 }, { "epoch": 22.00743243243243, "grad_norm": 7.650689125061035, "learning_rate": 3.125e-06, "loss": 0.0158, "step": 6732 }, { "epoch": 22.007466216216216, "grad_norm": 0.004948488902300596, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6733 }, { "epoch": 22.0075, "grad_norm": 0.003460800973698497, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6734 }, { "epoch": 22.007533783783785, "grad_norm": 0.9232634902000427, "learning_rate": 3.125e-06, "loss": 0.0297, "step": 6735 }, { "epoch": 22.007567567567566, "grad_norm": 0.009783091954886913, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6736 }, { "epoch": 22.00760135135135, "grad_norm": 0.0139537388458848, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6737 }, { "epoch": 22.007635135135136, "grad_norm": 0.012172509916126728, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6738 }, { "epoch": 22.00766891891892, "grad_norm": 0.005553185008466244, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6739 }, { "epoch": 22.0077027027027, "grad_norm": 0.11686451733112335, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6740 }, { "epoch": 22.007736486486486, "grad_norm": 0.0010238183895125985, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6741 }, { "epoch": 22.00777027027027, "grad_norm": 0.29016420245170593, "learning_rate": 3.125e-06, "loss": 0.0064, "step": 6742 }, { "epoch": 22.007804054054056, "grad_norm": 0.006518810987472534, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6743 }, { "epoch": 22.007837837837837, "grad_norm": 0.10721232742071152, "learning_rate": 3.125e-06, "loss": 0.004, "step": 6744 }, { "epoch": 22.00787162162162, "grad_norm": 0.0016983061796054244, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6745 }, { "epoch": 22.007905405405406, "grad_norm": 0.00775250606238842, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6746 }, { "epoch": 22.00793918918919, "grad_norm": 0.04793117567896843, "learning_rate": 3.125e-06, "loss": 0.0004, "step": 6747 }, { "epoch": 22.007972972972972, "grad_norm": 6.311061382293701, "learning_rate": 3.125e-06, "loss": 0.2987, "step": 6748 }, { "epoch": 22.008006756756757, "grad_norm": 0.024649636819958687, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6749 }, { "epoch": 22.00804054054054, "grad_norm": 0.043851543217897415, "learning_rate": 3.125e-06, "loss": 0.0011, "step": 6750 }, { "epoch": 22.008074324324323, "grad_norm": 0.051434505730867386, "learning_rate": 3.125e-06, "loss": 0.0006, "step": 6751 }, { "epoch": 22.008108108108107, "grad_norm": 0.004825138486921787, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6752 }, { "epoch": 22.008141891891892, "grad_norm": 0.0044053224846720695, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6753 }, { "epoch": 22.008175675675677, "grad_norm": 0.0008869192679412663, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6754 }, { "epoch": 22.008209459459458, "grad_norm": 0.08135240525007248, "learning_rate": 3.125e-06, "loss": 0.0025, "step": 6755 }, { "epoch": 22.008243243243243, "grad_norm": 0.013303845189511776, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6756 }, { "epoch": 22.008277027027027, "grad_norm": 0.003764326684176922, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6757 }, { "epoch": 22.008310810810812, "grad_norm": 3.636173963546753, "learning_rate": 3.125e-06, "loss": 0.1025, "step": 6758 }, { "epoch": 22.008344594594593, "grad_norm": 0.001929501653648913, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6759 }, { "epoch": 22.008378378378378, "grad_norm": 0.003593732137233019, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6760 }, { "epoch": 22.008412162162163, "grad_norm": 0.0016287053003907204, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6761 }, { "epoch": 22.008445945945947, "grad_norm": 16.177738189697266, "learning_rate": 3.125e-06, "loss": 0.1125, "step": 6762 }, { "epoch": 22.00847972972973, "grad_norm": 0.4284057319164276, "learning_rate": 3.125e-06, "loss": 0.0021, "step": 6763 }, { "epoch": 22.008513513513513, "grad_norm": 8.440363883972168, "learning_rate": 3.125e-06, "loss": 0.6801, "step": 6764 }, { "epoch": 22.008547297297298, "grad_norm": 0.0017808366101235151, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6765 }, { "epoch": 22.008581081081083, "grad_norm": 0.004653228912502527, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6766 }, { "epoch": 22.008614864864864, "grad_norm": 0.0011414573527872562, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6767 }, { "epoch": 22.00864864864865, "grad_norm": 0.052464134991168976, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6768 }, { "epoch": 22.008682432432433, "grad_norm": 0.01998279243707657, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6769 }, { "epoch": 22.008716216216218, "grad_norm": 0.021073389798402786, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6770 }, { "epoch": 22.00875, "grad_norm": 0.0008893825579434633, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6771 }, { "epoch": 22.008783783783784, "grad_norm": 7.267614841461182, "learning_rate": 3.125e-06, "loss": 0.0705, "step": 6772 }, { "epoch": 22.00881756756757, "grad_norm": 0.18688161671161652, "learning_rate": 3.125e-06, "loss": 0.0043, "step": 6773 }, { "epoch": 22.00885135135135, "grad_norm": 3.589841365814209, "learning_rate": 3.125e-06, "loss": 0.1404, "step": 6774 }, { "epoch": 22.008885135135134, "grad_norm": 0.0028572024311870337, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6775 }, { "epoch": 22.00891891891892, "grad_norm": 0.003443689551204443, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6776 }, { "epoch": 22.008952702702704, "grad_norm": 0.003062594449147582, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6777 }, { "epoch": 22.008986486486485, "grad_norm": 0.002145450096577406, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6778 }, { "epoch": 22.00902027027027, "grad_norm": 0.0015061789890751243, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6779 }, { "epoch": 22.009054054054054, "grad_norm": 0.005956759210675955, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6780 }, { "epoch": 22.00908783783784, "grad_norm": 0.0040724691934883595, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6781 }, { "epoch": 22.00912162162162, "grad_norm": 1.373611569404602, "learning_rate": 3.125e-06, "loss": 0.0155, "step": 6782 }, { "epoch": 22.009155405405405, "grad_norm": 0.4005148708820343, "learning_rate": 3.125e-06, "loss": 0.0123, "step": 6783 }, { "epoch": 22.00918918918919, "grad_norm": 0.0032829134725034237, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6784 }, { "epoch": 22.009222972972974, "grad_norm": 0.00285028456710279, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6785 }, { "epoch": 22.009256756756756, "grad_norm": 0.003714226186275482, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6786 }, { "epoch": 22.00929054054054, "grad_norm": 0.005567365325987339, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6787 }, { "epoch": 22.009324324324325, "grad_norm": 0.0018326095305383205, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6788 }, { "epoch": 22.00935810810811, "grad_norm": 0.009605348110198975, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6789 }, { "epoch": 22.00939189189189, "grad_norm": 0.004745961166918278, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6790 }, { "epoch": 22.009425675675676, "grad_norm": 5.7314839363098145, "learning_rate": 3.125e-06, "loss": 0.0118, "step": 6791 }, { "epoch": 22.00945945945946, "grad_norm": 0.0023693193215876818, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6792 }, { "epoch": 22.00949324324324, "grad_norm": 1.0338431596755981, "learning_rate": 3.125e-06, "loss": 0.0032, "step": 6793 }, { "epoch": 22.009527027027026, "grad_norm": 0.031936731189489365, "learning_rate": 3.125e-06, "loss": 0.0007, "step": 6794 }, { "epoch": 22.00956081081081, "grad_norm": 0.04261426255106926, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6795 }, { "epoch": 22.009594594594596, "grad_norm": 0.0021410256158560514, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6796 }, { "epoch": 22.009628378378377, "grad_norm": 36.23691177368164, "learning_rate": 3.125e-06, "loss": 0.6618, "step": 6797 }, { "epoch": 22.00966216216216, "grad_norm": 0.0011790405260398984, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6798 }, { "epoch": 22.009695945945946, "grad_norm": 0.01473260298371315, "learning_rate": 3.125e-06, "loss": 0.0003, "step": 6799 }, { "epoch": 22.00972972972973, "grad_norm": 0.005161643493920565, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6800 }, { "epoch": 22.009763513513512, "grad_norm": 0.31019970774650574, "learning_rate": 3.125e-06, "loss": 0.0043, "step": 6801 }, { "epoch": 22.009797297297297, "grad_norm": 0.2623075246810913, "learning_rate": 3.125e-06, "loss": 0.007, "step": 6802 }, { "epoch": 22.00983108108108, "grad_norm": 0.004871564917266369, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6803 }, { "epoch": 22.009864864864866, "grad_norm": 0.005149267613887787, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6804 }, { "epoch": 22.009898648648647, "grad_norm": 0.012248339131474495, "learning_rate": 3.125e-06, "loss": 0.0002, "step": 6805 }, { "epoch": 22.009932432432432, "grad_norm": 0.005146098788827658, "learning_rate": 3.125e-06, "loss": 0.0001, "step": 6806 }, { "epoch": 22.009966216216217, "grad_norm": 0.000771772232837975, "learning_rate": 3.125e-06, "loss": 0.0, "step": 6807 }, { "epoch": 22.01, "grad_norm": 1.0872117280960083, "learning_rate": 3.125e-06, "loss": 0.0048, "step": 6808 }, { "epoch": 22.01, "eval_accuracy": 0.8982229402261712, "eval_loss": 0.5355215668678284, "eval_runtime": 33.7974, "eval_samples_per_second": 18.315, "eval_steps_per_second": 2.308, "step": 6808 }, { "epoch": 23.000033783783785, "grad_norm": 0.00840197317302227, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6809 }, { "epoch": 23.000067567567566, "grad_norm": 0.009497827850282192, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6810 }, { "epoch": 23.00010135135135, "grad_norm": 0.11314540356397629, "learning_rate": 1.5625e-06, "loss": 0.0043, "step": 6811 }, { "epoch": 23.000135135135135, "grad_norm": 0.009445914067327976, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6812 }, { "epoch": 23.00016891891892, "grad_norm": 0.005571198184043169, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6813 }, { "epoch": 23.0002027027027, "grad_norm": 0.0021894159726798534, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6814 }, { "epoch": 23.000236486486486, "grad_norm": 0.00162205018568784, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6815 }, { "epoch": 23.00027027027027, "grad_norm": 3.4482483863830566, "learning_rate": 1.5625e-06, "loss": 0.4358, "step": 6816 }, { "epoch": 23.000304054054055, "grad_norm": 0.004733216017484665, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6817 }, { "epoch": 23.000337837837836, "grad_norm": 0.00222675665281713, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6818 }, { "epoch": 23.00037162162162, "grad_norm": 1.8159667253494263, "learning_rate": 1.5625e-06, "loss": 0.0364, "step": 6819 }, { "epoch": 23.000405405405406, "grad_norm": 1.8482115268707275, "learning_rate": 1.5625e-06, "loss": 0.0043, "step": 6820 }, { "epoch": 23.00043918918919, "grad_norm": 0.0069352807477116585, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6821 }, { "epoch": 23.00047297297297, "grad_norm": 0.006869712844491005, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6822 }, { "epoch": 23.000506756756756, "grad_norm": 1.860321283340454, "learning_rate": 1.5625e-06, "loss": 0.0038, "step": 6823 }, { "epoch": 23.00054054054054, "grad_norm": 0.009913012385368347, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6824 }, { "epoch": 23.000574324324326, "grad_norm": 0.0011538905091583729, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6825 }, { "epoch": 23.000608108108107, "grad_norm": 0.07383878529071808, "learning_rate": 1.5625e-06, "loss": 0.0011, "step": 6826 }, { "epoch": 23.000641891891892, "grad_norm": 0.009526846930384636, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6827 }, { "epoch": 23.000675675675677, "grad_norm": 0.0020794582087546587, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6828 }, { "epoch": 23.00070945945946, "grad_norm": 0.4563590884208679, "learning_rate": 1.5625e-06, "loss": 0.0054, "step": 6829 }, { "epoch": 23.000743243243242, "grad_norm": 29.928518295288086, "learning_rate": 1.5625e-06, "loss": 0.061, "step": 6830 }, { "epoch": 23.000777027027027, "grad_norm": 0.0008781193755567074, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6831 }, { "epoch": 23.000810810810812, "grad_norm": 0.0042227464728057384, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6832 }, { "epoch": 23.000844594594593, "grad_norm": 5.161856174468994, "learning_rate": 1.5625e-06, "loss": 0.2655, "step": 6833 }, { "epoch": 23.000878378378378, "grad_norm": 3.3926870822906494, "learning_rate": 1.5625e-06, "loss": 0.0701, "step": 6834 }, { "epoch": 23.000912162162162, "grad_norm": 0.12290091812610626, "learning_rate": 1.5625e-06, "loss": 0.0032, "step": 6835 }, { "epoch": 23.000945945945947, "grad_norm": 0.0010173008777201176, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6836 }, { "epoch": 23.00097972972973, "grad_norm": 0.06037360802292824, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 6837 }, { "epoch": 23.001013513513513, "grad_norm": 62.45518112182617, "learning_rate": 1.5625e-06, "loss": 0.3932, "step": 6838 }, { "epoch": 23.001047297297298, "grad_norm": 0.8103505373001099, "learning_rate": 1.5625e-06, "loss": 0.0139, "step": 6839 }, { "epoch": 23.001081081081082, "grad_norm": 0.14158450067043304, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 6840 }, { "epoch": 23.001114864864864, "grad_norm": 0.003274299902841449, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6841 }, { "epoch": 23.00114864864865, "grad_norm": 0.0024983910843729973, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6842 }, { "epoch": 23.001182432432433, "grad_norm": 0.0017864661058411002, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6843 }, { "epoch": 23.001216216216218, "grad_norm": 0.002762356074526906, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6844 }, { "epoch": 23.00125, "grad_norm": 0.0017840691143646836, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6845 }, { "epoch": 23.001283783783784, "grad_norm": 6.018038749694824, "learning_rate": 1.5625e-06, "loss": 0.0253, "step": 6846 }, { "epoch": 23.00131756756757, "grad_norm": 0.001460987376049161, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6847 }, { "epoch": 23.001351351351353, "grad_norm": 72.22614288330078, "learning_rate": 1.5625e-06, "loss": 0.2963, "step": 6848 }, { "epoch": 23.001385135135134, "grad_norm": 0.17446143925189972, "learning_rate": 1.5625e-06, "loss": 0.0034, "step": 6849 }, { "epoch": 23.00141891891892, "grad_norm": 0.2929675281047821, "learning_rate": 1.5625e-06, "loss": 0.0107, "step": 6850 }, { "epoch": 23.001452702702704, "grad_norm": 0.006173071451485157, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6851 }, { "epoch": 23.001486486486485, "grad_norm": 0.03027445077896118, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6852 }, { "epoch": 23.00152027027027, "grad_norm": 4.174325942993164, "learning_rate": 1.5625e-06, "loss": 0.2216, "step": 6853 }, { "epoch": 23.001554054054054, "grad_norm": 0.0014334027655422688, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6854 }, { "epoch": 23.00158783783784, "grad_norm": 0.4658753573894501, "learning_rate": 1.5625e-06, "loss": 0.006, "step": 6855 }, { "epoch": 23.00162162162162, "grad_norm": 0.02699052356183529, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6856 }, { "epoch": 23.001655405405405, "grad_norm": 0.007822029292583466, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6857 }, { "epoch": 23.00168918918919, "grad_norm": 0.00300854560919106, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6858 }, { "epoch": 23.001722972972974, "grad_norm": 0.0012780779507011175, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6859 }, { "epoch": 23.001756756756755, "grad_norm": 0.011167892254889011, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6860 }, { "epoch": 23.00179054054054, "grad_norm": 0.003366654273122549, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6861 }, { "epoch": 23.001824324324325, "grad_norm": 6.343930721282959, "learning_rate": 1.5625e-06, "loss": 0.4631, "step": 6862 }, { "epoch": 23.00185810810811, "grad_norm": 0.0012471899390220642, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6863 }, { "epoch": 23.00189189189189, "grad_norm": 2.7699713706970215, "learning_rate": 1.5625e-06, "loss": 0.0775, "step": 6864 }, { "epoch": 23.001925675675675, "grad_norm": 0.0015254812315106392, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6865 }, { "epoch": 23.00195945945946, "grad_norm": 0.20471519231796265, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 6866 }, { "epoch": 23.001993243243245, "grad_norm": 3.7092723846435547, "learning_rate": 1.5625e-06, "loss": 0.4049, "step": 6867 }, { "epoch": 23.002027027027026, "grad_norm": 0.019840043038129807, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6868 }, { "epoch": 23.00206081081081, "grad_norm": 0.006588790565729141, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6869 }, { "epoch": 23.002094594594595, "grad_norm": 2.9682610034942627, "learning_rate": 1.5625e-06, "loss": 0.0074, "step": 6870 }, { "epoch": 23.00212837837838, "grad_norm": 0.014322508126497269, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6871 }, { "epoch": 23.00216216216216, "grad_norm": 51.15034484863281, "learning_rate": 1.5625e-06, "loss": 0.1858, "step": 6872 }, { "epoch": 23.002195945945946, "grad_norm": 11.385725975036621, "learning_rate": 1.5625e-06, "loss": 0.316, "step": 6873 }, { "epoch": 23.00222972972973, "grad_norm": 11.695016860961914, "learning_rate": 1.5625e-06, "loss": 0.0223, "step": 6874 }, { "epoch": 23.002263513513512, "grad_norm": 0.006582406349480152, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6875 }, { "epoch": 23.002297297297297, "grad_norm": 0.07523476332426071, "learning_rate": 1.5625e-06, "loss": 0.0017, "step": 6876 }, { "epoch": 23.00233108108108, "grad_norm": 1.601192593574524, "learning_rate": 1.5625e-06, "loss": 0.0038, "step": 6877 }, { "epoch": 23.002364864864866, "grad_norm": 0.24116598069667816, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 6878 }, { "epoch": 23.002398648648647, "grad_norm": 0.004208684898912907, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6879 }, { "epoch": 23.002432432432432, "grad_norm": 0.004018108360469341, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6880 }, { "epoch": 23.002466216216217, "grad_norm": 0.00431786198168993, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6881 }, { "epoch": 23.0025, "grad_norm": 0.06337105482816696, "learning_rate": 1.5625e-06, "loss": 0.001, "step": 6882 }, { "epoch": 23.002533783783782, "grad_norm": 0.2996576130390167, "learning_rate": 1.5625e-06, "loss": 0.0027, "step": 6883 }, { "epoch": 23.002567567567567, "grad_norm": 0.0010925616370514035, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6884 }, { "epoch": 23.002601351351352, "grad_norm": 0.005478621460497379, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6885 }, { "epoch": 23.002635135135137, "grad_norm": 3.1484735012054443, "learning_rate": 1.5625e-06, "loss": 0.0035, "step": 6886 }, { "epoch": 23.002668918918918, "grad_norm": 0.20286251604557037, "learning_rate": 1.5625e-06, "loss": 0.0029, "step": 6887 }, { "epoch": 23.002702702702702, "grad_norm": 0.009893747046589851, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6888 }, { "epoch": 23.002736486486487, "grad_norm": 0.00990226585417986, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6889 }, { "epoch": 23.002770270270272, "grad_norm": 0.003885031910613179, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6890 }, { "epoch": 23.002804054054053, "grad_norm": 0.09395169466733932, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 6891 }, { "epoch": 23.002837837837838, "grad_norm": 0.38520747423171997, "learning_rate": 1.5625e-06, "loss": 0.0026, "step": 6892 }, { "epoch": 23.002871621621622, "grad_norm": 0.030422871932387352, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6893 }, { "epoch": 23.002905405405407, "grad_norm": 0.0008216602145694196, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6894 }, { "epoch": 23.00293918918919, "grad_norm": 0.09888206422328949, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 6895 }, { "epoch": 23.002972972972973, "grad_norm": 0.037188414484262466, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6896 }, { "epoch": 23.003006756756758, "grad_norm": 5.286370277404785, "learning_rate": 1.5625e-06, "loss": 0.0127, "step": 6897 }, { "epoch": 23.00304054054054, "grad_norm": 0.0008434058981947601, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6898 }, { "epoch": 23.003074324324324, "grad_norm": 0.0030104939360171556, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6899 }, { "epoch": 23.00310810810811, "grad_norm": 0.0032879419159144163, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6900 }, { "epoch": 23.003141891891893, "grad_norm": 0.09692094475030899, "learning_rate": 1.5625e-06, "loss": 0.0018, "step": 6901 }, { "epoch": 23.003175675675674, "grad_norm": 0.007394249550998211, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6902 }, { "epoch": 23.00320945945946, "grad_norm": 0.1785004884004593, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 6903 }, { "epoch": 23.003243243243244, "grad_norm": 0.0011014920892193913, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6904 }, { "epoch": 23.00327702702703, "grad_norm": 28.7227725982666, "learning_rate": 1.5625e-06, "loss": 0.3104, "step": 6905 }, { "epoch": 23.00331081081081, "grad_norm": 13.152604103088379, "learning_rate": 1.5625e-06, "loss": 1.0805, "step": 6906 }, { "epoch": 23.003344594594594, "grad_norm": 0.003928035497665405, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6907 }, { "epoch": 23.00337837837838, "grad_norm": 12.63220500946045, "learning_rate": 1.5625e-06, "loss": 0.7403, "step": 6908 }, { "epoch": 23.003412162162164, "grad_norm": 0.0013842215994372964, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6909 }, { "epoch": 23.003445945945945, "grad_norm": 0.0022892700508236885, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6910 }, { "epoch": 23.00347972972973, "grad_norm": 0.000951550726313144, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6911 }, { "epoch": 23.003513513513514, "grad_norm": 7.670808792114258, "learning_rate": 1.5625e-06, "loss": 0.099, "step": 6912 }, { "epoch": 23.0035472972973, "grad_norm": 0.1648474782705307, "learning_rate": 1.5625e-06, "loss": 0.0051, "step": 6913 }, { "epoch": 23.00358108108108, "grad_norm": 0.0015952304238453507, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6914 }, { "epoch": 23.003614864864865, "grad_norm": 0.01893697679042816, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6915 }, { "epoch": 23.00364864864865, "grad_norm": 28.62482452392578, "learning_rate": 1.5625e-06, "loss": 0.0424, "step": 6916 }, { "epoch": 23.00368243243243, "grad_norm": 0.0006916038691997528, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6917 }, { "epoch": 23.003716216216215, "grad_norm": 0.4007035791873932, "learning_rate": 1.5625e-06, "loss": 0.0137, "step": 6918 }, { "epoch": 23.00375, "grad_norm": 0.0009324735729023814, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6919 }, { "epoch": 23.003783783783785, "grad_norm": 0.0014266982907429338, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6920 }, { "epoch": 23.003817567567566, "grad_norm": 0.0018485990585759282, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6921 }, { "epoch": 23.00385135135135, "grad_norm": 0.023536309599876404, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6922 }, { "epoch": 23.003885135135135, "grad_norm": 0.13498704135417938, "learning_rate": 1.5625e-06, "loss": 0.0048, "step": 6923 }, { "epoch": 23.00391891891892, "grad_norm": 4.999200344085693, "learning_rate": 1.5625e-06, "loss": 0.0171, "step": 6924 }, { "epoch": 23.0039527027027, "grad_norm": 0.4508603811264038, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 6925 }, { "epoch": 23.003986486486486, "grad_norm": 0.0011459037195891142, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6926 }, { "epoch": 23.00402027027027, "grad_norm": 0.03366849198937416, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6927 }, { "epoch": 23.004054054054055, "grad_norm": 0.002241283655166626, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6928 }, { "epoch": 23.004087837837837, "grad_norm": 0.0952259972691536, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6929 }, { "epoch": 23.00412162162162, "grad_norm": 0.0011264182394370437, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6930 }, { "epoch": 23.004155405405406, "grad_norm": 0.04935641959309578, "learning_rate": 1.5625e-06, "loss": 0.0015, "step": 6931 }, { "epoch": 23.00418918918919, "grad_norm": 0.4465271532535553, "learning_rate": 1.5625e-06, "loss": 0.0074, "step": 6932 }, { "epoch": 23.004222972972972, "grad_norm": 0.12073053419589996, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 6933 }, { "epoch": 23.004256756756757, "grad_norm": 0.0013070101849734783, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6934 }, { "epoch": 23.00429054054054, "grad_norm": 0.006579746957868338, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6935 }, { "epoch": 23.004324324324326, "grad_norm": 0.004271345213055611, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6936 }, { "epoch": 23.004358108108107, "grad_norm": 0.2045755833387375, "learning_rate": 1.5625e-06, "loss": 0.004, "step": 6937 }, { "epoch": 23.004391891891892, "grad_norm": 0.0024433957878500223, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6938 }, { "epoch": 23.004425675675677, "grad_norm": 0.0012547809164971113, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6939 }, { "epoch": 23.004459459459458, "grad_norm": 3.3616535663604736, "learning_rate": 1.5625e-06, "loss": 0.4128, "step": 6940 }, { "epoch": 23.004493243243243, "grad_norm": 0.14721275866031647, "learning_rate": 1.5625e-06, "loss": 0.0054, "step": 6941 }, { "epoch": 23.004527027027027, "grad_norm": 0.00411643460392952, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6942 }, { "epoch": 23.004560810810812, "grad_norm": 8.128806114196777, "learning_rate": 1.5625e-06, "loss": 0.036, "step": 6943 }, { "epoch": 23.004594594594593, "grad_norm": 0.0010965659748762846, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6944 }, { "epoch": 23.004628378378378, "grad_norm": 0.002089529996737838, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6945 }, { "epoch": 23.004662162162163, "grad_norm": 0.15263736248016357, "learning_rate": 1.5625e-06, "loss": 0.0047, "step": 6946 }, { "epoch": 23.004695945945947, "grad_norm": 0.004756531212478876, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6947 }, { "epoch": 23.00472972972973, "grad_norm": 4.416715621948242, "learning_rate": 1.5625e-06, "loss": 0.0151, "step": 6948 }, { "epoch": 23.004763513513513, "grad_norm": 0.0013703249860554934, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6949 }, { "epoch": 23.004797297297298, "grad_norm": 3.29779052734375, "learning_rate": 1.5625e-06, "loss": 0.4228, "step": 6950 }, { "epoch": 23.004831081081083, "grad_norm": 0.0021192089188843966, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6951 }, { "epoch": 23.004864864864864, "grad_norm": 0.0020276247523725033, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6952 }, { "epoch": 23.00489864864865, "grad_norm": 0.025003889575600624, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6953 }, { "epoch": 23.004932432432433, "grad_norm": 0.0021952991373836994, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6954 }, { "epoch": 23.004966216216218, "grad_norm": 0.0024147224612534046, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6955 }, { "epoch": 23.005, "grad_norm": 1.3533114194869995, "learning_rate": 1.5625e-06, "loss": 0.0196, "step": 6956 }, { "epoch": 23.005033783783784, "grad_norm": 0.0015966102946549654, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6957 }, { "epoch": 23.00506756756757, "grad_norm": 16.3398380279541, "learning_rate": 1.5625e-06, "loss": 0.4026, "step": 6958 }, { "epoch": 23.00510135135135, "grad_norm": 0.0016888098325580359, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6959 }, { "epoch": 23.005135135135134, "grad_norm": 0.00991672370582819, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6960 }, { "epoch": 23.00516891891892, "grad_norm": 0.001697385567240417, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6961 }, { "epoch": 23.005202702702704, "grad_norm": 0.0013535174075514078, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6962 }, { "epoch": 23.005236486486485, "grad_norm": 0.001698061591014266, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6963 }, { "epoch": 23.00527027027027, "grad_norm": 0.042078856378793716, "learning_rate": 1.5625e-06, "loss": 0.0012, "step": 6964 }, { "epoch": 23.005304054054054, "grad_norm": 0.24164621531963348, "learning_rate": 1.5625e-06, "loss": 0.0081, "step": 6965 }, { "epoch": 23.00533783783784, "grad_norm": 0.553150475025177, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 6966 }, { "epoch": 23.00537162162162, "grad_norm": 0.014947395771741867, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6967 }, { "epoch": 23.005405405405405, "grad_norm": 0.0008558413246646523, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6968 }, { "epoch": 23.00543918918919, "grad_norm": 0.008360154926776886, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6969 }, { "epoch": 23.005472972972974, "grad_norm": 21.593013763427734, "learning_rate": 1.5625e-06, "loss": 0.0423, "step": 6970 }, { "epoch": 23.005506756756755, "grad_norm": 0.08541026711463928, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 6971 }, { "epoch": 23.00554054054054, "grad_norm": 0.005139067303389311, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6972 }, { "epoch": 23.005574324324325, "grad_norm": 0.010376408696174622, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6973 }, { "epoch": 23.00560810810811, "grad_norm": 0.021597979590296745, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 6974 }, { "epoch": 23.00564189189189, "grad_norm": 0.020217876881361008, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 6975 }, { "epoch": 23.005675675675676, "grad_norm": 0.003650561673566699, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6976 }, { "epoch": 23.00570945945946, "grad_norm": 1.1581368446350098, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 6977 }, { "epoch": 23.005743243243245, "grad_norm": 0.0028060285840183496, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6978 }, { "epoch": 23.005777027027026, "grad_norm": 0.0027377717196941376, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6979 }, { "epoch": 23.00581081081081, "grad_norm": 40.6740608215332, "learning_rate": 1.5625e-06, "loss": 0.1956, "step": 6980 }, { "epoch": 23.005844594594596, "grad_norm": 0.0019351267255842686, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6981 }, { "epoch": 23.005878378378377, "grad_norm": 0.0914049744606018, "learning_rate": 1.5625e-06, "loss": 0.001, "step": 6982 }, { "epoch": 23.00591216216216, "grad_norm": 55.853668212890625, "learning_rate": 1.5625e-06, "loss": 0.6445, "step": 6983 }, { "epoch": 23.005945945945946, "grad_norm": 0.0018938082503154874, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6984 }, { "epoch": 23.00597972972973, "grad_norm": 0.018961617723107338, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6985 }, { "epoch": 23.006013513513512, "grad_norm": 0.003079907037317753, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6986 }, { "epoch": 23.006047297297297, "grad_norm": 0.0022357264533638954, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6987 }, { "epoch": 23.00608108108108, "grad_norm": 0.04556691646575928, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 6988 }, { "epoch": 23.006114864864866, "grad_norm": 0.0012359886895865202, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6989 }, { "epoch": 23.006148648648647, "grad_norm": 0.04783426970243454, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 6990 }, { "epoch": 23.006182432432432, "grad_norm": 0.0073468186892569065, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6991 }, { "epoch": 23.006216216216217, "grad_norm": 0.012992601841688156, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 6992 }, { "epoch": 23.00625, "grad_norm": 0.0034962163772433996, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6993 }, { "epoch": 23.006283783783783, "grad_norm": 0.002508495468646288, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6994 }, { "epoch": 23.006317567567567, "grad_norm": 0.0039174905978143215, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6995 }, { "epoch": 23.006351351351352, "grad_norm": 0.22541195154190063, "learning_rate": 1.5625e-06, "loss": 0.0034, "step": 6996 }, { "epoch": 23.006385135135137, "grad_norm": 0.0008187665953300893, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 6997 }, { "epoch": 23.006418918918918, "grad_norm": 62.14952087402344, "learning_rate": 1.5625e-06, "loss": 0.4605, "step": 6998 }, { "epoch": 23.006452702702703, "grad_norm": 0.0014082215493544936, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 6999 }, { "epoch": 23.006486486486487, "grad_norm": 0.0019717297982424498, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7000 }, { "epoch": 23.006520270270272, "grad_norm": 0.16064541041851044, "learning_rate": 1.5625e-06, "loss": 0.0063, "step": 7001 }, { "epoch": 23.006554054054053, "grad_norm": 0.03201502189040184, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7002 }, { "epoch": 23.006587837837838, "grad_norm": 0.11158768832683563, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 7003 }, { "epoch": 23.006621621621623, "grad_norm": 0.03369148448109627, "learning_rate": 1.5625e-06, "loss": 0.001, "step": 7004 }, { "epoch": 23.006655405405404, "grad_norm": 0.2028086632490158, "learning_rate": 1.5625e-06, "loss": 0.0012, "step": 7005 }, { "epoch": 23.00668918918919, "grad_norm": 0.004051276482641697, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7006 }, { "epoch": 23.006722972972973, "grad_norm": 0.0035721042659133673, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7007 }, { "epoch": 23.006756756756758, "grad_norm": 10.294581413269043, "learning_rate": 1.5625e-06, "loss": 0.1664, "step": 7008 }, { "epoch": 23.00679054054054, "grad_norm": 0.0012072218814864755, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7009 }, { "epoch": 23.006824324324324, "grad_norm": 0.006237087305635214, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7010 }, { "epoch": 23.00685810810811, "grad_norm": 0.1871240884065628, "learning_rate": 1.5625e-06, "loss": 0.001, "step": 7011 }, { "epoch": 23.006891891891893, "grad_norm": 2.6968679428100586, "learning_rate": 1.5625e-06, "loss": 0.0265, "step": 7012 }, { "epoch": 23.006925675675674, "grad_norm": 5.228980541229248, "learning_rate": 1.5625e-06, "loss": 0.0268, "step": 7013 }, { "epoch": 23.00695945945946, "grad_norm": 6.64918327331543, "learning_rate": 1.5625e-06, "loss": 0.0891, "step": 7014 }, { "epoch": 23.006993243243244, "grad_norm": 0.0026495177298784256, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7015 }, { "epoch": 23.00702702702703, "grad_norm": 0.4306831359863281, "learning_rate": 1.5625e-06, "loss": 0.0149, "step": 7016 }, { "epoch": 23.00706081081081, "grad_norm": 5.693422317504883, "learning_rate": 1.5625e-06, "loss": 0.0979, "step": 7017 }, { "epoch": 23.007094594594594, "grad_norm": 0.0016675045480951667, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7018 }, { "epoch": 23.00712837837838, "grad_norm": 0.009641359560191631, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7019 }, { "epoch": 23.007162162162164, "grad_norm": 0.05297214165329933, "learning_rate": 1.5625e-06, "loss": 0.001, "step": 7020 }, { "epoch": 23.007195945945945, "grad_norm": 0.0019644147250801325, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7021 }, { "epoch": 23.00722972972973, "grad_norm": 0.05631363391876221, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 7022 }, { "epoch": 23.007263513513514, "grad_norm": 0.011951176449656487, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7023 }, { "epoch": 23.007297297297296, "grad_norm": 1.0542577505111694, "learning_rate": 1.5625e-06, "loss": 0.0044, "step": 7024 }, { "epoch": 23.00733108108108, "grad_norm": 4.445441722869873, "learning_rate": 1.5625e-06, "loss": 0.5701, "step": 7025 }, { "epoch": 23.007364864864865, "grad_norm": 0.0022325932513922453, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7026 }, { "epoch": 23.00739864864865, "grad_norm": 0.0026734801940619946, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7027 }, { "epoch": 23.00743243243243, "grad_norm": 0.014268947765231133, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7028 }, { "epoch": 23.007466216216216, "grad_norm": 0.003041262971237302, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7029 }, { "epoch": 23.0075, "grad_norm": 0.01335704606026411, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7030 }, { "epoch": 23.007533783783785, "grad_norm": 0.001748191541992128, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7031 }, { "epoch": 23.007567567567566, "grad_norm": 0.2779366075992584, "learning_rate": 1.5625e-06, "loss": 0.0018, "step": 7032 }, { "epoch": 23.00760135135135, "grad_norm": 0.0010251792846247554, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7033 }, { "epoch": 23.007635135135136, "grad_norm": 0.06744075566530228, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7034 }, { "epoch": 23.00766891891892, "grad_norm": 0.0011126550380140543, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7035 }, { "epoch": 23.0077027027027, "grad_norm": 0.0031582226511090994, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7036 }, { "epoch": 23.007736486486486, "grad_norm": 0.006304494570940733, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7037 }, { "epoch": 23.00777027027027, "grad_norm": 0.012357906438410282, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7038 }, { "epoch": 23.007804054054056, "grad_norm": 0.0015849124174565077, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7039 }, { "epoch": 23.007837837837837, "grad_norm": 0.00454511446878314, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7040 }, { "epoch": 23.00787162162162, "grad_norm": 0.5146388411521912, "learning_rate": 1.5625e-06, "loss": 0.004, "step": 7041 }, { "epoch": 23.007905405405406, "grad_norm": 0.0025177549105137587, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7042 }, { "epoch": 23.00793918918919, "grad_norm": 0.00470710638910532, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7043 }, { "epoch": 23.007972972972972, "grad_norm": 45.497520446777344, "learning_rate": 1.5625e-06, "loss": 0.265, "step": 7044 }, { "epoch": 23.008006756756757, "grad_norm": 2.5747828483581543, "learning_rate": 1.5625e-06, "loss": 0.0795, "step": 7045 }, { "epoch": 23.00804054054054, "grad_norm": 0.014161363244056702, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7046 }, { "epoch": 23.008074324324323, "grad_norm": 0.003240772522985935, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7047 }, { "epoch": 23.008108108108107, "grad_norm": 0.001764332759194076, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7048 }, { "epoch": 23.008141891891892, "grad_norm": 0.1379251778125763, "learning_rate": 1.5625e-06, "loss": 0.0052, "step": 7049 }, { "epoch": 23.008175675675677, "grad_norm": 15.497603416442871, "learning_rate": 1.5625e-06, "loss": 0.4999, "step": 7050 }, { "epoch": 23.008209459459458, "grad_norm": 5.442285060882568, "learning_rate": 1.5625e-06, "loss": 0.0948, "step": 7051 }, { "epoch": 23.008243243243243, "grad_norm": 0.004009875934571028, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7052 }, { "epoch": 23.008277027027027, "grad_norm": 0.005261266138404608, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7053 }, { "epoch": 23.008310810810812, "grad_norm": 0.14155343174934387, "learning_rate": 1.5625e-06, "loss": 0.0049, "step": 7054 }, { "epoch": 23.008344594594593, "grad_norm": 0.0007942457450553775, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7055 }, { "epoch": 23.008378378378378, "grad_norm": 0.18221965432167053, "learning_rate": 1.5625e-06, "loss": 0.0031, "step": 7056 }, { "epoch": 23.008412162162163, "grad_norm": 0.008743619546294212, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7057 }, { "epoch": 23.008445945945947, "grad_norm": 0.09257908910512924, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 7058 }, { "epoch": 23.00847972972973, "grad_norm": 0.0032567332964390516, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7059 }, { "epoch": 23.008513513513513, "grad_norm": 0.03699342533946037, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7060 }, { "epoch": 23.008547297297298, "grad_norm": 0.00532534858211875, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7061 }, { "epoch": 23.008581081081083, "grad_norm": 0.006395811215043068, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7062 }, { "epoch": 23.008614864864864, "grad_norm": 0.001624660100787878, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7063 }, { "epoch": 23.00864864864865, "grad_norm": 7.268538475036621, "learning_rate": 1.5625e-06, "loss": 0.4911, "step": 7064 }, { "epoch": 23.008682432432433, "grad_norm": 31.683917999267578, "learning_rate": 1.5625e-06, "loss": 0.0705, "step": 7065 }, { "epoch": 23.008716216216218, "grad_norm": 0.0011289094109088182, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7066 }, { "epoch": 23.00875, "grad_norm": 0.004682456608861685, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7067 }, { "epoch": 23.008783783783784, "grad_norm": 0.0018554310081526637, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7068 }, { "epoch": 23.00881756756757, "grad_norm": 0.0020869793370366096, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7069 }, { "epoch": 23.00885135135135, "grad_norm": 0.0012215896276757121, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7070 }, { "epoch": 23.008885135135134, "grad_norm": 0.000977691262960434, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7071 }, { "epoch": 23.00891891891892, "grad_norm": 0.006043936125934124, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7072 }, { "epoch": 23.008952702702704, "grad_norm": 0.0169516634196043, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7073 }, { "epoch": 23.008986486486485, "grad_norm": 0.0010098471539095044, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7074 }, { "epoch": 23.00902027027027, "grad_norm": 2.9658658504486084, "learning_rate": 1.5625e-06, "loss": 0.0119, "step": 7075 }, { "epoch": 23.009054054054054, "grad_norm": 0.003923389129340649, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7076 }, { "epoch": 23.00908783783784, "grad_norm": 0.4316440224647522, "learning_rate": 1.5625e-06, "loss": 0.0036, "step": 7077 }, { "epoch": 23.00912162162162, "grad_norm": 4.237118721008301, "learning_rate": 1.5625e-06, "loss": 0.0224, "step": 7078 }, { "epoch": 23.009155405405405, "grad_norm": 0.0586266964673996, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7079 }, { "epoch": 23.00918918918919, "grad_norm": 0.009393805637955666, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7080 }, { "epoch": 23.009222972972974, "grad_norm": 0.006700186058878899, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7081 }, { "epoch": 23.009256756756756, "grad_norm": 0.0057844785042107105, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7082 }, { "epoch": 23.00929054054054, "grad_norm": 0.008685748092830181, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7083 }, { "epoch": 23.009324324324325, "grad_norm": 0.042088642716407776, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7084 }, { "epoch": 23.00935810810811, "grad_norm": 0.0008904978167265654, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7085 }, { "epoch": 23.00939189189189, "grad_norm": 0.007066864054650068, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7086 }, { "epoch": 23.009425675675676, "grad_norm": 0.1010899543762207, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7087 }, { "epoch": 23.00945945945946, "grad_norm": 0.01624353975057602, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7088 }, { "epoch": 23.00949324324324, "grad_norm": 0.0026262400206178427, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7089 }, { "epoch": 23.009527027027026, "grad_norm": 0.0019434518180787563, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7090 }, { "epoch": 23.00956081081081, "grad_norm": 0.020305758342146873, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7091 }, { "epoch": 23.009594594594596, "grad_norm": 0.11431209743022919, "learning_rate": 1.5625e-06, "loss": 0.0043, "step": 7092 }, { "epoch": 23.009628378378377, "grad_norm": 0.000875229190569371, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7093 }, { "epoch": 23.00966216216216, "grad_norm": 0.004272000398486853, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7094 }, { "epoch": 23.009695945945946, "grad_norm": 0.0048661124892532825, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7095 }, { "epoch": 23.00972972972973, "grad_norm": 0.02828940562903881, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7096 }, { "epoch": 23.009763513513512, "grad_norm": 0.0009018330601975322, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7097 }, { "epoch": 23.009797297297297, "grad_norm": 0.0011619603028520942, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7098 }, { "epoch": 23.00983108108108, "grad_norm": 0.008884918875992298, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7099 }, { "epoch": 23.009864864864866, "grad_norm": 2.67928409576416, "learning_rate": 1.5625e-06, "loss": 0.0132, "step": 7100 }, { "epoch": 23.009898648648647, "grad_norm": 0.014593944884836674, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7101 }, { "epoch": 23.009932432432432, "grad_norm": 4.382077693939209, "learning_rate": 1.5625e-06, "loss": 0.2742, "step": 7102 }, { "epoch": 23.009966216216217, "grad_norm": 0.1273558884859085, "learning_rate": 1.5625e-06, "loss": 0.0048, "step": 7103 }, { "epoch": 23.01, "grad_norm": 0.0009344624122604728, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7104 }, { "epoch": 23.01, "eval_accuracy": 0.8982229402261712, "eval_loss": 0.5719950795173645, "eval_runtime": 32.8013, "eval_samples_per_second": 18.871, "eval_steps_per_second": 2.378, "step": 7104 }, { "epoch": 24.000033783783785, "grad_norm": 1.2919723987579346, "learning_rate": 1.5625e-06, "loss": 0.0166, "step": 7105 }, { "epoch": 24.000067567567566, "grad_norm": 0.7080085277557373, "learning_rate": 1.5625e-06, "loss": 0.0021, "step": 7106 }, { "epoch": 24.00010135135135, "grad_norm": 0.39471518993377686, "learning_rate": 1.5625e-06, "loss": 0.0021, "step": 7107 }, { "epoch": 24.000135135135135, "grad_norm": 0.02148064784705639, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7108 }, { "epoch": 24.00016891891892, "grad_norm": 0.022758791223168373, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7109 }, { "epoch": 24.0002027027027, "grad_norm": 0.07225494086742401, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7110 }, { "epoch": 24.000236486486486, "grad_norm": 10.317473411560059, "learning_rate": 1.5625e-06, "loss": 0.2172, "step": 7111 }, { "epoch": 24.00027027027027, "grad_norm": 0.17956507205963135, "learning_rate": 1.5625e-06, "loss": 0.0059, "step": 7112 }, { "epoch": 24.000304054054055, "grad_norm": 0.001159237464889884, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7113 }, { "epoch": 24.000337837837836, "grad_norm": 0.0034452308900654316, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7114 }, { "epoch": 24.00037162162162, "grad_norm": 20.499300003051758, "learning_rate": 1.5625e-06, "loss": 0.0537, "step": 7115 }, { "epoch": 24.000405405405406, "grad_norm": 0.006172406952828169, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7116 }, { "epoch": 24.00043918918919, "grad_norm": 20.725313186645508, "learning_rate": 1.5625e-06, "loss": 1.1578, "step": 7117 }, { "epoch": 24.00047297297297, "grad_norm": 0.18923792243003845, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7118 }, { "epoch": 24.000506756756756, "grad_norm": 0.007250877562910318, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7119 }, { "epoch": 24.00054054054054, "grad_norm": 0.0028433019760996103, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7120 }, { "epoch": 24.000574324324326, "grad_norm": 0.0029158638790249825, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7121 }, { "epoch": 24.000608108108107, "grad_norm": 0.0012588303070515394, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7122 }, { "epoch": 24.000641891891892, "grad_norm": 0.09886391460895538, "learning_rate": 1.5625e-06, "loss": 0.0023, "step": 7123 }, { "epoch": 24.000675675675677, "grad_norm": 0.0023482630494982004, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7124 }, { "epoch": 24.00070945945946, "grad_norm": 0.003622679738327861, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7125 }, { "epoch": 24.000743243243242, "grad_norm": 0.10831271857023239, "learning_rate": 1.5625e-06, "loss": 0.0014, "step": 7126 }, { "epoch": 24.000777027027027, "grad_norm": 0.025807393714785576, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7127 }, { "epoch": 24.000810810810812, "grad_norm": 0.009641862474381924, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7128 }, { "epoch": 24.000844594594593, "grad_norm": 0.018616437911987305, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7129 }, { "epoch": 24.000878378378378, "grad_norm": 0.10603579878807068, "learning_rate": 1.5625e-06, "loss": 0.004, "step": 7130 }, { "epoch": 24.000912162162162, "grad_norm": 0.007885312661528587, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7131 }, { "epoch": 24.000945945945947, "grad_norm": 4.988998889923096, "learning_rate": 1.5625e-06, "loss": 0.0082, "step": 7132 }, { "epoch": 24.00097972972973, "grad_norm": 0.000945146894082427, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7133 }, { "epoch": 24.001013513513513, "grad_norm": 0.002371353330090642, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7134 }, { "epoch": 24.001047297297298, "grad_norm": 0.40373098850250244, "learning_rate": 1.5625e-06, "loss": 0.0105, "step": 7135 }, { "epoch": 24.001081081081082, "grad_norm": 0.0041757505387067795, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7136 }, { "epoch": 24.001114864864864, "grad_norm": 0.7093147039413452, "learning_rate": 1.5625e-06, "loss": 0.0161, "step": 7137 }, { "epoch": 24.00114864864865, "grad_norm": 0.003593351924791932, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7138 }, { "epoch": 24.001182432432433, "grad_norm": 0.1315435767173767, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7139 }, { "epoch": 24.001216216216218, "grad_norm": 0.10449022054672241, "learning_rate": 1.5625e-06, "loss": 0.0038, "step": 7140 }, { "epoch": 24.00125, "grad_norm": 0.0018575044814497232, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7141 }, { "epoch": 24.001283783783784, "grad_norm": 0.005217425059527159, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7142 }, { "epoch": 24.00131756756757, "grad_norm": 2.1816391944885254, "learning_rate": 1.5625e-06, "loss": 0.0164, "step": 7143 }, { "epoch": 24.001351351351353, "grad_norm": 0.02312873676419258, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7144 }, { "epoch": 24.001385135135134, "grad_norm": 0.0041829370893538, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7145 }, { "epoch": 24.00141891891892, "grad_norm": 0.00502891605719924, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7146 }, { "epoch": 24.001452702702704, "grad_norm": 0.0019646615255624056, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7147 }, { "epoch": 24.001486486486485, "grad_norm": 6.062179088592529, "learning_rate": 1.5625e-06, "loss": 0.2359, "step": 7148 }, { "epoch": 24.00152027027027, "grad_norm": 2.5668413639068604, "learning_rate": 1.5625e-06, "loss": 0.0169, "step": 7149 }, { "epoch": 24.001554054054054, "grad_norm": 0.002718643518164754, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7150 }, { "epoch": 24.00158783783784, "grad_norm": 0.5494487881660461, "learning_rate": 1.5625e-06, "loss": 0.0014, "step": 7151 }, { "epoch": 24.00162162162162, "grad_norm": 0.0015466029290109873, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7152 }, { "epoch": 24.001655405405405, "grad_norm": 0.004224523901939392, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7153 }, { "epoch": 24.00168918918919, "grad_norm": 0.00328079704195261, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7154 }, { "epoch": 24.001722972972974, "grad_norm": 0.0020783296786248684, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7155 }, { "epoch": 24.001756756756755, "grad_norm": 0.003908942453563213, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7156 }, { "epoch": 24.00179054054054, "grad_norm": 0.11681052297353745, "learning_rate": 1.5625e-06, "loss": 0.0044, "step": 7157 }, { "epoch": 24.001824324324325, "grad_norm": 0.12669460475444794, "learning_rate": 1.5625e-06, "loss": 0.0028, "step": 7158 }, { "epoch": 24.00185810810811, "grad_norm": 0.10083774477243423, "learning_rate": 1.5625e-06, "loss": 0.0038, "step": 7159 }, { "epoch": 24.00189189189189, "grad_norm": 0.0018291755113750696, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7160 }, { "epoch": 24.001925675675675, "grad_norm": 0.08242703229188919, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 7161 }, { "epoch": 24.00195945945946, "grad_norm": 30.216859817504883, "learning_rate": 1.5625e-06, "loss": 0.4048, "step": 7162 }, { "epoch": 24.001993243243245, "grad_norm": 0.005849782377481461, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7163 }, { "epoch": 24.002027027027026, "grad_norm": 0.012906510382890701, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7164 }, { "epoch": 24.00206081081081, "grad_norm": 5.2569756507873535, "learning_rate": 1.5625e-06, "loss": 0.2217, "step": 7165 }, { "epoch": 24.002094594594595, "grad_norm": 0.16149747371673584, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7166 }, { "epoch": 24.00212837837838, "grad_norm": 0.0031751578208059072, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7167 }, { "epoch": 24.00216216216216, "grad_norm": 13.22815227508545, "learning_rate": 1.5625e-06, "loss": 0.0949, "step": 7168 }, { "epoch": 24.002195945945946, "grad_norm": 0.0023640671279281378, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7169 }, { "epoch": 24.00222972972973, "grad_norm": 0.05455688387155533, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7170 }, { "epoch": 24.002263513513512, "grad_norm": 0.003712030127644539, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7171 }, { "epoch": 24.002297297297297, "grad_norm": 0.8548080921173096, "learning_rate": 1.5625e-06, "loss": 0.0188, "step": 7172 }, { "epoch": 24.00233108108108, "grad_norm": 0.03760972246527672, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7173 }, { "epoch": 24.002364864864866, "grad_norm": 0.0014287186786532402, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7174 }, { "epoch": 24.002398648648647, "grad_norm": 0.004626743029803038, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7175 }, { "epoch": 24.002432432432432, "grad_norm": 0.0041860248893499374, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7176 }, { "epoch": 24.002466216216217, "grad_norm": 69.95449829101562, "learning_rate": 1.5625e-06, "loss": 0.3173, "step": 7177 }, { "epoch": 24.0025, "grad_norm": 0.0021667256951332092, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7178 }, { "epoch": 24.002533783783782, "grad_norm": 4.46220588684082, "learning_rate": 1.5625e-06, "loss": 0.0109, "step": 7179 }, { "epoch": 24.002567567567567, "grad_norm": 1.2851853370666504, "learning_rate": 1.5625e-06, "loss": 0.0275, "step": 7180 }, { "epoch": 24.002601351351352, "grad_norm": 3.2644639015197754, "learning_rate": 1.5625e-06, "loss": 0.1083, "step": 7181 }, { "epoch": 24.002635135135137, "grad_norm": 0.0014504914870485663, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7182 }, { "epoch": 24.002668918918918, "grad_norm": 0.03952248767018318, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7183 }, { "epoch": 24.002702702702702, "grad_norm": 0.2859582304954529, "learning_rate": 1.5625e-06, "loss": 0.0013, "step": 7184 }, { "epoch": 24.002736486486487, "grad_norm": 0.002588104223832488, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7185 }, { "epoch": 24.002770270270272, "grad_norm": 0.004440180025994778, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7186 }, { "epoch": 24.002804054054053, "grad_norm": 0.0035730975214391947, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7187 }, { "epoch": 24.002837837837838, "grad_norm": 0.005426137242466211, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7188 }, { "epoch": 24.002871621621622, "grad_norm": 0.0011307100066915154, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7189 }, { "epoch": 24.002905405405407, "grad_norm": 0.22785931825637817, "learning_rate": 1.5625e-06, "loss": 0.0087, "step": 7190 }, { "epoch": 24.00293918918919, "grad_norm": 0.003991888370364904, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7191 }, { "epoch": 24.002972972972973, "grad_norm": 0.4550575315952301, "learning_rate": 1.5625e-06, "loss": 0.0114, "step": 7192 }, { "epoch": 24.003006756756758, "grad_norm": 0.005397496744990349, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7193 }, { "epoch": 24.00304054054054, "grad_norm": 0.009887776337563992, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7194 }, { "epoch": 24.003074324324324, "grad_norm": 0.024468325078487396, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7195 }, { "epoch": 24.00310810810811, "grad_norm": 0.03273610770702362, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7196 }, { "epoch": 24.003141891891893, "grad_norm": 0.012750121764838696, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7197 }, { "epoch": 24.003175675675674, "grad_norm": 0.0017883406253531575, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7198 }, { "epoch": 24.00320945945946, "grad_norm": 0.012203062884509563, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7199 }, { "epoch": 24.003243243243244, "grad_norm": 0.012592067942023277, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7200 }, { "epoch": 24.00327702702703, "grad_norm": 0.0031032690312713385, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7201 }, { "epoch": 24.00331081081081, "grad_norm": 0.0017660305602476, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7202 }, { "epoch": 24.003344594594594, "grad_norm": 0.054788146167993546, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 7203 }, { "epoch": 24.00337837837838, "grad_norm": 0.0010276291286572814, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7204 }, { "epoch": 24.003412162162164, "grad_norm": 0.11519038677215576, "learning_rate": 1.5625e-06, "loss": 0.0034, "step": 7205 }, { "epoch": 24.003445945945945, "grad_norm": 0.7324994802474976, "learning_rate": 1.5625e-06, "loss": 0.0074, "step": 7206 }, { "epoch": 24.00347972972973, "grad_norm": 0.18676674365997314, "learning_rate": 1.5625e-06, "loss": 0.0039, "step": 7207 }, { "epoch": 24.003513513513514, "grad_norm": 0.08954635262489319, "learning_rate": 1.5625e-06, "loss": 0.0033, "step": 7208 }, { "epoch": 24.0035472972973, "grad_norm": 3.5424342155456543, "learning_rate": 1.5625e-06, "loss": 0.4058, "step": 7209 }, { "epoch": 24.00358108108108, "grad_norm": 1.0330145359039307, "learning_rate": 1.5625e-06, "loss": 0.0016, "step": 7210 }, { "epoch": 24.003614864864865, "grad_norm": 0.0006701017846353352, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7211 }, { "epoch": 24.00364864864865, "grad_norm": 0.11170396953821182, "learning_rate": 1.5625e-06, "loss": 0.0015, "step": 7212 }, { "epoch": 24.00368243243243, "grad_norm": 0.001171104609966278, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7213 }, { "epoch": 24.003716216216215, "grad_norm": 0.005741463042795658, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7214 }, { "epoch": 24.00375, "grad_norm": 6.730586528778076, "learning_rate": 1.5625e-06, "loss": 0.0192, "step": 7215 }, { "epoch": 24.003783783783785, "grad_norm": 0.08044105023145676, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 7216 }, { "epoch": 24.003817567567566, "grad_norm": 0.0024740947410464287, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7217 }, { "epoch": 24.00385135135135, "grad_norm": 0.009525307454168797, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7218 }, { "epoch": 24.003885135135135, "grad_norm": 80.45225524902344, "learning_rate": 1.5625e-06, "loss": 0.4706, "step": 7219 }, { "epoch": 24.00391891891892, "grad_norm": 0.001740464591421187, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7220 }, { "epoch": 24.0039527027027, "grad_norm": 0.06201224774122238, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7221 }, { "epoch": 24.003986486486486, "grad_norm": 3.316606044769287, "learning_rate": 1.5625e-06, "loss": 0.4372, "step": 7222 }, { "epoch": 24.00402027027027, "grad_norm": 0.0035576592199504375, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7223 }, { "epoch": 24.004054054054055, "grad_norm": 4.552047252655029, "learning_rate": 1.5625e-06, "loss": 0.4468, "step": 7224 }, { "epoch": 24.004087837837837, "grad_norm": 0.47018519043922424, "learning_rate": 1.5625e-06, "loss": 0.0013, "step": 7225 }, { "epoch": 24.00412162162162, "grad_norm": 0.1325910985469818, "learning_rate": 1.5625e-06, "loss": 0.0048, "step": 7226 }, { "epoch": 24.004155405405406, "grad_norm": 1.2455428838729858, "learning_rate": 1.5625e-06, "loss": 0.013, "step": 7227 }, { "epoch": 24.00418918918919, "grad_norm": 0.010463004000484943, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7228 }, { "epoch": 24.004222972972972, "grad_norm": 17.800947189331055, "learning_rate": 1.5625e-06, "loss": 0.0292, "step": 7229 }, { "epoch": 24.004256756756757, "grad_norm": 0.10489440709352493, "learning_rate": 1.5625e-06, "loss": 0.0017, "step": 7230 }, { "epoch": 24.00429054054054, "grad_norm": 0.0021202589850872755, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7231 }, { "epoch": 24.004324324324326, "grad_norm": 0.002949140267446637, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7232 }, { "epoch": 24.004358108108107, "grad_norm": 0.0044381264597177505, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7233 }, { "epoch": 24.004391891891892, "grad_norm": 0.0062997471541166306, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7234 }, { "epoch": 24.004425675675677, "grad_norm": 0.0015168224927037954, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7235 }, { "epoch": 24.004459459459458, "grad_norm": 0.02262754738330841, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7236 }, { "epoch": 24.004493243243243, "grad_norm": 30.313331604003906, "learning_rate": 1.5625e-06, "loss": 0.074, "step": 7237 }, { "epoch": 24.004527027027027, "grad_norm": 3.3964080810546875, "learning_rate": 1.5625e-06, "loss": 0.3549, "step": 7238 }, { "epoch": 24.004560810810812, "grad_norm": 0.019019415602087975, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7239 }, { "epoch": 24.004594594594593, "grad_norm": 0.005610255058854818, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7240 }, { "epoch": 24.004628378378378, "grad_norm": 0.09027335792779922, "learning_rate": 1.5625e-06, "loss": 0.0028, "step": 7241 }, { "epoch": 24.004662162162163, "grad_norm": 0.0013428993988782167, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7242 }, { "epoch": 24.004695945945947, "grad_norm": 0.14783118665218353, "learning_rate": 1.5625e-06, "loss": 0.0053, "step": 7243 }, { "epoch": 24.00472972972973, "grad_norm": 0.0009488274226896465, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7244 }, { "epoch": 24.004763513513513, "grad_norm": 0.01680668629705906, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7245 }, { "epoch": 24.004797297297298, "grad_norm": 0.1464010328054428, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 7246 }, { "epoch": 24.004831081081083, "grad_norm": 0.002677204553037882, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7247 }, { "epoch": 24.004864864864864, "grad_norm": 0.005653777159750462, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7248 }, { "epoch": 24.00489864864865, "grad_norm": 0.175065815448761, "learning_rate": 1.5625e-06, "loss": 0.0036, "step": 7249 }, { "epoch": 24.004932432432433, "grad_norm": 2.075930118560791, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 7250 }, { "epoch": 24.004966216216218, "grad_norm": 0.06649671494960785, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7251 }, { "epoch": 24.005, "grad_norm": 0.0059421481564641, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7252 }, { "epoch": 24.005033783783784, "grad_norm": 0.8950714468955994, "learning_rate": 1.5625e-06, "loss": 0.0022, "step": 7253 }, { "epoch": 24.00506756756757, "grad_norm": 0.0011117985704913735, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7254 }, { "epoch": 24.00510135135135, "grad_norm": 0.0020210850052535534, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7255 }, { "epoch": 24.005135135135134, "grad_norm": 1.140472412109375, "learning_rate": 1.5625e-06, "loss": 0.0111, "step": 7256 }, { "epoch": 24.00516891891892, "grad_norm": 0.0030093034729361534, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7257 }, { "epoch": 24.005202702702704, "grad_norm": 27.646657943725586, "learning_rate": 1.5625e-06, "loss": 0.3164, "step": 7258 }, { "epoch": 24.005236486486485, "grad_norm": 0.002507592085748911, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7259 }, { "epoch": 24.00527027027027, "grad_norm": 0.010007179342210293, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7260 }, { "epoch": 24.005304054054054, "grad_norm": 0.014758720993995667, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7261 }, { "epoch": 24.00533783783784, "grad_norm": 80.52372741699219, "learning_rate": 1.5625e-06, "loss": 0.524, "step": 7262 }, { "epoch": 24.00537162162162, "grad_norm": 0.0017303548520430923, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7263 }, { "epoch": 24.005405405405405, "grad_norm": 0.011298527009785175, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7264 }, { "epoch": 24.00543918918919, "grad_norm": 0.016555791720747948, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7265 }, { "epoch": 24.005472972972974, "grad_norm": 0.001206211163662374, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7266 }, { "epoch": 24.005506756756755, "grad_norm": 0.0008775627356953919, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7267 }, { "epoch": 24.00554054054054, "grad_norm": 0.0020289469975978136, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7268 }, { "epoch": 24.005574324324325, "grad_norm": 0.14355604350566864, "learning_rate": 1.5625e-06, "loss": 0.0029, "step": 7269 }, { "epoch": 24.00560810810811, "grad_norm": 0.0007583594415336847, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7270 }, { "epoch": 24.00564189189189, "grad_norm": 0.18204745650291443, "learning_rate": 1.5625e-06, "loss": 0.0062, "step": 7271 }, { "epoch": 24.005675675675676, "grad_norm": 0.0037320104893296957, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7272 }, { "epoch": 24.00570945945946, "grad_norm": 0.2927471399307251, "learning_rate": 1.5625e-06, "loss": 0.0091, "step": 7273 }, { "epoch": 24.005743243243245, "grad_norm": 0.003267481457442045, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7274 }, { "epoch": 24.005777027027026, "grad_norm": 0.002228738972917199, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7275 }, { "epoch": 24.00581081081081, "grad_norm": 0.0013139878865331411, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7276 }, { "epoch": 24.005844594594596, "grad_norm": 0.006270355079323053, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7277 }, { "epoch": 24.005878378378377, "grad_norm": 0.01393047347664833, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7278 }, { "epoch": 24.00591216216216, "grad_norm": 0.016929589211940765, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7279 }, { "epoch": 24.005945945945946, "grad_norm": 0.006109065841883421, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7280 }, { "epoch": 24.00597972972973, "grad_norm": 0.0011190128279849887, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7281 }, { "epoch": 24.006013513513512, "grad_norm": 0.0007958413334563375, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7282 }, { "epoch": 24.006047297297297, "grad_norm": 0.0008064048597589135, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7283 }, { "epoch": 24.00608108108108, "grad_norm": 0.022547999396920204, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7284 }, { "epoch": 24.006114864864866, "grad_norm": 0.0028803120367228985, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7285 }, { "epoch": 24.006148648648647, "grad_norm": 0.002720326418057084, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7286 }, { "epoch": 24.006182432432432, "grad_norm": 0.11759955435991287, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 7287 }, { "epoch": 24.006216216216217, "grad_norm": 0.0019742727745324373, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7288 }, { "epoch": 24.00625, "grad_norm": 4.940985679626465, "learning_rate": 1.5625e-06, "loss": 0.0086, "step": 7289 }, { "epoch": 24.006283783783783, "grad_norm": 0.02597218006849289, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7290 }, { "epoch": 24.006317567567567, "grad_norm": 0.0058280122466385365, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7291 }, { "epoch": 24.006351351351352, "grad_norm": 0.001382192480377853, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7292 }, { "epoch": 24.006385135135137, "grad_norm": 0.0021155732683837414, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7293 }, { "epoch": 24.006418918918918, "grad_norm": 0.001508608809672296, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7294 }, { "epoch": 24.006452702702703, "grad_norm": 1.000815510749817, "learning_rate": 1.5625e-06, "loss": 0.0112, "step": 7295 }, { "epoch": 24.006486486486487, "grad_norm": 0.26071491837501526, "learning_rate": 1.5625e-06, "loss": 0.0073, "step": 7296 }, { "epoch": 24.006520270270272, "grad_norm": 0.15781953930854797, "learning_rate": 1.5625e-06, "loss": 0.0058, "step": 7297 }, { "epoch": 24.006554054054053, "grad_norm": 0.0008092352654784918, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7298 }, { "epoch": 24.006587837837838, "grad_norm": 4.155658721923828, "learning_rate": 1.5625e-06, "loss": 0.4062, "step": 7299 }, { "epoch": 24.006621621621623, "grad_norm": 0.013432430103421211, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7300 }, { "epoch": 24.006655405405404, "grad_norm": 0.000919026555493474, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7301 }, { "epoch": 24.00668918918919, "grad_norm": 0.0018529891967773438, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7302 }, { "epoch": 24.006722972972973, "grad_norm": 0.0062677012756466866, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7303 }, { "epoch": 24.006756756756758, "grad_norm": 0.0027351684402674437, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7304 }, { "epoch": 24.00679054054054, "grad_norm": 0.8382761478424072, "learning_rate": 1.5625e-06, "loss": 0.0105, "step": 7305 }, { "epoch": 24.006824324324324, "grad_norm": 0.055133793503046036, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7306 }, { "epoch": 24.00685810810811, "grad_norm": 37.55160140991211, "learning_rate": 1.5625e-06, "loss": 0.6934, "step": 7307 }, { "epoch": 24.006891891891893, "grad_norm": 0.03879997506737709, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7308 }, { "epoch": 24.006925675675674, "grad_norm": 0.05657343193888664, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7309 }, { "epoch": 24.00695945945946, "grad_norm": 0.0041801766492426395, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7310 }, { "epoch": 24.006993243243244, "grad_norm": 0.0010622936533764005, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7311 }, { "epoch": 24.00702702702703, "grad_norm": 7.840466499328613, "learning_rate": 1.5625e-06, "loss": 0.9706, "step": 7312 }, { "epoch": 24.00706081081081, "grad_norm": 3.656078577041626, "learning_rate": 1.5625e-06, "loss": 0.102, "step": 7313 }, { "epoch": 24.007094594594594, "grad_norm": 0.1523713916540146, "learning_rate": 1.5625e-06, "loss": 0.0012, "step": 7314 }, { "epoch": 24.00712837837838, "grad_norm": 0.0009622747311368585, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7315 }, { "epoch": 24.007162162162164, "grad_norm": 0.21726873517036438, "learning_rate": 1.5625e-06, "loss": 0.0056, "step": 7316 }, { "epoch": 24.007195945945945, "grad_norm": 0.12715163826942444, "learning_rate": 1.5625e-06, "loss": 0.0047, "step": 7317 }, { "epoch": 24.00722972972973, "grad_norm": 4.646751403808594, "learning_rate": 1.5625e-06, "loss": 0.0496, "step": 7318 }, { "epoch": 24.007263513513514, "grad_norm": 0.7838619947433472, "learning_rate": 1.5625e-06, "loss": 0.0064, "step": 7319 }, { "epoch": 24.007297297297296, "grad_norm": 0.5551577806472778, "learning_rate": 1.5625e-06, "loss": 0.01, "step": 7320 }, { "epoch": 24.00733108108108, "grad_norm": 0.0910237580537796, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7321 }, { "epoch": 24.007364864864865, "grad_norm": 0.000906089786440134, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7322 }, { "epoch": 24.00739864864865, "grad_norm": 0.004034427460283041, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7323 }, { "epoch": 24.00743243243243, "grad_norm": 0.11848705261945724, "learning_rate": 1.5625e-06, "loss": 0.0043, "step": 7324 }, { "epoch": 24.007466216216216, "grad_norm": 0.5899952054023743, "learning_rate": 1.5625e-06, "loss": 0.0056, "step": 7325 }, { "epoch": 24.0075, "grad_norm": 0.0010411370312795043, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7326 }, { "epoch": 24.007533783783785, "grad_norm": 0.0019117455231025815, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7327 }, { "epoch": 24.007567567567566, "grad_norm": 0.003768920199945569, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7328 }, { "epoch": 24.00760135135135, "grad_norm": 0.01911277510225773, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7329 }, { "epoch": 24.007635135135136, "grad_norm": 0.00844330433756113, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7330 }, { "epoch": 24.00766891891892, "grad_norm": 0.14720715582370758, "learning_rate": 1.5625e-06, "loss": 0.0039, "step": 7331 }, { "epoch": 24.0077027027027, "grad_norm": 0.001992444507777691, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7332 }, { "epoch": 24.007736486486486, "grad_norm": 0.10364358872175217, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 7333 }, { "epoch": 24.00777027027027, "grad_norm": 26.809621810913086, "learning_rate": 1.5625e-06, "loss": 0.1175, "step": 7334 }, { "epoch": 24.007804054054056, "grad_norm": 28.95667266845703, "learning_rate": 1.5625e-06, "loss": 0.056, "step": 7335 }, { "epoch": 24.007837837837837, "grad_norm": 0.32552942633628845, "learning_rate": 1.5625e-06, "loss": 0.0054, "step": 7336 }, { "epoch": 24.00787162162162, "grad_norm": 0.3521330952644348, "learning_rate": 1.5625e-06, "loss": 0.0013, "step": 7337 }, { "epoch": 24.007905405405406, "grad_norm": 0.004410546738654375, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7338 }, { "epoch": 24.00793918918919, "grad_norm": 22.71560287475586, "learning_rate": 1.5625e-06, "loss": 0.2019, "step": 7339 }, { "epoch": 24.007972972972972, "grad_norm": 13.992262840270996, "learning_rate": 1.5625e-06, "loss": 0.054, "step": 7340 }, { "epoch": 24.008006756756757, "grad_norm": 0.0030907606706023216, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7341 }, { "epoch": 24.00804054054054, "grad_norm": 0.39486801624298096, "learning_rate": 1.5625e-06, "loss": 0.0025, "step": 7342 }, { "epoch": 24.008074324324323, "grad_norm": 0.11977224797010422, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7343 }, { "epoch": 24.008108108108107, "grad_norm": 0.002985132159665227, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7344 }, { "epoch": 24.008141891891892, "grad_norm": 0.00131929328199476, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7345 }, { "epoch": 24.008175675675677, "grad_norm": 0.002701301360502839, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7346 }, { "epoch": 24.008209459459458, "grad_norm": 0.0021882946603000164, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7347 }, { "epoch": 24.008243243243243, "grad_norm": 0.0023511911276727915, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7348 }, { "epoch": 24.008277027027027, "grad_norm": 0.003243495477363467, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7349 }, { "epoch": 24.008310810810812, "grad_norm": 0.7017614841461182, "learning_rate": 1.5625e-06, "loss": 0.0053, "step": 7350 }, { "epoch": 24.008344594594593, "grad_norm": 0.11174246668815613, "learning_rate": 1.5625e-06, "loss": 0.0032, "step": 7351 }, { "epoch": 24.008378378378378, "grad_norm": 0.005360968876630068, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7352 }, { "epoch": 24.008412162162163, "grad_norm": 0.001405666465871036, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7353 }, { "epoch": 24.008445945945947, "grad_norm": 0.675469696521759, "learning_rate": 1.5625e-06, "loss": 0.0029, "step": 7354 }, { "epoch": 24.00847972972973, "grad_norm": 0.04634421318769455, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7355 }, { "epoch": 24.008513513513513, "grad_norm": 0.0019048915710300207, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7356 }, { "epoch": 24.008547297297298, "grad_norm": 0.06635270267724991, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7357 }, { "epoch": 24.008581081081083, "grad_norm": 0.002435441827401519, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7358 }, { "epoch": 24.008614864864864, "grad_norm": 0.39082032442092896, "learning_rate": 1.5625e-06, "loss": 0.0084, "step": 7359 }, { "epoch": 24.00864864864865, "grad_norm": 0.003128060605376959, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7360 }, { "epoch": 24.008682432432433, "grad_norm": 0.018464144319295883, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7361 }, { "epoch": 24.008716216216218, "grad_norm": 0.0022922144271433353, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7362 }, { "epoch": 24.00875, "grad_norm": 0.0008572257356718183, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7363 }, { "epoch": 24.008783783783784, "grad_norm": 0.0028401613235473633, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7364 }, { "epoch": 24.00881756756757, "grad_norm": 3.53556227684021, "learning_rate": 1.5625e-06, "loss": 0.3938, "step": 7365 }, { "epoch": 24.00885135135135, "grad_norm": 0.0011617294512689114, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7366 }, { "epoch": 24.008885135135134, "grad_norm": 0.0029014998581260443, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7367 }, { "epoch": 24.00891891891892, "grad_norm": 7.4609456062316895, "learning_rate": 1.5625e-06, "loss": 0.1359, "step": 7368 }, { "epoch": 24.008952702702704, "grad_norm": 0.0035573027562350035, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7369 }, { "epoch": 24.008986486486485, "grad_norm": 0.016273977234959602, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7370 }, { "epoch": 24.00902027027027, "grad_norm": 0.0021987156942486763, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7371 }, { "epoch": 24.009054054054054, "grad_norm": 0.0030841040425002575, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7372 }, { "epoch": 24.00908783783784, "grad_norm": 0.004480176605284214, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7373 }, { "epoch": 24.00912162162162, "grad_norm": 0.0008434472256340086, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7374 }, { "epoch": 24.009155405405405, "grad_norm": 0.14835314452648163, "learning_rate": 1.5625e-06, "loss": 0.0053, "step": 7375 }, { "epoch": 24.00918918918919, "grad_norm": 0.0025416407734155655, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7376 }, { "epoch": 24.009222972972974, "grad_norm": 0.0665520429611206, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7377 }, { "epoch": 24.009256756756756, "grad_norm": 0.002580169588327408, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7378 }, { "epoch": 24.00929054054054, "grad_norm": 2.1521475315093994, "learning_rate": 1.5625e-06, "loss": 0.0062, "step": 7379 }, { "epoch": 24.009324324324325, "grad_norm": 0.002197148045524955, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7380 }, { "epoch": 24.00935810810811, "grad_norm": 23.808429718017578, "learning_rate": 1.5625e-06, "loss": 0.6199, "step": 7381 }, { "epoch": 24.00939189189189, "grad_norm": 0.0016821399331092834, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7382 }, { "epoch": 24.009425675675676, "grad_norm": 0.0011172929080203176, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7383 }, { "epoch": 24.00945945945946, "grad_norm": 13.910767555236816, "learning_rate": 1.5625e-06, "loss": 0.5527, "step": 7384 }, { "epoch": 24.00949324324324, "grad_norm": 0.0008570401696488261, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7385 }, { "epoch": 24.009527027027026, "grad_norm": 0.06064176931977272, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7386 }, { "epoch": 24.00956081081081, "grad_norm": 0.2370493859052658, "learning_rate": 1.5625e-06, "loss": 0.0011, "step": 7387 }, { "epoch": 24.009594594594596, "grad_norm": 36.25212097167969, "learning_rate": 1.5625e-06, "loss": 0.1059, "step": 7388 }, { "epoch": 24.009628378378377, "grad_norm": 0.266668438911438, "learning_rate": 1.5625e-06, "loss": 0.0016, "step": 7389 }, { "epoch": 24.00966216216216, "grad_norm": 0.005061822477728128, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7390 }, { "epoch": 24.009695945945946, "grad_norm": 0.0027564377523958683, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7391 }, { "epoch": 24.00972972972973, "grad_norm": 0.0023249925579875708, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7392 }, { "epoch": 24.009763513513512, "grad_norm": 0.21131037175655365, "learning_rate": 1.5625e-06, "loss": 0.0055, "step": 7393 }, { "epoch": 24.009797297297297, "grad_norm": 10.51416015625, "learning_rate": 1.5625e-06, "loss": 0.1522, "step": 7394 }, { "epoch": 24.00983108108108, "grad_norm": 0.002428306033834815, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7395 }, { "epoch": 24.009864864864866, "grad_norm": 0.005939406342804432, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7396 }, { "epoch": 24.009898648648647, "grad_norm": 0.001047294121235609, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7397 }, { "epoch": 24.009932432432432, "grad_norm": 0.0017249657539650798, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7398 }, { "epoch": 24.009966216216217, "grad_norm": 0.13063599169254303, "learning_rate": 1.5625e-06, "loss": 0.0049, "step": 7399 }, { "epoch": 24.01, "grad_norm": 0.12133465707302094, "learning_rate": 1.5625e-06, "loss": 0.0038, "step": 7400 }, { "epoch": 24.01, "eval_accuracy": 0.9046849757673667, "eval_loss": 0.5421075224876404, "eval_runtime": 32.7328, "eval_samples_per_second": 18.911, "eval_steps_per_second": 2.383, "step": 7400 }, { "epoch": 25.000033783783785, "grad_norm": 0.16264250874519348, "learning_rate": 1.5625e-06, "loss": 0.0055, "step": 7401 }, { "epoch": 25.000067567567566, "grad_norm": 20.409006118774414, "learning_rate": 1.5625e-06, "loss": 0.0513, "step": 7402 }, { "epoch": 25.00010135135135, "grad_norm": 7.90665340423584, "learning_rate": 1.5625e-06, "loss": 0.0434, "step": 7403 }, { "epoch": 25.000135135135135, "grad_norm": 0.03508947789669037, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7404 }, { "epoch": 25.00016891891892, "grad_norm": 0.0043688006699085236, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7405 }, { "epoch": 25.0002027027027, "grad_norm": 0.002021470572799444, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7406 }, { "epoch": 25.000236486486486, "grad_norm": 0.003953432198613882, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7407 }, { "epoch": 25.00027027027027, "grad_norm": 0.16582828760147095, "learning_rate": 1.5625e-06, "loss": 0.003, "step": 7408 }, { "epoch": 25.000304054054055, "grad_norm": 0.0013677170500159264, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7409 }, { "epoch": 25.000337837837836, "grad_norm": 0.006728089414536953, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7410 }, { "epoch": 25.00037162162162, "grad_norm": 34.88395309448242, "learning_rate": 1.5625e-06, "loss": 0.2525, "step": 7411 }, { "epoch": 25.000405405405406, "grad_norm": 0.08149212598800659, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7412 }, { "epoch": 25.00043918918919, "grad_norm": 0.0015148011734709144, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7413 }, { "epoch": 25.00047297297297, "grad_norm": 0.6031744480133057, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7414 }, { "epoch": 25.000506756756756, "grad_norm": 0.004594899248331785, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7415 }, { "epoch": 25.00054054054054, "grad_norm": 0.0034369025379419327, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7416 }, { "epoch": 25.000574324324326, "grad_norm": 0.029998673126101494, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7417 }, { "epoch": 25.000608108108107, "grad_norm": 0.0035712202079594135, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7418 }, { "epoch": 25.000641891891892, "grad_norm": 0.0018918360583484173, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7419 }, { "epoch": 25.000675675675677, "grad_norm": 0.0016934152226895094, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7420 }, { "epoch": 25.00070945945946, "grad_norm": 0.11085747182369232, "learning_rate": 1.5625e-06, "loss": 0.0041, "step": 7421 }, { "epoch": 25.000743243243242, "grad_norm": 0.0028789611533284187, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7422 }, { "epoch": 25.000777027027027, "grad_norm": 4.71401834487915, "learning_rate": 1.5625e-06, "loss": 0.0165, "step": 7423 }, { "epoch": 25.000810810810812, "grad_norm": 0.008101820014417171, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7424 }, { "epoch": 25.000844594594593, "grad_norm": 0.02060890384018421, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7425 }, { "epoch": 25.000878378378378, "grad_norm": 0.0025632348842918873, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7426 }, { "epoch": 25.000912162162162, "grad_norm": 0.0024472493678331375, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7427 }, { "epoch": 25.000945945945947, "grad_norm": 0.0513974092900753, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7428 }, { "epoch": 25.00097972972973, "grad_norm": 0.0014175361720845103, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7429 }, { "epoch": 25.001013513513513, "grad_norm": 0.026986053213477135, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7430 }, { "epoch": 25.001047297297298, "grad_norm": 0.027416927739977837, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7431 }, { "epoch": 25.001081081081082, "grad_norm": 1.0699279308319092, "learning_rate": 1.5625e-06, "loss": 0.0177, "step": 7432 }, { "epoch": 25.001114864864864, "grad_norm": 0.17772312462329865, "learning_rate": 1.5625e-06, "loss": 0.0064, "step": 7433 }, { "epoch": 25.00114864864865, "grad_norm": 0.07296465337276459, "learning_rate": 1.5625e-06, "loss": 0.0021, "step": 7434 }, { "epoch": 25.001182432432433, "grad_norm": 0.003564476268365979, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7435 }, { "epoch": 25.001216216216218, "grad_norm": 0.19551759958267212, "learning_rate": 1.5625e-06, "loss": 0.0069, "step": 7436 }, { "epoch": 25.00125, "grad_norm": 0.15986615419387817, "learning_rate": 1.5625e-06, "loss": 0.004, "step": 7437 }, { "epoch": 25.001283783783784, "grad_norm": 0.0185440331697464, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7438 }, { "epoch": 25.00131756756757, "grad_norm": 0.021000323817133904, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7439 }, { "epoch": 25.001351351351353, "grad_norm": 0.16960014402866364, "learning_rate": 1.5625e-06, "loss": 0.0072, "step": 7440 }, { "epoch": 25.001385135135134, "grad_norm": 0.0014944308204576373, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7441 }, { "epoch": 25.00141891891892, "grad_norm": 0.05629821494221687, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7442 }, { "epoch": 25.001452702702704, "grad_norm": 3.583272933959961, "learning_rate": 1.5625e-06, "loss": 0.0108, "step": 7443 }, { "epoch": 25.001486486486485, "grad_norm": 0.001646104734390974, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7444 }, { "epoch": 25.00152027027027, "grad_norm": 89.86502838134766, "learning_rate": 1.5625e-06, "loss": 0.2341, "step": 7445 }, { "epoch": 25.001554054054054, "grad_norm": 0.002421623794361949, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7446 }, { "epoch": 25.00158783783784, "grad_norm": 0.03760408237576485, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7447 }, { "epoch": 25.00162162162162, "grad_norm": 0.001191393006592989, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7448 }, { "epoch": 25.001655405405405, "grad_norm": 0.1569383442401886, "learning_rate": 1.5625e-06, "loss": 0.0016, "step": 7449 }, { "epoch": 25.00168918918919, "grad_norm": 0.005918628070503473, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7450 }, { "epoch": 25.001722972972974, "grad_norm": 0.018187740817666054, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7451 }, { "epoch": 25.001756756756755, "grad_norm": 0.0008013732149265707, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7452 }, { "epoch": 25.00179054054054, "grad_norm": 10.374528884887695, "learning_rate": 1.5625e-06, "loss": 0.0228, "step": 7453 }, { "epoch": 25.001824324324325, "grad_norm": 5.195580959320068, "learning_rate": 1.5625e-06, "loss": 0.1994, "step": 7454 }, { "epoch": 25.00185810810811, "grad_norm": 0.0026257422287017107, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7455 }, { "epoch": 25.00189189189189, "grad_norm": 0.0011856104247272015, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7456 }, { "epoch": 25.001925675675675, "grad_norm": 0.00786273367702961, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7457 }, { "epoch": 25.00195945945946, "grad_norm": 0.0021155369468033314, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7458 }, { "epoch": 25.001993243243245, "grad_norm": 0.0025492317508906126, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7459 }, { "epoch": 25.002027027027026, "grad_norm": 10.307982444763184, "learning_rate": 1.5625e-06, "loss": 0.1349, "step": 7460 }, { "epoch": 25.00206081081081, "grad_norm": 0.0013864197535440326, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7461 }, { "epoch": 25.002094594594595, "grad_norm": 0.042732808738946915, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7462 }, { "epoch": 25.00212837837838, "grad_norm": 0.0011516822269186378, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7463 }, { "epoch": 25.00216216216216, "grad_norm": 48.238365173339844, "learning_rate": 1.5625e-06, "loss": 0.3119, "step": 7464 }, { "epoch": 25.002195945945946, "grad_norm": 0.017307192087173462, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7465 }, { "epoch": 25.00222972972973, "grad_norm": 0.0008081222767941654, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7466 }, { "epoch": 25.002263513513512, "grad_norm": 0.11618702858686447, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 7467 }, { "epoch": 25.002297297297297, "grad_norm": 0.0010460361372679472, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7468 }, { "epoch": 25.00233108108108, "grad_norm": 0.009630582295358181, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7469 }, { "epoch": 25.002364864864866, "grad_norm": 0.000972459209151566, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7470 }, { "epoch": 25.002398648648647, "grad_norm": 0.0020899304654449224, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7471 }, { "epoch": 25.002432432432432, "grad_norm": 0.0031980641651898623, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7472 }, { "epoch": 25.002466216216217, "grad_norm": 0.00730145862326026, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7473 }, { "epoch": 25.0025, "grad_norm": 0.10155636817216873, "learning_rate": 1.5625e-06, "loss": 0.0038, "step": 7474 }, { "epoch": 25.002533783783782, "grad_norm": 0.8179550170898438, "learning_rate": 1.5625e-06, "loss": 0.0053, "step": 7475 }, { "epoch": 25.002567567567567, "grad_norm": 0.0014629641082137823, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7476 }, { "epoch": 25.002601351351352, "grad_norm": 0.006326448172330856, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7477 }, { "epoch": 25.002635135135137, "grad_norm": 0.0016084644012153149, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7478 }, { "epoch": 25.002668918918918, "grad_norm": 0.005623758304864168, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7479 }, { "epoch": 25.002702702702702, "grad_norm": 0.0006613357691094279, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7480 }, { "epoch": 25.002736486486487, "grad_norm": 0.03066127561032772, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7481 }, { "epoch": 25.002770270270272, "grad_norm": 0.024295223876833916, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7482 }, { "epoch": 25.002804054054053, "grad_norm": 3.298546314239502, "learning_rate": 1.5625e-06, "loss": 0.4281, "step": 7483 }, { "epoch": 25.002837837837838, "grad_norm": 5.143866539001465, "learning_rate": 1.5625e-06, "loss": 0.0205, "step": 7484 }, { "epoch": 25.002871621621622, "grad_norm": 13.395004272460938, "learning_rate": 1.5625e-06, "loss": 0.022, "step": 7485 }, { "epoch": 25.002905405405407, "grad_norm": 0.1475042700767517, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7486 }, { "epoch": 25.00293918918919, "grad_norm": 0.0013413710985332727, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7487 }, { "epoch": 25.002972972972973, "grad_norm": 0.0018700641812756658, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7488 }, { "epoch": 25.003006756756758, "grad_norm": 0.13020378351211548, "learning_rate": 1.5625e-06, "loss": 0.0048, "step": 7489 }, { "epoch": 25.00304054054054, "grad_norm": 0.003879898227751255, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7490 }, { "epoch": 25.003074324324324, "grad_norm": 0.004910505376756191, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7491 }, { "epoch": 25.00310810810811, "grad_norm": 17.955631256103516, "learning_rate": 1.5625e-06, "loss": 0.8578, "step": 7492 }, { "epoch": 25.003141891891893, "grad_norm": 3.7945916652679443, "learning_rate": 1.5625e-06, "loss": 0.0111, "step": 7493 }, { "epoch": 25.003175675675674, "grad_norm": 0.15683722496032715, "learning_rate": 1.5625e-06, "loss": 0.0012, "step": 7494 }, { "epoch": 25.00320945945946, "grad_norm": 9.975762367248535, "learning_rate": 1.5625e-06, "loss": 0.0139, "step": 7495 }, { "epoch": 25.003243243243244, "grad_norm": 0.0006707512075081468, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7496 }, { "epoch": 25.00327702702703, "grad_norm": 0.003180521307513118, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7497 }, { "epoch": 25.00331081081081, "grad_norm": 0.05989162251353264, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7498 }, { "epoch": 25.003344594594594, "grad_norm": 0.00633852556347847, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7499 }, { "epoch": 25.00337837837838, "grad_norm": 0.0029245372861623764, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7500 }, { "epoch": 25.003412162162164, "grad_norm": 0.005334710236638784, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7501 }, { "epoch": 25.003445945945945, "grad_norm": 0.07379399985074997, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7502 }, { "epoch": 25.00347972972973, "grad_norm": 0.11954125761985779, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7503 }, { "epoch": 25.003513513513514, "grad_norm": 0.03079069033265114, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7504 }, { "epoch": 25.0035472972973, "grad_norm": 0.022133324295282364, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7505 }, { "epoch": 25.00358108108108, "grad_norm": 0.053491976112127304, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7506 }, { "epoch": 25.003614864864865, "grad_norm": 0.23977535963058472, "learning_rate": 1.5625e-06, "loss": 0.007, "step": 7507 }, { "epoch": 25.00364864864865, "grad_norm": 1.3407092094421387, "learning_rate": 1.5625e-06, "loss": 0.0306, "step": 7508 }, { "epoch": 25.00368243243243, "grad_norm": 0.01253268402069807, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7509 }, { "epoch": 25.003716216216215, "grad_norm": 0.00260983407497406, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7510 }, { "epoch": 25.00375, "grad_norm": 0.06815788149833679, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 7511 }, { "epoch": 25.003783783783785, "grad_norm": 0.1660364419221878, "learning_rate": 1.5625e-06, "loss": 0.006, "step": 7512 }, { "epoch": 25.003817567567566, "grad_norm": 1.8277628421783447, "learning_rate": 1.5625e-06, "loss": 0.0305, "step": 7513 }, { "epoch": 25.00385135135135, "grad_norm": 0.00968545489013195, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7514 }, { "epoch": 25.003885135135135, "grad_norm": 0.013877162709832191, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7515 }, { "epoch": 25.00391891891892, "grad_norm": 0.0018792067421600223, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7516 }, { "epoch": 25.0039527027027, "grad_norm": 0.010581159964203835, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7517 }, { "epoch": 25.003986486486486, "grad_norm": 9.867837905883789, "learning_rate": 1.5625e-06, "loss": 0.3592, "step": 7518 }, { "epoch": 25.00402027027027, "grad_norm": 0.03813839703798294, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7519 }, { "epoch": 25.004054054054055, "grad_norm": 0.002373539377003908, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7520 }, { "epoch": 25.004087837837837, "grad_norm": 0.0012643926311284304, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7521 }, { "epoch": 25.00412162162162, "grad_norm": 17.88892936706543, "learning_rate": 1.5625e-06, "loss": 0.853, "step": 7522 }, { "epoch": 25.004155405405406, "grad_norm": 0.0014278569724410772, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7523 }, { "epoch": 25.00418918918919, "grad_norm": 0.0028316457755863667, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7524 }, { "epoch": 25.004222972972972, "grad_norm": 0.007287387270480394, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7525 }, { "epoch": 25.004256756756757, "grad_norm": 0.0077012889087200165, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7526 }, { "epoch": 25.00429054054054, "grad_norm": 0.0027324731927365065, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7527 }, { "epoch": 25.004324324324326, "grad_norm": 0.09750697761774063, "learning_rate": 1.5625e-06, "loss": 0.0018, "step": 7528 }, { "epoch": 25.004358108108107, "grad_norm": 0.0013642560224980116, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7529 }, { "epoch": 25.004391891891892, "grad_norm": 0.0030113901011645794, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7530 }, { "epoch": 25.004425675675677, "grad_norm": 0.0014387262053787708, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7531 }, { "epoch": 25.004459459459458, "grad_norm": 9.008569717407227, "learning_rate": 1.5625e-06, "loss": 0.2041, "step": 7532 }, { "epoch": 25.004493243243243, "grad_norm": 0.00265720346942544, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7533 }, { "epoch": 25.004527027027027, "grad_norm": 0.08151258528232574, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7534 }, { "epoch": 25.004560810810812, "grad_norm": 0.0011847803834825754, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7535 }, { "epoch": 25.004594594594593, "grad_norm": 0.001542643061839044, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7536 }, { "epoch": 25.004628378378378, "grad_norm": 0.004803255666047335, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7537 }, { "epoch": 25.004662162162163, "grad_norm": 0.17271815240383148, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 7538 }, { "epoch": 25.004695945945947, "grad_norm": 0.058581627905368805, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7539 }, { "epoch": 25.00472972972973, "grad_norm": 0.0016537345945835114, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7540 }, { "epoch": 25.004763513513513, "grad_norm": 36.934593200683594, "learning_rate": 1.5625e-06, "loss": 0.3253, "step": 7541 }, { "epoch": 25.004797297297298, "grad_norm": 0.14066766202449799, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7542 }, { "epoch": 25.004831081081083, "grad_norm": 0.0010343901813030243, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7543 }, { "epoch": 25.004864864864864, "grad_norm": 0.002581337234005332, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7544 }, { "epoch": 25.00489864864865, "grad_norm": 0.0006088687223382294, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7545 }, { "epoch": 25.004932432432433, "grad_norm": 34.20881271362305, "learning_rate": 1.5625e-06, "loss": 0.4052, "step": 7546 }, { "epoch": 25.004966216216218, "grad_norm": 0.004699041601270437, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7547 }, { "epoch": 25.005, "grad_norm": 0.0021058786660432816, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7548 }, { "epoch": 25.005033783783784, "grad_norm": 0.013965478166937828, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7549 }, { "epoch": 25.00506756756757, "grad_norm": 0.006389801390469074, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7550 }, { "epoch": 25.00510135135135, "grad_norm": 0.007034037262201309, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7551 }, { "epoch": 25.005135135135134, "grad_norm": 0.14378321170806885, "learning_rate": 1.5625e-06, "loss": 0.0053, "step": 7552 }, { "epoch": 25.00516891891892, "grad_norm": 0.1411689817905426, "learning_rate": 1.5625e-06, "loss": 0.0052, "step": 7553 }, { "epoch": 25.005202702702704, "grad_norm": 0.12675070762634277, "learning_rate": 1.5625e-06, "loss": 0.0044, "step": 7554 }, { "epoch": 25.005236486486485, "grad_norm": 0.004253044258803129, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7555 }, { "epoch": 25.00527027027027, "grad_norm": 0.0030452560167759657, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7556 }, { "epoch": 25.005304054054054, "grad_norm": 0.001160693122074008, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7557 }, { "epoch": 25.00533783783784, "grad_norm": 0.013927936553955078, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7558 }, { "epoch": 25.00537162162162, "grad_norm": 0.010612732730805874, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7559 }, { "epoch": 25.005405405405405, "grad_norm": 0.0019944021478295326, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7560 }, { "epoch": 25.00543918918919, "grad_norm": 0.002915196120738983, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7561 }, { "epoch": 25.005472972972974, "grad_norm": 23.24915313720703, "learning_rate": 1.5625e-06, "loss": 0.3703, "step": 7562 }, { "epoch": 25.005506756756755, "grad_norm": 2.964966058731079, "learning_rate": 1.5625e-06, "loss": 0.0188, "step": 7563 }, { "epoch": 25.00554054054054, "grad_norm": 0.0017730121035128832, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7564 }, { "epoch": 25.005574324324325, "grad_norm": 0.0008639486040920019, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7565 }, { "epoch": 25.00560810810811, "grad_norm": 36.020992279052734, "learning_rate": 1.5625e-06, "loss": 0.0753, "step": 7566 }, { "epoch": 25.00564189189189, "grad_norm": 0.00593156460672617, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7567 }, { "epoch": 25.005675675675676, "grad_norm": 0.0051126182079315186, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7568 }, { "epoch": 25.00570945945946, "grad_norm": 0.042176343500614166, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7569 }, { "epoch": 25.005743243243245, "grad_norm": 0.001362944021821022, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7570 }, { "epoch": 25.005777027027026, "grad_norm": 0.0007546119741164148, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7571 }, { "epoch": 25.00581081081081, "grad_norm": 23.434032440185547, "learning_rate": 1.5625e-06, "loss": 0.0641, "step": 7572 }, { "epoch": 25.005844594594596, "grad_norm": 0.12137677520513535, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 7573 }, { "epoch": 25.005878378378377, "grad_norm": 0.003252401016652584, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7574 }, { "epoch": 25.00591216216216, "grad_norm": 0.0035182491410523653, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7575 }, { "epoch": 25.005945945945946, "grad_norm": 0.002222254406660795, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7576 }, { "epoch": 25.00597972972973, "grad_norm": 3.915309429168701, "learning_rate": 1.5625e-06, "loss": 0.1499, "step": 7577 }, { "epoch": 25.006013513513512, "grad_norm": 0.004408285021781921, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7578 }, { "epoch": 25.006047297297297, "grad_norm": 0.008703759871423244, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7579 }, { "epoch": 25.00608108108108, "grad_norm": 1.059969425201416, "learning_rate": 1.5625e-06, "loss": 0.0108, "step": 7580 }, { "epoch": 25.006114864864866, "grad_norm": 0.0033844730351120234, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7581 }, { "epoch": 25.006148648648647, "grad_norm": 0.01978972740471363, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7582 }, { "epoch": 25.006182432432432, "grad_norm": 0.0009998716413974762, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7583 }, { "epoch": 25.006216216216217, "grad_norm": 0.0033040810376405716, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7584 }, { "epoch": 25.00625, "grad_norm": 0.1717895269393921, "learning_rate": 1.5625e-06, "loss": 0.0028, "step": 7585 }, { "epoch": 25.006283783783783, "grad_norm": 1.056628704071045, "learning_rate": 1.5625e-06, "loss": 0.0198, "step": 7586 }, { "epoch": 25.006317567567567, "grad_norm": 0.03072369284927845, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7587 }, { "epoch": 25.006351351351352, "grad_norm": 0.0017957729287445545, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7588 }, { "epoch": 25.006385135135137, "grad_norm": 0.00695592537522316, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7589 }, { "epoch": 25.006418918918918, "grad_norm": 0.0011598909040912986, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7590 }, { "epoch": 25.006452702702703, "grad_norm": 13.594551086425781, "learning_rate": 1.5625e-06, "loss": 0.1615, "step": 7591 }, { "epoch": 25.006486486486487, "grad_norm": 0.0068550389260053635, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7592 }, { "epoch": 25.006520270270272, "grad_norm": 0.005742600187659264, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7593 }, { "epoch": 25.006554054054053, "grad_norm": 0.0017657529097050428, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7594 }, { "epoch": 25.006587837837838, "grad_norm": 0.001893254928290844, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7595 }, { "epoch": 25.006621621621623, "grad_norm": 0.0012620919151231647, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7596 }, { "epoch": 25.006655405405404, "grad_norm": 0.025764618068933487, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7597 }, { "epoch": 25.00668918918919, "grad_norm": 3.413508176803589, "learning_rate": 1.5625e-06, "loss": 0.4347, "step": 7598 }, { "epoch": 25.006722972972973, "grad_norm": 0.0020463380496948957, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7599 }, { "epoch": 25.006756756756758, "grad_norm": 0.0026072801556438208, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7600 }, { "epoch": 25.00679054054054, "grad_norm": 0.08251169323921204, "learning_rate": 1.5625e-06, "loss": 0.0013, "step": 7601 }, { "epoch": 25.006824324324324, "grad_norm": 0.0025178405921906233, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7602 }, { "epoch": 25.00685810810811, "grad_norm": 0.0021384002175182104, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7603 }, { "epoch": 25.006891891891893, "grad_norm": 0.000755324203055352, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7604 }, { "epoch": 25.006925675675674, "grad_norm": 0.16239915788173676, "learning_rate": 1.5625e-06, "loss": 0.0047, "step": 7605 }, { "epoch": 25.00695945945946, "grad_norm": 0.006066232919692993, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7606 }, { "epoch": 25.006993243243244, "grad_norm": 0.006980368867516518, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7607 }, { "epoch": 25.00702702702703, "grad_norm": 0.1152351051568985, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 7608 }, { "epoch": 25.00706081081081, "grad_norm": 0.0009896293049678206, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7609 }, { "epoch": 25.007094594594594, "grad_norm": 0.0013063482474535704, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7610 }, { "epoch": 25.00712837837838, "grad_norm": 55.82343292236328, "learning_rate": 1.5625e-06, "loss": 0.1323, "step": 7611 }, { "epoch": 25.007162162162164, "grad_norm": 0.0024220533668994904, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7612 }, { "epoch": 25.007195945945945, "grad_norm": 0.01079231221228838, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7613 }, { "epoch": 25.00722972972973, "grad_norm": 0.009979834780097008, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7614 }, { "epoch": 25.007263513513514, "grad_norm": 3.9141409397125244, "learning_rate": 1.5625e-06, "loss": 0.0138, "step": 7615 }, { "epoch": 25.007297297297296, "grad_norm": 0.17674537003040314, "learning_rate": 1.5625e-06, "loss": 0.0063, "step": 7616 }, { "epoch": 25.00733108108108, "grad_norm": 0.010815060697495937, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7617 }, { "epoch": 25.007364864864865, "grad_norm": 0.0038398485630750656, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7618 }, { "epoch": 25.00739864864865, "grad_norm": 0.007351795211434364, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7619 }, { "epoch": 25.00743243243243, "grad_norm": 11.916329383850098, "learning_rate": 1.5625e-06, "loss": 0.2548, "step": 7620 }, { "epoch": 25.007466216216216, "grad_norm": 0.00645094458013773, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7621 }, { "epoch": 25.0075, "grad_norm": 0.0023158746771514416, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7622 }, { "epoch": 25.007533783783785, "grad_norm": 0.0033643627539277077, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7623 }, { "epoch": 25.007567567567566, "grad_norm": 0.25055086612701416, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 7624 }, { "epoch": 25.00760135135135, "grad_norm": 0.0005648484220728278, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7625 }, { "epoch": 25.007635135135136, "grad_norm": 47.8184700012207, "learning_rate": 1.5625e-06, "loss": 0.2761, "step": 7626 }, { "epoch": 25.00766891891892, "grad_norm": 0.0021716482006013393, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7627 }, { "epoch": 25.0077027027027, "grad_norm": 0.5253636837005615, "learning_rate": 1.5625e-06, "loss": 0.0121, "step": 7628 }, { "epoch": 25.007736486486486, "grad_norm": 0.0010717492550611496, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7629 }, { "epoch": 25.00777027027027, "grad_norm": 0.0012397038517519832, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7630 }, { "epoch": 25.007804054054056, "grad_norm": 0.0036555188708007336, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7631 }, { "epoch": 25.007837837837837, "grad_norm": 0.020608654245734215, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7632 }, { "epoch": 25.00787162162162, "grad_norm": 58.91350173950195, "learning_rate": 1.5625e-06, "loss": 0.4093, "step": 7633 }, { "epoch": 25.007905405405406, "grad_norm": 14.049966812133789, "learning_rate": 1.5625e-06, "loss": 0.0201, "step": 7634 }, { "epoch": 25.00793918918919, "grad_norm": 0.001026835641823709, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7635 }, { "epoch": 25.007972972972972, "grad_norm": 0.12073894590139389, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 7636 }, { "epoch": 25.008006756756757, "grad_norm": 0.003919045440852642, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7637 }, { "epoch": 25.00804054054054, "grad_norm": 0.002789657097309828, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7638 }, { "epoch": 25.008074324324323, "grad_norm": 0.002559213899075985, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7639 }, { "epoch": 25.008108108108107, "grad_norm": 0.0013352972455322742, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7640 }, { "epoch": 25.008141891891892, "grad_norm": 0.0018101480090990663, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7641 }, { "epoch": 25.008175675675677, "grad_norm": 0.029508540406823158, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7642 }, { "epoch": 25.008209459459458, "grad_norm": 0.04015905782580376, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7643 }, { "epoch": 25.008243243243243, "grad_norm": 0.052509158849716187, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7644 }, { "epoch": 25.008277027027027, "grad_norm": 0.016080565750598907, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7645 }, { "epoch": 25.008310810810812, "grad_norm": 0.20571434497833252, "learning_rate": 1.5625e-06, "loss": 0.0056, "step": 7646 }, { "epoch": 25.008344594594593, "grad_norm": 0.003959855064749718, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7647 }, { "epoch": 25.008378378378378, "grad_norm": 0.09503883123397827, "learning_rate": 1.5625e-06, "loss": 0.0026, "step": 7648 }, { "epoch": 25.008412162162163, "grad_norm": 0.002823163755238056, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7649 }, { "epoch": 25.008445945945947, "grad_norm": 0.15335245430469513, "learning_rate": 1.5625e-06, "loss": 0.0053, "step": 7650 }, { "epoch": 25.00847972972973, "grad_norm": 0.00391249218955636, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7651 }, { "epoch": 25.008513513513513, "grad_norm": 0.33621054887771606, "learning_rate": 1.5625e-06, "loss": 0.0032, "step": 7652 }, { "epoch": 25.008547297297298, "grad_norm": 0.004026274662464857, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7653 }, { "epoch": 25.008581081081083, "grad_norm": 0.001542606158182025, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7654 }, { "epoch": 25.008614864864864, "grad_norm": 0.00204191985540092, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7655 }, { "epoch": 25.00864864864865, "grad_norm": 0.002895141951739788, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7656 }, { "epoch": 25.008682432432433, "grad_norm": 0.2537311017513275, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 7657 }, { "epoch": 25.008716216216218, "grad_norm": 0.646649956703186, "learning_rate": 1.5625e-06, "loss": 0.0201, "step": 7658 }, { "epoch": 25.00875, "grad_norm": 0.0009857096010819077, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7659 }, { "epoch": 25.008783783783784, "grad_norm": 0.0010672720381990075, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7660 }, { "epoch": 25.00881756756757, "grad_norm": 0.0008639677544124424, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7661 }, { "epoch": 25.00885135135135, "grad_norm": 10.671547889709473, "learning_rate": 1.5625e-06, "loss": 1.0377, "step": 7662 }, { "epoch": 25.008885135135134, "grad_norm": 0.0008649486117064953, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7663 }, { "epoch": 25.00891891891892, "grad_norm": 0.0017965469742193818, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7664 }, { "epoch": 25.008952702702704, "grad_norm": 0.15017366409301758, "learning_rate": 1.5625e-06, "loss": 0.0039, "step": 7665 }, { "epoch": 25.008986486486485, "grad_norm": 0.0020726523362100124, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7666 }, { "epoch": 25.00902027027027, "grad_norm": 0.000882225576788187, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7667 }, { "epoch": 25.009054054054054, "grad_norm": 0.00304634147323668, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7668 }, { "epoch": 25.00908783783784, "grad_norm": 0.9944436550140381, "learning_rate": 1.5625e-06, "loss": 0.0029, "step": 7669 }, { "epoch": 25.00912162162162, "grad_norm": 0.0019010606920346618, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7670 }, { "epoch": 25.009155405405405, "grad_norm": 0.0059871370904147625, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7671 }, { "epoch": 25.00918918918919, "grad_norm": 0.13085763156414032, "learning_rate": 1.5625e-06, "loss": 0.0048, "step": 7672 }, { "epoch": 25.009222972972974, "grad_norm": 45.5644416809082, "learning_rate": 1.5625e-06, "loss": 0.093, "step": 7673 }, { "epoch": 25.009256756756756, "grad_norm": 0.16159656643867493, "learning_rate": 1.5625e-06, "loss": 0.0035, "step": 7674 }, { "epoch": 25.00929054054054, "grad_norm": 0.0012988890521228313, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7675 }, { "epoch": 25.009324324324325, "grad_norm": 0.0011150944046676159, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7676 }, { "epoch": 25.00935810810811, "grad_norm": 0.1288936734199524, "learning_rate": 1.5625e-06, "loss": 0.0046, "step": 7677 }, { "epoch": 25.00939189189189, "grad_norm": 0.00441330298781395, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7678 }, { "epoch": 25.009425675675676, "grad_norm": 0.010359457693994045, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7679 }, { "epoch": 25.00945945945946, "grad_norm": 0.0022715909872204065, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7680 }, { "epoch": 25.00949324324324, "grad_norm": 0.0021692768204957247, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7681 }, { "epoch": 25.009527027027026, "grad_norm": 3.5621793270111084, "learning_rate": 1.5625e-06, "loss": 0.3626, "step": 7682 }, { "epoch": 25.00956081081081, "grad_norm": 26.053070068359375, "learning_rate": 1.5625e-06, "loss": 0.4209, "step": 7683 }, { "epoch": 25.009594594594596, "grad_norm": 0.018090184777975082, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7684 }, { "epoch": 25.009628378378377, "grad_norm": 0.0012139558093622327, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7685 }, { "epoch": 25.00966216216216, "grad_norm": 4.178176403045654, "learning_rate": 1.5625e-06, "loss": 0.0064, "step": 7686 }, { "epoch": 25.009695945945946, "grad_norm": 0.0007491983706131577, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7687 }, { "epoch": 25.00972972972973, "grad_norm": 0.05271591618657112, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7688 }, { "epoch": 25.009763513513512, "grad_norm": 0.013702686876058578, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7689 }, { "epoch": 25.009797297297297, "grad_norm": 0.12321484833955765, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7690 }, { "epoch": 25.00983108108108, "grad_norm": 0.29030126333236694, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7691 }, { "epoch": 25.009864864864866, "grad_norm": 0.0207776241004467, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7692 }, { "epoch": 25.009898648648647, "grad_norm": 6.719357013702393, "learning_rate": 1.5625e-06, "loss": 0.4561, "step": 7693 }, { "epoch": 25.009932432432432, "grad_norm": 0.2984963655471802, "learning_rate": 1.5625e-06, "loss": 0.0025, "step": 7694 }, { "epoch": 25.009966216216217, "grad_norm": 0.003580309683457017, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7695 }, { "epoch": 25.01, "grad_norm": 0.02788037434220314, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7696 }, { "epoch": 25.01, "eval_accuracy": 0.8949919224555735, "eval_loss": 0.6254146695137024, "eval_runtime": 32.8583, "eval_samples_per_second": 18.838, "eval_steps_per_second": 2.374, "step": 7696 }, { "epoch": 26.000033783783785, "grad_norm": 0.003314614063128829, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7697 }, { "epoch": 26.000067567567566, "grad_norm": 0.000694854708854109, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7698 }, { "epoch": 26.00010135135135, "grad_norm": 2.2957746982574463, "learning_rate": 1.5625e-06, "loss": 0.0296, "step": 7699 }, { "epoch": 26.000135135135135, "grad_norm": 0.005739177577197552, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7700 }, { "epoch": 26.00016891891892, "grad_norm": 0.10414387285709381, "learning_rate": 1.5625e-06, "loss": 0.0039, "step": 7701 }, { "epoch": 26.0002027027027, "grad_norm": 19.027576446533203, "learning_rate": 1.5625e-06, "loss": 0.6712, "step": 7702 }, { "epoch": 26.000236486486486, "grad_norm": 0.005889474414288998, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7703 }, { "epoch": 26.00027027027027, "grad_norm": 0.004418067168444395, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7704 }, { "epoch": 26.000304054054055, "grad_norm": 0.006696212571114302, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7705 }, { "epoch": 26.000337837837836, "grad_norm": 0.005181087646633387, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7706 }, { "epoch": 26.00037162162162, "grad_norm": 0.001924303243868053, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7707 }, { "epoch": 26.000405405405406, "grad_norm": 9.071057319641113, "learning_rate": 1.5625e-06, "loss": 0.1103, "step": 7708 }, { "epoch": 26.00043918918919, "grad_norm": 0.012245599180459976, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7709 }, { "epoch": 26.00047297297297, "grad_norm": 0.0012919851578772068, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7710 }, { "epoch": 26.000506756756756, "grad_norm": 0.04637492075562477, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7711 }, { "epoch": 26.00054054054054, "grad_norm": 0.004887523129582405, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7712 }, { "epoch": 26.000574324324326, "grad_norm": 1.0036269426345825, "learning_rate": 1.5625e-06, "loss": 0.0028, "step": 7713 }, { "epoch": 26.000608108108107, "grad_norm": 0.004178083501756191, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7714 }, { "epoch": 26.000641891891892, "grad_norm": 0.003127508796751499, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7715 }, { "epoch": 26.000675675675677, "grad_norm": 0.10622894763946533, "learning_rate": 1.5625e-06, "loss": 0.0031, "step": 7716 }, { "epoch": 26.00070945945946, "grad_norm": 0.002738369395956397, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7717 }, { "epoch": 26.000743243243242, "grad_norm": 0.003740401240065694, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7718 }, { "epoch": 26.000777027027027, "grad_norm": 0.5109612345695496, "learning_rate": 1.5625e-06, "loss": 0.0094, "step": 7719 }, { "epoch": 26.000810810810812, "grad_norm": 0.009310135617852211, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7720 }, { "epoch": 26.000844594594593, "grad_norm": 0.047356247901916504, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7721 }, { "epoch": 26.000878378378378, "grad_norm": 0.3994661867618561, "learning_rate": 1.5625e-06, "loss": 0.007, "step": 7722 }, { "epoch": 26.000912162162162, "grad_norm": 0.031210647895932198, "learning_rate": 1.5625e-06, "loss": 0.0009, "step": 7723 }, { "epoch": 26.000945945945947, "grad_norm": 8.226017951965332, "learning_rate": 1.5625e-06, "loss": 0.2984, "step": 7724 }, { "epoch": 26.00097972972973, "grad_norm": 0.005801036953926086, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7725 }, { "epoch": 26.001013513513513, "grad_norm": 0.005372208077460527, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7726 }, { "epoch": 26.001047297297298, "grad_norm": 0.00489626731723547, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7727 }, { "epoch": 26.001081081081082, "grad_norm": 0.001632585539482534, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7728 }, { "epoch": 26.001114864864864, "grad_norm": 0.06207063049077988, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7729 }, { "epoch": 26.00114864864865, "grad_norm": 0.39741647243499756, "learning_rate": 1.5625e-06, "loss": 0.0017, "step": 7730 }, { "epoch": 26.001182432432433, "grad_norm": 0.3184298872947693, "learning_rate": 1.5625e-06, "loss": 0.0074, "step": 7731 }, { "epoch": 26.001216216216218, "grad_norm": 0.019409941509366035, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7732 }, { "epoch": 26.00125, "grad_norm": 1.0771903991699219, "learning_rate": 1.5625e-06, "loss": 0.0225, "step": 7733 }, { "epoch": 26.001283783783784, "grad_norm": 3.1671340465545654, "learning_rate": 1.5625e-06, "loss": 0.0317, "step": 7734 }, { "epoch": 26.00131756756757, "grad_norm": 0.0022417553700506687, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7735 }, { "epoch": 26.001351351351353, "grad_norm": 0.018503714352846146, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7736 }, { "epoch": 26.001385135135134, "grad_norm": 0.0026024971157312393, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7737 }, { "epoch": 26.00141891891892, "grad_norm": 0.06442664563655853, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7738 }, { "epoch": 26.001452702702704, "grad_norm": 3.814211845397949, "learning_rate": 1.5625e-06, "loss": 0.0315, "step": 7739 }, { "epoch": 26.001486486486485, "grad_norm": 0.001171578885987401, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7740 }, { "epoch": 26.00152027027027, "grad_norm": 0.002389283152297139, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7741 }, { "epoch": 26.001554054054054, "grad_norm": 0.0019341118168085814, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7742 }, { "epoch": 26.00158783783784, "grad_norm": 9.963788986206055, "learning_rate": 1.5625e-06, "loss": 0.848, "step": 7743 }, { "epoch": 26.00162162162162, "grad_norm": 0.0012049018405377865, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7744 }, { "epoch": 26.001655405405405, "grad_norm": 3.8574581146240234, "learning_rate": 1.5625e-06, "loss": 0.0934, "step": 7745 }, { "epoch": 26.00168918918919, "grad_norm": 0.0012556841829791665, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7746 }, { "epoch": 26.001722972972974, "grad_norm": 0.00431707501411438, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7747 }, { "epoch": 26.001756756756755, "grad_norm": 5.694555759429932, "learning_rate": 1.5625e-06, "loss": 0.0472, "step": 7748 }, { "epoch": 26.00179054054054, "grad_norm": 0.004101846367120743, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7749 }, { "epoch": 26.001824324324325, "grad_norm": 0.009859352372586727, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7750 }, { "epoch": 26.00185810810811, "grad_norm": 0.002423257566988468, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7751 }, { "epoch": 26.00189189189189, "grad_norm": 0.2684817910194397, "learning_rate": 1.5625e-06, "loss": 0.0056, "step": 7752 }, { "epoch": 26.001925675675675, "grad_norm": 0.0016284233424812555, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7753 }, { "epoch": 26.00195945945946, "grad_norm": 83.52666473388672, "learning_rate": 1.5625e-06, "loss": 0.56, "step": 7754 }, { "epoch": 26.001993243243245, "grad_norm": 0.001155184698291123, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7755 }, { "epoch": 26.002027027027026, "grad_norm": 0.0019904179498553276, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7756 }, { "epoch": 26.00206081081081, "grad_norm": 0.26622703671455383, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 7757 }, { "epoch": 26.002094594594595, "grad_norm": 0.0007562605896964669, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7758 }, { "epoch": 26.00212837837838, "grad_norm": 0.0007745208567939699, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7759 }, { "epoch": 26.00216216216216, "grad_norm": 0.006142564117908478, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7760 }, { "epoch": 26.002195945945946, "grad_norm": 0.003195851342752576, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7761 }, { "epoch": 26.00222972972973, "grad_norm": 0.008435758762061596, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7762 }, { "epoch": 26.002263513513512, "grad_norm": 0.001439390704035759, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7763 }, { "epoch": 26.002297297297297, "grad_norm": 0.13082289695739746, "learning_rate": 1.5625e-06, "loss": 0.0046, "step": 7764 }, { "epoch": 26.00233108108108, "grad_norm": 0.5694289207458496, "learning_rate": 1.5625e-06, "loss": 0.0135, "step": 7765 }, { "epoch": 26.002364864864866, "grad_norm": 0.0010917572071775794, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7766 }, { "epoch": 26.002398648648647, "grad_norm": 0.002733516041189432, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7767 }, { "epoch": 26.002432432432432, "grad_norm": 0.008540971204638481, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7768 }, { "epoch": 26.002466216216217, "grad_norm": 0.24900341033935547, "learning_rate": 1.5625e-06, "loss": 0.005, "step": 7769 }, { "epoch": 26.0025, "grad_norm": 0.0032419089693576097, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7770 }, { "epoch": 26.002533783783782, "grad_norm": 0.0013969196006655693, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7771 }, { "epoch": 26.002567567567567, "grad_norm": 0.7005584239959717, "learning_rate": 1.5625e-06, "loss": 0.0018, "step": 7772 }, { "epoch": 26.002601351351352, "grad_norm": 0.02398522198200226, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7773 }, { "epoch": 26.002635135135137, "grad_norm": 0.0031329726334661245, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7774 }, { "epoch": 26.002668918918918, "grad_norm": 0.034729499369859695, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7775 }, { "epoch": 26.002702702702702, "grad_norm": 8.557807922363281, "learning_rate": 1.5625e-06, "loss": 0.2843, "step": 7776 }, { "epoch": 26.002736486486487, "grad_norm": 0.0044899191707372665, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7777 }, { "epoch": 26.002770270270272, "grad_norm": 0.08048127591609955, "learning_rate": 1.5625e-06, "loss": 0.002, "step": 7778 }, { "epoch": 26.002804054054053, "grad_norm": 5.850514888763428, "learning_rate": 1.5625e-06, "loss": 0.462, "step": 7779 }, { "epoch": 26.002837837837838, "grad_norm": 0.0008781183278188109, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7780 }, { "epoch": 26.002871621621622, "grad_norm": 0.10645589232444763, "learning_rate": 1.5625e-06, "loss": 0.004, "step": 7781 }, { "epoch": 26.002905405405407, "grad_norm": 0.0009319151868112385, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7782 }, { "epoch": 26.00293918918919, "grad_norm": 0.003883959725499153, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7783 }, { "epoch": 26.002972972972973, "grad_norm": 4.416014671325684, "learning_rate": 1.5625e-06, "loss": 0.431, "step": 7784 }, { "epoch": 26.003006756756758, "grad_norm": 0.0011332925641909242, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7785 }, { "epoch": 26.00304054054054, "grad_norm": 6.45823860168457, "learning_rate": 1.5625e-06, "loss": 0.0739, "step": 7786 }, { "epoch": 26.003074324324324, "grad_norm": 0.005404265597462654, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7787 }, { "epoch": 26.00310810810811, "grad_norm": 0.011923564597964287, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7788 }, { "epoch": 26.003141891891893, "grad_norm": 53.076629638671875, "learning_rate": 1.5625e-06, "loss": 0.6685, "step": 7789 }, { "epoch": 26.003175675675674, "grad_norm": 0.003838473232463002, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7790 }, { "epoch": 26.00320945945946, "grad_norm": 0.15848402678966522, "learning_rate": 1.5625e-06, "loss": 0.0059, "step": 7791 }, { "epoch": 26.003243243243244, "grad_norm": 0.009132708422839642, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7792 }, { "epoch": 26.00327702702703, "grad_norm": 0.0013570523587986827, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7793 }, { "epoch": 26.00331081081081, "grad_norm": 30.660816192626953, "learning_rate": 1.5625e-06, "loss": 0.6741, "step": 7794 }, { "epoch": 26.003344594594594, "grad_norm": 0.0018158953171223402, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7795 }, { "epoch": 26.00337837837838, "grad_norm": 91.90265655517578, "learning_rate": 1.5625e-06, "loss": 0.2494, "step": 7796 }, { "epoch": 26.003412162162164, "grad_norm": 0.0016258558025583625, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7797 }, { "epoch": 26.003445945945945, "grad_norm": 0.0052342601120471954, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7798 }, { "epoch": 26.00347972972973, "grad_norm": 0.0024162582121789455, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7799 }, { "epoch": 26.003513513513514, "grad_norm": 0.0016670512268319726, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7800 }, { "epoch": 26.0035472972973, "grad_norm": 0.0007527150446549058, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7801 }, { "epoch": 26.00358108108108, "grad_norm": 0.00122841855045408, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7802 }, { "epoch": 26.003614864864865, "grad_norm": 0.006882851477712393, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7803 }, { "epoch": 26.00364864864865, "grad_norm": 0.001650018966756761, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7804 }, { "epoch": 26.00368243243243, "grad_norm": 39.69349670410156, "learning_rate": 1.5625e-06, "loss": 0.0787, "step": 7805 }, { "epoch": 26.003716216216215, "grad_norm": 1.144284725189209, "learning_rate": 1.5625e-06, "loss": 0.0041, "step": 7806 }, { "epoch": 26.00375, "grad_norm": 24.78870964050293, "learning_rate": 1.5625e-06, "loss": 0.3987, "step": 7807 }, { "epoch": 26.003783783783785, "grad_norm": 0.0019797328859567642, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7808 }, { "epoch": 26.003817567567566, "grad_norm": 0.0018903446616604924, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7809 }, { "epoch": 26.00385135135135, "grad_norm": 0.45227137207984924, "learning_rate": 1.5625e-06, "loss": 0.0064, "step": 7810 }, { "epoch": 26.003885135135135, "grad_norm": 14.47948932647705, "learning_rate": 1.5625e-06, "loss": 0.5334, "step": 7811 }, { "epoch": 26.00391891891892, "grad_norm": 0.0012304249685257673, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7812 }, { "epoch": 26.0039527027027, "grad_norm": 0.009339507669210434, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7813 }, { "epoch": 26.003986486486486, "grad_norm": 0.0050346143543720245, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7814 }, { "epoch": 26.00402027027027, "grad_norm": 0.001974454615265131, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7815 }, { "epoch": 26.004054054054055, "grad_norm": 0.0033911331556737423, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7816 }, { "epoch": 26.004087837837837, "grad_norm": 0.0015841645654290915, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7817 }, { "epoch": 26.00412162162162, "grad_norm": 4.524786472320557, "learning_rate": 1.5625e-06, "loss": 0.3216, "step": 7818 }, { "epoch": 26.004155405405406, "grad_norm": 0.0007166299619711936, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7819 }, { "epoch": 26.00418918918919, "grad_norm": 7.345637321472168, "learning_rate": 1.5625e-06, "loss": 0.748, "step": 7820 }, { "epoch": 26.004222972972972, "grad_norm": 13.937634468078613, "learning_rate": 1.5625e-06, "loss": 0.0446, "step": 7821 }, { "epoch": 26.004256756756757, "grad_norm": 0.1269475519657135, "learning_rate": 1.5625e-06, "loss": 0.0047, "step": 7822 }, { "epoch": 26.00429054054054, "grad_norm": 0.0015241996152326465, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7823 }, { "epoch": 26.004324324324326, "grad_norm": 0.11333199590444565, "learning_rate": 1.5625e-06, "loss": 0.0007, "step": 7824 }, { "epoch": 26.004358108108107, "grad_norm": 0.07198135554790497, "learning_rate": 1.5625e-06, "loss": 0.0016, "step": 7825 }, { "epoch": 26.004391891891892, "grad_norm": 0.02658362127840519, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7826 }, { "epoch": 26.004425675675677, "grad_norm": 0.10584738850593567, "learning_rate": 1.5625e-06, "loss": 0.0039, "step": 7827 }, { "epoch": 26.004459459459458, "grad_norm": 13.141958236694336, "learning_rate": 1.5625e-06, "loss": 0.1838, "step": 7828 }, { "epoch": 26.004493243243243, "grad_norm": 0.001733716344460845, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7829 }, { "epoch": 26.004527027027027, "grad_norm": 0.3734102249145508, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 7830 }, { "epoch": 26.004560810810812, "grad_norm": 0.11263879388570786, "learning_rate": 1.5625e-06, "loss": 0.0043, "step": 7831 }, { "epoch": 26.004594594594593, "grad_norm": 0.007966962642967701, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7832 }, { "epoch": 26.004628378378378, "grad_norm": 55.40110778808594, "learning_rate": 1.5625e-06, "loss": 0.6512, "step": 7833 }, { "epoch": 26.004662162162163, "grad_norm": 0.151551753282547, "learning_rate": 1.5625e-06, "loss": 0.0044, "step": 7834 }, { "epoch": 26.004695945945947, "grad_norm": 0.0015934890834614635, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7835 }, { "epoch": 26.00472972972973, "grad_norm": 0.01480830553919077, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7836 }, { "epoch": 26.004763513513513, "grad_norm": 0.0030677958857268095, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7837 }, { "epoch": 26.004797297297298, "grad_norm": 0.04110421985387802, "learning_rate": 1.5625e-06, "loss": 0.0004, "step": 7838 }, { "epoch": 26.004831081081083, "grad_norm": 1.7156091928482056, "learning_rate": 1.5625e-06, "loss": 0.0083, "step": 7839 }, { "epoch": 26.004864864864864, "grad_norm": 0.0023687335196882486, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7840 }, { "epoch": 26.00489864864865, "grad_norm": 0.01493825577199459, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7841 }, { "epoch": 26.004932432432433, "grad_norm": 0.4872598648071289, "learning_rate": 1.5625e-06, "loss": 0.0054, "step": 7842 }, { "epoch": 26.004966216216218, "grad_norm": 0.0017122449353337288, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7843 }, { "epoch": 26.005, "grad_norm": 0.04017915949225426, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7844 }, { "epoch": 26.005033783783784, "grad_norm": 0.0021958439610898495, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7845 }, { "epoch": 26.00506756756757, "grad_norm": 0.000983062433078885, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7846 }, { "epoch": 26.00510135135135, "grad_norm": 0.005916132591664791, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7847 }, { "epoch": 26.005135135135134, "grad_norm": 0.04782981052994728, "learning_rate": 1.5625e-06, "loss": 0.0008, "step": 7848 }, { "epoch": 26.00516891891892, "grad_norm": 0.03506757318973541, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7849 }, { "epoch": 26.005202702702704, "grad_norm": 0.01965901255607605, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7850 }, { "epoch": 26.005236486486485, "grad_norm": 0.01576235517859459, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7851 }, { "epoch": 26.00527027027027, "grad_norm": 11.039694786071777, "learning_rate": 1.5625e-06, "loss": 0.173, "step": 7852 }, { "epoch": 26.005304054054054, "grad_norm": 3.458944320678711, "learning_rate": 1.5625e-06, "loss": 0.4144, "step": 7853 }, { "epoch": 26.00533783783784, "grad_norm": 2.398773431777954, "learning_rate": 1.5625e-06, "loss": 0.0059, "step": 7854 }, { "epoch": 26.00537162162162, "grad_norm": 0.010063785128295422, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7855 }, { "epoch": 26.005405405405405, "grad_norm": 0.002431352622807026, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7856 }, { "epoch": 26.00543918918919, "grad_norm": 0.12205351889133453, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 7857 }, { "epoch": 26.005472972972974, "grad_norm": 0.000968644511885941, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7858 }, { "epoch": 26.005506756756755, "grad_norm": 0.005152016878128052, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7859 }, { "epoch": 26.00554054054054, "grad_norm": 0.4493078887462616, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 7860 }, { "epoch": 26.005574324324325, "grad_norm": 0.004118372220546007, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7861 }, { "epoch": 26.00560810810811, "grad_norm": 0.0017928496235981584, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7862 }, { "epoch": 26.00564189189189, "grad_norm": 4.42856502532959, "learning_rate": 1.5625e-06, "loss": 0.008, "step": 7863 }, { "epoch": 26.005675675675676, "grad_norm": 0.0898965522646904, "learning_rate": 1.5625e-06, "loss": 0.0015, "step": 7864 }, { "epoch": 26.00570945945946, "grad_norm": 13.560452461242676, "learning_rate": 1.5625e-06, "loss": 0.2641, "step": 7865 }, { "epoch": 26.005743243243245, "grad_norm": 0.005094944965094328, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7866 }, { "epoch": 26.005777027027026, "grad_norm": 0.07800896465778351, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7867 }, { "epoch": 26.00581081081081, "grad_norm": 0.0013241750421002507, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7868 }, { "epoch": 26.005844594594596, "grad_norm": 0.006459478754550219, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7869 }, { "epoch": 26.005878378378377, "grad_norm": 4.883833885192871, "learning_rate": 1.5625e-06, "loss": 0.4144, "step": 7870 }, { "epoch": 26.00591216216216, "grad_norm": 0.08973876386880875, "learning_rate": 1.5625e-06, "loss": 0.0022, "step": 7871 }, { "epoch": 26.005945945945946, "grad_norm": 0.004227593541145325, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7872 }, { "epoch": 26.00597972972973, "grad_norm": 0.0005796018522232771, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7873 }, { "epoch": 26.006013513513512, "grad_norm": 0.049549687653779984, "learning_rate": 1.5625e-06, "loss": 0.0015, "step": 7874 }, { "epoch": 26.006047297297297, "grad_norm": 0.0036041566636413336, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7875 }, { "epoch": 26.00608108108108, "grad_norm": 0.007952132262289524, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7876 }, { "epoch": 26.006114864864866, "grad_norm": 0.007569614332169294, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7877 }, { "epoch": 26.006148648648647, "grad_norm": 0.06519801169633865, "learning_rate": 1.5625e-06, "loss": 0.0015, "step": 7878 }, { "epoch": 26.006182432432432, "grad_norm": 21.47467803955078, "learning_rate": 1.5625e-06, "loss": 0.9618, "step": 7879 }, { "epoch": 26.006216216216217, "grad_norm": 0.004676496144384146, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7880 }, { "epoch": 26.00625, "grad_norm": 6.71839714050293, "learning_rate": 1.5625e-06, "loss": 0.0319, "step": 7881 }, { "epoch": 26.006283783783783, "grad_norm": 0.0020528167951852083, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7882 }, { "epoch": 26.006317567567567, "grad_norm": 0.11435361206531525, "learning_rate": 1.5625e-06, "loss": 0.0041, "step": 7883 }, { "epoch": 26.006351351351352, "grad_norm": 0.003924419172108173, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7884 }, { "epoch": 26.006385135135137, "grad_norm": 0.11013030260801315, "learning_rate": 1.5625e-06, "loss": 0.0029, "step": 7885 }, { "epoch": 26.006418918918918, "grad_norm": 0.0032245891634374857, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7886 }, { "epoch": 26.006452702702703, "grad_norm": 13.876998901367188, "learning_rate": 1.5625e-06, "loss": 0.7228, "step": 7887 }, { "epoch": 26.006486486486487, "grad_norm": 0.1448516845703125, "learning_rate": 1.5625e-06, "loss": 0.0017, "step": 7888 }, { "epoch": 26.006520270270272, "grad_norm": 17.644437789916992, "learning_rate": 1.5625e-06, "loss": 0.4962, "step": 7889 }, { "epoch": 26.006554054054053, "grad_norm": 0.005198687314987183, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7890 }, { "epoch": 26.006587837837838, "grad_norm": 0.0065385629422962666, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7891 }, { "epoch": 26.006621621621623, "grad_norm": 0.001095944899134338, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7892 }, { "epoch": 26.006655405405404, "grad_norm": 0.0014868819853290915, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7893 }, { "epoch": 26.00668918918919, "grad_norm": 0.0061093587428331375, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7894 }, { "epoch": 26.006722972972973, "grad_norm": 0.002389485016465187, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7895 }, { "epoch": 26.006756756756758, "grad_norm": 0.1146652027964592, "learning_rate": 1.5625e-06, "loss": 0.0042, "step": 7896 }, { "epoch": 26.00679054054054, "grad_norm": 3.7319886684417725, "learning_rate": 1.5625e-06, "loss": 0.0245, "step": 7897 }, { "epoch": 26.006824324324324, "grad_norm": 0.0024659193586558104, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7898 }, { "epoch": 26.00685810810811, "grad_norm": 0.0020086828153580427, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7899 }, { "epoch": 26.006891891891893, "grad_norm": 0.0016648423625156283, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7900 }, { "epoch": 26.006925675675674, "grad_norm": 0.014887231402099133, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7901 }, { "epoch": 26.00695945945946, "grad_norm": 0.0012055521365255117, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7902 }, { "epoch": 26.006993243243244, "grad_norm": 3.6679413318634033, "learning_rate": 1.5625e-06, "loss": 0.369, "step": 7903 }, { "epoch": 26.00702702702703, "grad_norm": 2.3481831550598145, "learning_rate": 1.5625e-06, "loss": 0.0065, "step": 7904 }, { "epoch": 26.00706081081081, "grad_norm": 0.021890809759497643, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7905 }, { "epoch": 26.007094594594594, "grad_norm": 15.401456832885742, "learning_rate": 1.5625e-06, "loss": 0.0342, "step": 7906 }, { "epoch": 26.00712837837838, "grad_norm": 11.367450714111328, "learning_rate": 1.5625e-06, "loss": 0.0694, "step": 7907 }, { "epoch": 26.007162162162164, "grad_norm": 0.0029291093815118074, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7908 }, { "epoch": 26.007195945945945, "grad_norm": 0.002059083664789796, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7909 }, { "epoch": 26.00722972972973, "grad_norm": 32.08045959472656, "learning_rate": 1.5625e-06, "loss": 0.1065, "step": 7910 }, { "epoch": 26.007263513513514, "grad_norm": 0.001664106035605073, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7911 }, { "epoch": 26.007297297297296, "grad_norm": 4.685143947601318, "learning_rate": 1.5625e-06, "loss": 0.2971, "step": 7912 }, { "epoch": 26.00733108108108, "grad_norm": 0.0019310700008645654, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7913 }, { "epoch": 26.007364864864865, "grad_norm": 0.003217624733224511, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7914 }, { "epoch": 26.00739864864865, "grad_norm": 0.0013315153773874044, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7915 }, { "epoch": 26.00743243243243, "grad_norm": 3.399223804473877, "learning_rate": 1.5625e-06, "loss": 0.391, "step": 7916 }, { "epoch": 26.007466216216216, "grad_norm": 3.139688014984131, "learning_rate": 1.5625e-06, "loss": 0.0105, "step": 7917 }, { "epoch": 26.0075, "grad_norm": 0.12311340123414993, "learning_rate": 1.5625e-06, "loss": 0.0046, "step": 7918 }, { "epoch": 26.007533783783785, "grad_norm": 0.000877607089933008, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7919 }, { "epoch": 26.007567567567566, "grad_norm": 0.07155340909957886, "learning_rate": 1.5625e-06, "loss": 0.0005, "step": 7920 }, { "epoch": 26.00760135135135, "grad_norm": 0.012010288424789906, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7921 }, { "epoch": 26.007635135135136, "grad_norm": 0.0017583612352609634, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7922 }, { "epoch": 26.00766891891892, "grad_norm": 0.0024105000775307417, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7923 }, { "epoch": 26.0077027027027, "grad_norm": 0.0016348065109923482, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7924 }, { "epoch": 26.007736486486486, "grad_norm": 2.4487626552581787, "learning_rate": 1.5625e-06, "loss": 0.0525, "step": 7925 }, { "epoch": 26.00777027027027, "grad_norm": 2.4261996746063232, "learning_rate": 1.5625e-06, "loss": 0.0073, "step": 7926 }, { "epoch": 26.007804054054056, "grad_norm": 0.008456957526504993, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7927 }, { "epoch": 26.007837837837837, "grad_norm": 0.281935453414917, "learning_rate": 1.5625e-06, "loss": 0.0016, "step": 7928 }, { "epoch": 26.00787162162162, "grad_norm": 0.005547198932617903, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7929 }, { "epoch": 26.007905405405406, "grad_norm": 7.287838459014893, "learning_rate": 1.5625e-06, "loss": 0.0738, "step": 7930 }, { "epoch": 26.00793918918919, "grad_norm": 0.0012488109059631824, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7931 }, { "epoch": 26.007972972972972, "grad_norm": 0.009968617931008339, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7932 }, { "epoch": 26.008006756756757, "grad_norm": 0.0017338166944682598, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7933 }, { "epoch": 26.00804054054054, "grad_norm": 0.0006085460190661252, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7934 }, { "epoch": 26.008074324324323, "grad_norm": 0.013131292536854744, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7935 }, { "epoch": 26.008108108108107, "grad_norm": 0.009965041652321815, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7936 }, { "epoch": 26.008141891891892, "grad_norm": 0.005896378308534622, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7937 }, { "epoch": 26.008175675675677, "grad_norm": 3.4929544925689697, "learning_rate": 1.5625e-06, "loss": 0.0077, "step": 7938 }, { "epoch": 26.008209459459458, "grad_norm": 0.00421398039907217, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7939 }, { "epoch": 26.008243243243243, "grad_norm": 7.888923645019531, "learning_rate": 1.5625e-06, "loss": 0.0213, "step": 7940 }, { "epoch": 26.008277027027027, "grad_norm": 1.3866759538650513, "learning_rate": 1.5625e-06, "loss": 0.0023, "step": 7941 }, { "epoch": 26.008310810810812, "grad_norm": 1.8262827396392822, "learning_rate": 1.5625e-06, "loss": 0.0241, "step": 7942 }, { "epoch": 26.008344594594593, "grad_norm": 0.0034216458443552256, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7943 }, { "epoch": 26.008378378378378, "grad_norm": 0.0007241448038257658, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7944 }, { "epoch": 26.008412162162163, "grad_norm": 0.07668124884366989, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7945 }, { "epoch": 26.008445945945947, "grad_norm": 0.12264957278966904, "learning_rate": 1.5625e-06, "loss": 0.0045, "step": 7946 }, { "epoch": 26.00847972972973, "grad_norm": 0.08506279438734055, "learning_rate": 1.5625e-06, "loss": 0.002, "step": 7947 }, { "epoch": 26.008513513513513, "grad_norm": 0.003697403008118272, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7948 }, { "epoch": 26.008547297297298, "grad_norm": 0.0021853933576494455, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7949 }, { "epoch": 26.008581081081083, "grad_norm": 1.9731436967849731, "learning_rate": 1.5625e-06, "loss": 0.0066, "step": 7950 }, { "epoch": 26.008614864864864, "grad_norm": 0.012631135992705822, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7951 }, { "epoch": 26.00864864864865, "grad_norm": 8.92180061340332, "learning_rate": 1.5625e-06, "loss": 0.1259, "step": 7952 }, { "epoch": 26.008682432432433, "grad_norm": 7.947693824768066, "learning_rate": 1.5625e-06, "loss": 0.0132, "step": 7953 }, { "epoch": 26.008716216216218, "grad_norm": 1.691179871559143, "learning_rate": 1.5625e-06, "loss": 0.0405, "step": 7954 }, { "epoch": 26.00875, "grad_norm": 0.05561258643865585, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7955 }, { "epoch": 26.008783783783784, "grad_norm": 0.003575169714167714, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7956 }, { "epoch": 26.00881756756757, "grad_norm": 0.007822101935744286, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7957 }, { "epoch": 26.00885135135135, "grad_norm": 0.0015434350352734327, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7958 }, { "epoch": 26.008885135135134, "grad_norm": 2.832014322280884, "learning_rate": 1.5625e-06, "loss": 0.0573, "step": 7959 }, { "epoch": 26.00891891891892, "grad_norm": 0.004841501358896494, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7960 }, { "epoch": 26.008952702702704, "grad_norm": 0.8521828055381775, "learning_rate": 1.5625e-06, "loss": 0.0244, "step": 7961 }, { "epoch": 26.008986486486485, "grad_norm": 0.002403997117653489, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7962 }, { "epoch": 26.00902027027027, "grad_norm": 0.003658958012238145, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7963 }, { "epoch": 26.009054054054054, "grad_norm": 8.421103477478027, "learning_rate": 1.5625e-06, "loss": 0.3143, "step": 7964 }, { "epoch": 26.00908783783784, "grad_norm": 0.0012192976428195834, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7965 }, { "epoch": 26.00912162162162, "grad_norm": 0.003897901624441147, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7966 }, { "epoch": 26.009155405405405, "grad_norm": 57.08354949951172, "learning_rate": 1.5625e-06, "loss": 0.3262, "step": 7967 }, { "epoch": 26.00918918918919, "grad_norm": 0.00510881282389164, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7968 }, { "epoch": 26.009222972972974, "grad_norm": 0.0021641452331095934, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7969 }, { "epoch": 26.009256756756756, "grad_norm": 1.758786678314209, "learning_rate": 1.5625e-06, "loss": 0.0066, "step": 7970 }, { "epoch": 26.00929054054054, "grad_norm": 4.153988838195801, "learning_rate": 1.5625e-06, "loss": 0.4382, "step": 7971 }, { "epoch": 26.009324324324325, "grad_norm": 0.06181499361991882, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7972 }, { "epoch": 26.00935810810811, "grad_norm": 0.002745786914601922, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7973 }, { "epoch": 26.00939189189189, "grad_norm": 0.001481806393712759, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7974 }, { "epoch": 26.009425675675676, "grad_norm": 0.3735367953777313, "learning_rate": 1.5625e-06, "loss": 0.0046, "step": 7975 }, { "epoch": 26.00945945945946, "grad_norm": 0.00619215564802289, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7976 }, { "epoch": 26.00949324324324, "grad_norm": 0.007299461867660284, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7977 }, { "epoch": 26.009527027027026, "grad_norm": 0.02911984547972679, "learning_rate": 1.5625e-06, "loss": 0.0006, "step": 7978 }, { "epoch": 26.00956081081081, "grad_norm": 0.3983052670955658, "learning_rate": 1.5625e-06, "loss": 0.0019, "step": 7979 }, { "epoch": 26.009594594594596, "grad_norm": 0.006499618757516146, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7980 }, { "epoch": 26.009628378378377, "grad_norm": 0.06807614117860794, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7981 }, { "epoch": 26.00966216216216, "grad_norm": 0.0070889792405068874, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7982 }, { "epoch": 26.009695945945946, "grad_norm": 0.8428572416305542, "learning_rate": 1.5625e-06, "loss": 0.018, "step": 7983 }, { "epoch": 26.00972972972973, "grad_norm": 1.8043915033340454, "learning_rate": 1.5625e-06, "loss": 0.0032, "step": 7984 }, { "epoch": 26.009763513513512, "grad_norm": 0.035830143839120865, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7985 }, { "epoch": 26.009797297297297, "grad_norm": 0.050644055008888245, "learning_rate": 1.5625e-06, "loss": 0.0003, "step": 7986 }, { "epoch": 26.00983108108108, "grad_norm": 0.004336270038038492, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7987 }, { "epoch": 26.009864864864866, "grad_norm": 4.559478759765625, "learning_rate": 1.5625e-06, "loss": 0.0103, "step": 7988 }, { "epoch": 26.009898648648647, "grad_norm": 0.0012266725534573197, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7989 }, { "epoch": 26.009932432432432, "grad_norm": 0.0015958676813170314, "learning_rate": 1.5625e-06, "loss": 0.0, "step": 7990 }, { "epoch": 26.009966216216217, "grad_norm": 0.003875533351674676, "learning_rate": 1.5625e-06, "loss": 0.0001, "step": 7991 }, { "epoch": 26.01, "grad_norm": 0.012601383030414581, "learning_rate": 1.5625e-06, "loss": 0.0002, "step": 7992 }, { "epoch": 26.01, "eval_accuracy": 0.8998384491114702, "eval_loss": 0.5808717608451843, "eval_runtime": 34.0541, "eval_samples_per_second": 18.177, "eval_steps_per_second": 2.29, "step": 7992 }, { "epoch": 27.000033783783785, "grad_norm": 4.95831823348999, "learning_rate": 7.8125e-07, "loss": 0.4373, "step": 7993 }, { "epoch": 27.000067567567566, "grad_norm": 0.0027436341624706984, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 7994 }, { "epoch": 27.00010135135135, "grad_norm": 0.00996605958789587, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 7995 }, { "epoch": 27.000135135135135, "grad_norm": 0.001531496411189437, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 7996 }, { "epoch": 27.00016891891892, "grad_norm": 11.257230758666992, "learning_rate": 7.8125e-07, "loss": 0.402, "step": 7997 }, { "epoch": 27.0002027027027, "grad_norm": 0.0008498905808664858, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 7998 }, { "epoch": 27.000236486486486, "grad_norm": 0.002811991609632969, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 7999 }, { "epoch": 27.00027027027027, "grad_norm": 0.0027453803922981024, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8000 }, { "epoch": 27.000304054054055, "grad_norm": 0.7094612121582031, "learning_rate": 7.8125e-07, "loss": 0.0046, "step": 8001 }, { "epoch": 27.000337837837836, "grad_norm": 0.012386166490614414, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8002 }, { "epoch": 27.00037162162162, "grad_norm": 0.0008588716154918075, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8003 }, { "epoch": 27.000405405405406, "grad_norm": 3.2342000007629395, "learning_rate": 7.8125e-07, "loss": 0.3925, "step": 8004 }, { "epoch": 27.00043918918919, "grad_norm": 0.12975329160690308, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8005 }, { "epoch": 27.00047297297297, "grad_norm": 0.05553199723362923, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8006 }, { "epoch": 27.000506756756756, "grad_norm": 0.020567452535033226, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8007 }, { "epoch": 27.00054054054054, "grad_norm": 10.400177001953125, "learning_rate": 7.8125e-07, "loss": 0.0456, "step": 8008 }, { "epoch": 27.000574324324326, "grad_norm": 0.0007760778535157442, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8009 }, { "epoch": 27.000608108108107, "grad_norm": 0.001297122915275395, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8010 }, { "epoch": 27.000641891891892, "grad_norm": 0.010583166033029556, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8011 }, { "epoch": 27.000675675675677, "grad_norm": 0.19142115116119385, "learning_rate": 7.8125e-07, "loss": 0.0051, "step": 8012 }, { "epoch": 27.00070945945946, "grad_norm": 0.14380468428134918, "learning_rate": 7.8125e-07, "loss": 0.0055, "step": 8013 }, { "epoch": 27.000743243243242, "grad_norm": 0.052020758390426636, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8014 }, { "epoch": 27.000777027027027, "grad_norm": 0.030479557812213898, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8015 }, { "epoch": 27.000810810810812, "grad_norm": 0.01525325607508421, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8016 }, { "epoch": 27.000844594594593, "grad_norm": 0.0007296857656911016, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8017 }, { "epoch": 27.000878378378378, "grad_norm": 0.008112887851893902, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8018 }, { "epoch": 27.000912162162162, "grad_norm": 0.1356368064880371, "learning_rate": 7.8125e-07, "loss": 0.0052, "step": 8019 }, { "epoch": 27.000945945945947, "grad_norm": 0.0013219547690823674, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8020 }, { "epoch": 27.00097972972973, "grad_norm": 0.0009406657773070037, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8021 }, { "epoch": 27.001013513513513, "grad_norm": 0.12096104025840759, "learning_rate": 7.8125e-07, "loss": 0.0042, "step": 8022 }, { "epoch": 27.001047297297298, "grad_norm": 0.018251361325383186, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8023 }, { "epoch": 27.001081081081082, "grad_norm": 0.5481263995170593, "learning_rate": 7.8125e-07, "loss": 0.0023, "step": 8024 }, { "epoch": 27.001114864864864, "grad_norm": 0.0038513210602104664, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8025 }, { "epoch": 27.00114864864865, "grad_norm": 0.017858130857348442, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8026 }, { "epoch": 27.001182432432433, "grad_norm": 0.001257450901903212, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8027 }, { "epoch": 27.001216216216218, "grad_norm": 0.0007348029175773263, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8028 }, { "epoch": 27.00125, "grad_norm": 0.011886049062013626, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8029 }, { "epoch": 27.001283783783784, "grad_norm": 0.0010087838163599372, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8030 }, { "epoch": 27.00131756756757, "grad_norm": 0.0011193715035915375, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8031 }, { "epoch": 27.001351351351353, "grad_norm": 0.00143164221663028, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8032 }, { "epoch": 27.001385135135134, "grad_norm": 0.004333157557994127, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8033 }, { "epoch": 27.00141891891892, "grad_norm": 69.11419677734375, "learning_rate": 7.8125e-07, "loss": 0.9464, "step": 8034 }, { "epoch": 27.001452702702704, "grad_norm": 0.00425761379301548, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8035 }, { "epoch": 27.001486486486485, "grad_norm": 3.3076155185699463, "learning_rate": 7.8125e-07, "loss": 0.3851, "step": 8036 }, { "epoch": 27.00152027027027, "grad_norm": 0.007740946486592293, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8037 }, { "epoch": 27.001554054054054, "grad_norm": 0.0008613129612058401, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8038 }, { "epoch": 27.00158783783784, "grad_norm": 0.008918289095163345, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8039 }, { "epoch": 27.00162162162162, "grad_norm": 0.02224554307758808, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8040 }, { "epoch": 27.001655405405405, "grad_norm": 0.001976126804947853, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8041 }, { "epoch": 27.00168918918919, "grad_norm": 1.7103971242904663, "learning_rate": 7.8125e-07, "loss": 0.0047, "step": 8042 }, { "epoch": 27.001722972972974, "grad_norm": 0.023866046220064163, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8043 }, { "epoch": 27.001756756756755, "grad_norm": 0.04767349734902382, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8044 }, { "epoch": 27.00179054054054, "grad_norm": 11.446744918823242, "learning_rate": 7.8125e-07, "loss": 0.1374, "step": 8045 }, { "epoch": 27.001824324324325, "grad_norm": 0.12895351648330688, "learning_rate": 7.8125e-07, "loss": 0.0017, "step": 8046 }, { "epoch": 27.00185810810811, "grad_norm": 0.0015632044523954391, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8047 }, { "epoch": 27.00189189189189, "grad_norm": 0.07780846208333969, "learning_rate": 7.8125e-07, "loss": 0.0023, "step": 8048 }, { "epoch": 27.001925675675675, "grad_norm": 0.006043533328920603, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8049 }, { "epoch": 27.00195945945946, "grad_norm": 0.34862473607063293, "learning_rate": 7.8125e-07, "loss": 0.0014, "step": 8050 }, { "epoch": 27.001993243243245, "grad_norm": 0.0008024520357139409, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8051 }, { "epoch": 27.002027027027026, "grad_norm": 0.002007013652473688, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8052 }, { "epoch": 27.00206081081081, "grad_norm": 0.030682047829031944, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8053 }, { "epoch": 27.002094594594595, "grad_norm": 0.006109616253525019, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8054 }, { "epoch": 27.00212837837838, "grad_norm": 0.001173807424493134, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8055 }, { "epoch": 27.00216216216216, "grad_norm": 0.001256516552530229, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8056 }, { "epoch": 27.002195945945946, "grad_norm": 17.68021583557129, "learning_rate": 7.8125e-07, "loss": 0.2254, "step": 8057 }, { "epoch": 27.00222972972973, "grad_norm": 0.0011371137807145715, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8058 }, { "epoch": 27.002263513513512, "grad_norm": 0.15702402591705322, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8059 }, { "epoch": 27.002297297297297, "grad_norm": 0.0011920398101210594, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8060 }, { "epoch": 27.00233108108108, "grad_norm": 0.17974498867988586, "learning_rate": 7.8125e-07, "loss": 0.0066, "step": 8061 }, { "epoch": 27.002364864864866, "grad_norm": 0.005175303667783737, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8062 }, { "epoch": 27.002398648648647, "grad_norm": 0.0070770131424069405, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8063 }, { "epoch": 27.002432432432432, "grad_norm": 0.006029477808624506, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8064 }, { "epoch": 27.002466216216217, "grad_norm": 0.0033068915363401175, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8065 }, { "epoch": 27.0025, "grad_norm": 0.007264968007802963, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8066 }, { "epoch": 27.002533783783782, "grad_norm": 0.0017090673791244626, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8067 }, { "epoch": 27.002567567567567, "grad_norm": 0.0013961246004328132, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8068 }, { "epoch": 27.002601351351352, "grad_norm": 0.5279844999313354, "learning_rate": 7.8125e-07, "loss": 0.0103, "step": 8069 }, { "epoch": 27.002635135135137, "grad_norm": 0.0008461606921628118, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8070 }, { "epoch": 27.002668918918918, "grad_norm": 44.405479431152344, "learning_rate": 7.8125e-07, "loss": 0.1203, "step": 8071 }, { "epoch": 27.002702702702702, "grad_norm": 0.16805966198444366, "learning_rate": 7.8125e-07, "loss": 0.0063, "step": 8072 }, { "epoch": 27.002736486486487, "grad_norm": 0.004874128848314285, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8073 }, { "epoch": 27.002770270270272, "grad_norm": 0.002891726093366742, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8074 }, { "epoch": 27.002804054054053, "grad_norm": 0.04847292974591255, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8075 }, { "epoch": 27.002837837837838, "grad_norm": 0.007118455599993467, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8076 }, { "epoch": 27.002871621621622, "grad_norm": 0.003611467545852065, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8077 }, { "epoch": 27.002905405405407, "grad_norm": 16.120603561401367, "learning_rate": 7.8125e-07, "loss": 0.0308, "step": 8078 }, { "epoch": 27.00293918918919, "grad_norm": 0.18626050651073456, "learning_rate": 7.8125e-07, "loss": 0.0046, "step": 8079 }, { "epoch": 27.002972972972973, "grad_norm": 8.532062530517578, "learning_rate": 7.8125e-07, "loss": 0.5154, "step": 8080 }, { "epoch": 27.003006756756758, "grad_norm": 0.033240195363759995, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8081 }, { "epoch": 27.00304054054054, "grad_norm": 0.17876505851745605, "learning_rate": 7.8125e-07, "loss": 0.0068, "step": 8082 }, { "epoch": 27.003074324324324, "grad_norm": 0.005148987751454115, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8083 }, { "epoch": 27.00310810810811, "grad_norm": 0.0014655282720923424, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8084 }, { "epoch": 27.003141891891893, "grad_norm": 0.061902571469545364, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8085 }, { "epoch": 27.003175675675674, "grad_norm": 1.0587987899780273, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8086 }, { "epoch": 27.00320945945946, "grad_norm": 0.0027379486709833145, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8087 }, { "epoch": 27.003243243243244, "grad_norm": 0.07120426744222641, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 8088 }, { "epoch": 27.00327702702703, "grad_norm": 0.16953273117542267, "learning_rate": 7.8125e-07, "loss": 0.0035, "step": 8089 }, { "epoch": 27.00331081081081, "grad_norm": 0.001278493320569396, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8090 }, { "epoch": 27.003344594594594, "grad_norm": 0.007638989016413689, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8091 }, { "epoch": 27.00337837837838, "grad_norm": 0.0016540359938517213, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8092 }, { "epoch": 27.003412162162164, "grad_norm": 0.33785945177078247, "learning_rate": 7.8125e-07, "loss": 0.0145, "step": 8093 }, { "epoch": 27.003445945945945, "grad_norm": 0.002175846602767706, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8094 }, { "epoch": 27.00347972972973, "grad_norm": 0.002314617857336998, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8095 }, { "epoch": 27.003513513513514, "grad_norm": 0.011146518401801586, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8096 }, { "epoch": 27.0035472972973, "grad_norm": 0.013878906145691872, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8097 }, { "epoch": 27.00358108108108, "grad_norm": 0.0016935189487412572, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8098 }, { "epoch": 27.003614864864865, "grad_norm": 0.005586853250861168, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8099 }, { "epoch": 27.00364864864865, "grad_norm": 0.8160308003425598, "learning_rate": 7.8125e-07, "loss": 0.0126, "step": 8100 }, { "epoch": 27.00368243243243, "grad_norm": 0.0013053020229563117, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8101 }, { "epoch": 27.003716216216215, "grad_norm": 1.2159030437469482, "learning_rate": 7.8125e-07, "loss": 0.0026, "step": 8102 }, { "epoch": 27.00375, "grad_norm": 0.002121070632711053, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8103 }, { "epoch": 27.003783783783785, "grad_norm": 0.0019389302469789982, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8104 }, { "epoch": 27.003817567567566, "grad_norm": 4.63238000869751, "learning_rate": 7.8125e-07, "loss": 0.4306, "step": 8105 }, { "epoch": 27.00385135135135, "grad_norm": 0.09372426569461823, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8106 }, { "epoch": 27.003885135135135, "grad_norm": 0.00422033853828907, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8107 }, { "epoch": 27.00391891891892, "grad_norm": 10.346502304077148, "learning_rate": 7.8125e-07, "loss": 0.2444, "step": 8108 }, { "epoch": 27.0039527027027, "grad_norm": 0.0010534831089898944, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8109 }, { "epoch": 27.003986486486486, "grad_norm": 0.0021680165082216263, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8110 }, { "epoch": 27.00402027027027, "grad_norm": 0.003123793052509427, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8111 }, { "epoch": 27.004054054054055, "grad_norm": 0.10119820386171341, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8112 }, { "epoch": 27.004087837837837, "grad_norm": 0.6372517943382263, "learning_rate": 7.8125e-07, "loss": 0.0049, "step": 8113 }, { "epoch": 27.00412162162162, "grad_norm": 0.005029190797358751, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8114 }, { "epoch": 27.004155405405406, "grad_norm": 0.005415298510342836, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8115 }, { "epoch": 27.00418918918919, "grad_norm": 0.004048728384077549, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8116 }, { "epoch": 27.004222972972972, "grad_norm": 0.42844197154045105, "learning_rate": 7.8125e-07, "loss": 0.003, "step": 8117 }, { "epoch": 27.004256756756757, "grad_norm": 0.0007737193373031914, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8118 }, { "epoch": 27.00429054054054, "grad_norm": 0.0505598820745945, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8119 }, { "epoch": 27.004324324324326, "grad_norm": 0.0014954510843381286, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8120 }, { "epoch": 27.004358108108107, "grad_norm": 0.0022486981470137835, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8121 }, { "epoch": 27.004391891891892, "grad_norm": 0.00462738610804081, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8122 }, { "epoch": 27.004425675675677, "grad_norm": 0.002065502107143402, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8123 }, { "epoch": 27.004459459459458, "grad_norm": 0.0017883742693811655, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8124 }, { "epoch": 27.004493243243243, "grad_norm": 12.815977096557617, "learning_rate": 7.8125e-07, "loss": 1.0209, "step": 8125 }, { "epoch": 27.004527027027027, "grad_norm": 0.0031396597623825073, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8126 }, { "epoch": 27.004560810810812, "grad_norm": 0.006799632683396339, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8127 }, { "epoch": 27.004594594594593, "grad_norm": 3.4859719276428223, "learning_rate": 7.8125e-07, "loss": 0.0303, "step": 8128 }, { "epoch": 27.004628378378378, "grad_norm": 17.78800392150879, "learning_rate": 7.8125e-07, "loss": 0.1455, "step": 8129 }, { "epoch": 27.004662162162163, "grad_norm": 3.956531524658203, "learning_rate": 7.8125e-07, "loss": 0.0069, "step": 8130 }, { "epoch": 27.004695945945947, "grad_norm": 0.0012388868490234017, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8131 }, { "epoch": 27.00472972972973, "grad_norm": 0.021779382601380348, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8132 }, { "epoch": 27.004763513513513, "grad_norm": 0.004463553428649902, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8133 }, { "epoch": 27.004797297297298, "grad_norm": 37.98836135864258, "learning_rate": 7.8125e-07, "loss": 0.1128, "step": 8134 }, { "epoch": 27.004831081081083, "grad_norm": 0.005919934716075659, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8135 }, { "epoch": 27.004864864864864, "grad_norm": 1.2060214281082153, "learning_rate": 7.8125e-07, "loss": 0.0082, "step": 8136 }, { "epoch": 27.00489864864865, "grad_norm": 0.17101001739501953, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8137 }, { "epoch": 27.004932432432433, "grad_norm": 0.003472774988040328, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8138 }, { "epoch": 27.004966216216218, "grad_norm": 44.02445983886719, "learning_rate": 7.8125e-07, "loss": 0.099, "step": 8139 }, { "epoch": 27.005, "grad_norm": 0.0059242951683700085, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8140 }, { "epoch": 27.005033783783784, "grad_norm": 0.031576912850141525, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8141 }, { "epoch": 27.00506756756757, "grad_norm": 0.0028106726240366697, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8142 }, { "epoch": 27.00510135135135, "grad_norm": 39.93797302246094, "learning_rate": 7.8125e-07, "loss": 0.2251, "step": 8143 }, { "epoch": 27.005135135135134, "grad_norm": 0.10348507016897202, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 8144 }, { "epoch": 27.00516891891892, "grad_norm": 0.0057381922379136086, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8145 }, { "epoch": 27.005202702702704, "grad_norm": 0.008873422630131245, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8146 }, { "epoch": 27.005236486486485, "grad_norm": 15.946704864501953, "learning_rate": 7.8125e-07, "loss": 0.0256, "step": 8147 }, { "epoch": 27.00527027027027, "grad_norm": 0.0007089879945851862, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8148 }, { "epoch": 27.005304054054054, "grad_norm": 0.09650364518165588, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8149 }, { "epoch": 27.00533783783784, "grad_norm": 0.018023336306214333, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8150 }, { "epoch": 27.00537162162162, "grad_norm": 0.16161775588989258, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 8151 }, { "epoch": 27.005405405405405, "grad_norm": 0.1369716078042984, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8152 }, { "epoch": 27.00543918918919, "grad_norm": 0.16819755733013153, "learning_rate": 7.8125e-07, "loss": 0.0061, "step": 8153 }, { "epoch": 27.005472972972974, "grad_norm": 0.09713166952133179, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 8154 }, { "epoch": 27.005506756756755, "grad_norm": 0.1497369408607483, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8155 }, { "epoch": 27.00554054054054, "grad_norm": 0.0022339792922139168, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8156 }, { "epoch": 27.005574324324325, "grad_norm": 4.895373821258545, "learning_rate": 7.8125e-07, "loss": 0.3091, "step": 8157 }, { "epoch": 27.00560810810811, "grad_norm": 0.0045626782812178135, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8158 }, { "epoch": 27.00564189189189, "grad_norm": 0.38703665137290955, "learning_rate": 7.8125e-07, "loss": 0.0019, "step": 8159 }, { "epoch": 27.005675675675676, "grad_norm": 0.0006757454248145223, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8160 }, { "epoch": 27.00570945945946, "grad_norm": 0.003363413969054818, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8161 }, { "epoch": 27.005743243243245, "grad_norm": 0.0032096095383167267, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8162 }, { "epoch": 27.005777027027026, "grad_norm": 0.5904726386070251, "learning_rate": 7.8125e-07, "loss": 0.0064, "step": 8163 }, { "epoch": 27.00581081081081, "grad_norm": 0.0035452835727483034, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8164 }, { "epoch": 27.005844594594596, "grad_norm": 0.01609012298285961, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8165 }, { "epoch": 27.005878378378377, "grad_norm": 0.001905524404719472, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8166 }, { "epoch": 27.00591216216216, "grad_norm": 0.0035154849756509066, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8167 }, { "epoch": 27.005945945945946, "grad_norm": 0.002637613797560334, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8168 }, { "epoch": 27.00597972972973, "grad_norm": 0.09039553999900818, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8169 }, { "epoch": 27.006013513513512, "grad_norm": 34.78951644897461, "learning_rate": 7.8125e-07, "loss": 0.4235, "step": 8170 }, { "epoch": 27.006047297297297, "grad_norm": 11.242767333984375, "learning_rate": 7.8125e-07, "loss": 0.0146, "step": 8171 }, { "epoch": 27.00608108108108, "grad_norm": 2.7234747409820557, "learning_rate": 7.8125e-07, "loss": 0.0863, "step": 8172 }, { "epoch": 27.006114864864866, "grad_norm": 0.002622476778924465, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8173 }, { "epoch": 27.006148648648647, "grad_norm": 4.062800407409668, "learning_rate": 7.8125e-07, "loss": 0.4067, "step": 8174 }, { "epoch": 27.006182432432432, "grad_norm": 54.673126220703125, "learning_rate": 7.8125e-07, "loss": 0.2228, "step": 8175 }, { "epoch": 27.006216216216217, "grad_norm": 0.0019132117740809917, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8176 }, { "epoch": 27.00625, "grad_norm": 0.1211434155702591, "learning_rate": 7.8125e-07, "loss": 0.0039, "step": 8177 }, { "epoch": 27.006283783783783, "grad_norm": 0.09516709297895432, "learning_rate": 7.8125e-07, "loss": 0.0012, "step": 8178 }, { "epoch": 27.006317567567567, "grad_norm": 0.0007494444726034999, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8179 }, { "epoch": 27.006351351351352, "grad_norm": 0.15888327360153198, "learning_rate": 7.8125e-07, "loss": 0.0056, "step": 8180 }, { "epoch": 27.006385135135137, "grad_norm": 0.2661021053791046, "learning_rate": 7.8125e-07, "loss": 0.0063, "step": 8181 }, { "epoch": 27.006418918918918, "grad_norm": 0.1516118049621582, "learning_rate": 7.8125e-07, "loss": 0.0056, "step": 8182 }, { "epoch": 27.006452702702703, "grad_norm": 0.0032451599836349487, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8183 }, { "epoch": 27.006486486486487, "grad_norm": 0.41899189352989197, "learning_rate": 7.8125e-07, "loss": 0.0012, "step": 8184 }, { "epoch": 27.006520270270272, "grad_norm": 25.49235725402832, "learning_rate": 7.8125e-07, "loss": 0.2071, "step": 8185 }, { "epoch": 27.006554054054053, "grad_norm": 0.0024888417683541775, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8186 }, { "epoch": 27.006587837837838, "grad_norm": 0.033837173134088516, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8187 }, { "epoch": 27.006621621621623, "grad_norm": 0.001886169658973813, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8188 }, { "epoch": 27.006655405405404, "grad_norm": 0.0016964609967544675, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8189 }, { "epoch": 27.00668918918919, "grad_norm": 0.0008328907424584031, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8190 }, { "epoch": 27.006722972972973, "grad_norm": 0.0009738213266246021, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8191 }, { "epoch": 27.006756756756758, "grad_norm": 0.009175922721624374, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8192 }, { "epoch": 27.00679054054054, "grad_norm": 0.0009830892086029053, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8193 }, { "epoch": 27.006824324324324, "grad_norm": 0.9143563508987427, "learning_rate": 7.8125e-07, "loss": 0.0247, "step": 8194 }, { "epoch": 27.00685810810811, "grad_norm": 4.1500139236450195, "learning_rate": 7.8125e-07, "loss": 0.4779, "step": 8195 }, { "epoch": 27.006891891891893, "grad_norm": 0.030624479055404663, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8196 }, { "epoch": 27.006925675675674, "grad_norm": 0.0008384949178434908, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8197 }, { "epoch": 27.00695945945946, "grad_norm": 0.1636333465576172, "learning_rate": 7.8125e-07, "loss": 0.0059, "step": 8198 }, { "epoch": 27.006993243243244, "grad_norm": 0.028249965980648994, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8199 }, { "epoch": 27.00702702702703, "grad_norm": 0.005245206877589226, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8200 }, { "epoch": 27.00706081081081, "grad_norm": 0.0021541935857385397, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8201 }, { "epoch": 27.007094594594594, "grad_norm": 0.012173685245215893, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8202 }, { "epoch": 27.00712837837838, "grad_norm": 0.028013894334435463, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8203 }, { "epoch": 27.007162162162164, "grad_norm": 10.985392570495605, "learning_rate": 7.8125e-07, "loss": 0.5746, "step": 8204 }, { "epoch": 27.007195945945945, "grad_norm": 0.0019463705830276012, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8205 }, { "epoch": 27.00722972972973, "grad_norm": 0.0009295706986449659, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8206 }, { "epoch": 27.007263513513514, "grad_norm": 0.0017408767016604543, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8207 }, { "epoch": 27.007297297297296, "grad_norm": 0.011006105691194534, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8208 }, { "epoch": 27.00733108108108, "grad_norm": 0.011709999293088913, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8209 }, { "epoch": 27.007364864864865, "grad_norm": 0.0026054871268570423, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8210 }, { "epoch": 27.00739864864865, "grad_norm": 0.0014188840286806226, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8211 }, { "epoch": 27.00743243243243, "grad_norm": 2.4485580921173096, "learning_rate": 7.8125e-07, "loss": 0.0061, "step": 8212 }, { "epoch": 27.007466216216216, "grad_norm": 0.0024122116155922413, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8213 }, { "epoch": 27.0075, "grad_norm": 0.0026014726608991623, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8214 }, { "epoch": 27.007533783783785, "grad_norm": 0.0027518533170223236, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8215 }, { "epoch": 27.007567567567566, "grad_norm": 0.0017520105466246605, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8216 }, { "epoch": 27.00760135135135, "grad_norm": 0.021556802093982697, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8217 }, { "epoch": 27.007635135135136, "grad_norm": 0.4227984547615051, "learning_rate": 7.8125e-07, "loss": 0.0069, "step": 8218 }, { "epoch": 27.00766891891892, "grad_norm": 22.657169342041016, "learning_rate": 7.8125e-07, "loss": 0.3237, "step": 8219 }, { "epoch": 27.0077027027027, "grad_norm": 0.35185015201568604, "learning_rate": 7.8125e-07, "loss": 0.001, "step": 8220 }, { "epoch": 27.007736486486486, "grad_norm": 0.021747887134552002, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8221 }, { "epoch": 27.00777027027027, "grad_norm": 0.692866325378418, "learning_rate": 7.8125e-07, "loss": 0.0139, "step": 8222 }, { "epoch": 27.007804054054056, "grad_norm": 0.03128596395254135, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8223 }, { "epoch": 27.007837837837837, "grad_norm": 0.0008781814249232411, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8224 }, { "epoch": 27.00787162162162, "grad_norm": 0.1676606982946396, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8225 }, { "epoch": 27.007905405405406, "grad_norm": 0.1321205049753189, "learning_rate": 7.8125e-07, "loss": 0.005, "step": 8226 }, { "epoch": 27.00793918918919, "grad_norm": 0.010061067529022694, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8227 }, { "epoch": 27.007972972972972, "grad_norm": 0.0036851565819233656, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8228 }, { "epoch": 27.008006756756757, "grad_norm": 0.003040301613509655, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8229 }, { "epoch": 27.00804054054054, "grad_norm": 2.9688897132873535, "learning_rate": 7.8125e-07, "loss": 0.0727, "step": 8230 }, { "epoch": 27.008074324324323, "grad_norm": 0.06495403498411179, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8231 }, { "epoch": 27.008108108108107, "grad_norm": 0.001269817934371531, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8232 }, { "epoch": 27.008141891891892, "grad_norm": 0.0018158338498324156, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8233 }, { "epoch": 27.008175675675677, "grad_norm": 0.0015085241757333279, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8234 }, { "epoch": 27.008209459459458, "grad_norm": 0.0017586580943316221, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8235 }, { "epoch": 27.008243243243243, "grad_norm": 0.002158177550882101, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8236 }, { "epoch": 27.008277027027027, "grad_norm": 0.14350561797618866, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 8237 }, { "epoch": 27.008310810810812, "grad_norm": 0.0026066889986395836, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8238 }, { "epoch": 27.008344594594593, "grad_norm": 0.12309684604406357, "learning_rate": 7.8125e-07, "loss": 0.0046, "step": 8239 }, { "epoch": 27.008378378378378, "grad_norm": 0.0028816075064241886, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8240 }, { "epoch": 27.008412162162163, "grad_norm": 0.0007388028898276389, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8241 }, { "epoch": 27.008445945945947, "grad_norm": 0.11444553732872009, "learning_rate": 7.8125e-07, "loss": 0.001, "step": 8242 }, { "epoch": 27.00847972972973, "grad_norm": 5.558946132659912, "learning_rate": 7.8125e-07, "loss": 0.0375, "step": 8243 }, { "epoch": 27.008513513513513, "grad_norm": 0.004226096905767918, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8244 }, { "epoch": 27.008547297297298, "grad_norm": 0.214463472366333, "learning_rate": 7.8125e-07, "loss": 0.0064, "step": 8245 }, { "epoch": 27.008581081081083, "grad_norm": 0.0025074949953705072, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8246 }, { "epoch": 27.008614864864864, "grad_norm": 0.02057541161775589, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8247 }, { "epoch": 27.00864864864865, "grad_norm": 0.0018538562580943108, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8248 }, { "epoch": 27.008682432432433, "grad_norm": 0.0022887527011334896, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8249 }, { "epoch": 27.008716216216218, "grad_norm": 0.0015530905220657587, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8250 }, { "epoch": 27.00875, "grad_norm": 0.0029444131068885326, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8251 }, { "epoch": 27.008783783783784, "grad_norm": 0.0038890126161277294, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8252 }, { "epoch": 27.00881756756757, "grad_norm": 0.0011190949007868767, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8253 }, { "epoch": 27.00885135135135, "grad_norm": 0.2507287263870239, "learning_rate": 7.8125e-07, "loss": 0.0015, "step": 8254 }, { "epoch": 27.008885135135134, "grad_norm": 0.006060863845050335, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8255 }, { "epoch": 27.00891891891892, "grad_norm": 0.7013164758682251, "learning_rate": 7.8125e-07, "loss": 0.0225, "step": 8256 }, { "epoch": 27.008952702702704, "grad_norm": 0.1315649300813675, "learning_rate": 7.8125e-07, "loss": 0.005, "step": 8257 }, { "epoch": 27.008986486486485, "grad_norm": 0.0011245415080338717, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8258 }, { "epoch": 27.00902027027027, "grad_norm": 0.003367838216945529, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8259 }, { "epoch": 27.009054054054054, "grad_norm": 4.091954231262207, "learning_rate": 7.8125e-07, "loss": 0.0364, "step": 8260 }, { "epoch": 27.00908783783784, "grad_norm": 0.012043853290379047, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8261 }, { "epoch": 27.00912162162162, "grad_norm": 3.3727166652679443, "learning_rate": 7.8125e-07, "loss": 0.0396, "step": 8262 }, { "epoch": 27.009155405405405, "grad_norm": 60.96499252319336, "learning_rate": 7.8125e-07, "loss": 0.3272, "step": 8263 }, { "epoch": 27.00918918918919, "grad_norm": 0.0009319810196757317, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8264 }, { "epoch": 27.009222972972974, "grad_norm": 0.004913634154945612, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8265 }, { "epoch": 27.009256756756756, "grad_norm": 0.03302360326051712, "learning_rate": 7.8125e-07, "loss": 0.001, "step": 8266 }, { "epoch": 27.00929054054054, "grad_norm": 0.11869248002767563, "learning_rate": 7.8125e-07, "loss": 0.0044, "step": 8267 }, { "epoch": 27.009324324324325, "grad_norm": 0.20326343178749084, "learning_rate": 7.8125e-07, "loss": 0.0043, "step": 8268 }, { "epoch": 27.00935810810811, "grad_norm": 0.1516885757446289, "learning_rate": 7.8125e-07, "loss": 0.0055, "step": 8269 }, { "epoch": 27.00939189189189, "grad_norm": 0.18158625066280365, "learning_rate": 7.8125e-07, "loss": 0.0063, "step": 8270 }, { "epoch": 27.009425675675676, "grad_norm": 0.011556042358279228, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8271 }, { "epoch": 27.00945945945946, "grad_norm": 0.0013928984990343451, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8272 }, { "epoch": 27.00949324324324, "grad_norm": 23.086902618408203, "learning_rate": 7.8125e-07, "loss": 0.0352, "step": 8273 }, { "epoch": 27.009527027027026, "grad_norm": 37.90145492553711, "learning_rate": 7.8125e-07, "loss": 0.7611, "step": 8274 }, { "epoch": 27.00956081081081, "grad_norm": 0.45391255617141724, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 8275 }, { "epoch": 27.009594594594596, "grad_norm": 0.0953538790345192, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8276 }, { "epoch": 27.009628378378377, "grad_norm": 0.0016252751229330897, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8277 }, { "epoch": 27.00966216216216, "grad_norm": 0.0009719978552311659, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8278 }, { "epoch": 27.009695945945946, "grad_norm": 0.13046346604824066, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8279 }, { "epoch": 27.00972972972973, "grad_norm": 0.018592528998851776, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8280 }, { "epoch": 27.009763513513512, "grad_norm": 6.405369281768799, "learning_rate": 7.8125e-07, "loss": 0.3139, "step": 8281 }, { "epoch": 27.009797297297297, "grad_norm": 5.295281410217285, "learning_rate": 7.8125e-07, "loss": 0.3912, "step": 8282 }, { "epoch": 27.00983108108108, "grad_norm": 0.00271977367810905, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8283 }, { "epoch": 27.009864864864866, "grad_norm": 1.2808681726455688, "learning_rate": 7.8125e-07, "loss": 0.0096, "step": 8284 }, { "epoch": 27.009898648648647, "grad_norm": 0.45274075865745544, "learning_rate": 7.8125e-07, "loss": 0.0097, "step": 8285 }, { "epoch": 27.009932432432432, "grad_norm": 0.0009635603055357933, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8286 }, { "epoch": 27.009966216216217, "grad_norm": 7.281394004821777, "learning_rate": 7.8125e-07, "loss": 0.0122, "step": 8287 }, { "epoch": 27.01, "grad_norm": 1.0600816011428833, "learning_rate": 7.8125e-07, "loss": 0.0031, "step": 8288 }, { "epoch": 27.01, "eval_accuracy": 0.8966074313408724, "eval_loss": 0.5868592262268066, "eval_runtime": 33.1739, "eval_samples_per_second": 18.659, "eval_steps_per_second": 2.351, "step": 8288 }, { "epoch": 28.000033783783785, "grad_norm": 0.00340086012147367, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8289 }, { "epoch": 28.000067567567566, "grad_norm": 0.002092257607728243, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8290 }, { "epoch": 28.00010135135135, "grad_norm": 0.0028425047639757395, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8291 }, { "epoch": 28.000135135135135, "grad_norm": 0.23601338267326355, "learning_rate": 7.8125e-07, "loss": 0.0071, "step": 8292 }, { "epoch": 28.00016891891892, "grad_norm": 0.0010806795908138156, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8293 }, { "epoch": 28.0002027027027, "grad_norm": 36.42216873168945, "learning_rate": 7.8125e-07, "loss": 0.7847, "step": 8294 }, { "epoch": 28.000236486486486, "grad_norm": 1.370995283126831, "learning_rate": 7.8125e-07, "loss": 0.0111, "step": 8295 }, { "epoch": 28.00027027027027, "grad_norm": 0.0011266848305240273, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8296 }, { "epoch": 28.000304054054055, "grad_norm": 0.3204402029514313, "learning_rate": 7.8125e-07, "loss": 0.0013, "step": 8297 }, { "epoch": 28.000337837837836, "grad_norm": 0.0008936551166698337, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8298 }, { "epoch": 28.00037162162162, "grad_norm": 0.5703086256980896, "learning_rate": 7.8125e-07, "loss": 0.0154, "step": 8299 }, { "epoch": 28.000405405405406, "grad_norm": 0.0009683124953880906, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8300 }, { "epoch": 28.00043918918919, "grad_norm": 0.0014713449636474252, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8301 }, { "epoch": 28.00047297297297, "grad_norm": 0.0009736586362123489, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8302 }, { "epoch": 28.000506756756756, "grad_norm": 0.0014907974982634187, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8303 }, { "epoch": 28.00054054054054, "grad_norm": 0.6364137530326843, "learning_rate": 7.8125e-07, "loss": 0.0113, "step": 8304 }, { "epoch": 28.000574324324326, "grad_norm": 0.00930754654109478, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8305 }, { "epoch": 28.000608108108107, "grad_norm": 0.30892691016197205, "learning_rate": 7.8125e-07, "loss": 0.0116, "step": 8306 }, { "epoch": 28.000641891891892, "grad_norm": 0.006034085061401129, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8307 }, { "epoch": 28.000675675675677, "grad_norm": 0.020178381353616714, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8308 }, { "epoch": 28.00070945945946, "grad_norm": 1.562537670135498, "learning_rate": 7.8125e-07, "loss": 0.0144, "step": 8309 }, { "epoch": 28.000743243243242, "grad_norm": 0.1452075093984604, "learning_rate": 7.8125e-07, "loss": 0.0054, "step": 8310 }, { "epoch": 28.000777027027027, "grad_norm": 5.531789779663086, "learning_rate": 7.8125e-07, "loss": 0.0486, "step": 8311 }, { "epoch": 28.000810810810812, "grad_norm": 0.0009424805175513029, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8312 }, { "epoch": 28.000844594594593, "grad_norm": 0.002348676323890686, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8313 }, { "epoch": 28.000878378378378, "grad_norm": 1.3926153182983398, "learning_rate": 7.8125e-07, "loss": 0.0543, "step": 8314 }, { "epoch": 28.000912162162162, "grad_norm": 0.020972840487957, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8315 }, { "epoch": 28.000945945945947, "grad_norm": 112.00663757324219, "learning_rate": 7.8125e-07, "loss": 0.3094, "step": 8316 }, { "epoch": 28.00097972972973, "grad_norm": 0.021291060373187065, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8317 }, { "epoch": 28.001013513513513, "grad_norm": 0.01672186888754368, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8318 }, { "epoch": 28.001047297297298, "grad_norm": 0.004967955406755209, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8319 }, { "epoch": 28.001081081081082, "grad_norm": 0.803968608379364, "learning_rate": 7.8125e-07, "loss": 0.0059, "step": 8320 }, { "epoch": 28.001114864864864, "grad_norm": 0.007178883533924818, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8321 }, { "epoch": 28.00114864864865, "grad_norm": 0.8400903940200806, "learning_rate": 7.8125e-07, "loss": 0.0208, "step": 8322 }, { "epoch": 28.001182432432433, "grad_norm": 5.707606792449951, "learning_rate": 7.8125e-07, "loss": 0.4552, "step": 8323 }, { "epoch": 28.001216216216218, "grad_norm": 0.001559488708153367, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8324 }, { "epoch": 28.00125, "grad_norm": 1.1703823804855347, "learning_rate": 7.8125e-07, "loss": 0.035, "step": 8325 }, { "epoch": 28.001283783783784, "grad_norm": 0.007374299690127373, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8326 }, { "epoch": 28.00131756756757, "grad_norm": 0.11057895421981812, "learning_rate": 7.8125e-07, "loss": 0.0015, "step": 8327 }, { "epoch": 28.001351351351353, "grad_norm": 0.5000165700912476, "learning_rate": 7.8125e-07, "loss": 0.0154, "step": 8328 }, { "epoch": 28.001385135135134, "grad_norm": 0.11118942499160767, "learning_rate": 7.8125e-07, "loss": 0.0037, "step": 8329 }, { "epoch": 28.00141891891892, "grad_norm": 0.0012577160960063338, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8330 }, { "epoch": 28.001452702702704, "grad_norm": 0.015439619310200214, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8331 }, { "epoch": 28.001486486486485, "grad_norm": 0.004029779229313135, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8332 }, { "epoch": 28.00152027027027, "grad_norm": 8.917696952819824, "learning_rate": 7.8125e-07, "loss": 0.0706, "step": 8333 }, { "epoch": 28.001554054054054, "grad_norm": 1.3735876083374023, "learning_rate": 7.8125e-07, "loss": 0.017, "step": 8334 }, { "epoch": 28.00158783783784, "grad_norm": 0.007868713699281216, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8335 }, { "epoch": 28.00162162162162, "grad_norm": 0.036771971732378006, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8336 }, { "epoch": 28.001655405405405, "grad_norm": 0.0007201190455816686, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8337 }, { "epoch": 28.00168918918919, "grad_norm": 0.0030211356934159994, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8338 }, { "epoch": 28.001722972972974, "grad_norm": 3.644386053085327, "learning_rate": 7.8125e-07, "loss": 0.3512, "step": 8339 }, { "epoch": 28.001756756756755, "grad_norm": 0.0034906009677797556, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8340 }, { "epoch": 28.00179054054054, "grad_norm": 0.005320399068295956, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8341 }, { "epoch": 28.001824324324325, "grad_norm": 0.0010974102187901735, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8342 }, { "epoch": 28.00185810810811, "grad_norm": 35.983924865722656, "learning_rate": 7.8125e-07, "loss": 0.0406, "step": 8343 }, { "epoch": 28.00189189189189, "grad_norm": 0.015949079766869545, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8344 }, { "epoch": 28.001925675675675, "grad_norm": 0.002481668023392558, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8345 }, { "epoch": 28.00195945945946, "grad_norm": 0.002843540394678712, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8346 }, { "epoch": 28.001993243243245, "grad_norm": 33.48015213012695, "learning_rate": 7.8125e-07, "loss": 0.6172, "step": 8347 }, { "epoch": 28.002027027027026, "grad_norm": 0.019276022911071777, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8348 }, { "epoch": 28.00206081081081, "grad_norm": 66.87788391113281, "learning_rate": 7.8125e-07, "loss": 0.2212, "step": 8349 }, { "epoch": 28.002094594594595, "grad_norm": 0.003704481292515993, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8350 }, { "epoch": 28.00212837837838, "grad_norm": 18.796367645263672, "learning_rate": 7.8125e-07, "loss": 0.0387, "step": 8351 }, { "epoch": 28.00216216216216, "grad_norm": 0.003501342609524727, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8352 }, { "epoch": 28.002195945945946, "grad_norm": 0.0033888209145516157, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8353 }, { "epoch": 28.00222972972973, "grad_norm": 0.22518157958984375, "learning_rate": 7.8125e-07, "loss": 0.0052, "step": 8354 }, { "epoch": 28.002263513513512, "grad_norm": 0.006139649078249931, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8355 }, { "epoch": 28.002297297297297, "grad_norm": 0.007255043368786573, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8356 }, { "epoch": 28.00233108108108, "grad_norm": 0.002338584279641509, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8357 }, { "epoch": 28.002364864864866, "grad_norm": 0.01198616623878479, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8358 }, { "epoch": 28.002398648648647, "grad_norm": 0.00472590746358037, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8359 }, { "epoch": 28.002432432432432, "grad_norm": 0.0009514699340797961, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8360 }, { "epoch": 28.002466216216217, "grad_norm": 2.7826218605041504, "learning_rate": 7.8125e-07, "loss": 0.0231, "step": 8361 }, { "epoch": 28.0025, "grad_norm": 0.0011684155324473977, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8362 }, { "epoch": 28.002533783783782, "grad_norm": 0.039587412029504776, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 8363 }, { "epoch": 28.002567567567567, "grad_norm": 0.04260649532079697, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8364 }, { "epoch": 28.002601351351352, "grad_norm": 0.010495368391275406, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8365 }, { "epoch": 28.002635135135137, "grad_norm": 0.0008896237704902887, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8366 }, { "epoch": 28.002668918918918, "grad_norm": 0.0024645971134305, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8367 }, { "epoch": 28.002702702702702, "grad_norm": 2.9299862384796143, "learning_rate": 7.8125e-07, "loss": 0.0227, "step": 8368 }, { "epoch": 28.002736486486487, "grad_norm": 0.005367951933294535, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8369 }, { "epoch": 28.002770270270272, "grad_norm": 0.004914774559438229, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8370 }, { "epoch": 28.002804054054053, "grad_norm": 0.12719778716564178, "learning_rate": 7.8125e-07, "loss": 0.0047, "step": 8371 }, { "epoch": 28.002837837837838, "grad_norm": 0.0008868110016919672, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8372 }, { "epoch": 28.002871621621622, "grad_norm": 0.0018220681231468916, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8373 }, { "epoch": 28.002905405405407, "grad_norm": 0.003918208181858063, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8374 }, { "epoch": 28.00293918918919, "grad_norm": 0.0012395937228575349, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8375 }, { "epoch": 28.002972972972973, "grad_norm": 0.01723792776465416, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8376 }, { "epoch": 28.003006756756758, "grad_norm": 0.17105184495449066, "learning_rate": 7.8125e-07, "loss": 0.0065, "step": 8377 }, { "epoch": 28.00304054054054, "grad_norm": 8.073269844055176, "learning_rate": 7.8125e-07, "loss": 0.7315, "step": 8378 }, { "epoch": 28.003074324324324, "grad_norm": 0.004184032790362835, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8379 }, { "epoch": 28.00310810810811, "grad_norm": 0.17878566682338715, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8380 }, { "epoch": 28.003141891891893, "grad_norm": 37.07905960083008, "learning_rate": 7.8125e-07, "loss": 0.0972, "step": 8381 }, { "epoch": 28.003175675675674, "grad_norm": 0.13942170143127441, "learning_rate": 7.8125e-07, "loss": 0.0051, "step": 8382 }, { "epoch": 28.00320945945946, "grad_norm": 0.1828567385673523, "learning_rate": 7.8125e-07, "loss": 0.0056, "step": 8383 }, { "epoch": 28.003243243243244, "grad_norm": 0.12356201559305191, "learning_rate": 7.8125e-07, "loss": 0.0046, "step": 8384 }, { "epoch": 28.00327702702703, "grad_norm": 0.0008429958252236247, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8385 }, { "epoch": 28.00331081081081, "grad_norm": 0.0011050804750993848, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8386 }, { "epoch": 28.003344594594594, "grad_norm": 0.0037372438237071037, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8387 }, { "epoch": 28.00337837837838, "grad_norm": 0.004739843308925629, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8388 }, { "epoch": 28.003412162162164, "grad_norm": 0.0006889391806907952, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8389 }, { "epoch": 28.003445945945945, "grad_norm": 0.009284751489758492, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8390 }, { "epoch": 28.00347972972973, "grad_norm": 0.0013536842307075858, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8391 }, { "epoch": 28.003513513513514, "grad_norm": 31.236299514770508, "learning_rate": 7.8125e-07, "loss": 0.1755, "step": 8392 }, { "epoch": 28.0035472972973, "grad_norm": 0.014269347302615643, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8393 }, { "epoch": 28.00358108108108, "grad_norm": 0.003176901489496231, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8394 }, { "epoch": 28.003614864864865, "grad_norm": 0.0601007305085659, "learning_rate": 7.8125e-07, "loss": 0.0018, "step": 8395 }, { "epoch": 28.00364864864865, "grad_norm": 0.08803780376911163, "learning_rate": 7.8125e-07, "loss": 0.0025, "step": 8396 }, { "epoch": 28.00368243243243, "grad_norm": 0.2897071838378906, "learning_rate": 7.8125e-07, "loss": 0.0019, "step": 8397 }, { "epoch": 28.003716216216215, "grad_norm": 0.0031322648283094168, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8398 }, { "epoch": 28.00375, "grad_norm": 8.6005220413208, "learning_rate": 7.8125e-07, "loss": 0.2312, "step": 8399 }, { "epoch": 28.003783783783785, "grad_norm": 0.0005882697878405452, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8400 }, { "epoch": 28.003817567567566, "grad_norm": 0.0010388526134192944, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8401 }, { "epoch": 28.00385135135135, "grad_norm": 0.3642047643661499, "learning_rate": 7.8125e-07, "loss": 0.0034, "step": 8402 }, { "epoch": 28.003885135135135, "grad_norm": 0.2518408000469208, "learning_rate": 7.8125e-07, "loss": 0.0062, "step": 8403 }, { "epoch": 28.00391891891892, "grad_norm": 0.017129629850387573, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8404 }, { "epoch": 28.0039527027027, "grad_norm": 3.841083288192749, "learning_rate": 7.8125e-07, "loss": 0.4021, "step": 8405 }, { "epoch": 28.003986486486486, "grad_norm": 0.05229582637548447, "learning_rate": 7.8125e-07, "loss": 0.0014, "step": 8406 }, { "epoch": 28.00402027027027, "grad_norm": 0.005722260568290949, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8407 }, { "epoch": 28.004054054054055, "grad_norm": 1.2046135663986206, "learning_rate": 7.8125e-07, "loss": 0.02, "step": 8408 }, { "epoch": 28.004087837837837, "grad_norm": 0.002466722624376416, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8409 }, { "epoch": 28.00412162162162, "grad_norm": 0.0008004486444406211, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8410 }, { "epoch": 28.004155405405406, "grad_norm": 0.03588251397013664, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8411 }, { "epoch": 28.00418918918919, "grad_norm": 0.0025408354122191668, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8412 }, { "epoch": 28.004222972972972, "grad_norm": 0.006476054899394512, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8413 }, { "epoch": 28.004256756756757, "grad_norm": 0.3126285970211029, "learning_rate": 7.8125e-07, "loss": 0.0051, "step": 8414 }, { "epoch": 28.00429054054054, "grad_norm": 0.14863713085651398, "learning_rate": 7.8125e-07, "loss": 0.005, "step": 8415 }, { "epoch": 28.004324324324326, "grad_norm": 0.008848407305777073, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8416 }, { "epoch": 28.004358108108107, "grad_norm": 0.0009634565794840455, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8417 }, { "epoch": 28.004391891891892, "grad_norm": 0.2931361496448517, "learning_rate": 7.8125e-07, "loss": 0.0022, "step": 8418 }, { "epoch": 28.004425675675677, "grad_norm": 0.003953043837100267, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8419 }, { "epoch": 28.004459459459458, "grad_norm": 0.0015087543288245797, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8420 }, { "epoch": 28.004493243243243, "grad_norm": 0.0021361690014600754, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8421 }, { "epoch": 28.004527027027027, "grad_norm": 0.0007487615803256631, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8422 }, { "epoch": 28.004560810810812, "grad_norm": 0.3039061725139618, "learning_rate": 7.8125e-07, "loss": 0.008, "step": 8423 }, { "epoch": 28.004594594594593, "grad_norm": 0.0033462492283433676, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8424 }, { "epoch": 28.004628378378378, "grad_norm": 0.0045998976565897465, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8425 }, { "epoch": 28.004662162162163, "grad_norm": 0.005731277167797089, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8426 }, { "epoch": 28.004695945945947, "grad_norm": 0.0032961645629256964, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8427 }, { "epoch": 28.00472972972973, "grad_norm": 0.021839657798409462, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8428 }, { "epoch": 28.004763513513513, "grad_norm": 0.14029793441295624, "learning_rate": 7.8125e-07, "loss": 0.0053, "step": 8429 }, { "epoch": 28.004797297297298, "grad_norm": 0.11605380475521088, "learning_rate": 7.8125e-07, "loss": 0.0043, "step": 8430 }, { "epoch": 28.004831081081083, "grad_norm": 0.005651853047311306, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8431 }, { "epoch": 28.004864864864864, "grad_norm": 0.0006479778094217181, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8432 }, { "epoch": 28.00489864864865, "grad_norm": 0.21877631545066833, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8433 }, { "epoch": 28.004932432432433, "grad_norm": 0.0012241986114531755, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8434 }, { "epoch": 28.004966216216218, "grad_norm": 0.0009930564556270838, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8435 }, { "epoch": 28.005, "grad_norm": 0.002239328809082508, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8436 }, { "epoch": 28.005033783783784, "grad_norm": 0.002317192265763879, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8437 }, { "epoch": 28.00506756756757, "grad_norm": 0.00064865592867136, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8438 }, { "epoch": 28.00510135135135, "grad_norm": 16.36377716064453, "learning_rate": 7.8125e-07, "loss": 0.139, "step": 8439 }, { "epoch": 28.005135135135134, "grad_norm": 0.03031117282807827, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8440 }, { "epoch": 28.00516891891892, "grad_norm": 0.00152117433026433, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8441 }, { "epoch": 28.005202702702704, "grad_norm": 0.002699910430237651, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8442 }, { "epoch": 28.005236486486485, "grad_norm": 0.37474068999290466, "learning_rate": 7.8125e-07, "loss": 0.0115, "step": 8443 }, { "epoch": 28.00527027027027, "grad_norm": 0.0016622529365122318, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8444 }, { "epoch": 28.005304054054054, "grad_norm": 0.13509710133075714, "learning_rate": 7.8125e-07, "loss": 0.0046, "step": 8445 }, { "epoch": 28.00533783783784, "grad_norm": 0.7390922904014587, "learning_rate": 7.8125e-07, "loss": 0.0064, "step": 8446 }, { "epoch": 28.00537162162162, "grad_norm": 0.0009560480248183012, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8447 }, { "epoch": 28.005405405405405, "grad_norm": 0.005923242308199406, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8448 }, { "epoch": 28.00543918918919, "grad_norm": 0.6308783888816833, "learning_rate": 7.8125e-07, "loss": 0.013, "step": 8449 }, { "epoch": 28.005472972972974, "grad_norm": 0.16837771236896515, "learning_rate": 7.8125e-07, "loss": 0.0036, "step": 8450 }, { "epoch": 28.005506756756755, "grad_norm": 0.422556608915329, "learning_rate": 7.8125e-07, "loss": 0.0031, "step": 8451 }, { "epoch": 28.00554054054054, "grad_norm": 9.451645851135254, "learning_rate": 7.8125e-07, "loss": 0.4432, "step": 8452 }, { "epoch": 28.005574324324325, "grad_norm": 0.05296691134572029, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8453 }, { "epoch": 28.00560810810811, "grad_norm": 0.020122317597270012, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8454 }, { "epoch": 28.00564189189189, "grad_norm": 0.06580137461423874, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8455 }, { "epoch": 28.005675675675676, "grad_norm": 0.0007870321278460324, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8456 }, { "epoch": 28.00570945945946, "grad_norm": 0.05264138802886009, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8457 }, { "epoch": 28.005743243243245, "grad_norm": 0.2845778465270996, "learning_rate": 7.8125e-07, "loss": 0.0084, "step": 8458 }, { "epoch": 28.005777027027026, "grad_norm": 0.30209028720855713, "learning_rate": 7.8125e-07, "loss": 0.0071, "step": 8459 }, { "epoch": 28.00581081081081, "grad_norm": 0.004329037386924028, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8460 }, { "epoch": 28.005844594594596, "grad_norm": 0.0012201661011204123, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8461 }, { "epoch": 28.005878378378377, "grad_norm": 0.0013849640963599086, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8462 }, { "epoch": 28.00591216216216, "grad_norm": 7.906026840209961, "learning_rate": 7.8125e-07, "loss": 0.4169, "step": 8463 }, { "epoch": 28.005945945945946, "grad_norm": 3.7098188400268555, "learning_rate": 7.8125e-07, "loss": 0.1312, "step": 8464 }, { "epoch": 28.00597972972973, "grad_norm": 0.002068914473056793, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8465 }, { "epoch": 28.006013513513512, "grad_norm": 0.008832469582557678, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8466 }, { "epoch": 28.006047297297297, "grad_norm": 0.0011728698154911399, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8467 }, { "epoch": 28.00608108108108, "grad_norm": 0.0024661023635417223, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8468 }, { "epoch": 28.006114864864866, "grad_norm": 50.95520782470703, "learning_rate": 7.8125e-07, "loss": 0.6206, "step": 8469 }, { "epoch": 28.006148648648647, "grad_norm": 0.002830022480338812, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8470 }, { "epoch": 28.006182432432432, "grad_norm": 0.010518433526158333, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8471 }, { "epoch": 28.006216216216217, "grad_norm": 0.0010454300791025162, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8472 }, { "epoch": 28.00625, "grad_norm": 0.009100688621401787, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8473 }, { "epoch": 28.006283783783783, "grad_norm": 0.00855301320552826, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8474 }, { "epoch": 28.006317567567567, "grad_norm": 0.002083134138956666, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8475 }, { "epoch": 28.006351351351352, "grad_norm": 0.14180950820446014, "learning_rate": 7.8125e-07, "loss": 0.0052, "step": 8476 }, { "epoch": 28.006385135135137, "grad_norm": 0.0017625144682824612, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8477 }, { "epoch": 28.006418918918918, "grad_norm": 0.0017559006810188293, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8478 }, { "epoch": 28.006452702702703, "grad_norm": 0.007938199676573277, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8479 }, { "epoch": 28.006486486486487, "grad_norm": 0.007734097074717283, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8480 }, { "epoch": 28.006520270270272, "grad_norm": 0.20083223283290863, "learning_rate": 7.8125e-07, "loss": 0.0055, "step": 8481 }, { "epoch": 28.006554054054053, "grad_norm": 0.1347382813692093, "learning_rate": 7.8125e-07, "loss": 0.0049, "step": 8482 }, { "epoch": 28.006587837837838, "grad_norm": 0.0016036276938393712, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8483 }, { "epoch": 28.006621621621623, "grad_norm": 0.004053843207657337, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8484 }, { "epoch": 28.006655405405404, "grad_norm": 41.77423858642578, "learning_rate": 7.8125e-07, "loss": 0.4359, "step": 8485 }, { "epoch": 28.00668918918919, "grad_norm": 3.2557437419891357, "learning_rate": 7.8125e-07, "loss": 0.4031, "step": 8486 }, { "epoch": 28.006722972972973, "grad_norm": 0.0029552981723099947, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8487 }, { "epoch": 28.006756756756758, "grad_norm": 5.646690368652344, "learning_rate": 7.8125e-07, "loss": 0.2821, "step": 8488 }, { "epoch": 28.00679054054054, "grad_norm": 0.001947529730387032, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8489 }, { "epoch": 28.006824324324324, "grad_norm": 0.019676262512803078, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8490 }, { "epoch": 28.00685810810811, "grad_norm": 0.006912867538630962, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8491 }, { "epoch": 28.006891891891893, "grad_norm": 0.0012677958002313972, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8492 }, { "epoch": 28.006925675675674, "grad_norm": 0.0014428893337026238, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8493 }, { "epoch": 28.00695945945946, "grad_norm": 0.014053263701498508, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8494 }, { "epoch": 28.006993243243244, "grad_norm": 0.057883162051439285, "learning_rate": 7.8125e-07, "loss": 0.0015, "step": 8495 }, { "epoch": 28.00702702702703, "grad_norm": 0.006877507083117962, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8496 }, { "epoch": 28.00706081081081, "grad_norm": 0.002646523294970393, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8497 }, { "epoch": 28.007094594594594, "grad_norm": 0.007805699482560158, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8498 }, { "epoch": 28.00712837837838, "grad_norm": 42.79267501831055, "learning_rate": 7.8125e-07, "loss": 0.231, "step": 8499 }, { "epoch": 28.007162162162164, "grad_norm": 0.007402139250189066, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8500 }, { "epoch": 28.007195945945945, "grad_norm": 0.1536017209291458, "learning_rate": 7.8125e-07, "loss": 0.0057, "step": 8501 }, { "epoch": 28.00722972972973, "grad_norm": 0.0018579249735921621, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8502 }, { "epoch": 28.007263513513514, "grad_norm": 0.002692699432373047, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8503 }, { "epoch": 28.007297297297296, "grad_norm": 0.00920582003891468, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8504 }, { "epoch": 28.00733108108108, "grad_norm": 0.0018905408214777708, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8505 }, { "epoch": 28.007364864864865, "grad_norm": 0.08805083483457565, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8506 }, { "epoch": 28.00739864864865, "grad_norm": 0.11514333635568619, "learning_rate": 7.8125e-07, "loss": 0.004, "step": 8507 }, { "epoch": 28.00743243243243, "grad_norm": 4.357701778411865, "learning_rate": 7.8125e-07, "loss": 0.1133, "step": 8508 }, { "epoch": 28.007466216216216, "grad_norm": 0.12013384699821472, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8509 }, { "epoch": 28.0075, "grad_norm": 0.05552184581756592, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8510 }, { "epoch": 28.007533783783785, "grad_norm": 0.18156182765960693, "learning_rate": 7.8125e-07, "loss": 0.0059, "step": 8511 }, { "epoch": 28.007567567567566, "grad_norm": 0.002296820282936096, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8512 }, { "epoch": 28.00760135135135, "grad_norm": 0.012538573704659939, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8513 }, { "epoch": 28.007635135135136, "grad_norm": 0.010027393698692322, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8514 }, { "epoch": 28.00766891891892, "grad_norm": 47.496559143066406, "learning_rate": 7.8125e-07, "loss": 1.541, "step": 8515 }, { "epoch": 28.0077027027027, "grad_norm": 0.7877855896949768, "learning_rate": 7.8125e-07, "loss": 0.0063, "step": 8516 }, { "epoch": 28.007736486486486, "grad_norm": 0.0006316508515737951, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8517 }, { "epoch": 28.00777027027027, "grad_norm": 0.004330527037382126, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8518 }, { "epoch": 28.007804054054056, "grad_norm": 0.014455757103860378, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8519 }, { "epoch": 28.007837837837837, "grad_norm": 0.0011006243294104934, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8520 }, { "epoch": 28.00787162162162, "grad_norm": 1.4128421545028687, "learning_rate": 7.8125e-07, "loss": 0.0024, "step": 8521 }, { "epoch": 28.007905405405406, "grad_norm": 0.002007666975259781, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8522 }, { "epoch": 28.00793918918919, "grad_norm": 0.01008178573101759, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8523 }, { "epoch": 28.007972972972972, "grad_norm": 0.006837924942374229, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8524 }, { "epoch": 28.008006756756757, "grad_norm": 0.04710254445672035, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8525 }, { "epoch": 28.00804054054054, "grad_norm": 10.472094535827637, "learning_rate": 7.8125e-07, "loss": 0.4836, "step": 8526 }, { "epoch": 28.008074324324323, "grad_norm": 0.1543464958667755, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8527 }, { "epoch": 28.008108108108107, "grad_norm": 0.011719238944351673, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8528 }, { "epoch": 28.008141891891892, "grad_norm": 1.4529646635055542, "learning_rate": 7.8125e-07, "loss": 0.0133, "step": 8529 }, { "epoch": 28.008175675675677, "grad_norm": 0.14416027069091797, "learning_rate": 7.8125e-07, "loss": 0.0052, "step": 8530 }, { "epoch": 28.008209459459458, "grad_norm": 0.014806671999394894, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8531 }, { "epoch": 28.008243243243243, "grad_norm": 0.11054547876119614, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8532 }, { "epoch": 28.008277027027027, "grad_norm": 0.007833501324057579, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8533 }, { "epoch": 28.008310810810812, "grad_norm": 0.008547317236661911, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8534 }, { "epoch": 28.008344594594593, "grad_norm": 0.005485148634761572, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8535 }, { "epoch": 28.008378378378378, "grad_norm": 47.15354919433594, "learning_rate": 7.8125e-07, "loss": 0.1501, "step": 8536 }, { "epoch": 28.008412162162163, "grad_norm": 11.002585411071777, "learning_rate": 7.8125e-07, "loss": 0.0723, "step": 8537 }, { "epoch": 28.008445945945947, "grad_norm": 0.0025518869515508413, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8538 }, { "epoch": 28.00847972972973, "grad_norm": 0.0033809475135058165, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8539 }, { "epoch": 28.008513513513513, "grad_norm": 0.011767112649977207, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8540 }, { "epoch": 28.008547297297298, "grad_norm": 3.310579776763916, "learning_rate": 7.8125e-07, "loss": 0.4127, "step": 8541 }, { "epoch": 28.008581081081083, "grad_norm": 0.004554498940706253, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8542 }, { "epoch": 28.008614864864864, "grad_norm": 0.004677222575992346, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8543 }, { "epoch": 28.00864864864865, "grad_norm": 0.0008719081524759531, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8544 }, { "epoch": 28.008682432432433, "grad_norm": 0.0020953945349901915, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8545 }, { "epoch": 28.008716216216218, "grad_norm": 0.12126360088586807, "learning_rate": 7.8125e-07, "loss": 0.0044, "step": 8546 }, { "epoch": 28.00875, "grad_norm": 0.0069239321164786816, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8547 }, { "epoch": 28.008783783783784, "grad_norm": 0.02670557238161564, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8548 }, { "epoch": 28.00881756756757, "grad_norm": 0.0968591570854187, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8549 }, { "epoch": 28.00885135135135, "grad_norm": 0.008858502842485905, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8550 }, { "epoch": 28.008885135135134, "grad_norm": 6.296270847320557, "learning_rate": 7.8125e-07, "loss": 0.1959, "step": 8551 }, { "epoch": 28.00891891891892, "grad_norm": 0.02154659293591976, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8552 }, { "epoch": 28.008952702702704, "grad_norm": 0.0014758476754650474, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8553 }, { "epoch": 28.008986486486485, "grad_norm": 0.0034745442681014538, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8554 }, { "epoch": 28.00902027027027, "grad_norm": 0.0055147637613117695, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8555 }, { "epoch": 28.009054054054054, "grad_norm": 0.4606894850730896, "learning_rate": 7.8125e-07, "loss": 0.0036, "step": 8556 }, { "epoch": 28.00908783783784, "grad_norm": 0.012756820768117905, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8557 }, { "epoch": 28.00912162162162, "grad_norm": 0.0022835296113044024, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8558 }, { "epoch": 28.009155405405405, "grad_norm": 16.121292114257812, "learning_rate": 7.8125e-07, "loss": 0.5134, "step": 8559 }, { "epoch": 28.00918918918919, "grad_norm": 0.0734255388379097, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8560 }, { "epoch": 28.009222972972974, "grad_norm": 0.004431241191923618, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8561 }, { "epoch": 28.009256756756756, "grad_norm": 0.006020131520926952, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8562 }, { "epoch": 28.00929054054054, "grad_norm": 0.5033639669418335, "learning_rate": 7.8125e-07, "loss": 0.0093, "step": 8563 }, { "epoch": 28.009324324324325, "grad_norm": 0.00553864473477006, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8564 }, { "epoch": 28.00935810810811, "grad_norm": 0.0019506963435560465, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8565 }, { "epoch": 28.00939189189189, "grad_norm": 1.6931679248809814, "learning_rate": 7.8125e-07, "loss": 0.0441, "step": 8566 }, { "epoch": 28.009425675675676, "grad_norm": 0.004064077045768499, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8567 }, { "epoch": 28.00945945945946, "grad_norm": 0.001297761220484972, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8568 }, { "epoch": 28.00949324324324, "grad_norm": 0.005806118715554476, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8569 }, { "epoch": 28.009527027027026, "grad_norm": 0.018981527537107468, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8570 }, { "epoch": 28.00956081081081, "grad_norm": 0.20995180308818817, "learning_rate": 7.8125e-07, "loss": 0.0036, "step": 8571 }, { "epoch": 28.009594594594596, "grad_norm": 0.005014164373278618, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8572 }, { "epoch": 28.009628378378377, "grad_norm": 3.301248788833618, "learning_rate": 7.8125e-07, "loss": 0.0352, "step": 8573 }, { "epoch": 28.00966216216216, "grad_norm": 1.4566088914871216, "learning_rate": 7.8125e-07, "loss": 0.0077, "step": 8574 }, { "epoch": 28.009695945945946, "grad_norm": 7.382636070251465, "learning_rate": 7.8125e-07, "loss": 0.453, "step": 8575 }, { "epoch": 28.00972972972973, "grad_norm": 0.004735040944069624, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8576 }, { "epoch": 28.009763513513512, "grad_norm": 0.004393813665956259, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8577 }, { "epoch": 28.009797297297297, "grad_norm": 0.001895047607831657, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8578 }, { "epoch": 28.00983108108108, "grad_norm": 0.0016394297126680613, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8579 }, { "epoch": 28.009864864864866, "grad_norm": 0.13433682918548584, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8580 }, { "epoch": 28.009898648648647, "grad_norm": 0.0023220465518534184, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8581 }, { "epoch": 28.009932432432432, "grad_norm": 4.165064334869385, "learning_rate": 7.8125e-07, "loss": 0.4075, "step": 8582 }, { "epoch": 28.009966216216217, "grad_norm": 0.40906694531440735, "learning_rate": 7.8125e-07, "loss": 0.002, "step": 8583 }, { "epoch": 28.01, "grad_norm": 0.06830812990665436, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8584 }, { "epoch": 28.01, "eval_accuracy": 0.8949919224555735, "eval_loss": 0.6073256134986877, "eval_runtime": 32.997, "eval_samples_per_second": 18.759, "eval_steps_per_second": 2.364, "step": 8584 }, { "epoch": 29.000033783783785, "grad_norm": 0.7414525747299194, "learning_rate": 7.8125e-07, "loss": 0.002, "step": 8585 }, { "epoch": 29.000067567567566, "grad_norm": 7.367420196533203, "learning_rate": 7.8125e-07, "loss": 0.0113, "step": 8586 }, { "epoch": 29.00010135135135, "grad_norm": 0.0011637526331469417, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8587 }, { "epoch": 29.000135135135135, "grad_norm": 0.0011065811850130558, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8588 }, { "epoch": 29.00016891891892, "grad_norm": 0.0013437076704576612, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8589 }, { "epoch": 29.0002027027027, "grad_norm": 0.0030754513572901487, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8590 }, { "epoch": 29.000236486486486, "grad_norm": 0.011387808248400688, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8591 }, { "epoch": 29.00027027027027, "grad_norm": 0.009718391112983227, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8592 }, { "epoch": 29.000304054054055, "grad_norm": 0.0008658950682729483, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8593 }, { "epoch": 29.000337837837836, "grad_norm": 0.0016110740834847093, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8594 }, { "epoch": 29.00037162162162, "grad_norm": 0.0011864467523992062, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8595 }, { "epoch": 29.000405405405406, "grad_norm": 0.008441155776381493, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8596 }, { "epoch": 29.00043918918919, "grad_norm": 0.128867506980896, "learning_rate": 7.8125e-07, "loss": 0.0031, "step": 8597 }, { "epoch": 29.00047297297297, "grad_norm": 0.005305056925863028, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8598 }, { "epoch": 29.000506756756756, "grad_norm": 0.2465343177318573, "learning_rate": 7.8125e-07, "loss": 0.0094, "step": 8599 }, { "epoch": 29.00054054054054, "grad_norm": 0.04202989861369133, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8600 }, { "epoch": 29.000574324324326, "grad_norm": 0.0073852138593792915, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8601 }, { "epoch": 29.000608108108107, "grad_norm": 0.005794881843030453, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8602 }, { "epoch": 29.000641891891892, "grad_norm": 0.19827723503112793, "learning_rate": 7.8125e-07, "loss": 0.0037, "step": 8603 }, { "epoch": 29.000675675675677, "grad_norm": 3.6245481967926025, "learning_rate": 7.8125e-07, "loss": 0.0065, "step": 8604 }, { "epoch": 29.00070945945946, "grad_norm": 0.14362965524196625, "learning_rate": 7.8125e-07, "loss": 0.0013, "step": 8605 }, { "epoch": 29.000743243243242, "grad_norm": 0.0016229806933552027, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8606 }, { "epoch": 29.000777027027027, "grad_norm": 0.1777687817811966, "learning_rate": 7.8125e-07, "loss": 0.0057, "step": 8607 }, { "epoch": 29.000810810810812, "grad_norm": 45.144718170166016, "learning_rate": 7.8125e-07, "loss": 0.0691, "step": 8608 }, { "epoch": 29.000844594594593, "grad_norm": 2.8782503604888916, "learning_rate": 7.8125e-07, "loss": 0.0104, "step": 8609 }, { "epoch": 29.000878378378378, "grad_norm": 0.0016607738798484206, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8610 }, { "epoch": 29.000912162162162, "grad_norm": 0.129135400056839, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8611 }, { "epoch": 29.000945945945947, "grad_norm": 0.006322484463453293, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8612 }, { "epoch": 29.00097972972973, "grad_norm": 6.521575450897217, "learning_rate": 7.8125e-07, "loss": 0.2223, "step": 8613 }, { "epoch": 29.001013513513513, "grad_norm": 0.04199511185288429, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8614 }, { "epoch": 29.001047297297298, "grad_norm": 0.0009624658268876374, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8615 }, { "epoch": 29.001081081081082, "grad_norm": 0.008133566938340664, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8616 }, { "epoch": 29.001114864864864, "grad_norm": 0.0342969112098217, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8617 }, { "epoch": 29.00114864864865, "grad_norm": 0.0011637702118605375, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8618 }, { "epoch": 29.001182432432433, "grad_norm": 0.27218636870384216, "learning_rate": 7.8125e-07, "loss": 0.0076, "step": 8619 }, { "epoch": 29.001216216216218, "grad_norm": 2.5812647342681885, "learning_rate": 7.8125e-07, "loss": 0.0049, "step": 8620 }, { "epoch": 29.00125, "grad_norm": 0.003739970037713647, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8621 }, { "epoch": 29.001283783783784, "grad_norm": 0.0018880268326029181, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8622 }, { "epoch": 29.00131756756757, "grad_norm": 0.6210820078849792, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 8623 }, { "epoch": 29.001351351351353, "grad_norm": 0.0025292281061410904, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8624 }, { "epoch": 29.001385135135134, "grad_norm": 0.04978496953845024, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8625 }, { "epoch": 29.00141891891892, "grad_norm": 0.0011842080857604742, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8626 }, { "epoch": 29.001452702702704, "grad_norm": 11.73704719543457, "learning_rate": 7.8125e-07, "loss": 0.2245, "step": 8627 }, { "epoch": 29.001486486486485, "grad_norm": 0.002490701386705041, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8628 }, { "epoch": 29.00152027027027, "grad_norm": 0.005202935077250004, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8629 }, { "epoch": 29.001554054054054, "grad_norm": 0.04242812842130661, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8630 }, { "epoch": 29.00158783783784, "grad_norm": 0.014181150123476982, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8631 }, { "epoch": 29.00162162162162, "grad_norm": 0.0023885027039796114, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8632 }, { "epoch": 29.001655405405405, "grad_norm": 0.0008348795818164945, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8633 }, { "epoch": 29.00168918918919, "grad_norm": 0.27168354392051697, "learning_rate": 7.8125e-07, "loss": 0.0087, "step": 8634 }, { "epoch": 29.001722972972974, "grad_norm": 4.077956199645996, "learning_rate": 7.8125e-07, "loss": 0.0391, "step": 8635 }, { "epoch": 29.001756756756755, "grad_norm": 0.07971204817295074, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8636 }, { "epoch": 29.00179054054054, "grad_norm": 0.0012608402175828815, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8637 }, { "epoch": 29.001824324324325, "grad_norm": 0.020817529410123825, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8638 }, { "epoch": 29.00185810810811, "grad_norm": 0.010959610342979431, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8639 }, { "epoch": 29.00189189189189, "grad_norm": 0.014564606361091137, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8640 }, { "epoch": 29.001925675675675, "grad_norm": 0.00192610255908221, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8641 }, { "epoch": 29.00195945945946, "grad_norm": 0.18443293869495392, "learning_rate": 7.8125e-07, "loss": 0.0066, "step": 8642 }, { "epoch": 29.001993243243245, "grad_norm": 0.0015799769898876548, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8643 }, { "epoch": 29.002027027027026, "grad_norm": 0.002275770530104637, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8644 }, { "epoch": 29.00206081081081, "grad_norm": 0.0016526752151548862, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8645 }, { "epoch": 29.002094594594595, "grad_norm": 0.00064635812304914, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8646 }, { "epoch": 29.00212837837838, "grad_norm": 0.00153452274389565, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8647 }, { "epoch": 29.00216216216216, "grad_norm": 0.043092526495456696, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8648 }, { "epoch": 29.002195945945946, "grad_norm": 0.001118356711231172, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8649 }, { "epoch": 29.00222972972973, "grad_norm": 0.0019491425482556224, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8650 }, { "epoch": 29.002263513513512, "grad_norm": 0.0013861518818885088, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8651 }, { "epoch": 29.002297297297297, "grad_norm": 0.19801361858844757, "learning_rate": 7.8125e-07, "loss": 0.0068, "step": 8652 }, { "epoch": 29.00233108108108, "grad_norm": 0.0026344372890889645, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8653 }, { "epoch": 29.002364864864866, "grad_norm": 0.0014178096316754818, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8654 }, { "epoch": 29.002398648648647, "grad_norm": 0.03671438246965408, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8655 }, { "epoch": 29.002432432432432, "grad_norm": 0.0032626446336507797, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8656 }, { "epoch": 29.002466216216217, "grad_norm": 0.003505521221086383, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8657 }, { "epoch": 29.0025, "grad_norm": 0.04296968877315521, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8658 }, { "epoch": 29.002533783783782, "grad_norm": 0.09038123488426208, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8659 }, { "epoch": 29.002567567567567, "grad_norm": 0.0013808695366606116, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8660 }, { "epoch": 29.002601351351352, "grad_norm": 0.004071106668561697, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8661 }, { "epoch": 29.002635135135137, "grad_norm": 0.0008942227577790618, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8662 }, { "epoch": 29.002668918918918, "grad_norm": 0.0008706890512257814, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8663 }, { "epoch": 29.002702702702702, "grad_norm": 0.009594114497303963, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8664 }, { "epoch": 29.002736486486487, "grad_norm": 0.006596802733838558, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8665 }, { "epoch": 29.002770270270272, "grad_norm": 0.16592401266098022, "learning_rate": 7.8125e-07, "loss": 0.0039, "step": 8666 }, { "epoch": 29.002804054054053, "grad_norm": 8.100064277648926, "learning_rate": 7.8125e-07, "loss": 0.2478, "step": 8667 }, { "epoch": 29.002837837837838, "grad_norm": 0.002173299202695489, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8668 }, { "epoch": 29.002871621621622, "grad_norm": 0.10942937433719635, "learning_rate": 7.8125e-07, "loss": 0.0041, "step": 8669 }, { "epoch": 29.002905405405407, "grad_norm": 0.003544433508068323, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8670 }, { "epoch": 29.00293918918919, "grad_norm": 0.08701322227716446, "learning_rate": 7.8125e-07, "loss": 0.0025, "step": 8671 }, { "epoch": 29.002972972972973, "grad_norm": 0.038713619112968445, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8672 }, { "epoch": 29.003006756756758, "grad_norm": 0.001052571926265955, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8673 }, { "epoch": 29.00304054054054, "grad_norm": 0.1241416409611702, "learning_rate": 7.8125e-07, "loss": 0.0044, "step": 8674 }, { "epoch": 29.003074324324324, "grad_norm": 68.88957214355469, "learning_rate": 7.8125e-07, "loss": 1.1853, "step": 8675 }, { "epoch": 29.00310810810811, "grad_norm": 0.6215308904647827, "learning_rate": 7.8125e-07, "loss": 0.0047, "step": 8676 }, { "epoch": 29.003141891891893, "grad_norm": 0.0029537701047956944, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8677 }, { "epoch": 29.003175675675674, "grad_norm": 0.008178376592695713, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8678 }, { "epoch": 29.00320945945946, "grad_norm": 18.029132843017578, "learning_rate": 7.8125e-07, "loss": 0.3953, "step": 8679 }, { "epoch": 29.003243243243244, "grad_norm": 0.0009539674501866102, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8680 }, { "epoch": 29.00327702702703, "grad_norm": 32.60860061645508, "learning_rate": 7.8125e-07, "loss": 0.1567, "step": 8681 }, { "epoch": 29.00331081081081, "grad_norm": 0.0007468628464266658, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8682 }, { "epoch": 29.003344594594594, "grad_norm": 3.6687028408050537, "learning_rate": 7.8125e-07, "loss": 0.0068, "step": 8683 }, { "epoch": 29.00337837837838, "grad_norm": 3.5384678840637207, "learning_rate": 7.8125e-07, "loss": 0.396, "step": 8684 }, { "epoch": 29.003412162162164, "grad_norm": 0.4524347186088562, "learning_rate": 7.8125e-07, "loss": 0.0024, "step": 8685 }, { "epoch": 29.003445945945945, "grad_norm": 0.0015982768964022398, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8686 }, { "epoch": 29.00347972972973, "grad_norm": 0.005337391514331102, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8687 }, { "epoch": 29.003513513513514, "grad_norm": 0.0021214792504906654, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8688 }, { "epoch": 29.0035472972973, "grad_norm": 0.0014526302693411708, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8689 }, { "epoch": 29.00358108108108, "grad_norm": 0.0039014893118292093, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8690 }, { "epoch": 29.003614864864865, "grad_norm": 0.15526671707630157, "learning_rate": 7.8125e-07, "loss": 0.0009, "step": 8691 }, { "epoch": 29.00364864864865, "grad_norm": 0.0249142087996006, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8692 }, { "epoch": 29.00368243243243, "grad_norm": 0.04255172610282898, "learning_rate": 7.8125e-07, "loss": 0.0013, "step": 8693 }, { "epoch": 29.003716216216215, "grad_norm": 14.621478080749512, "learning_rate": 7.8125e-07, "loss": 0.2582, "step": 8694 }, { "epoch": 29.00375, "grad_norm": 0.06415095925331116, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8695 }, { "epoch": 29.003783783783785, "grad_norm": 0.005315362475812435, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8696 }, { "epoch": 29.003817567567566, "grad_norm": 0.0012031677179038525, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8697 }, { "epoch": 29.00385135135135, "grad_norm": 0.001621541567146778, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8698 }, { "epoch": 29.003885135135135, "grad_norm": 0.017460837960243225, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8699 }, { "epoch": 29.00391891891892, "grad_norm": 0.293194442987442, "learning_rate": 7.8125e-07, "loss": 0.0072, "step": 8700 }, { "epoch": 29.0039527027027, "grad_norm": 0.0016693813959136605, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8701 }, { "epoch": 29.003986486486486, "grad_norm": 1.1020803451538086, "learning_rate": 7.8125e-07, "loss": 0.0073, "step": 8702 }, { "epoch": 29.00402027027027, "grad_norm": 0.05513536185026169, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8703 }, { "epoch": 29.004054054054055, "grad_norm": 0.136013001203537, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8704 }, { "epoch": 29.004087837837837, "grad_norm": 16.974273681640625, "learning_rate": 7.8125e-07, "loss": 0.1047, "step": 8705 }, { "epoch": 29.00412162162162, "grad_norm": 0.0009494674159213901, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8706 }, { "epoch": 29.004155405405406, "grad_norm": 0.14789019525051117, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8707 }, { "epoch": 29.00418918918919, "grad_norm": 0.24591787159442902, "learning_rate": 7.8125e-07, "loss": 0.0042, "step": 8708 }, { "epoch": 29.004222972972972, "grad_norm": 0.010392296127974987, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8709 }, { "epoch": 29.004256756756757, "grad_norm": 0.019634684547781944, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8710 }, { "epoch": 29.00429054054054, "grad_norm": 0.0012152109993621707, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8711 }, { "epoch": 29.004324324324326, "grad_norm": 0.05935799330472946, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8712 }, { "epoch": 29.004358108108107, "grad_norm": 0.09450932592153549, "learning_rate": 7.8125e-07, "loss": 0.0021, "step": 8713 }, { "epoch": 29.004391891891892, "grad_norm": 0.0014073115307837725, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8714 }, { "epoch": 29.004425675675677, "grad_norm": 0.007122599054127932, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8715 }, { "epoch": 29.004459459459458, "grad_norm": 0.0025158121716231108, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8716 }, { "epoch": 29.004493243243243, "grad_norm": 0.016018839552998543, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8717 }, { "epoch": 29.004527027027027, "grad_norm": 0.0022757824044674635, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8718 }, { "epoch": 29.004560810810812, "grad_norm": 0.0017117056995630264, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8719 }, { "epoch": 29.004594594594593, "grad_norm": 0.0009510913514532149, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8720 }, { "epoch": 29.004628378378378, "grad_norm": 0.008931300602853298, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8721 }, { "epoch": 29.004662162162163, "grad_norm": 0.011102238669991493, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8722 }, { "epoch": 29.004695945945947, "grad_norm": 0.0014810689026489854, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8723 }, { "epoch": 29.00472972972973, "grad_norm": 0.00688448129221797, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8724 }, { "epoch": 29.004763513513513, "grad_norm": 0.0008140450809150934, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8725 }, { "epoch": 29.004797297297298, "grad_norm": 0.3295915722846985, "learning_rate": 7.8125e-07, "loss": 0.002, "step": 8726 }, { "epoch": 29.004831081081083, "grad_norm": 20.259984970092773, "learning_rate": 7.8125e-07, "loss": 0.0437, "step": 8727 }, { "epoch": 29.004864864864864, "grad_norm": 0.0010085339890792966, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8728 }, { "epoch": 29.00489864864865, "grad_norm": 0.006076316349208355, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8729 }, { "epoch": 29.004932432432433, "grad_norm": 0.0011158848647028208, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8730 }, { "epoch": 29.004966216216218, "grad_norm": 14.08969783782959, "learning_rate": 7.8125e-07, "loss": 0.0975, "step": 8731 }, { "epoch": 29.005, "grad_norm": 0.0012043295428156853, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8732 }, { "epoch": 29.005033783783784, "grad_norm": 0.009871949441730976, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8733 }, { "epoch": 29.00506756756757, "grad_norm": 0.00184466817881912, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8734 }, { "epoch": 29.00510135135135, "grad_norm": 0.13682821393013, "learning_rate": 7.8125e-07, "loss": 0.0049, "step": 8735 }, { "epoch": 29.005135135135134, "grad_norm": 0.011326729319989681, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8736 }, { "epoch": 29.00516891891892, "grad_norm": 0.14922942221164703, "learning_rate": 7.8125e-07, "loss": 0.0055, "step": 8737 }, { "epoch": 29.005202702702704, "grad_norm": 0.08609567582607269, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8738 }, { "epoch": 29.005236486486485, "grad_norm": 0.3931020498275757, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 8739 }, { "epoch": 29.00527027027027, "grad_norm": 0.0030375593341886997, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8740 }, { "epoch": 29.005304054054054, "grad_norm": 0.0024632220156490803, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8741 }, { "epoch": 29.00533783783784, "grad_norm": 0.0020691759418696165, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8742 }, { "epoch": 29.00537162162162, "grad_norm": 0.004571300465613604, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8743 }, { "epoch": 29.005405405405405, "grad_norm": 0.0009515316342003644, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8744 }, { "epoch": 29.00543918918919, "grad_norm": 0.0027897574473172426, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8745 }, { "epoch": 29.005472972972974, "grad_norm": 0.0038691218942403793, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8746 }, { "epoch": 29.005506756756755, "grad_norm": 0.014873187988996506, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8747 }, { "epoch": 29.00554054054054, "grad_norm": 70.18292236328125, "learning_rate": 7.8125e-07, "loss": 0.6607, "step": 8748 }, { "epoch": 29.005574324324325, "grad_norm": 0.001509578083641827, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8749 }, { "epoch": 29.00560810810811, "grad_norm": 0.0022036356385797262, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8750 }, { "epoch": 29.00564189189189, "grad_norm": 0.044118721038103104, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8751 }, { "epoch": 29.005675675675676, "grad_norm": 4.012655735015869, "learning_rate": 7.8125e-07, "loss": 0.0084, "step": 8752 }, { "epoch": 29.00570945945946, "grad_norm": 0.11344680190086365, "learning_rate": 7.8125e-07, "loss": 0.0013, "step": 8753 }, { "epoch": 29.005743243243245, "grad_norm": 0.010300317779183388, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8754 }, { "epoch": 29.005777027027026, "grad_norm": 0.0025658225640654564, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8755 }, { "epoch": 29.00581081081081, "grad_norm": 0.006540094967931509, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8756 }, { "epoch": 29.005844594594596, "grad_norm": 0.02191382646560669, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8757 }, { "epoch": 29.005878378378377, "grad_norm": 0.003889580490067601, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8758 }, { "epoch": 29.00591216216216, "grad_norm": 0.3591156303882599, "learning_rate": 7.8125e-07, "loss": 0.0029, "step": 8759 }, { "epoch": 29.005945945945946, "grad_norm": 14.22138786315918, "learning_rate": 7.8125e-07, "loss": 0.3226, "step": 8760 }, { "epoch": 29.00597972972973, "grad_norm": 0.0010732177179306746, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8761 }, { "epoch": 29.006013513513512, "grad_norm": 0.0016926747048273683, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8762 }, { "epoch": 29.006047297297297, "grad_norm": 0.0011945563601329923, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8763 }, { "epoch": 29.00608108108108, "grad_norm": 0.004817912820726633, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8764 }, { "epoch": 29.006114864864866, "grad_norm": 0.027664564549922943, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8765 }, { "epoch": 29.006148648648647, "grad_norm": 0.0008696202421560884, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8766 }, { "epoch": 29.006182432432432, "grad_norm": 0.0007838770397938788, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8767 }, { "epoch": 29.006216216216217, "grad_norm": 0.3703670799732208, "learning_rate": 7.8125e-07, "loss": 0.0009, "step": 8768 }, { "epoch": 29.00625, "grad_norm": 0.12069863080978394, "learning_rate": 7.8125e-07, "loss": 0.0045, "step": 8769 }, { "epoch": 29.006283783783783, "grad_norm": 0.0008822876261547208, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8770 }, { "epoch": 29.006317567567567, "grad_norm": 0.0036644036881625652, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8771 }, { "epoch": 29.006351351351352, "grad_norm": 0.0009319233940914273, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8772 }, { "epoch": 29.006385135135137, "grad_norm": 0.1453811377286911, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8773 }, { "epoch": 29.006418918918918, "grad_norm": 7.808815002441406, "learning_rate": 7.8125e-07, "loss": 0.0915, "step": 8774 }, { "epoch": 29.006452702702703, "grad_norm": 0.10244657099246979, "learning_rate": 7.8125e-07, "loss": 0.001, "step": 8775 }, { "epoch": 29.006486486486487, "grad_norm": 0.003113735932856798, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8776 }, { "epoch": 29.006520270270272, "grad_norm": 0.1361645609140396, "learning_rate": 7.8125e-07, "loss": 0.0047, "step": 8777 }, { "epoch": 29.006554054054053, "grad_norm": 0.001614991924725473, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8778 }, { "epoch": 29.006587837837838, "grad_norm": 0.0010661801788955927, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8779 }, { "epoch": 29.006621621621623, "grad_norm": 18.375343322753906, "learning_rate": 7.8125e-07, "loss": 0.0581, "step": 8780 }, { "epoch": 29.006655405405404, "grad_norm": 0.0014152014628052711, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8781 }, { "epoch": 29.00668918918919, "grad_norm": 3.5281856060028076, "learning_rate": 7.8125e-07, "loss": 0.3849, "step": 8782 }, { "epoch": 29.006722972972973, "grad_norm": 0.0038975460920482874, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8783 }, { "epoch": 29.006756756756758, "grad_norm": 0.0018400378758087754, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8784 }, { "epoch": 29.00679054054054, "grad_norm": 0.0013852627016603947, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8785 }, { "epoch": 29.006824324324324, "grad_norm": 2.376912832260132, "learning_rate": 7.8125e-07, "loss": 0.0096, "step": 8786 }, { "epoch": 29.00685810810811, "grad_norm": 0.12930968403816223, "learning_rate": 7.8125e-07, "loss": 0.0047, "step": 8787 }, { "epoch": 29.006891891891893, "grad_norm": 0.0009358958923257887, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8788 }, { "epoch": 29.006925675675674, "grad_norm": 0.0016324506141245365, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8789 }, { "epoch": 29.00695945945946, "grad_norm": 0.00205925852060318, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8790 }, { "epoch": 29.006993243243244, "grad_norm": 0.25336921215057373, "learning_rate": 7.8125e-07, "loss": 0.0085, "step": 8791 }, { "epoch": 29.00702702702703, "grad_norm": 0.002145722508430481, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8792 }, { "epoch": 29.00706081081081, "grad_norm": 0.031104424968361855, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8793 }, { "epoch": 29.007094594594594, "grad_norm": 0.004103053826838732, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8794 }, { "epoch": 29.00712837837838, "grad_norm": 0.9224175810813904, "learning_rate": 7.8125e-07, "loss": 0.012, "step": 8795 }, { "epoch": 29.007162162162164, "grad_norm": 0.0009915687842294574, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8796 }, { "epoch": 29.007195945945945, "grad_norm": 0.010267216712236404, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8797 }, { "epoch": 29.00722972972973, "grad_norm": 0.18911831080913544, "learning_rate": 7.8125e-07, "loss": 0.0036, "step": 8798 }, { "epoch": 29.007263513513514, "grad_norm": 0.002101664897054434, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8799 }, { "epoch": 29.007297297297296, "grad_norm": 0.016726570203900337, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8800 }, { "epoch": 29.00733108108108, "grad_norm": 0.0037010209634900093, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8801 }, { "epoch": 29.007364864864865, "grad_norm": 0.001412852667272091, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8802 }, { "epoch": 29.00739864864865, "grad_norm": 0.002136199502274394, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8803 }, { "epoch": 29.00743243243243, "grad_norm": 0.0018556403229013085, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8804 }, { "epoch": 29.007466216216216, "grad_norm": 0.04346000403165817, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8805 }, { "epoch": 29.0075, "grad_norm": 0.008645899593830109, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8806 }, { "epoch": 29.007533783783785, "grad_norm": 0.015757668763399124, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8807 }, { "epoch": 29.007567567567566, "grad_norm": 0.00923615787178278, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8808 }, { "epoch": 29.00760135135135, "grad_norm": 0.0026303164195269346, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8809 }, { "epoch": 29.007635135135136, "grad_norm": 0.004775978159159422, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8810 }, { "epoch": 29.00766891891892, "grad_norm": 0.002607321599498391, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8811 }, { "epoch": 29.0077027027027, "grad_norm": 0.00792191457003355, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8812 }, { "epoch": 29.007736486486486, "grad_norm": 0.13617374002933502, "learning_rate": 7.8125e-07, "loss": 0.0013, "step": 8813 }, { "epoch": 29.00777027027027, "grad_norm": 12.099456787109375, "learning_rate": 7.8125e-07, "loss": 0.1038, "step": 8814 }, { "epoch": 29.007804054054056, "grad_norm": 0.4330049455165863, "learning_rate": 7.8125e-07, "loss": 0.0083, "step": 8815 }, { "epoch": 29.007837837837837, "grad_norm": 0.15764330327510834, "learning_rate": 7.8125e-07, "loss": 0.0056, "step": 8816 }, { "epoch": 29.00787162162162, "grad_norm": 0.0013649252941831946, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8817 }, { "epoch": 29.007905405405406, "grad_norm": 2.871548891067505, "learning_rate": 7.8125e-07, "loss": 0.0599, "step": 8818 }, { "epoch": 29.00793918918919, "grad_norm": 0.12325932085514069, "learning_rate": 7.8125e-07, "loss": 0.0045, "step": 8819 }, { "epoch": 29.007972972972972, "grad_norm": 0.13034309446811676, "learning_rate": 7.8125e-07, "loss": 0.0044, "step": 8820 }, { "epoch": 29.008006756756757, "grad_norm": 0.0070016756653785706, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8821 }, { "epoch": 29.00804054054054, "grad_norm": 0.004396664910018444, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8822 }, { "epoch": 29.008074324324323, "grad_norm": 0.0028537146281450987, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8823 }, { "epoch": 29.008108108108107, "grad_norm": 0.3549564480781555, "learning_rate": 7.8125e-07, "loss": 0.0112, "step": 8824 }, { "epoch": 29.008141891891892, "grad_norm": 0.0019585846457630396, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8825 }, { "epoch": 29.008175675675677, "grad_norm": 0.001017373171634972, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8826 }, { "epoch": 29.008209459459458, "grad_norm": 0.007125292904675007, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8827 }, { "epoch": 29.008243243243243, "grad_norm": 0.04889748618006706, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8828 }, { "epoch": 29.008277027027027, "grad_norm": 0.21139222383499146, "learning_rate": 7.8125e-07, "loss": 0.0066, "step": 8829 }, { "epoch": 29.008310810810812, "grad_norm": 0.6813983917236328, "learning_rate": 7.8125e-07, "loss": 0.005, "step": 8830 }, { "epoch": 29.008344594594593, "grad_norm": 0.1987380087375641, "learning_rate": 7.8125e-07, "loss": 0.0012, "step": 8831 }, { "epoch": 29.008378378378378, "grad_norm": 0.1319996416568756, "learning_rate": 7.8125e-07, "loss": 0.0049, "step": 8832 }, { "epoch": 29.008412162162163, "grad_norm": 0.002963179722428322, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8833 }, { "epoch": 29.008445945945947, "grad_norm": 0.45344996452331543, "learning_rate": 7.8125e-07, "loss": 0.0114, "step": 8834 }, { "epoch": 29.00847972972973, "grad_norm": 0.0009032138041220605, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8835 }, { "epoch": 29.008513513513513, "grad_norm": 4.237473964691162, "learning_rate": 7.8125e-07, "loss": 0.5205, "step": 8836 }, { "epoch": 29.008547297297298, "grad_norm": 0.1675468236207962, "learning_rate": 7.8125e-07, "loss": 0.004, "step": 8837 }, { "epoch": 29.008581081081083, "grad_norm": 0.04343968629837036, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 8838 }, { "epoch": 29.008614864864864, "grad_norm": 0.12034527957439423, "learning_rate": 7.8125e-07, "loss": 0.0045, "step": 8839 }, { "epoch": 29.00864864864865, "grad_norm": 0.007632861379534006, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8840 }, { "epoch": 29.008682432432433, "grad_norm": 0.023452192544937134, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8841 }, { "epoch": 29.008716216216218, "grad_norm": 0.0006201069918461144, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8842 }, { "epoch": 29.00875, "grad_norm": 0.1540721356868744, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8843 }, { "epoch": 29.008783783783784, "grad_norm": 0.002080389065667987, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8844 }, { "epoch": 29.00881756756757, "grad_norm": 0.0013534851605072618, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8845 }, { "epoch": 29.00885135135135, "grad_norm": 0.0006615127786062658, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8846 }, { "epoch": 29.008885135135134, "grad_norm": 0.004462329670786858, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8847 }, { "epoch": 29.00891891891892, "grad_norm": 0.004906976595520973, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8848 }, { "epoch": 29.008952702702704, "grad_norm": 0.2224990874528885, "learning_rate": 7.8125e-07, "loss": 0.0079, "step": 8849 }, { "epoch": 29.008986486486485, "grad_norm": 0.1871788650751114, "learning_rate": 7.8125e-07, "loss": 0.0012, "step": 8850 }, { "epoch": 29.00902027027027, "grad_norm": 0.12441925704479218, "learning_rate": 7.8125e-07, "loss": 0.0045, "step": 8851 }, { "epoch": 29.009054054054054, "grad_norm": 0.0017727646045386791, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8852 }, { "epoch": 29.00908783783784, "grad_norm": 0.0008497036760672927, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8853 }, { "epoch": 29.00912162162162, "grad_norm": 0.00156661425717175, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8854 }, { "epoch": 29.009155405405405, "grad_norm": 0.0009348933235742152, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8855 }, { "epoch": 29.00918918918919, "grad_norm": 0.007473450619727373, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8856 }, { "epoch": 29.009222972972974, "grad_norm": 0.5449880957603455, "learning_rate": 7.8125e-07, "loss": 0.0038, "step": 8857 }, { "epoch": 29.009256756756756, "grad_norm": 1.9662785530090332, "learning_rate": 7.8125e-07, "loss": 0.0136, "step": 8858 }, { "epoch": 29.00929054054054, "grad_norm": 0.00239905109629035, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8859 }, { "epoch": 29.009324324324325, "grad_norm": 0.12583456933498383, "learning_rate": 7.8125e-07, "loss": 0.0048, "step": 8860 }, { "epoch": 29.00935810810811, "grad_norm": 0.006356438621878624, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8861 }, { "epoch": 29.00939189189189, "grad_norm": 0.0015460816211998463, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8862 }, { "epoch": 29.009425675675676, "grad_norm": 0.003412870457395911, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8863 }, { "epoch": 29.00945945945946, "grad_norm": 0.002174953231588006, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8864 }, { "epoch": 29.00949324324324, "grad_norm": 0.002346651628613472, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8865 }, { "epoch": 29.009527027027026, "grad_norm": 0.004554536659270525, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8866 }, { "epoch": 29.00956081081081, "grad_norm": 0.0024696956388652325, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8867 }, { "epoch": 29.009594594594596, "grad_norm": 0.015657970681786537, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8868 }, { "epoch": 29.009628378378377, "grad_norm": 0.0011477216612547636, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8869 }, { "epoch": 29.00966216216216, "grad_norm": 0.06457428634166718, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8870 }, { "epoch": 29.009695945945946, "grad_norm": 0.010948272421956062, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8871 }, { "epoch": 29.00972972972973, "grad_norm": 0.0027089861687272787, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8872 }, { "epoch": 29.009763513513512, "grad_norm": 0.7790197730064392, "learning_rate": 7.8125e-07, "loss": 0.0031, "step": 8873 }, { "epoch": 29.009797297297297, "grad_norm": 0.0036705327220261097, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8874 }, { "epoch": 29.00983108108108, "grad_norm": 0.007773890625685453, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8875 }, { "epoch": 29.009864864864866, "grad_norm": 37.36239242553711, "learning_rate": 7.8125e-07, "loss": 0.7523, "step": 8876 }, { "epoch": 29.009898648648647, "grad_norm": 0.021436605602502823, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8877 }, { "epoch": 29.009932432432432, "grad_norm": 0.0009431355865672231, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8878 }, { "epoch": 29.009966216216217, "grad_norm": 0.029284926131367683, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8879 }, { "epoch": 29.01, "grad_norm": 0.002850862918421626, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8880 }, { "epoch": 29.01, "eval_accuracy": 0.901453957996769, "eval_loss": 0.5942434072494507, "eval_runtime": 34.5788, "eval_samples_per_second": 17.901, "eval_steps_per_second": 2.256, "step": 8880 }, { "epoch": 30.000033783783785, "grad_norm": 0.006786246318370104, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8881 }, { "epoch": 30.000067567567566, "grad_norm": 0.011612938717007637, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8882 }, { "epoch": 30.00010135135135, "grad_norm": 0.001563405618071556, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8883 }, { "epoch": 30.000135135135135, "grad_norm": 0.09492279589176178, "learning_rate": 7.8125e-07, "loss": 0.0009, "step": 8884 }, { "epoch": 30.00016891891892, "grad_norm": 0.006375856697559357, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8885 }, { "epoch": 30.0002027027027, "grad_norm": 18.881921768188477, "learning_rate": 7.8125e-07, "loss": 0.6573, "step": 8886 }, { "epoch": 30.000236486486486, "grad_norm": 0.1560693383216858, "learning_rate": 7.8125e-07, "loss": 0.0058, "step": 8887 }, { "epoch": 30.00027027027027, "grad_norm": 0.10703373700380325, "learning_rate": 7.8125e-07, "loss": 0.004, "step": 8888 }, { "epoch": 30.000304054054055, "grad_norm": 0.48599621653556824, "learning_rate": 7.8125e-07, "loss": 0.002, "step": 8889 }, { "epoch": 30.000337837837836, "grad_norm": 0.06861506402492523, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8890 }, { "epoch": 30.00037162162162, "grad_norm": 0.0019534965977072716, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8891 }, { "epoch": 30.000405405405406, "grad_norm": 0.02862156741321087, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 8892 }, { "epoch": 30.00043918918919, "grad_norm": 0.009811261668801308, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8893 }, { "epoch": 30.00047297297297, "grad_norm": 0.0157626923173666, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8894 }, { "epoch": 30.000506756756756, "grad_norm": 0.12114350497722626, "learning_rate": 7.8125e-07, "loss": 0.0031, "step": 8895 }, { "epoch": 30.00054054054054, "grad_norm": 0.0018914449028670788, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8896 }, { "epoch": 30.000574324324326, "grad_norm": 0.10764766484498978, "learning_rate": 7.8125e-07, "loss": 0.0033, "step": 8897 }, { "epoch": 30.000608108108107, "grad_norm": 24.43532943725586, "learning_rate": 7.8125e-07, "loss": 0.076, "step": 8898 }, { "epoch": 30.000641891891892, "grad_norm": 0.009678773581981659, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8899 }, { "epoch": 30.000675675675677, "grad_norm": 15.306991577148438, "learning_rate": 7.8125e-07, "loss": 0.2276, "step": 8900 }, { "epoch": 30.00070945945946, "grad_norm": 0.0040407064370810986, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8901 }, { "epoch": 30.000743243243242, "grad_norm": 5.057543754577637, "learning_rate": 7.8125e-07, "loss": 0.2498, "step": 8902 }, { "epoch": 30.000777027027027, "grad_norm": 0.0012808318715542555, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8903 }, { "epoch": 30.000810810810812, "grad_norm": 5.3371405601501465, "learning_rate": 7.8125e-07, "loss": 0.0982, "step": 8904 }, { "epoch": 30.000844594594593, "grad_norm": 0.00475545646622777, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8905 }, { "epoch": 30.000878378378378, "grad_norm": 0.0037763502914458513, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8906 }, { "epoch": 30.000912162162162, "grad_norm": 0.032408881932497025, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 8907 }, { "epoch": 30.000945945945947, "grad_norm": 3.6511945724487305, "learning_rate": 7.8125e-07, "loss": 0.3546, "step": 8908 }, { "epoch": 30.00097972972973, "grad_norm": 0.007994496263563633, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8909 }, { "epoch": 30.001013513513513, "grad_norm": 0.2724188268184662, "learning_rate": 7.8125e-07, "loss": 0.0012, "step": 8910 }, { "epoch": 30.001047297297298, "grad_norm": 0.013318491168320179, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8911 }, { "epoch": 30.001081081081082, "grad_norm": 0.004989852663129568, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8912 }, { "epoch": 30.001114864864864, "grad_norm": 4.469759941101074, "learning_rate": 7.8125e-07, "loss": 0.2864, "step": 8913 }, { "epoch": 30.00114864864865, "grad_norm": 49.540531158447266, "learning_rate": 7.8125e-07, "loss": 0.1711, "step": 8914 }, { "epoch": 30.001182432432433, "grad_norm": 0.0007087650010362267, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8915 }, { "epoch": 30.001216216216218, "grad_norm": 0.0013892545830458403, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8916 }, { "epoch": 30.00125, "grad_norm": 0.4311167299747467, "learning_rate": 7.8125e-07, "loss": 0.0026, "step": 8917 }, { "epoch": 30.001283783783784, "grad_norm": 0.012509595602750778, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8918 }, { "epoch": 30.00131756756757, "grad_norm": 4.994117259979248, "learning_rate": 7.8125e-07, "loss": 0.3441, "step": 8919 }, { "epoch": 30.001351351351353, "grad_norm": 0.0012796737719327211, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8920 }, { "epoch": 30.001385135135134, "grad_norm": 5.1230597496032715, "learning_rate": 7.8125e-07, "loss": 0.2338, "step": 8921 }, { "epoch": 30.00141891891892, "grad_norm": 0.001293662004172802, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8922 }, { "epoch": 30.001452702702704, "grad_norm": 0.9723377227783203, "learning_rate": 7.8125e-07, "loss": 0.0023, "step": 8923 }, { "epoch": 30.001486486486485, "grad_norm": 0.0014249728992581367, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8924 }, { "epoch": 30.00152027027027, "grad_norm": 0.0023101866245269775, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8925 }, { "epoch": 30.001554054054054, "grad_norm": 0.0013952674344182014, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8926 }, { "epoch": 30.00158783783784, "grad_norm": 8.908059120178223, "learning_rate": 7.8125e-07, "loss": 0.1133, "step": 8927 }, { "epoch": 30.00162162162162, "grad_norm": 3.349524974822998, "learning_rate": 7.8125e-07, "loss": 0.4324, "step": 8928 }, { "epoch": 30.001655405405405, "grad_norm": 0.19721364974975586, "learning_rate": 7.8125e-07, "loss": 0.0056, "step": 8929 }, { "epoch": 30.00168918918919, "grad_norm": 17.520999908447266, "learning_rate": 7.8125e-07, "loss": 0.9607, "step": 8930 }, { "epoch": 30.001722972972974, "grad_norm": 0.11691158264875412, "learning_rate": 7.8125e-07, "loss": 0.0044, "step": 8931 }, { "epoch": 30.001756756756755, "grad_norm": 0.11098417639732361, "learning_rate": 7.8125e-07, "loss": 0.0041, "step": 8932 }, { "epoch": 30.00179054054054, "grad_norm": 0.0016047091921791434, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8933 }, { "epoch": 30.001824324324325, "grad_norm": 0.002137649804353714, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8934 }, { "epoch": 30.00185810810811, "grad_norm": 0.0017335880547761917, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8935 }, { "epoch": 30.00189189189189, "grad_norm": 0.09642414003610611, "learning_rate": 7.8125e-07, "loss": 0.0014, "step": 8936 }, { "epoch": 30.001925675675675, "grad_norm": 0.00608995882794261, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8937 }, { "epoch": 30.00195945945946, "grad_norm": 9.215092658996582, "learning_rate": 7.8125e-07, "loss": 0.0649, "step": 8938 }, { "epoch": 30.001993243243245, "grad_norm": 0.001751350937411189, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8939 }, { "epoch": 30.002027027027026, "grad_norm": 0.6431359648704529, "learning_rate": 7.8125e-07, "loss": 0.0167, "step": 8940 }, { "epoch": 30.00206081081081, "grad_norm": 0.00638628052547574, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8941 }, { "epoch": 30.002094594594595, "grad_norm": 0.0008719247416593134, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8942 }, { "epoch": 30.00212837837838, "grad_norm": 23.685161590576172, "learning_rate": 7.8125e-07, "loss": 0.0588, "step": 8943 }, { "epoch": 30.00216216216216, "grad_norm": 0.007997420616447926, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8944 }, { "epoch": 30.002195945945946, "grad_norm": 0.005021108314394951, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8945 }, { "epoch": 30.00222972972973, "grad_norm": 0.10726003348827362, "learning_rate": 7.8125e-07, "loss": 0.0011, "step": 8946 }, { "epoch": 30.002263513513512, "grad_norm": 3.947640895843506, "learning_rate": 7.8125e-07, "loss": 0.0679, "step": 8947 }, { "epoch": 30.002297297297297, "grad_norm": 0.21124176681041718, "learning_rate": 7.8125e-07, "loss": 0.0009, "step": 8948 }, { "epoch": 30.00233108108108, "grad_norm": 0.04416617378592491, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8949 }, { "epoch": 30.002364864864866, "grad_norm": 0.0030770564917474985, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8950 }, { "epoch": 30.002398648648647, "grad_norm": 0.0012883513700217009, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8951 }, { "epoch": 30.002432432432432, "grad_norm": 0.0015404440928250551, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8952 }, { "epoch": 30.002466216216217, "grad_norm": 0.001692183897830546, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8953 }, { "epoch": 30.0025, "grad_norm": 0.007830392569303513, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8954 }, { "epoch": 30.002533783783782, "grad_norm": 0.001348957885056734, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8955 }, { "epoch": 30.002567567567567, "grad_norm": 0.000995473237708211, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8956 }, { "epoch": 30.002601351351352, "grad_norm": 0.006773407571017742, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8957 }, { "epoch": 30.002635135135137, "grad_norm": 0.029139431193470955, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8958 }, { "epoch": 30.002668918918918, "grad_norm": 0.0010730455396696925, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8959 }, { "epoch": 30.002702702702702, "grad_norm": 0.0020818861667066813, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8960 }, { "epoch": 30.002736486486487, "grad_norm": 0.23031575977802277, "learning_rate": 7.8125e-07, "loss": 0.0099, "step": 8961 }, { "epoch": 30.002770270270272, "grad_norm": 0.0030570225790143013, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8962 }, { "epoch": 30.002804054054053, "grad_norm": 0.004240776412189007, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8963 }, { "epoch": 30.002837837837838, "grad_norm": 0.9357881546020508, "learning_rate": 7.8125e-07, "loss": 0.0118, "step": 8964 }, { "epoch": 30.002871621621622, "grad_norm": 0.048310574144124985, "learning_rate": 7.8125e-07, "loss": 0.0009, "step": 8965 }, { "epoch": 30.002905405405407, "grad_norm": 0.002949504181742668, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8966 }, { "epoch": 30.00293918918919, "grad_norm": 1.5898343324661255, "learning_rate": 7.8125e-07, "loss": 0.043, "step": 8967 }, { "epoch": 30.002972972972973, "grad_norm": 0.16072379052639008, "learning_rate": 7.8125e-07, "loss": 0.0053, "step": 8968 }, { "epoch": 30.003006756756758, "grad_norm": 0.09867978096008301, "learning_rate": 7.8125e-07, "loss": 0.0014, "step": 8969 }, { "epoch": 30.00304054054054, "grad_norm": 0.00155662652105093, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8970 }, { "epoch": 30.003074324324324, "grad_norm": 7.421422481536865, "learning_rate": 7.8125e-07, "loss": 0.0702, "step": 8971 }, { "epoch": 30.00310810810811, "grad_norm": 0.0019681963603943586, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8972 }, { "epoch": 30.003141891891893, "grad_norm": 0.12216244637966156, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 8973 }, { "epoch": 30.003175675675674, "grad_norm": 0.013353339396417141, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8974 }, { "epoch": 30.00320945945946, "grad_norm": 0.11126960813999176, "learning_rate": 7.8125e-07, "loss": 0.0041, "step": 8975 }, { "epoch": 30.003243243243244, "grad_norm": 0.001725925481878221, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8976 }, { "epoch": 30.00327702702703, "grad_norm": 0.3997044265270233, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 8977 }, { "epoch": 30.00331081081081, "grad_norm": 0.539266049861908, "learning_rate": 7.8125e-07, "loss": 0.0023, "step": 8978 }, { "epoch": 30.003344594594594, "grad_norm": 27.989301681518555, "learning_rate": 7.8125e-07, "loss": 0.0894, "step": 8979 }, { "epoch": 30.00337837837838, "grad_norm": 0.0013184286653995514, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8980 }, { "epoch": 30.003412162162164, "grad_norm": 0.011330152861773968, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8981 }, { "epoch": 30.003445945945945, "grad_norm": 0.0030590849928557873, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8982 }, { "epoch": 30.00347972972973, "grad_norm": 12.395489692687988, "learning_rate": 7.8125e-07, "loss": 0.1591, "step": 8983 }, { "epoch": 30.003513513513514, "grad_norm": 3.2057273387908936, "learning_rate": 7.8125e-07, "loss": 0.0881, "step": 8984 }, { "epoch": 30.0035472972973, "grad_norm": 0.14771731197834015, "learning_rate": 7.8125e-07, "loss": 0.0056, "step": 8985 }, { "epoch": 30.00358108108108, "grad_norm": 3.180426836013794, "learning_rate": 7.8125e-07, "loss": 0.0635, "step": 8986 }, { "epoch": 30.003614864864865, "grad_norm": 14.631515502929688, "learning_rate": 7.8125e-07, "loss": 0.0563, "step": 8987 }, { "epoch": 30.00364864864865, "grad_norm": 0.011535072699189186, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8988 }, { "epoch": 30.00368243243243, "grad_norm": 0.02170451544225216, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 8989 }, { "epoch": 30.003716216216215, "grad_norm": 0.12472274899482727, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 8990 }, { "epoch": 30.00375, "grad_norm": 0.09131761640310287, "learning_rate": 7.8125e-07, "loss": 0.0022, "step": 8991 }, { "epoch": 30.003783783783785, "grad_norm": 0.010216386057436466, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8992 }, { "epoch": 30.003817567567566, "grad_norm": 4.659182071685791, "learning_rate": 7.8125e-07, "loss": 0.1887, "step": 8993 }, { "epoch": 30.00385135135135, "grad_norm": 0.0007099840440787375, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 8994 }, { "epoch": 30.003885135135135, "grad_norm": 0.09527289122343063, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8995 }, { "epoch": 30.00391891891892, "grad_norm": 0.0036982293240725994, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8996 }, { "epoch": 30.0039527027027, "grad_norm": 1.2337909936904907, "learning_rate": 7.8125e-07, "loss": 0.022, "step": 8997 }, { "epoch": 30.003986486486486, "grad_norm": 0.0025896099396049976, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 8998 }, { "epoch": 30.00402027027027, "grad_norm": 0.08260107785463333, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 8999 }, { "epoch": 30.004054054054055, "grad_norm": 0.02659009024500847, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 9000 }, { "epoch": 30.004087837837837, "grad_norm": 2.237499475479126, "learning_rate": 7.8125e-07, "loss": 0.0216, "step": 9001 }, { "epoch": 30.00412162162162, "grad_norm": 0.007022547535598278, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9002 }, { "epoch": 30.004155405405406, "grad_norm": 0.09814346581697464, "learning_rate": 7.8125e-07, "loss": 0.001, "step": 9003 }, { "epoch": 30.00418918918919, "grad_norm": 0.008292722515761852, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9004 }, { "epoch": 30.004222972972972, "grad_norm": 0.151437908411026, "learning_rate": 7.8125e-07, "loss": 0.0052, "step": 9005 }, { "epoch": 30.004256756756757, "grad_norm": 0.004947040230035782, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9006 }, { "epoch": 30.00429054054054, "grad_norm": 0.0012547530932351947, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9007 }, { "epoch": 30.004324324324326, "grad_norm": 0.0023656270932406187, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9008 }, { "epoch": 30.004358108108107, "grad_norm": 0.01977682299911976, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9009 }, { "epoch": 30.004391891891892, "grad_norm": 0.0009954592678695917, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9010 }, { "epoch": 30.004425675675677, "grad_norm": 0.004941461607813835, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9011 }, { "epoch": 30.004459459459458, "grad_norm": 0.004850449040532112, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9012 }, { "epoch": 30.004493243243243, "grad_norm": 3.8375391960144043, "learning_rate": 7.8125e-07, "loss": 0.4724, "step": 9013 }, { "epoch": 30.004527027027027, "grad_norm": 0.0009946373756974936, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9014 }, { "epoch": 30.004560810810812, "grad_norm": 0.2682979702949524, "learning_rate": 7.8125e-07, "loss": 0.003, "step": 9015 }, { "epoch": 30.004594594594593, "grad_norm": 0.0012223230442032218, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9016 }, { "epoch": 30.004628378378378, "grad_norm": 0.010892807506024837, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9017 }, { "epoch": 30.004662162162163, "grad_norm": 0.03597109392285347, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 9018 }, { "epoch": 30.004695945945947, "grad_norm": 0.0032227197661995888, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9019 }, { "epoch": 30.00472972972973, "grad_norm": 0.001556630595587194, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9020 }, { "epoch": 30.004763513513513, "grad_norm": 0.061009157449007034, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9021 }, { "epoch": 30.004797297297298, "grad_norm": 2.9163246154785156, "learning_rate": 7.8125e-07, "loss": 0.066, "step": 9022 }, { "epoch": 30.004831081081083, "grad_norm": 1.2339385747909546, "learning_rate": 7.8125e-07, "loss": 0.033, "step": 9023 }, { "epoch": 30.004864864864864, "grad_norm": 0.0824265331029892, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 9024 }, { "epoch": 30.00489864864865, "grad_norm": 5.414673328399658, "learning_rate": 7.8125e-07, "loss": 0.3599, "step": 9025 }, { "epoch": 30.004932432432433, "grad_norm": 0.030324522405862808, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9026 }, { "epoch": 30.004966216216218, "grad_norm": 0.003106855321675539, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9027 }, { "epoch": 30.005, "grad_norm": 0.10565663874149323, "learning_rate": 7.8125e-07, "loss": 0.0033, "step": 9028 }, { "epoch": 30.005033783783784, "grad_norm": 0.002044800901785493, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9029 }, { "epoch": 30.00506756756757, "grad_norm": 0.18159109354019165, "learning_rate": 7.8125e-07, "loss": 0.0059, "step": 9030 }, { "epoch": 30.00510135135135, "grad_norm": 69.82379150390625, "learning_rate": 7.8125e-07, "loss": 0.363, "step": 9031 }, { "epoch": 30.005135135135134, "grad_norm": 0.0010406128130853176, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9032 }, { "epoch": 30.00516891891892, "grad_norm": 0.005806934088468552, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9033 }, { "epoch": 30.005202702702704, "grad_norm": 33.4088134765625, "learning_rate": 7.8125e-07, "loss": 0.5957, "step": 9034 }, { "epoch": 30.005236486486485, "grad_norm": 0.14249378442764282, "learning_rate": 7.8125e-07, "loss": 0.0051, "step": 9035 }, { "epoch": 30.00527027027027, "grad_norm": 0.10759196430444717, "learning_rate": 7.8125e-07, "loss": 0.004, "step": 9036 }, { "epoch": 30.005304054054054, "grad_norm": 0.317628413438797, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 9037 }, { "epoch": 30.00533783783784, "grad_norm": 0.0006678920472040772, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9038 }, { "epoch": 30.00537162162162, "grad_norm": 0.0022314987145364285, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9039 }, { "epoch": 30.005405405405405, "grad_norm": 6.684687614440918, "learning_rate": 7.8125e-07, "loss": 0.0691, "step": 9040 }, { "epoch": 30.00543918918919, "grad_norm": 0.9772579669952393, "learning_rate": 7.8125e-07, "loss": 0.0049, "step": 9041 }, { "epoch": 30.005472972972974, "grad_norm": 0.001722075161524117, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9042 }, { "epoch": 30.005506756756755, "grad_norm": 0.0012763958657160401, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9043 }, { "epoch": 30.00554054054054, "grad_norm": 0.00429837079718709, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9044 }, { "epoch": 30.005574324324325, "grad_norm": 0.001711905701085925, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9045 }, { "epoch": 30.00560810810811, "grad_norm": 4.316672325134277, "learning_rate": 7.8125e-07, "loss": 0.3156, "step": 9046 }, { "epoch": 30.00564189189189, "grad_norm": 0.0013757796259596944, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9047 }, { "epoch": 30.005675675675676, "grad_norm": 0.005110639613121748, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9048 }, { "epoch": 30.00570945945946, "grad_norm": 0.006429455243051052, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9049 }, { "epoch": 30.005743243243245, "grad_norm": 0.0010816100984811783, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9050 }, { "epoch": 30.005777027027026, "grad_norm": 2.4394779205322266, "learning_rate": 7.8125e-07, "loss": 0.0121, "step": 9051 }, { "epoch": 30.00581081081081, "grad_norm": 0.019679713994264603, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9052 }, { "epoch": 30.005844594594596, "grad_norm": 0.0016437125159427524, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9053 }, { "epoch": 30.005878378378377, "grad_norm": 0.0009549455135129392, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9054 }, { "epoch": 30.00591216216216, "grad_norm": 0.01925615593791008, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9055 }, { "epoch": 30.005945945945946, "grad_norm": 0.10946860909461975, "learning_rate": 7.8125e-07, "loss": 0.004, "step": 9056 }, { "epoch": 30.00597972972973, "grad_norm": 0.009085891768336296, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9057 }, { "epoch": 30.006013513513512, "grad_norm": 68.65164947509766, "learning_rate": 7.8125e-07, "loss": 0.5287, "step": 9058 }, { "epoch": 30.006047297297297, "grad_norm": 0.0011270091636106372, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9059 }, { "epoch": 30.00608108108108, "grad_norm": 0.0017425833502784371, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9060 }, { "epoch": 30.006114864864866, "grad_norm": 0.012984851375222206, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9061 }, { "epoch": 30.006148648648647, "grad_norm": 0.0011795159662142396, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9062 }, { "epoch": 30.006182432432432, "grad_norm": 0.06564789265394211, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 9063 }, { "epoch": 30.006216216216217, "grad_norm": 0.0029932756442576647, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9064 }, { "epoch": 30.00625, "grad_norm": 0.003696278901770711, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9065 }, { "epoch": 30.006283783783783, "grad_norm": 0.014284064061939716, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9066 }, { "epoch": 30.006317567567567, "grad_norm": 0.288346529006958, "learning_rate": 7.8125e-07, "loss": 0.0053, "step": 9067 }, { "epoch": 30.006351351351352, "grad_norm": 12.539471626281738, "learning_rate": 7.8125e-07, "loss": 0.2806, "step": 9068 }, { "epoch": 30.006385135135137, "grad_norm": 0.0021490168292075396, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9069 }, { "epoch": 30.006418918918918, "grad_norm": 0.06308476626873016, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9070 }, { "epoch": 30.006452702702703, "grad_norm": 5.188417911529541, "learning_rate": 7.8125e-07, "loss": 0.2227, "step": 9071 }, { "epoch": 30.006486486486487, "grad_norm": 0.0032905240077525377, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9072 }, { "epoch": 30.006520270270272, "grad_norm": 0.006134359166026115, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9073 }, { "epoch": 30.006554054054053, "grad_norm": 13.17232894897461, "learning_rate": 7.8125e-07, "loss": 0.4814, "step": 9074 }, { "epoch": 30.006587837837838, "grad_norm": 0.001231303089298308, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9075 }, { "epoch": 30.006621621621623, "grad_norm": 0.0035022434312850237, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9076 }, { "epoch": 30.006655405405404, "grad_norm": 0.05679036304354668, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 9077 }, { "epoch": 30.00668918918919, "grad_norm": 0.021502679213881493, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9078 }, { "epoch": 30.006722972972973, "grad_norm": 0.09223093837499619, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9079 }, { "epoch": 30.006756756756758, "grad_norm": 0.001885513891465962, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9080 }, { "epoch": 30.00679054054054, "grad_norm": 0.0033530036453157663, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9081 }, { "epoch": 30.006824324324324, "grad_norm": 0.008407571353018284, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9082 }, { "epoch": 30.00685810810811, "grad_norm": 0.0007708074408583343, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9083 }, { "epoch": 30.006891891891893, "grad_norm": 0.06387311965227127, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 9084 }, { "epoch": 30.006925675675674, "grad_norm": 0.0010207723826169968, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9085 }, { "epoch": 30.00695945945946, "grad_norm": 0.0016833713743835688, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9086 }, { "epoch": 30.006993243243244, "grad_norm": 0.021638628095388412, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9087 }, { "epoch": 30.00702702702703, "grad_norm": 0.005397400353103876, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9088 }, { "epoch": 30.00706081081081, "grad_norm": 0.05612632632255554, "learning_rate": 7.8125e-07, "loss": 0.0006, "step": 9089 }, { "epoch": 30.007094594594594, "grad_norm": 0.009246242232620716, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9090 }, { "epoch": 30.00712837837838, "grad_norm": 45.74506759643555, "learning_rate": 7.8125e-07, "loss": 0.6012, "step": 9091 }, { "epoch": 30.007162162162164, "grad_norm": 0.0023956450168043375, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9092 }, { "epoch": 30.007195945945945, "grad_norm": 0.22265340387821198, "learning_rate": 7.8125e-07, "loss": 0.0016, "step": 9093 }, { "epoch": 30.00722972972973, "grad_norm": 0.017957136034965515, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9094 }, { "epoch": 30.007263513513514, "grad_norm": 0.010187658481299877, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9095 }, { "epoch": 30.007297297297296, "grad_norm": 0.11813904345035553, "learning_rate": 7.8125e-07, "loss": 0.0045, "step": 9096 }, { "epoch": 30.00733108108108, "grad_norm": 0.006071476731449366, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9097 }, { "epoch": 30.007364864864865, "grad_norm": 0.10426709055900574, "learning_rate": 7.8125e-07, "loss": 0.0039, "step": 9098 }, { "epoch": 30.00739864864865, "grad_norm": 0.11496708542108536, "learning_rate": 7.8125e-07, "loss": 0.0042, "step": 9099 }, { "epoch": 30.00743243243243, "grad_norm": 0.004429804161190987, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9100 }, { "epoch": 30.007466216216216, "grad_norm": 0.009348667226731777, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9101 }, { "epoch": 30.0075, "grad_norm": 0.013394529931247234, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9102 }, { "epoch": 30.007533783783785, "grad_norm": 0.0026820856146514416, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9103 }, { "epoch": 30.007567567567566, "grad_norm": 0.046078070998191833, "learning_rate": 7.8125e-07, "loss": 0.0007, "step": 9104 }, { "epoch": 30.00760135135135, "grad_norm": 0.0032173674553632736, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9105 }, { "epoch": 30.007635135135136, "grad_norm": 0.03135086968541145, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 9106 }, { "epoch": 30.00766891891892, "grad_norm": 0.000705726386513561, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9107 }, { "epoch": 30.0077027027027, "grad_norm": 0.002655017888173461, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9108 }, { "epoch": 30.007736486486486, "grad_norm": 0.012120711617171764, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9109 }, { "epoch": 30.00777027027027, "grad_norm": 0.12264855206012726, "learning_rate": 7.8125e-07, "loss": 0.0025, "step": 9110 }, { "epoch": 30.007804054054056, "grad_norm": 0.017101317644119263, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9111 }, { "epoch": 30.007837837837837, "grad_norm": 0.05622002109885216, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9112 }, { "epoch": 30.00787162162162, "grad_norm": 63.422733306884766, "learning_rate": 7.8125e-07, "loss": 0.3284, "step": 9113 }, { "epoch": 30.007905405405406, "grad_norm": 0.35137858986854553, "learning_rate": 7.8125e-07, "loss": 0.0052, "step": 9114 }, { "epoch": 30.00793918918919, "grad_norm": 0.0020317428279668093, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9115 }, { "epoch": 30.007972972972972, "grad_norm": 0.006224033422768116, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9116 }, { "epoch": 30.008006756756757, "grad_norm": 1.3970184326171875, "learning_rate": 7.8125e-07, "loss": 0.0557, "step": 9117 }, { "epoch": 30.00804054054054, "grad_norm": 0.0009788793977349997, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9118 }, { "epoch": 30.008074324324323, "grad_norm": 0.1433604210615158, "learning_rate": 7.8125e-07, "loss": 0.001, "step": 9119 }, { "epoch": 30.008108108108107, "grad_norm": 0.005727655719965696, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9120 }, { "epoch": 30.008141891891892, "grad_norm": 0.0015073056565597653, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9121 }, { "epoch": 30.008175675675677, "grad_norm": 0.5958597660064697, "learning_rate": 7.8125e-07, "loss": 0.0073, "step": 9122 }, { "epoch": 30.008209459459458, "grad_norm": 0.0021061678417026997, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9123 }, { "epoch": 30.008243243243243, "grad_norm": 0.003646045457571745, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9124 }, { "epoch": 30.008277027027027, "grad_norm": 0.5455254912376404, "learning_rate": 7.8125e-07, "loss": 0.0051, "step": 9125 }, { "epoch": 30.008310810810812, "grad_norm": 41.75301742553711, "learning_rate": 7.8125e-07, "loss": 0.4738, "step": 9126 }, { "epoch": 30.008344594594593, "grad_norm": 0.0027943821623921394, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9127 }, { "epoch": 30.008378378378378, "grad_norm": 0.17947296798229218, "learning_rate": 7.8125e-07, "loss": 0.0005, "step": 9128 }, { "epoch": 30.008412162162163, "grad_norm": 0.005665949080139399, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9129 }, { "epoch": 30.008445945945947, "grad_norm": 18.966333389282227, "learning_rate": 7.8125e-07, "loss": 0.3433, "step": 9130 }, { "epoch": 30.00847972972973, "grad_norm": 0.0038549918681383133, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9131 }, { "epoch": 30.008513513513513, "grad_norm": 0.2934558689594269, "learning_rate": 7.8125e-07, "loss": 0.0072, "step": 9132 }, { "epoch": 30.008547297297298, "grad_norm": 0.00399030139669776, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9133 }, { "epoch": 30.008581081081083, "grad_norm": 0.00464828684926033, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9134 }, { "epoch": 30.008614864864864, "grad_norm": 0.10463131219148636, "learning_rate": 7.8125e-07, "loss": 0.0026, "step": 9135 }, { "epoch": 30.00864864864865, "grad_norm": 1.6280782222747803, "learning_rate": 7.8125e-07, "loss": 0.0067, "step": 9136 }, { "epoch": 30.008682432432433, "grad_norm": 0.002036353573203087, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9137 }, { "epoch": 30.008716216216218, "grad_norm": 0.15853947401046753, "learning_rate": 7.8125e-07, "loss": 0.0008, "step": 9138 }, { "epoch": 30.00875, "grad_norm": 20.484167098999023, "learning_rate": 7.8125e-07, "loss": 0.0561, "step": 9139 }, { "epoch": 30.008783783783784, "grad_norm": 0.028297292068600655, "learning_rate": 7.8125e-07, "loss": 0.0004, "step": 9140 }, { "epoch": 30.00881756756757, "grad_norm": 0.0033843915443867445, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9141 }, { "epoch": 30.00885135135135, "grad_norm": 0.0008640704327262938, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9142 }, { "epoch": 30.008885135135134, "grad_norm": 0.004747065715491772, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9143 }, { "epoch": 30.00891891891892, "grad_norm": 0.0028081119526177645, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9144 }, { "epoch": 30.008952702702704, "grad_norm": 0.01134367287158966, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9145 }, { "epoch": 30.008986486486485, "grad_norm": 0.0424593985080719, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9146 }, { "epoch": 30.00902027027027, "grad_norm": 0.004332043696194887, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9147 }, { "epoch": 30.009054054054054, "grad_norm": 0.006238369736820459, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9148 }, { "epoch": 30.00908783783784, "grad_norm": 0.3049147129058838, "learning_rate": 7.8125e-07, "loss": 0.0038, "step": 9149 }, { "epoch": 30.00912162162162, "grad_norm": 0.0013622755650430918, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9150 }, { "epoch": 30.009155405405405, "grad_norm": 0.11500300467014313, "learning_rate": 7.8125e-07, "loss": 0.0041, "step": 9151 }, { "epoch": 30.00918918918919, "grad_norm": 5.492821216583252, "learning_rate": 7.8125e-07, "loss": 0.2686, "step": 9152 }, { "epoch": 30.009222972972974, "grad_norm": 0.00194693508092314, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9153 }, { "epoch": 30.009256756756756, "grad_norm": 0.001736170845106244, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9154 }, { "epoch": 30.00929054054054, "grad_norm": 0.12071294337511063, "learning_rate": 7.8125e-07, "loss": 0.0044, "step": 9155 }, { "epoch": 30.009324324324325, "grad_norm": 0.00104804546572268, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9156 }, { "epoch": 30.00935810810811, "grad_norm": 0.0026402482762932777, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9157 }, { "epoch": 30.00939189189189, "grad_norm": 0.002412491012364626, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9158 }, { "epoch": 30.009425675675676, "grad_norm": 0.0017977781826630235, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9159 }, { "epoch": 30.00945945945946, "grad_norm": 0.009026645682752132, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9160 }, { "epoch": 30.00949324324324, "grad_norm": 0.1872081607580185, "learning_rate": 7.8125e-07, "loss": 0.0065, "step": 9161 }, { "epoch": 30.009527027027026, "grad_norm": 0.003908566664904356, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9162 }, { "epoch": 30.00956081081081, "grad_norm": 0.014997514896094799, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9163 }, { "epoch": 30.009594594594596, "grad_norm": 0.0014515905641019344, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9164 }, { "epoch": 30.009628378378377, "grad_norm": 0.015535300597548485, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9165 }, { "epoch": 30.00966216216216, "grad_norm": 64.78173065185547, "learning_rate": 7.8125e-07, "loss": 0.3941, "step": 9166 }, { "epoch": 30.009695945945946, "grad_norm": 0.00102890282869339, "learning_rate": 7.8125e-07, "loss": 0.0, "step": 9167 }, { "epoch": 30.00972972972973, "grad_norm": 3.4072513580322266, "learning_rate": 7.8125e-07, "loss": 0.4405, "step": 9168 }, { "epoch": 30.009763513513512, "grad_norm": 4.125339508056641, "learning_rate": 7.8125e-07, "loss": 0.4217, "step": 9169 }, { "epoch": 30.009797297297297, "grad_norm": 0.002074006712064147, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9170 }, { "epoch": 30.00983108108108, "grad_norm": 0.025476137176156044, "learning_rate": 7.8125e-07, "loss": 0.0003, "step": 9171 }, { "epoch": 30.009864864864866, "grad_norm": 0.0016071780119091272, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9172 }, { "epoch": 30.009898648648647, "grad_norm": 0.002657695673406124, "learning_rate": 7.8125e-07, "loss": 0.0001, "step": 9173 }, { "epoch": 30.009932432432432, "grad_norm": 2.1693663597106934, "learning_rate": 7.8125e-07, "loss": 0.0114, "step": 9174 }, { "epoch": 30.009966216216217, "grad_norm": 0.015064729377627373, "learning_rate": 7.8125e-07, "loss": 0.0002, "step": 9175 }, { "epoch": 30.01, "grad_norm": 7.497888088226318, "learning_rate": 7.8125e-07, "loss": 0.5734, "step": 9176 }, { "epoch": 30.01, "eval_accuracy": 0.8998384491114702, "eval_loss": 0.5723937153816223, "eval_runtime": 33.7841, "eval_samples_per_second": 18.322, "eval_steps_per_second": 2.309, "step": 9176 }, { "epoch": 31.000033783783785, "grad_norm": 0.12062273919582367, "learning_rate": 3.90625e-07, "loss": 0.0044, "step": 9177 }, { "epoch": 31.000067567567566, "grad_norm": 0.0704926997423172, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9178 }, { "epoch": 31.00010135135135, "grad_norm": 1.290972113609314, "learning_rate": 3.90625e-07, "loss": 0.01, "step": 9179 }, { "epoch": 31.000135135135135, "grad_norm": 21.46822738647461, "learning_rate": 3.90625e-07, "loss": 0.1697, "step": 9180 }, { "epoch": 31.00016891891892, "grad_norm": 5.510542869567871, "learning_rate": 3.90625e-07, "loss": 0.3044, "step": 9181 }, { "epoch": 31.0002027027027, "grad_norm": 0.2782764136791229, "learning_rate": 3.90625e-07, "loss": 0.0015, "step": 9182 }, { "epoch": 31.000236486486486, "grad_norm": 0.0016059011686593294, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9183 }, { "epoch": 31.00027027027027, "grad_norm": 0.11473336070775986, "learning_rate": 3.90625e-07, "loss": 0.0036, "step": 9184 }, { "epoch": 31.000304054054055, "grad_norm": 0.016155777499079704, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9185 }, { "epoch": 31.000337837837836, "grad_norm": 0.0021501213777810335, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9186 }, { "epoch": 31.00037162162162, "grad_norm": 0.00281824660487473, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9187 }, { "epoch": 31.000405405405406, "grad_norm": 0.09886414557695389, "learning_rate": 3.90625e-07, "loss": 0.0011, "step": 9188 }, { "epoch": 31.00043918918919, "grad_norm": 0.0008662266773171723, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9189 }, { "epoch": 31.00047297297297, "grad_norm": 0.15696710348129272, "learning_rate": 3.90625e-07, "loss": 0.0057, "step": 9190 }, { "epoch": 31.000506756756756, "grad_norm": 0.0683402493596077, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9191 }, { "epoch": 31.00054054054054, "grad_norm": 0.0775066688656807, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9192 }, { "epoch": 31.000574324324326, "grad_norm": 0.0011195873375982046, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9193 }, { "epoch": 31.000608108108107, "grad_norm": 4.9952898025512695, "learning_rate": 3.90625e-07, "loss": 0.4264, "step": 9194 }, { "epoch": 31.000641891891892, "grad_norm": 0.001199772348627448, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9195 }, { "epoch": 31.000675675675677, "grad_norm": 0.001008966937661171, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9196 }, { "epoch": 31.00070945945946, "grad_norm": 7.0443925857543945, "learning_rate": 3.90625e-07, "loss": 0.1063, "step": 9197 }, { "epoch": 31.000743243243242, "grad_norm": 0.0032396975439041853, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9198 }, { "epoch": 31.000777027027027, "grad_norm": 0.007005321327596903, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9199 }, { "epoch": 31.000810810810812, "grad_norm": 0.014267923310399055, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9200 }, { "epoch": 31.000844594594593, "grad_norm": 0.07803310453891754, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9201 }, { "epoch": 31.000878378378378, "grad_norm": 0.0007240683771669865, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9202 }, { "epoch": 31.000912162162162, "grad_norm": 0.0020248673390597105, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9203 }, { "epoch": 31.000945945945947, "grad_norm": 0.3023335635662079, "learning_rate": 3.90625e-07, "loss": 0.0014, "step": 9204 }, { "epoch": 31.00097972972973, "grad_norm": 0.0006591816782020032, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9205 }, { "epoch": 31.001013513513513, "grad_norm": 0.02237439900636673, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9206 }, { "epoch": 31.001047297297298, "grad_norm": 0.0020327670499682426, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9207 }, { "epoch": 31.001081081081082, "grad_norm": 0.00617239298298955, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9208 }, { "epoch": 31.001114864864864, "grad_norm": 0.008259447291493416, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9209 }, { "epoch": 31.00114864864865, "grad_norm": 0.02137022651731968, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9210 }, { "epoch": 31.001182432432433, "grad_norm": 1.7186874151229858, "learning_rate": 3.90625e-07, "loss": 0.0406, "step": 9211 }, { "epoch": 31.001216216216218, "grad_norm": 0.016918519511818886, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9212 }, { "epoch": 31.00125, "grad_norm": 0.0009832114446908236, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9213 }, { "epoch": 31.001283783783784, "grad_norm": 0.0013444466749206185, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9214 }, { "epoch": 31.00131756756757, "grad_norm": 0.14324185252189636, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9215 }, { "epoch": 31.001351351351353, "grad_norm": 3.671318531036377, "learning_rate": 3.90625e-07, "loss": 0.397, "step": 9216 }, { "epoch": 31.001385135135134, "grad_norm": 0.010924817062914371, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9217 }, { "epoch": 31.00141891891892, "grad_norm": 0.0021504417527467012, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9218 }, { "epoch": 31.001452702702704, "grad_norm": 0.0011558446567505598, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9219 }, { "epoch": 31.001486486486485, "grad_norm": 0.006982241291552782, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9220 }, { "epoch": 31.00152027027027, "grad_norm": 0.0015494015533477068, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9221 }, { "epoch": 31.001554054054054, "grad_norm": 0.0014352311845868826, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9222 }, { "epoch": 31.00158783783784, "grad_norm": 12.446813583374023, "learning_rate": 3.90625e-07, "loss": 0.0276, "step": 9223 }, { "epoch": 31.00162162162162, "grad_norm": 0.005796482786536217, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9224 }, { "epoch": 31.001655405405405, "grad_norm": 0.002677489537745714, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9225 }, { "epoch": 31.00168918918919, "grad_norm": 0.004516758024692535, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9226 }, { "epoch": 31.001722972972974, "grad_norm": 0.11692984402179718, "learning_rate": 3.90625e-07, "loss": 0.0028, "step": 9227 }, { "epoch": 31.001756756756755, "grad_norm": 4.213637828826904, "learning_rate": 3.90625e-07, "loss": 0.0093, "step": 9228 }, { "epoch": 31.00179054054054, "grad_norm": 0.003504224354401231, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9229 }, { "epoch": 31.001824324324325, "grad_norm": 0.019071463495492935, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9230 }, { "epoch": 31.00185810810811, "grad_norm": 5.786520481109619, "learning_rate": 3.90625e-07, "loss": 0.0294, "step": 9231 }, { "epoch": 31.00189189189189, "grad_norm": 1.2649413347244263, "learning_rate": 3.90625e-07, "loss": 0.0083, "step": 9232 }, { "epoch": 31.001925675675675, "grad_norm": 102.94598388671875, "learning_rate": 3.90625e-07, "loss": 0.6994, "step": 9233 }, { "epoch": 31.00195945945946, "grad_norm": 0.002943481085821986, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9234 }, { "epoch": 31.001993243243245, "grad_norm": 9.69936752319336, "learning_rate": 3.90625e-07, "loss": 0.1356, "step": 9235 }, { "epoch": 31.002027027027026, "grad_norm": 0.0009880631696432829, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9236 }, { "epoch": 31.00206081081081, "grad_norm": 0.019801709800958633, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9237 }, { "epoch": 31.002094594594595, "grad_norm": 0.014161431230604649, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9238 }, { "epoch": 31.00212837837838, "grad_norm": 0.0012866872129961848, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9239 }, { "epoch": 31.00216216216216, "grad_norm": 0.0016118354396894574, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9240 }, { "epoch": 31.002195945945946, "grad_norm": 0.008074803277850151, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9241 }, { "epoch": 31.00222972972973, "grad_norm": 3.625267505645752, "learning_rate": 3.90625e-07, "loss": 0.2399, "step": 9242 }, { "epoch": 31.002263513513512, "grad_norm": 1.704896092414856, "learning_rate": 3.90625e-07, "loss": 0.0435, "step": 9243 }, { "epoch": 31.002297297297297, "grad_norm": 0.0015668263658881187, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9244 }, { "epoch": 31.00233108108108, "grad_norm": 0.001216657692566514, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9245 }, { "epoch": 31.002364864864866, "grad_norm": 0.0028517236933112144, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9246 }, { "epoch": 31.002398648648647, "grad_norm": 0.0051203761249780655, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9247 }, { "epoch": 31.002432432432432, "grad_norm": 0.5414369702339172, "learning_rate": 3.90625e-07, "loss": 0.0077, "step": 9248 }, { "epoch": 31.002466216216217, "grad_norm": 0.0011465783463791013, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9249 }, { "epoch": 31.0025, "grad_norm": 32.16463851928711, "learning_rate": 3.90625e-07, "loss": 0.0676, "step": 9250 }, { "epoch": 31.002533783783782, "grad_norm": 0.0034626310225576162, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9251 }, { "epoch": 31.002567567567567, "grad_norm": 0.001614019856788218, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9252 }, { "epoch": 31.002601351351352, "grad_norm": 0.0008910192991606891, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9253 }, { "epoch": 31.002635135135137, "grad_norm": 0.1097269132733345, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 9254 }, { "epoch": 31.002668918918918, "grad_norm": 0.0016641177935525775, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9255 }, { "epoch": 31.002702702702702, "grad_norm": 0.12306687980890274, "learning_rate": 3.90625e-07, "loss": 0.0046, "step": 9256 }, { "epoch": 31.002736486486487, "grad_norm": 0.0019316032994538546, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9257 }, { "epoch": 31.002770270270272, "grad_norm": 5.105460166931152, "learning_rate": 3.90625e-07, "loss": 0.2276, "step": 9258 }, { "epoch": 31.002804054054053, "grad_norm": 0.0200805626809597, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9259 }, { "epoch": 31.002837837837838, "grad_norm": 0.009830418974161148, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9260 }, { "epoch": 31.002871621621622, "grad_norm": 0.04089904949069023, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9261 }, { "epoch": 31.002905405405407, "grad_norm": 0.0660836398601532, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9262 }, { "epoch": 31.00293918918919, "grad_norm": 0.0019294078228995204, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9263 }, { "epoch": 31.002972972972973, "grad_norm": 0.0020439319778233767, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9264 }, { "epoch": 31.003006756756758, "grad_norm": 0.005475515499711037, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9265 }, { "epoch": 31.00304054054054, "grad_norm": 4.389070987701416, "learning_rate": 3.90625e-07, "loss": 0.3001, "step": 9266 }, { "epoch": 31.003074324324324, "grad_norm": 0.0014450278831645846, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9267 }, { "epoch": 31.00310810810811, "grad_norm": 0.0012159050675109029, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9268 }, { "epoch": 31.003141891891893, "grad_norm": 1.1046342849731445, "learning_rate": 3.90625e-07, "loss": 0.0088, "step": 9269 }, { "epoch": 31.003175675675674, "grad_norm": 0.0007012280402705073, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9270 }, { "epoch": 31.00320945945946, "grad_norm": 0.006510869599878788, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9271 }, { "epoch": 31.003243243243244, "grad_norm": 0.0011295778676867485, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9272 }, { "epoch": 31.00327702702703, "grad_norm": 0.003423062153160572, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9273 }, { "epoch": 31.00331081081081, "grad_norm": 0.12285082042217255, "learning_rate": 3.90625e-07, "loss": 0.003, "step": 9274 }, { "epoch": 31.003344594594594, "grad_norm": 0.1714467704296112, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9275 }, { "epoch": 31.00337837837838, "grad_norm": 0.028042593970894814, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9276 }, { "epoch": 31.003412162162164, "grad_norm": 0.0035552787594497204, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9277 }, { "epoch": 31.003445945945945, "grad_norm": 0.1392991989850998, "learning_rate": 3.90625e-07, "loss": 0.005, "step": 9278 }, { "epoch": 31.00347972972973, "grad_norm": 0.0058606774546206, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9279 }, { "epoch": 31.003513513513514, "grad_norm": 0.001485844375565648, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9280 }, { "epoch": 31.0035472972973, "grad_norm": 0.0008838606881909072, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9281 }, { "epoch": 31.00358108108108, "grad_norm": 0.006729734595865011, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9282 }, { "epoch": 31.003614864864865, "grad_norm": 0.13441291451454163, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9283 }, { "epoch": 31.00364864864865, "grad_norm": 0.3090386390686035, "learning_rate": 3.90625e-07, "loss": 0.0078, "step": 9284 }, { "epoch": 31.00368243243243, "grad_norm": 0.039324987679719925, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9285 }, { "epoch": 31.003716216216215, "grad_norm": 0.847335696220398, "learning_rate": 3.90625e-07, "loss": 0.0145, "step": 9286 }, { "epoch": 31.00375, "grad_norm": 22.550504684448242, "learning_rate": 3.90625e-07, "loss": 0.409, "step": 9287 }, { "epoch": 31.003783783783785, "grad_norm": 0.002039635321125388, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9288 }, { "epoch": 31.003817567567566, "grad_norm": 0.024556446820497513, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9289 }, { "epoch": 31.00385135135135, "grad_norm": 0.9418559670448303, "learning_rate": 3.90625e-07, "loss": 0.0256, "step": 9290 }, { "epoch": 31.003885135135135, "grad_norm": 0.11331257969141006, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 9291 }, { "epoch": 31.00391891891892, "grad_norm": 0.009340167976915836, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9292 }, { "epoch": 31.0039527027027, "grad_norm": 0.23227491974830627, "learning_rate": 3.90625e-07, "loss": 0.0087, "step": 9293 }, { "epoch": 31.003986486486486, "grad_norm": 0.6921828389167786, "learning_rate": 3.90625e-07, "loss": 0.0081, "step": 9294 }, { "epoch": 31.00402027027027, "grad_norm": 0.0037881960161030293, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9295 }, { "epoch": 31.004054054054055, "grad_norm": 0.0013588782167062163, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9296 }, { "epoch": 31.004087837837837, "grad_norm": 0.0028355007525533438, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9297 }, { "epoch": 31.00412162162162, "grad_norm": 0.033847875893116, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9298 }, { "epoch": 31.004155405405406, "grad_norm": 0.017551742494106293, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9299 }, { "epoch": 31.00418918918919, "grad_norm": 0.0015750096645206213, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9300 }, { "epoch": 31.004222972972972, "grad_norm": 0.003187932539731264, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9301 }, { "epoch": 31.004256756756757, "grad_norm": 0.006936998572200537, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9302 }, { "epoch": 31.00429054054054, "grad_norm": 0.14129064977169037, "learning_rate": 3.90625e-07, "loss": 0.0019, "step": 9303 }, { "epoch": 31.004324324324326, "grad_norm": 0.10773032903671265, "learning_rate": 3.90625e-07, "loss": 0.0039, "step": 9304 }, { "epoch": 31.004358108108107, "grad_norm": 0.003042197786271572, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9305 }, { "epoch": 31.004391891891892, "grad_norm": 0.0016100388020277023, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9306 }, { "epoch": 31.004425675675677, "grad_norm": 0.002607988193631172, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9307 }, { "epoch": 31.004459459459458, "grad_norm": 0.00506186718121171, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9308 }, { "epoch": 31.004493243243243, "grad_norm": 0.0006651509320363402, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9309 }, { "epoch": 31.004527027027027, "grad_norm": 0.9566026926040649, "learning_rate": 3.90625e-07, "loss": 0.0097, "step": 9310 }, { "epoch": 31.004560810810812, "grad_norm": 0.08696091175079346, "learning_rate": 3.90625e-07, "loss": 0.0023, "step": 9311 }, { "epoch": 31.004594594594593, "grad_norm": 5.168580532073975, "learning_rate": 3.90625e-07, "loss": 0.0294, "step": 9312 }, { "epoch": 31.004628378378378, "grad_norm": 0.3842124938964844, "learning_rate": 3.90625e-07, "loss": 0.0014, "step": 9313 }, { "epoch": 31.004662162162163, "grad_norm": 0.0016404411289840937, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9314 }, { "epoch": 31.004695945945947, "grad_norm": 0.08906096965074539, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9315 }, { "epoch": 31.00472972972973, "grad_norm": 0.0029870893340557814, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9316 }, { "epoch": 31.004763513513513, "grad_norm": 0.04422498121857643, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9317 }, { "epoch": 31.004797297297298, "grad_norm": 0.0013349694199860096, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9318 }, { "epoch": 31.004831081081083, "grad_norm": 0.0031533841975033283, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9319 }, { "epoch": 31.004864864864864, "grad_norm": 0.005823161453008652, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9320 }, { "epoch": 31.00489864864865, "grad_norm": 0.06334375590085983, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9321 }, { "epoch": 31.004932432432433, "grad_norm": 0.0033029646147042513, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9322 }, { "epoch": 31.004966216216218, "grad_norm": 0.0032323470804840326, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9323 }, { "epoch": 31.005, "grad_norm": 0.003157257568091154, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9324 }, { "epoch": 31.005033783783784, "grad_norm": 0.20321938395500183, "learning_rate": 3.90625e-07, "loss": 0.0076, "step": 9325 }, { "epoch": 31.00506756756757, "grad_norm": 0.023793229833245277, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9326 }, { "epoch": 31.00510135135135, "grad_norm": 0.17602458596229553, "learning_rate": 3.90625e-07, "loss": 0.0009, "step": 9327 }, { "epoch": 31.005135135135134, "grad_norm": 0.03464645892381668, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9328 }, { "epoch": 31.00516891891892, "grad_norm": 0.0017866560956463218, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9329 }, { "epoch": 31.005202702702704, "grad_norm": 0.005592228379100561, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9330 }, { "epoch": 31.005236486486485, "grad_norm": 0.0068532428704202175, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9331 }, { "epoch": 31.00527027027027, "grad_norm": 0.006568668410181999, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9332 }, { "epoch": 31.005304054054054, "grad_norm": 0.0011773144360631704, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9333 }, { "epoch": 31.00533783783784, "grad_norm": 0.0031445897184312344, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9334 }, { "epoch": 31.00537162162162, "grad_norm": 0.0021737401839345694, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9335 }, { "epoch": 31.005405405405405, "grad_norm": 0.009354839101433754, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9336 }, { "epoch": 31.00543918918919, "grad_norm": 0.22150616347789764, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9337 }, { "epoch": 31.005472972972974, "grad_norm": 0.009372963570058346, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9338 }, { "epoch": 31.005506756756755, "grad_norm": 0.0036765034310519695, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9339 }, { "epoch": 31.00554054054054, "grad_norm": 0.001567663042806089, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9340 }, { "epoch": 31.005574324324325, "grad_norm": 0.00835026428103447, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9341 }, { "epoch": 31.00560810810811, "grad_norm": 0.2580430507659912, "learning_rate": 3.90625e-07, "loss": 0.0011, "step": 9342 }, { "epoch": 31.00564189189189, "grad_norm": 0.0033934484235942364, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9343 }, { "epoch": 31.005675675675676, "grad_norm": 0.09612954407930374, "learning_rate": 3.90625e-07, "loss": 0.0014, "step": 9344 }, { "epoch": 31.00570945945946, "grad_norm": 0.01306427177041769, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9345 }, { "epoch": 31.005743243243245, "grad_norm": 58.749454498291016, "learning_rate": 3.90625e-07, "loss": 0.605, "step": 9346 }, { "epoch": 31.005777027027026, "grad_norm": 0.022769786417484283, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9347 }, { "epoch": 31.00581081081081, "grad_norm": 0.4459139406681061, "learning_rate": 3.90625e-07, "loss": 0.0091, "step": 9348 }, { "epoch": 31.005844594594596, "grad_norm": 0.11036156117916107, "learning_rate": 3.90625e-07, "loss": 0.0035, "step": 9349 }, { "epoch": 31.005878378378377, "grad_norm": 0.00396339688450098, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9350 }, { "epoch": 31.00591216216216, "grad_norm": 0.06781303137540817, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9351 }, { "epoch": 31.005945945945946, "grad_norm": 0.0007296741241589189, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9352 }, { "epoch": 31.00597972972973, "grad_norm": 0.10401620715856552, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9353 }, { "epoch": 31.006013513513512, "grad_norm": 0.008407593704760075, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9354 }, { "epoch": 31.006047297297297, "grad_norm": 0.0033261198550462723, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9355 }, { "epoch": 31.00608108108108, "grad_norm": 0.008202418684959412, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9356 }, { "epoch": 31.006114864864866, "grad_norm": 0.14407777786254883, "learning_rate": 3.90625e-07, "loss": 0.0052, "step": 9357 }, { "epoch": 31.006148648648647, "grad_norm": 0.002076597884297371, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9358 }, { "epoch": 31.006182432432432, "grad_norm": 0.01476961374282837, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9359 }, { "epoch": 31.006216216216217, "grad_norm": 0.000723213714081794, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9360 }, { "epoch": 31.00625, "grad_norm": 0.0036701106000691652, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9361 }, { "epoch": 31.006283783783783, "grad_norm": 0.061991192400455475, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9362 }, { "epoch": 31.006317567567567, "grad_norm": 0.0015978720039129257, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9363 }, { "epoch": 31.006351351351352, "grad_norm": 0.015487201511859894, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9364 }, { "epoch": 31.006385135135137, "grad_norm": 0.0028769359923899174, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9365 }, { "epoch": 31.006418918918918, "grad_norm": 7.16277551651001, "learning_rate": 3.90625e-07, "loss": 0.1002, "step": 9366 }, { "epoch": 31.006452702702703, "grad_norm": 0.00622670678421855, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9367 }, { "epoch": 31.006486486486487, "grad_norm": 0.0013847614172846079, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9368 }, { "epoch": 31.006520270270272, "grad_norm": 0.001516428543254733, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9369 }, { "epoch": 31.006554054054053, "grad_norm": 0.003981363959610462, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9370 }, { "epoch": 31.006587837837838, "grad_norm": 3.552330493927002, "learning_rate": 3.90625e-07, "loss": 0.0168, "step": 9371 }, { "epoch": 31.006621621621623, "grad_norm": 0.0009265934349969029, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9372 }, { "epoch": 31.006655405405404, "grad_norm": 0.14912985265254974, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9373 }, { "epoch": 31.00668918918919, "grad_norm": 1.4464222192764282, "learning_rate": 3.90625e-07, "loss": 0.0339, "step": 9374 }, { "epoch": 31.006722972972973, "grad_norm": 0.04382207244634628, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9375 }, { "epoch": 31.006756756756758, "grad_norm": 0.01420971006155014, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9376 }, { "epoch": 31.00679054054054, "grad_norm": 0.0032965538557618856, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9377 }, { "epoch": 31.006824324324324, "grad_norm": 0.03144202008843422, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9378 }, { "epoch": 31.00685810810811, "grad_norm": 0.009471242316067219, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9379 }, { "epoch": 31.006891891891893, "grad_norm": 18.538738250732422, "learning_rate": 3.90625e-07, "loss": 0.0559, "step": 9380 }, { "epoch": 31.006925675675674, "grad_norm": 0.007945306599140167, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9381 }, { "epoch": 31.00695945945946, "grad_norm": 0.4279113709926605, "learning_rate": 3.90625e-07, "loss": 0.0093, "step": 9382 }, { "epoch": 31.006993243243244, "grad_norm": 0.17385785281658173, "learning_rate": 3.90625e-07, "loss": 0.0061, "step": 9383 }, { "epoch": 31.00702702702703, "grad_norm": 0.030541501939296722, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9384 }, { "epoch": 31.00706081081081, "grad_norm": 36.61445236206055, "learning_rate": 3.90625e-07, "loss": 0.1405, "step": 9385 }, { "epoch": 31.007094594594594, "grad_norm": 0.0035646874457597733, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9386 }, { "epoch": 31.00712837837838, "grad_norm": 10.994304656982422, "learning_rate": 3.90625e-07, "loss": 0.3022, "step": 9387 }, { "epoch": 31.007162162162164, "grad_norm": 0.002948685549199581, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9388 }, { "epoch": 31.007195945945945, "grad_norm": 0.0014900354435667396, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9389 }, { "epoch": 31.00722972972973, "grad_norm": 1.0572102069854736, "learning_rate": 3.90625e-07, "loss": 0.009, "step": 9390 }, { "epoch": 31.007263513513514, "grad_norm": 0.003301712218672037, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9391 }, { "epoch": 31.007297297297296, "grad_norm": 0.002151878084987402, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9392 }, { "epoch": 31.00733108108108, "grad_norm": 0.2982064187526703, "learning_rate": 3.90625e-07, "loss": 0.005, "step": 9393 }, { "epoch": 31.007364864864865, "grad_norm": 0.0009510366362519562, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9394 }, { "epoch": 31.00739864864865, "grad_norm": 0.020483853295445442, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9395 }, { "epoch": 31.00743243243243, "grad_norm": 0.11359057575464249, "learning_rate": 3.90625e-07, "loss": 0.0033, "step": 9396 }, { "epoch": 31.007466216216216, "grad_norm": 0.001675840700045228, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9397 }, { "epoch": 31.0075, "grad_norm": 0.10770375281572342, "learning_rate": 3.90625e-07, "loss": 0.0018, "step": 9398 }, { "epoch": 31.007533783783785, "grad_norm": 0.1766415387392044, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9399 }, { "epoch": 31.007567567567566, "grad_norm": 68.52694702148438, "learning_rate": 3.90625e-07, "loss": 0.4847, "step": 9400 }, { "epoch": 31.00760135135135, "grad_norm": 0.0023491745814681053, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9401 }, { "epoch": 31.007635135135136, "grad_norm": 0.0032700232695788145, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9402 }, { "epoch": 31.00766891891892, "grad_norm": 0.5790817737579346, "learning_rate": 3.90625e-07, "loss": 0.0118, "step": 9403 }, { "epoch": 31.0077027027027, "grad_norm": 0.042526621371507645, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9404 }, { "epoch": 31.007736486486486, "grad_norm": 0.0016388599760830402, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9405 }, { "epoch": 31.00777027027027, "grad_norm": 0.0827367827296257, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 9406 }, { "epoch": 31.007804054054056, "grad_norm": 0.001966262934729457, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9407 }, { "epoch": 31.007837837837837, "grad_norm": 22.94204330444336, "learning_rate": 3.90625e-07, "loss": 0.4152, "step": 9408 }, { "epoch": 31.00787162162162, "grad_norm": 0.30803602933883667, "learning_rate": 3.90625e-07, "loss": 0.0011, "step": 9409 }, { "epoch": 31.007905405405406, "grad_norm": 0.010737047530710697, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9410 }, { "epoch": 31.00793918918919, "grad_norm": 0.026248600333929062, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9411 }, { "epoch": 31.007972972972972, "grad_norm": 0.0059824674390256405, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9412 }, { "epoch": 31.008006756756757, "grad_norm": 0.628627359867096, "learning_rate": 3.90625e-07, "loss": 0.0051, "step": 9413 }, { "epoch": 31.00804054054054, "grad_norm": 0.002364882966503501, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9414 }, { "epoch": 31.008074324324323, "grad_norm": 0.03952853009104729, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9415 }, { "epoch": 31.008108108108107, "grad_norm": 3.6591498851776123, "learning_rate": 3.90625e-07, "loss": 0.1056, "step": 9416 }, { "epoch": 31.008141891891892, "grad_norm": 0.0016080152709037066, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9417 }, { "epoch": 31.008175675675677, "grad_norm": 0.003682411275804043, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9418 }, { "epoch": 31.008209459459458, "grad_norm": 0.009641002863645554, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9419 }, { "epoch": 31.008243243243243, "grad_norm": 1.0185256004333496, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9420 }, { "epoch": 31.008277027027027, "grad_norm": 0.0010373081313446164, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9421 }, { "epoch": 31.008310810810812, "grad_norm": 0.002689441666007042, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9422 }, { "epoch": 31.008344594594593, "grad_norm": 0.002849054289981723, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9423 }, { "epoch": 31.008378378378378, "grad_norm": 0.004261093679815531, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9424 }, { "epoch": 31.008412162162163, "grad_norm": 0.005542641039937735, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9425 }, { "epoch": 31.008445945945947, "grad_norm": 0.010722734965384007, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9426 }, { "epoch": 31.00847972972973, "grad_norm": 0.0008336888859048486, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9427 }, { "epoch": 31.008513513513513, "grad_norm": 0.0014935669023543596, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9428 }, { "epoch": 31.008547297297298, "grad_norm": 0.0007980698719620705, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9429 }, { "epoch": 31.008581081081083, "grad_norm": 12.712112426757812, "learning_rate": 3.90625e-07, "loss": 0.2197, "step": 9430 }, { "epoch": 31.008614864864864, "grad_norm": 0.008685282431542873, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9431 }, { "epoch": 31.00864864864865, "grad_norm": 20.013111114501953, "learning_rate": 3.90625e-07, "loss": 0.0408, "step": 9432 }, { "epoch": 31.008682432432433, "grad_norm": 0.008797504007816315, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9433 }, { "epoch": 31.008716216216218, "grad_norm": 0.11590151488780975, "learning_rate": 3.90625e-07, "loss": 0.004, "step": 9434 }, { "epoch": 31.00875, "grad_norm": 0.07053329050540924, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9435 }, { "epoch": 31.008783783783784, "grad_norm": 0.03188053146004677, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9436 }, { "epoch": 31.00881756756757, "grad_norm": 0.00116692204028368, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9437 }, { "epoch": 31.00885135135135, "grad_norm": 0.002915727673098445, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9438 }, { "epoch": 31.008885135135134, "grad_norm": 0.13329270482063293, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9439 }, { "epoch": 31.00891891891892, "grad_norm": 1.9855473041534424, "learning_rate": 3.90625e-07, "loss": 0.0052, "step": 9440 }, { "epoch": 31.008952702702704, "grad_norm": 0.02093510702252388, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9441 }, { "epoch": 31.008986486486485, "grad_norm": 1.4995094537734985, "learning_rate": 3.90625e-07, "loss": 0.0428, "step": 9442 }, { "epoch": 31.00902027027027, "grad_norm": 0.0127288568764925, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9443 }, { "epoch": 31.009054054054054, "grad_norm": 0.12733259797096252, "learning_rate": 3.90625e-07, "loss": 0.0029, "step": 9444 }, { "epoch": 31.00908783783784, "grad_norm": 0.08926000446081161, "learning_rate": 3.90625e-07, "loss": 0.0019, "step": 9445 }, { "epoch": 31.00912162162162, "grad_norm": 0.04660738259553909, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9446 }, { "epoch": 31.009155405405405, "grad_norm": 0.0012849875492975116, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9447 }, { "epoch": 31.00918918918919, "grad_norm": 0.010670413263142109, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9448 }, { "epoch": 31.009222972972974, "grad_norm": 0.017401572316884995, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9449 }, { "epoch": 31.009256756756756, "grad_norm": 0.012739088386297226, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9450 }, { "epoch": 31.00929054054054, "grad_norm": 0.0010227601742371917, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9451 }, { "epoch": 31.009324324324325, "grad_norm": 0.005544479936361313, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9452 }, { "epoch": 31.00935810810811, "grad_norm": 0.11610601097345352, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9453 }, { "epoch": 31.00939189189189, "grad_norm": 1.216054081916809, "learning_rate": 3.90625e-07, "loss": 0.0241, "step": 9454 }, { "epoch": 31.009425675675676, "grad_norm": 0.10897663980722427, "learning_rate": 3.90625e-07, "loss": 0.0036, "step": 9455 }, { "epoch": 31.00945945945946, "grad_norm": 0.0030950712971389294, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9456 }, { "epoch": 31.00949324324324, "grad_norm": 0.05648837611079216, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9457 }, { "epoch": 31.009527027027026, "grad_norm": 0.059358853846788406, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9458 }, { "epoch": 31.00956081081081, "grad_norm": 0.042693156749010086, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9459 }, { "epoch": 31.009594594594596, "grad_norm": 0.0018028862541541457, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9460 }, { "epoch": 31.009628378378377, "grad_norm": 0.0006075300625525415, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9461 }, { "epoch": 31.00966216216216, "grad_norm": 0.004811047110706568, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9462 }, { "epoch": 31.009695945945946, "grad_norm": 0.08320144563913345, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9463 }, { "epoch": 31.00972972972973, "grad_norm": 0.006850205361843109, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9464 }, { "epoch": 31.009763513513512, "grad_norm": 0.04076218232512474, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9465 }, { "epoch": 31.009797297297297, "grad_norm": 0.10387708246707916, "learning_rate": 3.90625e-07, "loss": 0.0017, "step": 9466 }, { "epoch": 31.00983108108108, "grad_norm": 4.3336873054504395, "learning_rate": 3.90625e-07, "loss": 0.2102, "step": 9467 }, { "epoch": 31.009864864864866, "grad_norm": 0.052482929080724716, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9468 }, { "epoch": 31.009898648648647, "grad_norm": 0.4456363022327423, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 9469 }, { "epoch": 31.009932432432432, "grad_norm": 0.05603805556893349, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9470 }, { "epoch": 31.009966216216217, "grad_norm": 0.0033961765002459288, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9471 }, { "epoch": 31.01, "grad_norm": 0.0038130637258291245, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9472 }, { "epoch": 31.01, "eval_accuracy": 0.9046849757673667, "eval_loss": 0.5558223128318787, "eval_runtime": 33.5632, "eval_samples_per_second": 18.443, "eval_steps_per_second": 2.324, "step": 9472 }, { "epoch": 32.000033783783785, "grad_norm": 0.005827654153108597, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9473 }, { "epoch": 32.00006756756757, "grad_norm": 0.000868618197273463, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9474 }, { "epoch": 32.000101351351354, "grad_norm": 0.0020961621776223183, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9475 }, { "epoch": 32.00013513513513, "grad_norm": 1.0977612733840942, "learning_rate": 3.90625e-07, "loss": 0.0102, "step": 9476 }, { "epoch": 32.00016891891892, "grad_norm": 0.12741446495056152, "learning_rate": 3.90625e-07, "loss": 0.0044, "step": 9477 }, { "epoch": 32.0002027027027, "grad_norm": 0.18835854530334473, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9478 }, { "epoch": 32.000236486486486, "grad_norm": 0.003088991856202483, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9479 }, { "epoch": 32.00027027027027, "grad_norm": 0.0044685592874884605, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9480 }, { "epoch": 32.000304054054055, "grad_norm": 2.4425547122955322, "learning_rate": 3.90625e-07, "loss": 0.0044, "step": 9481 }, { "epoch": 32.00033783783784, "grad_norm": 0.005827763117849827, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9482 }, { "epoch": 32.000371621621625, "grad_norm": 0.11457671970129013, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 9483 }, { "epoch": 32.0004054054054, "grad_norm": 0.0020029954612255096, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9484 }, { "epoch": 32.00043918918919, "grad_norm": 0.0031156742479652166, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9485 }, { "epoch": 32.00047297297297, "grad_norm": 0.28048861026763916, "learning_rate": 3.90625e-07, "loss": 0.0015, "step": 9486 }, { "epoch": 32.00050675675676, "grad_norm": 0.0034608428832143545, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9487 }, { "epoch": 32.00054054054054, "grad_norm": 0.002261185320094228, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9488 }, { "epoch": 32.000574324324326, "grad_norm": 4.599878311157227, "learning_rate": 3.90625e-07, "loss": 0.1863, "step": 9489 }, { "epoch": 32.00060810810811, "grad_norm": 0.2617418169975281, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 9490 }, { "epoch": 32.000641891891895, "grad_norm": 0.10211143642663956, "learning_rate": 3.90625e-07, "loss": 0.0039, "step": 9491 }, { "epoch": 32.00067567567567, "grad_norm": 0.046957727521657944, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9492 }, { "epoch": 32.00070945945946, "grad_norm": 8.934744834899902, "learning_rate": 3.90625e-07, "loss": 0.7678, "step": 9493 }, { "epoch": 32.00074324324324, "grad_norm": 0.02179615944623947, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9494 }, { "epoch": 32.00077702702703, "grad_norm": 0.0034687744919210672, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9495 }, { "epoch": 32.00081081081081, "grad_norm": 6.922950267791748, "learning_rate": 3.90625e-07, "loss": 0.4583, "step": 9496 }, { "epoch": 32.0008445945946, "grad_norm": 0.14344052970409393, "learning_rate": 3.90625e-07, "loss": 0.0019, "step": 9497 }, { "epoch": 32.00087837837838, "grad_norm": 0.010007938370108604, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9498 }, { "epoch": 32.00091216216216, "grad_norm": 0.002877247054129839, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9499 }, { "epoch": 32.00094594594594, "grad_norm": 0.006215399596840143, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9500 }, { "epoch": 32.00097972972973, "grad_norm": 0.0010571288876235485, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9501 }, { "epoch": 32.00101351351351, "grad_norm": 0.005229462869465351, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9502 }, { "epoch": 32.0010472972973, "grad_norm": 0.11979269981384277, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9503 }, { "epoch": 32.00108108108108, "grad_norm": 0.000994714442640543, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9504 }, { "epoch": 32.00111486486487, "grad_norm": 8.70345401763916, "learning_rate": 3.90625e-07, "loss": 0.4188, "step": 9505 }, { "epoch": 32.00114864864865, "grad_norm": 0.05431199446320534, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9506 }, { "epoch": 32.00118243243243, "grad_norm": 0.029437292367219925, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9507 }, { "epoch": 32.001216216216214, "grad_norm": 0.09454285353422165, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9508 }, { "epoch": 32.00125, "grad_norm": 0.017447100952267647, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9509 }, { "epoch": 32.001283783783784, "grad_norm": 0.024892577901482582, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9510 }, { "epoch": 32.00131756756757, "grad_norm": 0.0006133050192147493, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9511 }, { "epoch": 32.00135135135135, "grad_norm": 3.336237907409668, "learning_rate": 3.90625e-07, "loss": 0.1309, "step": 9512 }, { "epoch": 32.00138513513514, "grad_norm": 0.0042127808555960655, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9513 }, { "epoch": 32.00141891891892, "grad_norm": 0.0032327643129974604, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9514 }, { "epoch": 32.0014527027027, "grad_norm": 6.254719257354736, "learning_rate": 3.90625e-07, "loss": 0.2538, "step": 9515 }, { "epoch": 32.001486486486485, "grad_norm": 1.6828744411468506, "learning_rate": 3.90625e-07, "loss": 0.0422, "step": 9516 }, { "epoch": 32.00152027027027, "grad_norm": 0.007348434068262577, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9517 }, { "epoch": 32.001554054054054, "grad_norm": 0.003077026689425111, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9518 }, { "epoch": 32.00158783783784, "grad_norm": 0.00090641004499048, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9519 }, { "epoch": 32.001621621621624, "grad_norm": 0.8574332594871521, "learning_rate": 3.90625e-07, "loss": 0.0085, "step": 9520 }, { "epoch": 32.00165540540541, "grad_norm": 0.0022165547125041485, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9521 }, { "epoch": 32.001689189189186, "grad_norm": 0.004497376270592213, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9522 }, { "epoch": 32.00172297297297, "grad_norm": 0.0017753978027030826, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9523 }, { "epoch": 32.001756756756755, "grad_norm": 0.37030941247940063, "learning_rate": 3.90625e-07, "loss": 0.0117, "step": 9524 }, { "epoch": 32.00179054054054, "grad_norm": 0.0008071057964116335, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9525 }, { "epoch": 32.001824324324325, "grad_norm": 0.09607131034135818, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9526 }, { "epoch": 32.00185810810811, "grad_norm": 0.0013557093916460872, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9527 }, { "epoch": 32.001891891891894, "grad_norm": 0.03373393416404724, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9528 }, { "epoch": 32.00192567567568, "grad_norm": 0.001555484370328486, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9529 }, { "epoch": 32.00195945945946, "grad_norm": 0.0025341655127704144, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9530 }, { "epoch": 32.00199324324324, "grad_norm": 0.002400748198851943, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9531 }, { "epoch": 32.002027027027026, "grad_norm": 2.527817726135254, "learning_rate": 3.90625e-07, "loss": 0.0177, "step": 9532 }, { "epoch": 32.00206081081081, "grad_norm": 0.0393373966217041, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9533 }, { "epoch": 32.002094594594595, "grad_norm": 0.01057930663228035, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9534 }, { "epoch": 32.00212837837838, "grad_norm": 0.005317178089171648, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9535 }, { "epoch": 32.002162162162165, "grad_norm": 0.0028761725407093763, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9536 }, { "epoch": 32.00219594594594, "grad_norm": 0.3307130038738251, "learning_rate": 3.90625e-07, "loss": 0.0083, "step": 9537 }, { "epoch": 32.00222972972973, "grad_norm": 0.0061846585012972355, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9538 }, { "epoch": 32.00226351351351, "grad_norm": 5.758739948272705, "learning_rate": 3.90625e-07, "loss": 0.0669, "step": 9539 }, { "epoch": 32.0022972972973, "grad_norm": 0.22830265760421753, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 9540 }, { "epoch": 32.00233108108108, "grad_norm": 0.10430251806974411, "learning_rate": 3.90625e-07, "loss": 0.0039, "step": 9541 }, { "epoch": 32.002364864864866, "grad_norm": 0.2093607783317566, "learning_rate": 3.90625e-07, "loss": 0.0034, "step": 9542 }, { "epoch": 32.00239864864865, "grad_norm": 0.0016965451650321484, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9543 }, { "epoch": 32.002432432432435, "grad_norm": 0.004967299290001392, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9544 }, { "epoch": 32.00246621621621, "grad_norm": 3.873521327972412, "learning_rate": 3.90625e-07, "loss": 0.0811, "step": 9545 }, { "epoch": 32.0025, "grad_norm": 0.6138079166412354, "learning_rate": 3.90625e-07, "loss": 0.0095, "step": 9546 }, { "epoch": 32.00253378378378, "grad_norm": 0.623347818851471, "learning_rate": 3.90625e-07, "loss": 0.0101, "step": 9547 }, { "epoch": 32.00256756756757, "grad_norm": 0.00637208903208375, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9548 }, { "epoch": 32.00260135135135, "grad_norm": 10.775588989257812, "learning_rate": 3.90625e-07, "loss": 0.0394, "step": 9549 }, { "epoch": 32.00263513513514, "grad_norm": 0.001222169492393732, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9550 }, { "epoch": 32.00266891891892, "grad_norm": 0.0016362139722332358, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9551 }, { "epoch": 32.002702702702706, "grad_norm": 0.0015924531035125256, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9552 }, { "epoch": 32.002736486486484, "grad_norm": 0.0066328090615570545, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9553 }, { "epoch": 32.00277027027027, "grad_norm": 0.06594272702932358, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9554 }, { "epoch": 32.00280405405405, "grad_norm": 0.002014296595007181, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9555 }, { "epoch": 32.00283783783784, "grad_norm": 0.002987358020618558, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9556 }, { "epoch": 32.00287162162162, "grad_norm": 0.005419943947345018, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9557 }, { "epoch": 32.00290540540541, "grad_norm": 0.0040181479416787624, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9558 }, { "epoch": 32.00293918918919, "grad_norm": 0.006464343052357435, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9559 }, { "epoch": 32.00297297297297, "grad_norm": 0.0015463036252185702, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9560 }, { "epoch": 32.003006756756754, "grad_norm": 27.471702575683594, "learning_rate": 3.90625e-07, "loss": 0.9758, "step": 9561 }, { "epoch": 32.00304054054054, "grad_norm": 0.010770131833851337, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9562 }, { "epoch": 32.003074324324324, "grad_norm": 2.292661666870117, "learning_rate": 3.90625e-07, "loss": 0.0084, "step": 9563 }, { "epoch": 32.00310810810811, "grad_norm": 0.012550076469779015, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9564 }, { "epoch": 32.00314189189189, "grad_norm": 0.001897809561342001, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9565 }, { "epoch": 32.00317567567568, "grad_norm": 0.005508699454367161, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9566 }, { "epoch": 32.00320945945946, "grad_norm": 0.000963934522587806, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9567 }, { "epoch": 32.00324324324324, "grad_norm": 0.017456265166401863, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9568 }, { "epoch": 32.003277027027025, "grad_norm": 0.15869572758674622, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9569 }, { "epoch": 32.00331081081081, "grad_norm": 0.0011834203032776713, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9570 }, { "epoch": 32.003344594594594, "grad_norm": 0.00282600917853415, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9571 }, { "epoch": 32.00337837837838, "grad_norm": 0.03839055076241493, "learning_rate": 3.90625e-07, "loss": 0.0011, "step": 9572 }, { "epoch": 32.003412162162164, "grad_norm": 0.0014049471355974674, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9573 }, { "epoch": 32.00344594594595, "grad_norm": 0.10606703907251358, "learning_rate": 3.90625e-07, "loss": 0.004, "step": 9574 }, { "epoch": 32.00347972972973, "grad_norm": 0.002932964824140072, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9575 }, { "epoch": 32.00351351351351, "grad_norm": 0.001642241608351469, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9576 }, { "epoch": 32.003547297297295, "grad_norm": 0.0020495299249887466, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9577 }, { "epoch": 32.00358108108108, "grad_norm": 0.12393060326576233, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9578 }, { "epoch": 32.003614864864865, "grad_norm": 0.06247001141309738, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9579 }, { "epoch": 32.00364864864865, "grad_norm": 0.01442977599799633, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9580 }, { "epoch": 32.003682432432434, "grad_norm": 0.0027274456806480885, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9581 }, { "epoch": 32.00371621621622, "grad_norm": 0.004417964722961187, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9582 }, { "epoch": 32.00375, "grad_norm": 0.0054365224204957485, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9583 }, { "epoch": 32.00378378378378, "grad_norm": 0.001094380277208984, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9584 }, { "epoch": 32.003817567567566, "grad_norm": 0.021019037812948227, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9585 }, { "epoch": 32.00385135135135, "grad_norm": 0.005596040282398462, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9586 }, { "epoch": 32.003885135135135, "grad_norm": 0.001584483659826219, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9587 }, { "epoch": 32.00391891891892, "grad_norm": 3.556396722793579, "learning_rate": 3.90625e-07, "loss": 0.4051, "step": 9588 }, { "epoch": 32.003952702702705, "grad_norm": 0.017344489693641663, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9589 }, { "epoch": 32.00398648648649, "grad_norm": 0.0010706461034715176, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9590 }, { "epoch": 32.00402027027027, "grad_norm": 0.00960533507168293, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9591 }, { "epoch": 32.00405405405405, "grad_norm": 0.5823484659194946, "learning_rate": 3.90625e-07, "loss": 0.0017, "step": 9592 }, { "epoch": 32.00408783783784, "grad_norm": 0.0019505227683112025, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9593 }, { "epoch": 32.00412162162162, "grad_norm": 0.0019797873683273792, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9594 }, { "epoch": 32.004155405405406, "grad_norm": 0.04752371832728386, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9595 }, { "epoch": 32.00418918918919, "grad_norm": 0.002713557332754135, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9596 }, { "epoch": 32.004222972972975, "grad_norm": 7.688516139984131, "learning_rate": 3.90625e-07, "loss": 0.7445, "step": 9597 }, { "epoch": 32.00425675675676, "grad_norm": 24.035985946655273, "learning_rate": 3.90625e-07, "loss": 0.0739, "step": 9598 }, { "epoch": 32.00429054054054, "grad_norm": 0.020235080271959305, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9599 }, { "epoch": 32.00432432432432, "grad_norm": 0.003244555788114667, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9600 }, { "epoch": 32.00435810810811, "grad_norm": 0.43461719155311584, "learning_rate": 3.90625e-07, "loss": 0.0114, "step": 9601 }, { "epoch": 32.00439189189189, "grad_norm": 0.0018091402016580105, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9602 }, { "epoch": 32.00442567567568, "grad_norm": 1.105826497077942, "learning_rate": 3.90625e-07, "loss": 0.0109, "step": 9603 }, { "epoch": 32.00445945945946, "grad_norm": 0.0016770611982792616, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9604 }, { "epoch": 32.004493243243246, "grad_norm": 31.393840789794922, "learning_rate": 3.90625e-07, "loss": 0.2456, "step": 9605 }, { "epoch": 32.004527027027024, "grad_norm": 0.0010625353315845132, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9606 }, { "epoch": 32.00456081081081, "grad_norm": 0.00882937852293253, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9607 }, { "epoch": 32.00459459459459, "grad_norm": 0.003374790772795677, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9608 }, { "epoch": 32.00462837837838, "grad_norm": 0.008231598883867264, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9609 }, { "epoch": 32.00466216216216, "grad_norm": 0.000987434177659452, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9610 }, { "epoch": 32.00469594594595, "grad_norm": 0.003145137568935752, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9611 }, { "epoch": 32.00472972972973, "grad_norm": 0.00207425351254642, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9612 }, { "epoch": 32.00476351351352, "grad_norm": 0.0012332696933299303, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9613 }, { "epoch": 32.004797297297294, "grad_norm": 0.0016113794408738613, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9614 }, { "epoch": 32.00483108108108, "grad_norm": 0.1277053952217102, "learning_rate": 3.90625e-07, "loss": 0.0015, "step": 9615 }, { "epoch": 32.004864864864864, "grad_norm": 0.0007517646881751716, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9616 }, { "epoch": 32.00489864864865, "grad_norm": 0.08653116971254349, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9617 }, { "epoch": 32.00493243243243, "grad_norm": 0.00532695185393095, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9618 }, { "epoch": 32.00496621621622, "grad_norm": 5.842657089233398, "learning_rate": 3.90625e-07, "loss": 0.0128, "step": 9619 }, { "epoch": 32.005, "grad_norm": 37.76164245605469, "learning_rate": 3.90625e-07, "loss": 0.2798, "step": 9620 }, { "epoch": 32.00503378378379, "grad_norm": 0.07924987375736237, "learning_rate": 3.90625e-07, "loss": 0.0014, "step": 9621 }, { "epoch": 32.005067567567565, "grad_norm": 49.83127975463867, "learning_rate": 3.90625e-07, "loss": 0.1242, "step": 9622 }, { "epoch": 32.00510135135135, "grad_norm": 0.008498485200107098, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9623 }, { "epoch": 32.005135135135134, "grad_norm": 0.41557690501213074, "learning_rate": 3.90625e-07, "loss": 0.0017, "step": 9624 }, { "epoch": 32.00516891891892, "grad_norm": 0.0021825178992003202, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9625 }, { "epoch": 32.005202702702704, "grad_norm": 0.6361295580863953, "learning_rate": 3.90625e-07, "loss": 0.0034, "step": 9626 }, { "epoch": 32.00523648648649, "grad_norm": 0.012913806363940239, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9627 }, { "epoch": 32.00527027027027, "grad_norm": 0.003045415971428156, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9628 }, { "epoch": 32.00530405405405, "grad_norm": 0.13694393634796143, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9629 }, { "epoch": 32.005337837837835, "grad_norm": 0.007186892442405224, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9630 }, { "epoch": 32.00537162162162, "grad_norm": 0.0008460301905870438, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9631 }, { "epoch": 32.005405405405405, "grad_norm": 0.00263551389798522, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9632 }, { "epoch": 32.00543918918919, "grad_norm": 0.0019996927585452795, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9633 }, { "epoch": 32.005472972972974, "grad_norm": 0.0989597961306572, "learning_rate": 3.90625e-07, "loss": 0.0036, "step": 9634 }, { "epoch": 32.00550675675676, "grad_norm": 26.200958251953125, "learning_rate": 3.90625e-07, "loss": 0.1361, "step": 9635 }, { "epoch": 32.005540540540544, "grad_norm": 0.002644095802679658, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9636 }, { "epoch": 32.00557432432432, "grad_norm": 0.01286973338574171, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9637 }, { "epoch": 32.005608108108106, "grad_norm": 0.27761608362197876, "learning_rate": 3.90625e-07, "loss": 0.003, "step": 9638 }, { "epoch": 32.00564189189189, "grad_norm": 0.6875995397567749, "learning_rate": 3.90625e-07, "loss": 0.0025, "step": 9639 }, { "epoch": 32.005675675675676, "grad_norm": 0.005521082319319248, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9640 }, { "epoch": 32.00570945945946, "grad_norm": 0.007518934551626444, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9641 }, { "epoch": 32.005743243243245, "grad_norm": 0.0010203705169260502, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9642 }, { "epoch": 32.00577702702703, "grad_norm": 0.003341070841997862, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9643 }, { "epoch": 32.005810810810814, "grad_norm": 0.0022485440131276846, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9644 }, { "epoch": 32.00584459459459, "grad_norm": 0.00505822105333209, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9645 }, { "epoch": 32.00587837837838, "grad_norm": 0.001338976202532649, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9646 }, { "epoch": 32.00591216216216, "grad_norm": 0.0007831120165064931, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9647 }, { "epoch": 32.005945945945946, "grad_norm": 0.0025600269436836243, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9648 }, { "epoch": 32.00597972972973, "grad_norm": 0.0018587023951113224, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9649 }, { "epoch": 32.006013513513516, "grad_norm": 0.0027031609788537025, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9650 }, { "epoch": 32.0060472972973, "grad_norm": 0.005904368124902248, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9651 }, { "epoch": 32.00608108108108, "grad_norm": 0.0010856715962290764, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9652 }, { "epoch": 32.00611486486486, "grad_norm": 0.003969921730458736, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9653 }, { "epoch": 32.00614864864865, "grad_norm": 0.10205432772636414, "learning_rate": 3.90625e-07, "loss": 0.0037, "step": 9654 }, { "epoch": 32.00618243243243, "grad_norm": 3.3773598670959473, "learning_rate": 3.90625e-07, "loss": 0.4396, "step": 9655 }, { "epoch": 32.00621621621622, "grad_norm": 3.350844383239746, "learning_rate": 3.90625e-07, "loss": 0.4265, "step": 9656 }, { "epoch": 32.00625, "grad_norm": 1.105200171470642, "learning_rate": 3.90625e-07, "loss": 0.0137, "step": 9657 }, { "epoch": 32.006283783783786, "grad_norm": 0.014209187589585781, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9658 }, { "epoch": 32.00631756756757, "grad_norm": 0.001638429006561637, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9659 }, { "epoch": 32.00635135135135, "grad_norm": 0.091592937707901, "learning_rate": 3.90625e-07, "loss": 0.0034, "step": 9660 }, { "epoch": 32.00638513513513, "grad_norm": 0.001619921182282269, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9661 }, { "epoch": 32.00641891891892, "grad_norm": 0.15045541524887085, "learning_rate": 3.90625e-07, "loss": 0.0052, "step": 9662 }, { "epoch": 32.0064527027027, "grad_norm": 0.0017746133962646127, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9663 }, { "epoch": 32.00648648648649, "grad_norm": 37.27669906616211, "learning_rate": 3.90625e-07, "loss": 0.1061, "step": 9664 }, { "epoch": 32.00652027027027, "grad_norm": 18.5060977935791, "learning_rate": 3.90625e-07, "loss": 0.7868, "step": 9665 }, { "epoch": 32.00655405405406, "grad_norm": 0.002436333568766713, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9666 }, { "epoch": 32.006587837837834, "grad_norm": 0.18632972240447998, "learning_rate": 3.90625e-07, "loss": 0.0028, "step": 9667 }, { "epoch": 32.00662162162162, "grad_norm": 0.002115025417879224, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9668 }, { "epoch": 32.006655405405404, "grad_norm": 0.0009217456681653857, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9669 }, { "epoch": 32.00668918918919, "grad_norm": 0.02699282579123974, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9670 }, { "epoch": 32.00672297297297, "grad_norm": 0.13542218506336212, "learning_rate": 3.90625e-07, "loss": 0.005, "step": 9671 }, { "epoch": 32.00675675675676, "grad_norm": 0.0014943245332688093, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9672 }, { "epoch": 32.00679054054054, "grad_norm": 0.0064539117738604546, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9673 }, { "epoch": 32.00682432432433, "grad_norm": 0.1226477175951004, "learning_rate": 3.90625e-07, "loss": 0.0047, "step": 9674 }, { "epoch": 32.006858108108105, "grad_norm": 0.19133585691452026, "learning_rate": 3.90625e-07, "loss": 0.0043, "step": 9675 }, { "epoch": 32.00689189189189, "grad_norm": 0.0016244696453213692, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9676 }, { "epoch": 32.006925675675674, "grad_norm": 0.536575973033905, "learning_rate": 3.90625e-07, "loss": 0.0137, "step": 9677 }, { "epoch": 32.00695945945946, "grad_norm": 0.001148222596384585, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9678 }, { "epoch": 32.006993243243244, "grad_norm": 0.002590535907074809, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9679 }, { "epoch": 32.00702702702703, "grad_norm": 0.0010190911125391722, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9680 }, { "epoch": 32.00706081081081, "grad_norm": 11.19784164428711, "learning_rate": 3.90625e-07, "loss": 1.1359, "step": 9681 }, { "epoch": 32.0070945945946, "grad_norm": 0.2426580786705017, "learning_rate": 3.90625e-07, "loss": 0.0074, "step": 9682 }, { "epoch": 32.007128378378376, "grad_norm": 14.430485725402832, "learning_rate": 3.90625e-07, "loss": 0.0469, "step": 9683 }, { "epoch": 32.00716216216216, "grad_norm": 0.0019140520598739386, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9684 }, { "epoch": 32.007195945945945, "grad_norm": 0.0020962730050086975, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9685 }, { "epoch": 32.00722972972973, "grad_norm": 0.11255154758691788, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9686 }, { "epoch": 32.007263513513514, "grad_norm": 0.5955376029014587, "learning_rate": 3.90625e-07, "loss": 0.0066, "step": 9687 }, { "epoch": 32.0072972972973, "grad_norm": 4.997396945953369, "learning_rate": 3.90625e-07, "loss": 0.3129, "step": 9688 }, { "epoch": 32.007331081081084, "grad_norm": 0.0016740977298468351, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9689 }, { "epoch": 32.00736486486486, "grad_norm": 0.06538591533899307, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9690 }, { "epoch": 32.007398648648646, "grad_norm": 0.11703834682703018, "learning_rate": 3.90625e-07, "loss": 0.0044, "step": 9691 }, { "epoch": 32.00743243243243, "grad_norm": 0.0014958048705011606, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9692 }, { "epoch": 32.007466216216216, "grad_norm": 0.12224584817886353, "learning_rate": 3.90625e-07, "loss": 0.0045, "step": 9693 }, { "epoch": 32.0075, "grad_norm": 0.03864789009094238, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9694 }, { "epoch": 32.007533783783785, "grad_norm": 0.0018303912365809083, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9695 }, { "epoch": 32.00756756756757, "grad_norm": 0.0020272068213671446, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9696 }, { "epoch": 32.007601351351354, "grad_norm": 0.0012269223807379603, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9697 }, { "epoch": 32.00763513513513, "grad_norm": 0.009423335082828999, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9698 }, { "epoch": 32.00766891891892, "grad_norm": 0.25254908204078674, "learning_rate": 3.90625e-07, "loss": 0.0049, "step": 9699 }, { "epoch": 32.0077027027027, "grad_norm": 0.0013292854418978095, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9700 }, { "epoch": 32.007736486486486, "grad_norm": 0.00371723435819149, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9701 }, { "epoch": 32.00777027027027, "grad_norm": 0.0006562531925737858, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9702 }, { "epoch": 32.007804054054056, "grad_norm": 0.002422517165541649, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9703 }, { "epoch": 32.00783783783784, "grad_norm": 0.0012989884708076715, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9704 }, { "epoch": 32.007871621621625, "grad_norm": 0.0073317717760801315, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9705 }, { "epoch": 32.0079054054054, "grad_norm": 3.3933205604553223, "learning_rate": 3.90625e-07, "loss": 0.4521, "step": 9706 }, { "epoch": 32.00793918918919, "grad_norm": 0.0023247560020536184, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9707 }, { "epoch": 32.00797297297297, "grad_norm": 0.002668132074177265, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9708 }, { "epoch": 32.00800675675676, "grad_norm": 9.323480606079102, "learning_rate": 3.90625e-07, "loss": 0.392, "step": 9709 }, { "epoch": 32.00804054054054, "grad_norm": 0.009005660191178322, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9710 }, { "epoch": 32.008074324324326, "grad_norm": 0.9453467726707458, "learning_rate": 3.90625e-07, "loss": 0.0242, "step": 9711 }, { "epoch": 32.00810810810811, "grad_norm": 0.0052630361169576645, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9712 }, { "epoch": 32.00814189189189, "grad_norm": 0.002414366928860545, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9713 }, { "epoch": 32.00817567567567, "grad_norm": 0.001277641742490232, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9714 }, { "epoch": 32.00820945945946, "grad_norm": 9.12629508972168, "learning_rate": 3.90625e-07, "loss": 0.4787, "step": 9715 }, { "epoch": 32.00824324324324, "grad_norm": 0.0015096936840564013, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9716 }, { "epoch": 32.00827702702703, "grad_norm": 0.14285673201084137, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9717 }, { "epoch": 32.00831081081081, "grad_norm": 0.0012328691082075238, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9718 }, { "epoch": 32.0083445945946, "grad_norm": 0.0010861554183065891, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9719 }, { "epoch": 32.00837837837838, "grad_norm": 6.1031951904296875, "learning_rate": 3.90625e-07, "loss": 0.3677, "step": 9720 }, { "epoch": 32.00841216216216, "grad_norm": 0.007686719298362732, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9721 }, { "epoch": 32.008445945945944, "grad_norm": 0.0016820087330415845, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9722 }, { "epoch": 32.00847972972973, "grad_norm": 0.0020569870248436928, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9723 }, { "epoch": 32.00851351351351, "grad_norm": 0.7221931219100952, "learning_rate": 3.90625e-07, "loss": 0.0049, "step": 9724 }, { "epoch": 32.0085472972973, "grad_norm": 0.0016533531015738845, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9725 }, { "epoch": 32.00858108108108, "grad_norm": 7.60900354385376, "learning_rate": 3.90625e-07, "loss": 0.31, "step": 9726 }, { "epoch": 32.00861486486487, "grad_norm": 0.11479120701551437, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 9727 }, { "epoch": 32.00864864864865, "grad_norm": 0.030616648495197296, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9728 }, { "epoch": 32.00868243243243, "grad_norm": 8.23251724243164, "learning_rate": 3.90625e-07, "loss": 0.1065, "step": 9729 }, { "epoch": 32.008716216216214, "grad_norm": 0.003288803854957223, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9730 }, { "epoch": 32.00875, "grad_norm": 0.029037537053227425, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9731 }, { "epoch": 32.008783783783784, "grad_norm": 0.0007226703455671668, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9732 }, { "epoch": 32.00881756756757, "grad_norm": 0.08017494529485703, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9733 }, { "epoch": 32.00885135135135, "grad_norm": 0.22894422709941864, "learning_rate": 3.90625e-07, "loss": 0.0053, "step": 9734 }, { "epoch": 32.00888513513514, "grad_norm": 0.07381044328212738, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9735 }, { "epoch": 32.008918918918916, "grad_norm": 0.002397130476310849, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9736 }, { "epoch": 32.0089527027027, "grad_norm": 0.03926605358719826, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9737 }, { "epoch": 32.008986486486485, "grad_norm": 0.008402942679822445, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9738 }, { "epoch": 32.00902027027027, "grad_norm": 0.0028760205022990704, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9739 }, { "epoch": 32.009054054054054, "grad_norm": 0.008326753973960876, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9740 }, { "epoch": 32.00908783783784, "grad_norm": 0.0032610748894512653, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9741 }, { "epoch": 32.009121621621624, "grad_norm": 0.0011492125922814012, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9742 }, { "epoch": 32.00915540540541, "grad_norm": 0.03353358432650566, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9743 }, { "epoch": 32.009189189189186, "grad_norm": 0.01561255007982254, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9744 }, { "epoch": 32.00922297297297, "grad_norm": 0.0024323691613972187, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9745 }, { "epoch": 32.009256756756756, "grad_norm": 0.0056068371050059795, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9746 }, { "epoch": 32.00929054054054, "grad_norm": 22.68852424621582, "learning_rate": 3.90625e-07, "loss": 0.2759, "step": 9747 }, { "epoch": 32.009324324324325, "grad_norm": 0.0011834215838462114, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9748 }, { "epoch": 32.00935810810811, "grad_norm": 0.002606042195111513, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9749 }, { "epoch": 32.009391891891894, "grad_norm": 0.0012309029698371887, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9750 }, { "epoch": 32.00942567567568, "grad_norm": 0.03618612512946129, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9751 }, { "epoch": 32.00945945945946, "grad_norm": 0.007820109836757183, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9752 }, { "epoch": 32.00949324324324, "grad_norm": 0.000608326809015125, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9753 }, { "epoch": 32.009527027027026, "grad_norm": 0.0006066768546588719, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9754 }, { "epoch": 32.00956081081081, "grad_norm": 0.0029689676593989134, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9755 }, { "epoch": 32.009594594594596, "grad_norm": 0.0016277555841952562, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9756 }, { "epoch": 32.00962837837838, "grad_norm": 0.11314038932323456, "learning_rate": 3.90625e-07, "loss": 0.0043, "step": 9757 }, { "epoch": 32.009662162162165, "grad_norm": 0.0013955411268398166, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9758 }, { "epoch": 32.00969594594594, "grad_norm": 0.0009310381719842553, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9759 }, { "epoch": 32.00972972972973, "grad_norm": 0.06268598884344101, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9760 }, { "epoch": 32.00976351351351, "grad_norm": 0.002853240817785263, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9761 }, { "epoch": 32.0097972972973, "grad_norm": 0.6875421404838562, "learning_rate": 3.90625e-07, "loss": 0.0131, "step": 9762 }, { "epoch": 32.00983108108108, "grad_norm": 3.2976393699645996, "learning_rate": 3.90625e-07, "loss": 0.0082, "step": 9763 }, { "epoch": 32.009864864864866, "grad_norm": 0.0008796048350632191, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9764 }, { "epoch": 32.00989864864865, "grad_norm": 0.003204341745004058, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9765 }, { "epoch": 32.009932432432436, "grad_norm": 0.0023197585251182318, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9766 }, { "epoch": 32.00996621621621, "grad_norm": 0.0007762701134197414, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9767 }, { "epoch": 32.01, "grad_norm": 0.003048843936994672, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9768 }, { "epoch": 32.01, "eval_accuracy": 0.8982229402261712, "eval_loss": 0.5693919062614441, "eval_runtime": 32.8252, "eval_samples_per_second": 18.857, "eval_steps_per_second": 2.376, "step": 9768 }, { "epoch": 33.000033783783785, "grad_norm": 0.008307844400405884, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9769 }, { "epoch": 33.00006756756757, "grad_norm": 0.004237343091517687, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9770 }, { "epoch": 33.000101351351354, "grad_norm": 0.008519764058291912, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9771 }, { "epoch": 33.00013513513513, "grad_norm": 0.002379706595093012, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9772 }, { "epoch": 33.00016891891892, "grad_norm": 0.07832315564155579, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9773 }, { "epoch": 33.0002027027027, "grad_norm": 0.24631591141223907, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9774 }, { "epoch": 33.000236486486486, "grad_norm": 0.0025779874995350838, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9775 }, { "epoch": 33.00027027027027, "grad_norm": 0.0028492126148194075, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9776 }, { "epoch": 33.000304054054055, "grad_norm": 0.0041694496758282185, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9777 }, { "epoch": 33.00033783783784, "grad_norm": 0.001545266597531736, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9778 }, { "epoch": 33.000371621621625, "grad_norm": 0.002342636464163661, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9779 }, { "epoch": 33.0004054054054, "grad_norm": 4.160382270812988, "learning_rate": 3.90625e-07, "loss": 0.1611, "step": 9780 }, { "epoch": 33.00043918918919, "grad_norm": 0.005406454671174288, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9781 }, { "epoch": 33.00047297297297, "grad_norm": 0.0029647524934262037, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9782 }, { "epoch": 33.00050675675676, "grad_norm": 6.56235408782959, "learning_rate": 3.90625e-07, "loss": 0.011, "step": 9783 }, { "epoch": 33.00054054054054, "grad_norm": 0.0012650941498577595, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9784 }, { "epoch": 33.000574324324326, "grad_norm": 0.02674684301018715, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9785 }, { "epoch": 33.00060810810811, "grad_norm": 0.0030168176162987947, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9786 }, { "epoch": 33.000641891891895, "grad_norm": 0.0018623662181198597, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9787 }, { "epoch": 33.00067567567567, "grad_norm": 0.10908326506614685, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9788 }, { "epoch": 33.00070945945946, "grad_norm": 0.0024899777490645647, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9789 }, { "epoch": 33.00074324324324, "grad_norm": 0.5532480478286743, "learning_rate": 3.90625e-07, "loss": 0.0181, "step": 9790 }, { "epoch": 33.00077702702703, "grad_norm": 0.04356026649475098, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9791 }, { "epoch": 33.00081081081081, "grad_norm": 0.0011085937730967999, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9792 }, { "epoch": 33.0008445945946, "grad_norm": 0.0006032229284755886, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9793 }, { "epoch": 33.00087837837838, "grad_norm": 0.8840978145599365, "learning_rate": 3.90625e-07, "loss": 0.0177, "step": 9794 }, { "epoch": 33.00091216216216, "grad_norm": 0.0014690825482830405, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9795 }, { "epoch": 33.00094594594594, "grad_norm": 0.003157520666718483, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9796 }, { "epoch": 33.00097972972973, "grad_norm": 0.001454874756745994, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9797 }, { "epoch": 33.00101351351351, "grad_norm": 0.05675581842660904, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9798 }, { "epoch": 33.0010472972973, "grad_norm": 0.0019811070524156094, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9799 }, { "epoch": 33.00108108108108, "grad_norm": 0.0020536924712359905, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9800 }, { "epoch": 33.00111486486487, "grad_norm": 56.23830795288086, "learning_rate": 3.90625e-07, "loss": 0.429, "step": 9801 }, { "epoch": 33.00114864864865, "grad_norm": 0.005291324574500322, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9802 }, { "epoch": 33.00118243243243, "grad_norm": 0.005897529888898134, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9803 }, { "epoch": 33.001216216216214, "grad_norm": 0.00792973767966032, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9804 }, { "epoch": 33.00125, "grad_norm": 0.0010380141902714968, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9805 }, { "epoch": 33.001283783783784, "grad_norm": 0.8747628331184387, "learning_rate": 3.90625e-07, "loss": 0.0072, "step": 9806 }, { "epoch": 33.00131756756757, "grad_norm": 6.249261856079102, "learning_rate": 3.90625e-07, "loss": 0.0173, "step": 9807 }, { "epoch": 33.00135135135135, "grad_norm": 15.903351783752441, "learning_rate": 3.90625e-07, "loss": 0.0417, "step": 9808 }, { "epoch": 33.00138513513514, "grad_norm": 0.011904202401638031, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9809 }, { "epoch": 33.00141891891892, "grad_norm": 0.0010587163269519806, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9810 }, { "epoch": 33.0014527027027, "grad_norm": 0.003195153083652258, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9811 }, { "epoch": 33.001486486486485, "grad_norm": 0.013325950130820274, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9812 }, { "epoch": 33.00152027027027, "grad_norm": 12.150688171386719, "learning_rate": 3.90625e-07, "loss": 0.0546, "step": 9813 }, { "epoch": 33.001554054054054, "grad_norm": 0.002134148497134447, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9814 }, { "epoch": 33.00158783783784, "grad_norm": 0.005976546090096235, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9815 }, { "epoch": 33.001621621621624, "grad_norm": 0.005329979583621025, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9816 }, { "epoch": 33.00165540540541, "grad_norm": 0.08109404891729355, "learning_rate": 3.90625e-07, "loss": 0.0026, "step": 9817 }, { "epoch": 33.001689189189186, "grad_norm": 0.1292862892150879, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9818 }, { "epoch": 33.00172297297297, "grad_norm": 0.0023701440077275038, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9819 }, { "epoch": 33.001756756756755, "grad_norm": 0.05857947841286659, "learning_rate": 3.90625e-07, "loss": 0.0013, "step": 9820 }, { "epoch": 33.00179054054054, "grad_norm": 0.03222072869539261, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9821 }, { "epoch": 33.001824324324325, "grad_norm": 0.03219415992498398, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9822 }, { "epoch": 33.00185810810811, "grad_norm": 34.063507080078125, "learning_rate": 3.90625e-07, "loss": 0.1153, "step": 9823 }, { "epoch": 33.001891891891894, "grad_norm": 0.16882582008838654, "learning_rate": 3.90625e-07, "loss": 0.006, "step": 9824 }, { "epoch": 33.00192567567568, "grad_norm": 0.46077078580856323, "learning_rate": 3.90625e-07, "loss": 0.0071, "step": 9825 }, { "epoch": 33.00195945945946, "grad_norm": 0.016379525884985924, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9826 }, { "epoch": 33.00199324324324, "grad_norm": 0.7554235458374023, "learning_rate": 3.90625e-07, "loss": 0.0145, "step": 9827 }, { "epoch": 33.002027027027026, "grad_norm": 0.07280003279447556, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9828 }, { "epoch": 33.00206081081081, "grad_norm": 0.005584930069744587, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9829 }, { "epoch": 33.002094594594595, "grad_norm": 0.009718682616949081, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9830 }, { "epoch": 33.00212837837838, "grad_norm": 0.001149357296526432, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9831 }, { "epoch": 33.002162162162165, "grad_norm": 0.11996962130069733, "learning_rate": 3.90625e-07, "loss": 0.0044, "step": 9832 }, { "epoch": 33.00219594594594, "grad_norm": 0.0012803577119484544, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9833 }, { "epoch": 33.00222972972973, "grad_norm": 0.1308087855577469, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 9834 }, { "epoch": 33.00226351351351, "grad_norm": 0.002474862849339843, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9835 }, { "epoch": 33.0022972972973, "grad_norm": 0.0017497336957603693, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9836 }, { "epoch": 33.00233108108108, "grad_norm": 0.29835575819015503, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 9837 }, { "epoch": 33.002364864864866, "grad_norm": 0.020585453137755394, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9838 }, { "epoch": 33.00239864864865, "grad_norm": 0.0012516668066382408, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9839 }, { "epoch": 33.002432432432435, "grad_norm": 0.0011872905306518078, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9840 }, { "epoch": 33.00246621621621, "grad_norm": 0.0067343879491090775, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9841 }, { "epoch": 33.0025, "grad_norm": 0.024715710431337357, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9842 }, { "epoch": 33.00253378378378, "grad_norm": 0.001107691554352641, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9843 }, { "epoch": 33.00256756756757, "grad_norm": 0.007246755994856358, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9844 }, { "epoch": 33.00260135135135, "grad_norm": 0.003778967075049877, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9845 }, { "epoch": 33.00263513513514, "grad_norm": 0.015019598416984081, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9846 }, { "epoch": 33.00266891891892, "grad_norm": 0.0011606091866269708, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9847 }, { "epoch": 33.002702702702706, "grad_norm": 0.0012380999978631735, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9848 }, { "epoch": 33.002736486486484, "grad_norm": 0.0024845427833497524, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9849 }, { "epoch": 33.00277027027027, "grad_norm": 0.002239635679870844, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9850 }, { "epoch": 33.00280405405405, "grad_norm": 0.10973402112722397, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 9851 }, { "epoch": 33.00283783783784, "grad_norm": 0.002537005115300417, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9852 }, { "epoch": 33.00287162162162, "grad_norm": 0.003440961241722107, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9853 }, { "epoch": 33.00290540540541, "grad_norm": 0.024918239563703537, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9854 }, { "epoch": 33.00293918918919, "grad_norm": 0.002703051781281829, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9855 }, { "epoch": 33.00297297297297, "grad_norm": 0.0032270450610667467, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9856 }, { "epoch": 33.003006756756754, "grad_norm": 0.0018022955628111959, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9857 }, { "epoch": 33.00304054054054, "grad_norm": 0.0019344073953107, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9858 }, { "epoch": 33.003074324324324, "grad_norm": 0.11461341381072998, "learning_rate": 3.90625e-07, "loss": 0.0043, "step": 9859 }, { "epoch": 33.00310810810811, "grad_norm": 0.0014862609095871449, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9860 }, { "epoch": 33.00314189189189, "grad_norm": 0.1181907206773758, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 9861 }, { "epoch": 33.00317567567568, "grad_norm": 0.11954371631145477, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 9862 }, { "epoch": 33.00320945945946, "grad_norm": 1.6062026023864746, "learning_rate": 3.90625e-07, "loss": 0.0094, "step": 9863 }, { "epoch": 33.00324324324324, "grad_norm": 0.3767111301422119, "learning_rate": 3.90625e-07, "loss": 0.0088, "step": 9864 }, { "epoch": 33.003277027027025, "grad_norm": 0.0015893211821094155, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9865 }, { "epoch": 33.00331081081081, "grad_norm": 0.08460771292448044, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9866 }, { "epoch": 33.003344594594594, "grad_norm": 0.006209124810993671, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9867 }, { "epoch": 33.00337837837838, "grad_norm": 0.0024826228618621826, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9868 }, { "epoch": 33.003412162162164, "grad_norm": 0.022693101316690445, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9869 }, { "epoch": 33.00344594594595, "grad_norm": 0.15416061878204346, "learning_rate": 3.90625e-07, "loss": 0.0023, "step": 9870 }, { "epoch": 33.00347972972973, "grad_norm": 5.5727858543396, "learning_rate": 3.90625e-07, "loss": 0.0266, "step": 9871 }, { "epoch": 33.00351351351351, "grad_norm": 3.944774866104126, "learning_rate": 3.90625e-07, "loss": 0.1393, "step": 9872 }, { "epoch": 33.003547297297295, "grad_norm": 16.142175674438477, "learning_rate": 3.90625e-07, "loss": 0.5608, "step": 9873 }, { "epoch": 33.00358108108108, "grad_norm": 0.0012900487054139376, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9874 }, { "epoch": 33.003614864864865, "grad_norm": 0.0009653761517256498, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9875 }, { "epoch": 33.00364864864865, "grad_norm": 0.011690774001181126, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9876 }, { "epoch": 33.003682432432434, "grad_norm": 0.0010833467822521925, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9877 }, { "epoch": 33.00371621621622, "grad_norm": 0.0036289433483034372, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9878 }, { "epoch": 33.00375, "grad_norm": 0.10295391082763672, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9879 }, { "epoch": 33.00378378378378, "grad_norm": 2.084244728088379, "learning_rate": 3.90625e-07, "loss": 0.0575, "step": 9880 }, { "epoch": 33.003817567567566, "grad_norm": 0.004840688779950142, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9881 }, { "epoch": 33.00385135135135, "grad_norm": 0.017380064353346825, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9882 }, { "epoch": 33.003885135135135, "grad_norm": 0.007175973150879145, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9883 }, { "epoch": 33.00391891891892, "grad_norm": 0.00307817617431283, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9884 }, { "epoch": 33.003952702702705, "grad_norm": 0.7705891728401184, "learning_rate": 3.90625e-07, "loss": 0.025, "step": 9885 }, { "epoch": 33.00398648648649, "grad_norm": 0.001953907310962677, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9886 }, { "epoch": 33.00402027027027, "grad_norm": 0.0010632678167894483, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9887 }, { "epoch": 33.00405405405405, "grad_norm": 0.002892283722758293, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9888 }, { "epoch": 33.00408783783784, "grad_norm": 0.0010337821440771222, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9889 }, { "epoch": 33.00412162162162, "grad_norm": 0.05470762774348259, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9890 }, { "epoch": 33.004155405405406, "grad_norm": 1.1352035999298096, "learning_rate": 3.90625e-07, "loss": 0.0095, "step": 9891 }, { "epoch": 33.00418918918919, "grad_norm": 0.001164642395451665, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9892 }, { "epoch": 33.004222972972975, "grad_norm": 0.09042828530073166, "learning_rate": 3.90625e-07, "loss": 0.002, "step": 9893 }, { "epoch": 33.00425675675676, "grad_norm": 0.0099716167896986, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9894 }, { "epoch": 33.00429054054054, "grad_norm": 0.0030961388256400824, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9895 }, { "epoch": 33.00432432432432, "grad_norm": 0.03861056640744209, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9896 }, { "epoch": 33.00435810810811, "grad_norm": 0.0021172186825424433, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9897 }, { "epoch": 33.00439189189189, "grad_norm": 0.10269692540168762, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9898 }, { "epoch": 33.00442567567568, "grad_norm": 0.0021980833262205124, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9899 }, { "epoch": 33.00445945945946, "grad_norm": 0.0039147874340415, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9900 }, { "epoch": 33.004493243243246, "grad_norm": 0.06861382722854614, "learning_rate": 3.90625e-07, "loss": 0.0017, "step": 9901 }, { "epoch": 33.004527027027024, "grad_norm": 0.10186132788658142, "learning_rate": 3.90625e-07, "loss": 0.0039, "step": 9902 }, { "epoch": 33.00456081081081, "grad_norm": 2.723597288131714, "learning_rate": 3.90625e-07, "loss": 0.0087, "step": 9903 }, { "epoch": 33.00459459459459, "grad_norm": 10.63803768157959, "learning_rate": 3.90625e-07, "loss": 0.054, "step": 9904 }, { "epoch": 33.00462837837838, "grad_norm": 0.0028645945712924004, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9905 }, { "epoch": 33.00466216216216, "grad_norm": 0.005323644261807203, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9906 }, { "epoch": 33.00469594594595, "grad_norm": 0.18637306988239288, "learning_rate": 3.90625e-07, "loss": 0.0065, "step": 9907 }, { "epoch": 33.00472972972973, "grad_norm": 0.00714282039552927, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9908 }, { "epoch": 33.00476351351352, "grad_norm": 0.0013328511267900467, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9909 }, { "epoch": 33.004797297297294, "grad_norm": 0.0034301516134291887, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9910 }, { "epoch": 33.00483108108108, "grad_norm": 0.019348766654729843, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9911 }, { "epoch": 33.004864864864864, "grad_norm": 0.3134959638118744, "learning_rate": 3.90625e-07, "loss": 0.0012, "step": 9912 }, { "epoch": 33.00489864864865, "grad_norm": 0.14637143909931183, "learning_rate": 3.90625e-07, "loss": 0.0035, "step": 9913 }, { "epoch": 33.00493243243243, "grad_norm": 0.0027037812396883965, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9914 }, { "epoch": 33.00496621621622, "grad_norm": 0.0029494566842913628, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9915 }, { "epoch": 33.005, "grad_norm": 0.005582113284617662, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9916 }, { "epoch": 33.00503378378379, "grad_norm": 15.299764633178711, "learning_rate": 3.90625e-07, "loss": 0.0293, "step": 9917 }, { "epoch": 33.005067567567565, "grad_norm": 0.056285560131073, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 9918 }, { "epoch": 33.00510135135135, "grad_norm": 5.780520439147949, "learning_rate": 3.90625e-07, "loss": 0.052, "step": 9919 }, { "epoch": 33.005135135135134, "grad_norm": 0.008322244510054588, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9920 }, { "epoch": 33.00516891891892, "grad_norm": 0.001608594786375761, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9921 }, { "epoch": 33.005202702702704, "grad_norm": 0.005771491210907698, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9922 }, { "epoch": 33.00523648648649, "grad_norm": 0.13723860681056976, "learning_rate": 3.90625e-07, "loss": 0.0043, "step": 9923 }, { "epoch": 33.00527027027027, "grad_norm": 0.008610463701188564, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9924 }, { "epoch": 33.00530405405405, "grad_norm": 0.030459821224212646, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9925 }, { "epoch": 33.005337837837835, "grad_norm": 0.34900200366973877, "learning_rate": 3.90625e-07, "loss": 0.0087, "step": 9926 }, { "epoch": 33.00537162162162, "grad_norm": 0.04523064196109772, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 9927 }, { "epoch": 33.005405405405405, "grad_norm": 16.021507263183594, "learning_rate": 3.90625e-07, "loss": 0.1228, "step": 9928 }, { "epoch": 33.00543918918919, "grad_norm": 0.12685655057430267, "learning_rate": 3.90625e-07, "loss": 0.0046, "step": 9929 }, { "epoch": 33.005472972972974, "grad_norm": 0.001827893080189824, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9930 }, { "epoch": 33.00550675675676, "grad_norm": 0.18133606016635895, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 9931 }, { "epoch": 33.005540540540544, "grad_norm": 0.002771381288766861, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9932 }, { "epoch": 33.00557432432432, "grad_norm": 0.12330419570207596, "learning_rate": 3.90625e-07, "loss": 0.0027, "step": 9933 }, { "epoch": 33.005608108108106, "grad_norm": 0.09384261071681976, "learning_rate": 3.90625e-07, "loss": 0.0033, "step": 9934 }, { "epoch": 33.00564189189189, "grad_norm": 0.000927562708966434, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9935 }, { "epoch": 33.005675675675676, "grad_norm": 4.678101539611816, "learning_rate": 3.90625e-07, "loss": 0.3722, "step": 9936 }, { "epoch": 33.00570945945946, "grad_norm": 0.0025832366663962603, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9937 }, { "epoch": 33.005743243243245, "grad_norm": 4.836427211761475, "learning_rate": 3.90625e-07, "loss": 0.6107, "step": 9938 }, { "epoch": 33.00577702702703, "grad_norm": 0.005408735014498234, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9939 }, { "epoch": 33.005810810810814, "grad_norm": 43.589725494384766, "learning_rate": 3.90625e-07, "loss": 0.6273, "step": 9940 }, { "epoch": 33.00584459459459, "grad_norm": 0.034425411373376846, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9941 }, { "epoch": 33.00587837837838, "grad_norm": 0.0024886385072022676, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9942 }, { "epoch": 33.00591216216216, "grad_norm": 0.002326544839888811, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9943 }, { "epoch": 33.005945945945946, "grad_norm": 0.0009147901437245309, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9944 }, { "epoch": 33.00597972972973, "grad_norm": 0.02319388836622238, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9945 }, { "epoch": 33.006013513513516, "grad_norm": 0.0014443029649555683, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9946 }, { "epoch": 33.0060472972973, "grad_norm": 0.5611678957939148, "learning_rate": 3.90625e-07, "loss": 0.0015, "step": 9947 }, { "epoch": 33.00608108108108, "grad_norm": 0.001997792860493064, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9948 }, { "epoch": 33.00611486486486, "grad_norm": 0.1011742651462555, "learning_rate": 3.90625e-07, "loss": 0.0036, "step": 9949 }, { "epoch": 33.00614864864865, "grad_norm": 0.005372282117605209, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9950 }, { "epoch": 33.00618243243243, "grad_norm": 0.0012268864084035158, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9951 }, { "epoch": 33.00621621621622, "grad_norm": 0.04497186467051506, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9952 }, { "epoch": 33.00625, "grad_norm": 0.0009772077901288867, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9953 }, { "epoch": 33.006283783783786, "grad_norm": 0.002170124091207981, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9954 }, { "epoch": 33.00631756756757, "grad_norm": 0.23974570631980896, "learning_rate": 3.90625e-07, "loss": 0.0013, "step": 9955 }, { "epoch": 33.00635135135135, "grad_norm": 0.01912335492670536, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9956 }, { "epoch": 33.00638513513513, "grad_norm": 0.1600516140460968, "learning_rate": 3.90625e-07, "loss": 0.0059, "step": 9957 }, { "epoch": 33.00641891891892, "grad_norm": 0.034714002162218094, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 9958 }, { "epoch": 33.0064527027027, "grad_norm": 0.0006181748467497528, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9959 }, { "epoch": 33.00648648648649, "grad_norm": 0.0016032180283218622, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9960 }, { "epoch": 33.00652027027027, "grad_norm": 0.7852601408958435, "learning_rate": 3.90625e-07, "loss": 0.0068, "step": 9961 }, { "epoch": 33.00655405405406, "grad_norm": 0.004028369206935167, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9962 }, { "epoch": 33.006587837837834, "grad_norm": 49.37603759765625, "learning_rate": 3.90625e-07, "loss": 0.3935, "step": 9963 }, { "epoch": 33.00662162162162, "grad_norm": 0.15879318118095398, "learning_rate": 3.90625e-07, "loss": 0.0045, "step": 9964 }, { "epoch": 33.006655405405404, "grad_norm": 0.019234254956245422, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9965 }, { "epoch": 33.00668918918919, "grad_norm": 0.005070572718977928, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9966 }, { "epoch": 33.00672297297297, "grad_norm": 0.0944858267903328, "learning_rate": 3.90625e-07, "loss": 0.0034, "step": 9967 }, { "epoch": 33.00675675675676, "grad_norm": 0.027933111414313316, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 9968 }, { "epoch": 33.00679054054054, "grad_norm": 27.736967086791992, "learning_rate": 3.90625e-07, "loss": 0.3748, "step": 9969 }, { "epoch": 33.00682432432433, "grad_norm": 0.10129915177822113, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 9970 }, { "epoch": 33.006858108108105, "grad_norm": 0.005712281912565231, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9971 }, { "epoch": 33.00689189189189, "grad_norm": 0.000747232639696449, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9972 }, { "epoch": 33.006925675675674, "grad_norm": 0.013324668630957603, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9973 }, { "epoch": 33.00695945945946, "grad_norm": 3.316540479660034, "learning_rate": 3.90625e-07, "loss": 0.4452, "step": 9974 }, { "epoch": 33.006993243243244, "grad_norm": 0.0722678080201149, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9975 }, { "epoch": 33.00702702702703, "grad_norm": 0.02876143716275692, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 9976 }, { "epoch": 33.00706081081081, "grad_norm": 0.0010096505284309387, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9977 }, { "epoch": 33.0070945945946, "grad_norm": 0.004558845888823271, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9978 }, { "epoch": 33.007128378378376, "grad_norm": 0.0014641409507021308, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9979 }, { "epoch": 33.00716216216216, "grad_norm": 0.12996935844421387, "learning_rate": 3.90625e-07, "loss": 0.0047, "step": 9980 }, { "epoch": 33.007195945945945, "grad_norm": 0.005237491335719824, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9981 }, { "epoch": 33.00722972972973, "grad_norm": 0.0018863081932067871, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9982 }, { "epoch": 33.007263513513514, "grad_norm": 0.001645209384150803, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9983 }, { "epoch": 33.0072972972973, "grad_norm": 0.005136517342180014, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9984 }, { "epoch": 33.007331081081084, "grad_norm": 0.00309663824737072, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9985 }, { "epoch": 33.00736486486486, "grad_norm": 0.0009330154862254858, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 9986 }, { "epoch": 33.007398648648646, "grad_norm": 0.0021664046216756105, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9987 }, { "epoch": 33.00743243243243, "grad_norm": 0.002785063348710537, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9988 }, { "epoch": 33.007466216216216, "grad_norm": 5.108492851257324, "learning_rate": 3.90625e-07, "loss": 0.1193, "step": 9989 }, { "epoch": 33.0075, "grad_norm": 0.0020234095863997936, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9990 }, { "epoch": 33.007533783783785, "grad_norm": 0.004981684498488903, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9991 }, { "epoch": 33.00756756756757, "grad_norm": 0.08824238926172256, "learning_rate": 3.90625e-07, "loss": 0.002, "step": 9992 }, { "epoch": 33.007601351351354, "grad_norm": 52.506038665771484, "learning_rate": 3.90625e-07, "loss": 0.2879, "step": 9993 }, { "epoch": 33.00763513513513, "grad_norm": 0.07126250118017197, "learning_rate": 3.90625e-07, "loss": 0.0018, "step": 9994 }, { "epoch": 33.00766891891892, "grad_norm": 0.009630697779357433, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9995 }, { "epoch": 33.0077027027027, "grad_norm": 2.19697904586792, "learning_rate": 3.90625e-07, "loss": 0.0064, "step": 9996 }, { "epoch": 33.007736486486486, "grad_norm": 1.24393892288208, "learning_rate": 3.90625e-07, "loss": 0.031, "step": 9997 }, { "epoch": 33.00777027027027, "grad_norm": 0.0017417118651792407, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 9998 }, { "epoch": 33.007804054054056, "grad_norm": 0.041284359991550446, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 9999 }, { "epoch": 33.00783783783784, "grad_norm": 0.0006974176503717899, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10000 }, { "epoch": 33.007871621621625, "grad_norm": 0.0016777397831901908, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10001 }, { "epoch": 33.0079054054054, "grad_norm": 0.0009758779779076576, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10002 }, { "epoch": 33.00793918918919, "grad_norm": 0.6184316277503967, "learning_rate": 3.90625e-07, "loss": 0.0028, "step": 10003 }, { "epoch": 33.00797297297297, "grad_norm": 0.012710046954452991, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10004 }, { "epoch": 33.00800675675676, "grad_norm": 0.3455437123775482, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 10005 }, { "epoch": 33.00804054054054, "grad_norm": 1.2949135303497314, "learning_rate": 3.90625e-07, "loss": 0.0029, "step": 10006 }, { "epoch": 33.008074324324326, "grad_norm": 0.0011081858538091183, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10007 }, { "epoch": 33.00810810810811, "grad_norm": 0.02657926268875599, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10008 }, { "epoch": 33.00814189189189, "grad_norm": 0.0015710284933447838, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10009 }, { "epoch": 33.00817567567567, "grad_norm": 0.00768659682944417, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10010 }, { "epoch": 33.00820945945946, "grad_norm": 0.0018912763334810734, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10011 }, { "epoch": 33.00824324324324, "grad_norm": 0.0032852531876415014, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10012 }, { "epoch": 33.00827702702703, "grad_norm": 0.0011108001926913857, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10013 }, { "epoch": 33.00831081081081, "grad_norm": 0.004534272011369467, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10014 }, { "epoch": 33.0083445945946, "grad_norm": 0.004048200789839029, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10015 }, { "epoch": 33.00837837837838, "grad_norm": 0.0011772726429626346, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10016 }, { "epoch": 33.00841216216216, "grad_norm": 0.0024486007168889046, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10017 }, { "epoch": 33.008445945945944, "grad_norm": 0.037170786410570145, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10018 }, { "epoch": 33.00847972972973, "grad_norm": 0.32026708126068115, "learning_rate": 3.90625e-07, "loss": 0.0055, "step": 10019 }, { "epoch": 33.00851351351351, "grad_norm": 0.002566184615716338, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10020 }, { "epoch": 33.0085472972973, "grad_norm": 0.0009559052414260805, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10021 }, { "epoch": 33.00858108108108, "grad_norm": 0.7789846062660217, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 10022 }, { "epoch": 33.00861486486487, "grad_norm": 23.90822410583496, "learning_rate": 3.90625e-07, "loss": 0.0505, "step": 10023 }, { "epoch": 33.00864864864865, "grad_norm": 7.839721202850342, "learning_rate": 3.90625e-07, "loss": 0.0521, "step": 10024 }, { "epoch": 33.00868243243243, "grad_norm": 0.0018870820058509707, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10025 }, { "epoch": 33.008716216216214, "grad_norm": 0.0019423151388764381, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10026 }, { "epoch": 33.00875, "grad_norm": 11.235243797302246, "learning_rate": 3.90625e-07, "loss": 0.8505, "step": 10027 }, { "epoch": 33.008783783783784, "grad_norm": 3.539841413497925, "learning_rate": 3.90625e-07, "loss": 0.3734, "step": 10028 }, { "epoch": 33.00881756756757, "grad_norm": 0.007754335645586252, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10029 }, { "epoch": 33.00885135135135, "grad_norm": 0.07212530821561813, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 10030 }, { "epoch": 33.00888513513514, "grad_norm": 0.242909774184227, "learning_rate": 3.90625e-07, "loss": 0.0009, "step": 10031 }, { "epoch": 33.008918918918916, "grad_norm": 0.025740912184119225, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10032 }, { "epoch": 33.0089527027027, "grad_norm": 0.0017802831716835499, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10033 }, { "epoch": 33.008986486486485, "grad_norm": 0.0021711557637900114, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10034 }, { "epoch": 33.00902027027027, "grad_norm": 0.0014817595947533846, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10035 }, { "epoch": 33.009054054054054, "grad_norm": 0.001421425025910139, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10036 }, { "epoch": 33.00908783783784, "grad_norm": 0.007020855322480202, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10037 }, { "epoch": 33.009121621621624, "grad_norm": 0.003190558170899749, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10038 }, { "epoch": 33.00915540540541, "grad_norm": 0.003824579529464245, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10039 }, { "epoch": 33.009189189189186, "grad_norm": 0.00161017628852278, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10040 }, { "epoch": 33.00922297297297, "grad_norm": 0.002213026862591505, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10041 }, { "epoch": 33.009256756756756, "grad_norm": 0.017478371039032936, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10042 }, { "epoch": 33.00929054054054, "grad_norm": 5.0274176597595215, "learning_rate": 3.90625e-07, "loss": 0.0618, "step": 10043 }, { "epoch": 33.009324324324325, "grad_norm": 0.01380128227174282, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10044 }, { "epoch": 33.00935810810811, "grad_norm": 0.2514980435371399, "learning_rate": 3.90625e-07, "loss": 0.003, "step": 10045 }, { "epoch": 33.009391891891894, "grad_norm": 0.09885657578706741, "learning_rate": 3.90625e-07, "loss": 0.0038, "step": 10046 }, { "epoch": 33.00942567567568, "grad_norm": 0.5433288812637329, "learning_rate": 3.90625e-07, "loss": 0.0182, "step": 10047 }, { "epoch": 33.00945945945946, "grad_norm": 0.007010755594819784, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10048 }, { "epoch": 33.00949324324324, "grad_norm": 0.001822633552365005, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10049 }, { "epoch": 33.009527027027026, "grad_norm": 0.0007951149600557983, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10050 }, { "epoch": 33.00956081081081, "grad_norm": 0.010991823859512806, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10051 }, { "epoch": 33.009594594594596, "grad_norm": 3.038038969039917, "learning_rate": 3.90625e-07, "loss": 0.0606, "step": 10052 }, { "epoch": 33.00962837837838, "grad_norm": 0.024372771382331848, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10053 }, { "epoch": 33.009662162162165, "grad_norm": 0.05954931676387787, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 10054 }, { "epoch": 33.00969594594594, "grad_norm": 0.03825206682085991, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10055 }, { "epoch": 33.00972972972973, "grad_norm": 1.9298776388168335, "learning_rate": 3.90625e-07, "loss": 0.0059, "step": 10056 }, { "epoch": 33.00976351351351, "grad_norm": 0.0019740869756788015, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10057 }, { "epoch": 33.0097972972973, "grad_norm": 0.026527216657996178, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10058 }, { "epoch": 33.00983108108108, "grad_norm": 5.134125709533691, "learning_rate": 3.90625e-07, "loss": 0.0144, "step": 10059 }, { "epoch": 33.009864864864866, "grad_norm": 0.15807603299617767, "learning_rate": 3.90625e-07, "loss": 0.0056, "step": 10060 }, { "epoch": 33.00989864864865, "grad_norm": 16.30689811706543, "learning_rate": 3.90625e-07, "loss": 0.2467, "step": 10061 }, { "epoch": 33.009932432432436, "grad_norm": 0.028876859694719315, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10062 }, { "epoch": 33.00996621621621, "grad_norm": 0.0012884816387668252, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10063 }, { "epoch": 33.01, "grad_norm": 0.001268040738068521, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10064 }, { "epoch": 33.01, "eval_accuracy": 0.9030694668820679, "eval_loss": 0.5874162316322327, "eval_runtime": 33.0393, "eval_samples_per_second": 18.735, "eval_steps_per_second": 2.361, "step": 10064 }, { "epoch": 34.000033783783785, "grad_norm": 0.0010706569300964475, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10065 }, { "epoch": 34.00006756756757, "grad_norm": 0.00158982805442065, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10066 }, { "epoch": 34.000101351351354, "grad_norm": 0.003156134160235524, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10067 }, { "epoch": 34.00013513513513, "grad_norm": 0.06057959049940109, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10068 }, { "epoch": 34.00016891891892, "grad_norm": 0.013545027934014797, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10069 }, { "epoch": 34.0002027027027, "grad_norm": 0.000792150036431849, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10070 }, { "epoch": 34.000236486486486, "grad_norm": 0.008368190377950668, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10071 }, { "epoch": 34.00027027027027, "grad_norm": 0.0021537260618060827, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10072 }, { "epoch": 34.000304054054055, "grad_norm": 0.0013577926438301802, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10073 }, { "epoch": 34.00033783783784, "grad_norm": 0.002420135773718357, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10074 }, { "epoch": 34.000371621621625, "grad_norm": 0.0014709477545693517, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10075 }, { "epoch": 34.0004054054054, "grad_norm": 0.022617075592279434, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10076 }, { "epoch": 34.00043918918919, "grad_norm": 0.3454964756965637, "learning_rate": 3.90625e-07, "loss": 0.0066, "step": 10077 }, { "epoch": 34.00047297297297, "grad_norm": 10.90247917175293, "learning_rate": 3.90625e-07, "loss": 0.0355, "step": 10078 }, { "epoch": 34.00050675675676, "grad_norm": 0.0006995273288339376, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10079 }, { "epoch": 34.00054054054054, "grad_norm": 0.001055674976669252, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10080 }, { "epoch": 34.000574324324326, "grad_norm": 0.04982468858361244, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 10081 }, { "epoch": 34.00060810810811, "grad_norm": 28.151641845703125, "learning_rate": 3.90625e-07, "loss": 0.1866, "step": 10082 }, { "epoch": 34.000641891891895, "grad_norm": 0.0021209989208728075, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10083 }, { "epoch": 34.00067567567567, "grad_norm": 0.00243341620080173, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10084 }, { "epoch": 34.00070945945946, "grad_norm": 0.0027232752181589603, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10085 }, { "epoch": 34.00074324324324, "grad_norm": 0.0016448948299512267, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10086 }, { "epoch": 34.00077702702703, "grad_norm": 0.001889004954136908, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10087 }, { "epoch": 34.00081081081081, "grad_norm": 0.9204635620117188, "learning_rate": 3.90625e-07, "loss": 0.018, "step": 10088 }, { "epoch": 34.0008445945946, "grad_norm": 12.303531646728516, "learning_rate": 3.90625e-07, "loss": 0.0639, "step": 10089 }, { "epoch": 34.00087837837838, "grad_norm": 0.0012517641298472881, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10090 }, { "epoch": 34.00091216216216, "grad_norm": 0.010258719325065613, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10091 }, { "epoch": 34.00094594594594, "grad_norm": 0.0018738994840532541, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10092 }, { "epoch": 34.00097972972973, "grad_norm": 0.035987552255392075, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 10093 }, { "epoch": 34.00101351351351, "grad_norm": 0.1416044384241104, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 10094 }, { "epoch": 34.0010472972973, "grad_norm": 10.605558395385742, "learning_rate": 3.90625e-07, "loss": 0.8121, "step": 10095 }, { "epoch": 34.00108108108108, "grad_norm": 0.006891495082527399, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10096 }, { "epoch": 34.00111486486487, "grad_norm": 11.512628555297852, "learning_rate": 3.90625e-07, "loss": 0.519, "step": 10097 }, { "epoch": 34.00114864864865, "grad_norm": 0.0032252478413283825, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10098 }, { "epoch": 34.00118243243243, "grad_norm": 0.004207529127597809, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10099 }, { "epoch": 34.001216216216214, "grad_norm": 0.11077942699193954, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 10100 }, { "epoch": 34.00125, "grad_norm": 0.0020832568407058716, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10101 }, { "epoch": 34.001283783783784, "grad_norm": 0.050889380276203156, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 10102 }, { "epoch": 34.00131756756757, "grad_norm": 0.006965768989175558, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10103 }, { "epoch": 34.00135135135135, "grad_norm": 0.005876925773918629, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10104 }, { "epoch": 34.00138513513514, "grad_norm": 0.00065990089206025, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10105 }, { "epoch": 34.00141891891892, "grad_norm": 0.00100306689273566, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10106 }, { "epoch": 34.0014527027027, "grad_norm": 41.92544937133789, "learning_rate": 3.90625e-07, "loss": 0.5311, "step": 10107 }, { "epoch": 34.001486486486485, "grad_norm": 0.0018039846327155828, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10108 }, { "epoch": 34.00152027027027, "grad_norm": 0.001901142532005906, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10109 }, { "epoch": 34.001554054054054, "grad_norm": 0.21469777822494507, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 10110 }, { "epoch": 34.00158783783784, "grad_norm": 0.0014515123330056667, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10111 }, { "epoch": 34.001621621621624, "grad_norm": 0.024259231984615326, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10112 }, { "epoch": 34.00165540540541, "grad_norm": 0.003470849944278598, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10113 }, { "epoch": 34.001689189189186, "grad_norm": 0.11477972567081451, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 10114 }, { "epoch": 34.00172297297297, "grad_norm": 0.005050907842814922, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10115 }, { "epoch": 34.001756756756755, "grad_norm": 0.007254071533679962, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10116 }, { "epoch": 34.00179054054054, "grad_norm": 0.002347019501030445, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10117 }, { "epoch": 34.001824324324325, "grad_norm": 0.11543845385313034, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 10118 }, { "epoch": 34.00185810810811, "grad_norm": 2.5019619464874268, "learning_rate": 3.90625e-07, "loss": 0.0731, "step": 10119 }, { "epoch": 34.001891891891894, "grad_norm": 0.0015674771275371313, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10120 }, { "epoch": 34.00192567567568, "grad_norm": 0.0011085271835327148, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10121 }, { "epoch": 34.00195945945946, "grad_norm": 0.45739516615867615, "learning_rate": 3.90625e-07, "loss": 0.0067, "step": 10122 }, { "epoch": 34.00199324324324, "grad_norm": 0.09995150566101074, "learning_rate": 3.90625e-07, "loss": 0.0037, "step": 10123 }, { "epoch": 34.002027027027026, "grad_norm": 0.004898015409708023, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10124 }, { "epoch": 34.00206081081081, "grad_norm": 0.000970917462836951, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10125 }, { "epoch": 34.002094594594595, "grad_norm": 0.032720375806093216, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 10126 }, { "epoch": 34.00212837837838, "grad_norm": 0.006347624585032463, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10127 }, { "epoch": 34.002162162162165, "grad_norm": 0.0025067951064556837, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10128 }, { "epoch": 34.00219594594594, "grad_norm": 0.0022687374148517847, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10129 }, { "epoch": 34.00222972972973, "grad_norm": 6.438855171203613, "learning_rate": 3.90625e-07, "loss": 0.3928, "step": 10130 }, { "epoch": 34.00226351351351, "grad_norm": 0.0013045638334006071, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10131 }, { "epoch": 34.0022972972973, "grad_norm": 0.19621765613555908, "learning_rate": 3.90625e-07, "loss": 0.0047, "step": 10132 }, { "epoch": 34.00233108108108, "grad_norm": 2.637925386428833, "learning_rate": 3.90625e-07, "loss": 0.0071, "step": 10133 }, { "epoch": 34.002364864864866, "grad_norm": 0.018881790339946747, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10134 }, { "epoch": 34.00239864864865, "grad_norm": 0.005476081743836403, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10135 }, { "epoch": 34.002432432432435, "grad_norm": 0.0022836255375295877, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10136 }, { "epoch": 34.00246621621621, "grad_norm": 0.005617726594209671, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10137 }, { "epoch": 34.0025, "grad_norm": 0.030639739707112312, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10138 }, { "epoch": 34.00253378378378, "grad_norm": 0.0031965970993041992, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10139 }, { "epoch": 34.00256756756757, "grad_norm": 0.0007062412914820015, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10140 }, { "epoch": 34.00260135135135, "grad_norm": 0.06302370876073837, "learning_rate": 3.90625e-07, "loss": 0.0008, "step": 10141 }, { "epoch": 34.00263513513514, "grad_norm": 0.0016330546932294965, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10142 }, { "epoch": 34.00266891891892, "grad_norm": 0.0009400186827406287, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10143 }, { "epoch": 34.002702702702706, "grad_norm": 0.0022616388741880655, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10144 }, { "epoch": 34.002736486486484, "grad_norm": 0.001724863308481872, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10145 }, { "epoch": 34.00277027027027, "grad_norm": 0.0018859464908018708, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10146 }, { "epoch": 34.00280405405405, "grad_norm": 0.0008648295770399272, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10147 }, { "epoch": 34.00283783783784, "grad_norm": 42.65065002441406, "learning_rate": 3.90625e-07, "loss": 0.7151, "step": 10148 }, { "epoch": 34.00287162162162, "grad_norm": 4.918276786804199, "learning_rate": 3.90625e-07, "loss": 0.266, "step": 10149 }, { "epoch": 34.00290540540541, "grad_norm": 45.0455322265625, "learning_rate": 3.90625e-07, "loss": 0.1928, "step": 10150 }, { "epoch": 34.00293918918919, "grad_norm": 0.0016684409929439425, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10151 }, { "epoch": 34.00297297297297, "grad_norm": 0.011045235209167004, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10152 }, { "epoch": 34.003006756756754, "grad_norm": 8.501822471618652, "learning_rate": 3.90625e-07, "loss": 0.5701, "step": 10153 }, { "epoch": 34.00304054054054, "grad_norm": 0.003259134478867054, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10154 }, { "epoch": 34.003074324324324, "grad_norm": 0.5698793530464172, "learning_rate": 3.90625e-07, "loss": 0.0017, "step": 10155 }, { "epoch": 34.00310810810811, "grad_norm": 0.002174455439671874, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10156 }, { "epoch": 34.00314189189189, "grad_norm": 0.1370430588722229, "learning_rate": 3.90625e-07, "loss": 0.0048, "step": 10157 }, { "epoch": 34.00317567567568, "grad_norm": 0.22218672931194305, "learning_rate": 3.90625e-07, "loss": 0.004, "step": 10158 }, { "epoch": 34.00320945945946, "grad_norm": 0.0009140759357251227, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10159 }, { "epoch": 34.00324324324324, "grad_norm": 0.0024850342888385057, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10160 }, { "epoch": 34.003277027027025, "grad_norm": 0.013183638453483582, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10161 }, { "epoch": 34.00331081081081, "grad_norm": 0.000761422561481595, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10162 }, { "epoch": 34.003344594594594, "grad_norm": 3.370870590209961, "learning_rate": 3.90625e-07, "loss": 0.4356, "step": 10163 }, { "epoch": 34.00337837837838, "grad_norm": 0.0013161110691726208, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10164 }, { "epoch": 34.003412162162164, "grad_norm": 0.036853887140750885, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10165 }, { "epoch": 34.00344594594595, "grad_norm": 0.0024050050415098667, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10166 }, { "epoch": 34.00347972972973, "grad_norm": 4.696739196777344, "learning_rate": 3.90625e-07, "loss": 0.1216, "step": 10167 }, { "epoch": 34.00351351351351, "grad_norm": 0.0011531297350302339, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10168 }, { "epoch": 34.003547297297295, "grad_norm": 0.0023351535201072693, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10169 }, { "epoch": 34.00358108108108, "grad_norm": 0.005630151834338903, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10170 }, { "epoch": 34.003614864864865, "grad_norm": 0.11278029531240463, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 10171 }, { "epoch": 34.00364864864865, "grad_norm": 0.004565891809761524, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10172 }, { "epoch": 34.003682432432434, "grad_norm": 0.0019717400427907705, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10173 }, { "epoch": 34.00371621621622, "grad_norm": 0.0015702147502452135, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10174 }, { "epoch": 34.00375, "grad_norm": 0.10959473997354507, "learning_rate": 3.90625e-07, "loss": 0.0026, "step": 10175 }, { "epoch": 34.00378378378378, "grad_norm": 0.002130017150193453, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10176 }, { "epoch": 34.003817567567566, "grad_norm": 0.005916767753660679, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10177 }, { "epoch": 34.00385135135135, "grad_norm": 0.115421824157238, "learning_rate": 3.90625e-07, "loss": 0.0043, "step": 10178 }, { "epoch": 34.003885135135135, "grad_norm": 20.80235481262207, "learning_rate": 3.90625e-07, "loss": 0.9213, "step": 10179 }, { "epoch": 34.00391891891892, "grad_norm": 0.00852135382592678, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10180 }, { "epoch": 34.003952702702705, "grad_norm": 0.0020580310374498367, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10181 }, { "epoch": 34.00398648648649, "grad_norm": 0.0010457051685079932, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10182 }, { "epoch": 34.00402027027027, "grad_norm": 0.0012599460314959288, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10183 }, { "epoch": 34.00405405405405, "grad_norm": 0.003640576498582959, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10184 }, { "epoch": 34.00408783783784, "grad_norm": 0.010434589348733425, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10185 }, { "epoch": 34.00412162162162, "grad_norm": 0.00496313301846385, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10186 }, { "epoch": 34.004155405405406, "grad_norm": 0.00802877452224493, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10187 }, { "epoch": 34.00418918918919, "grad_norm": 0.0010474100708961487, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10188 }, { "epoch": 34.004222972972975, "grad_norm": 0.1149517148733139, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10189 }, { "epoch": 34.00425675675676, "grad_norm": 0.0021221216302365065, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10190 }, { "epoch": 34.00429054054054, "grad_norm": 0.003437032224610448, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10191 }, { "epoch": 34.00432432432432, "grad_norm": 0.005004568491131067, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10192 }, { "epoch": 34.00435810810811, "grad_norm": 0.007714989595115185, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10193 }, { "epoch": 34.00439189189189, "grad_norm": 0.12628787755966187, "learning_rate": 3.90625e-07, "loss": 0.0046, "step": 10194 }, { "epoch": 34.00442567567568, "grad_norm": 0.06121869012713432, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10195 }, { "epoch": 34.00445945945946, "grad_norm": 0.0037110529374331236, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10196 }, { "epoch": 34.004493243243246, "grad_norm": 0.002197252120822668, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10197 }, { "epoch": 34.004527027027024, "grad_norm": 0.0032409350387752056, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10198 }, { "epoch": 34.00456081081081, "grad_norm": 0.00267432676628232, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10199 }, { "epoch": 34.00459459459459, "grad_norm": 0.023540010675787926, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10200 }, { "epoch": 34.00462837837838, "grad_norm": 0.09373756498098373, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10201 }, { "epoch": 34.00466216216216, "grad_norm": 0.0006307609146460891, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10202 }, { "epoch": 34.00469594594595, "grad_norm": 0.5892778635025024, "learning_rate": 3.90625e-07, "loss": 0.0154, "step": 10203 }, { "epoch": 34.00472972972973, "grad_norm": 0.07447639107704163, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 10204 }, { "epoch": 34.00476351351352, "grad_norm": 0.0049636587500572205, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10205 }, { "epoch": 34.004797297297294, "grad_norm": 0.001940655754879117, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10206 }, { "epoch": 34.00483108108108, "grad_norm": 0.8016286492347717, "learning_rate": 3.90625e-07, "loss": 0.0215, "step": 10207 }, { "epoch": 34.004864864864864, "grad_norm": 0.010694782249629498, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10208 }, { "epoch": 34.00489864864865, "grad_norm": 0.000766862474847585, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10209 }, { "epoch": 34.00493243243243, "grad_norm": 0.2553204298019409, "learning_rate": 3.90625e-07, "loss": 0.0014, "step": 10210 }, { "epoch": 34.00496621621622, "grad_norm": 0.0019193406915292144, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10211 }, { "epoch": 34.005, "grad_norm": 0.14081591367721558, "learning_rate": 3.90625e-07, "loss": 0.001, "step": 10212 }, { "epoch": 34.00503378378379, "grad_norm": 0.0014393271412700415, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10213 }, { "epoch": 34.005067567567565, "grad_norm": 0.0009623607038520277, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10214 }, { "epoch": 34.00510135135135, "grad_norm": 3.2456271648406982, "learning_rate": 3.90625e-07, "loss": 0.3565, "step": 10215 }, { "epoch": 34.005135135135134, "grad_norm": 0.1103249043226242, "learning_rate": 3.90625e-07, "loss": 0.004, "step": 10216 }, { "epoch": 34.00516891891892, "grad_norm": 10.833685874938965, "learning_rate": 3.90625e-07, "loss": 0.0332, "step": 10217 }, { "epoch": 34.005202702702704, "grad_norm": 0.0013516682665795088, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10218 }, { "epoch": 34.00523648648649, "grad_norm": 0.11027068644762039, "learning_rate": 3.90625e-07, "loss": 0.0029, "step": 10219 }, { "epoch": 34.00527027027027, "grad_norm": 0.004579400643706322, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10220 }, { "epoch": 34.00530405405405, "grad_norm": 0.0010928206611424685, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10221 }, { "epoch": 34.005337837837835, "grad_norm": 9.405837059020996, "learning_rate": 3.90625e-07, "loss": 0.181, "step": 10222 }, { "epoch": 34.00537162162162, "grad_norm": 24.168556213378906, "learning_rate": 3.90625e-07, "loss": 0.0423, "step": 10223 }, { "epoch": 34.005405405405405, "grad_norm": 0.40383490920066833, "learning_rate": 3.90625e-07, "loss": 0.002, "step": 10224 }, { "epoch": 34.00543918918919, "grad_norm": 0.03458327054977417, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10225 }, { "epoch": 34.005472972972974, "grad_norm": 0.023976163938641548, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 10226 }, { "epoch": 34.00550675675676, "grad_norm": 0.001956065883859992, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10227 }, { "epoch": 34.005540540540544, "grad_norm": 0.002062438288703561, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10228 }, { "epoch": 34.00557432432432, "grad_norm": 0.0015319299418479204, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10229 }, { "epoch": 34.005608108108106, "grad_norm": 0.02303488552570343, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10230 }, { "epoch": 34.00564189189189, "grad_norm": 0.0016029097605496645, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10231 }, { "epoch": 34.005675675675676, "grad_norm": 18.248565673828125, "learning_rate": 3.90625e-07, "loss": 0.0747, "step": 10232 }, { "epoch": 34.00570945945946, "grad_norm": 0.13197852671146393, "learning_rate": 3.90625e-07, "loss": 0.005, "step": 10233 }, { "epoch": 34.005743243243245, "grad_norm": 0.0010218273382633924, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10234 }, { "epoch": 34.00577702702703, "grad_norm": 0.0026013641618192196, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10235 }, { "epoch": 34.005810810810814, "grad_norm": 0.0016248440369963646, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10236 }, { "epoch": 34.00584459459459, "grad_norm": 0.00840222742408514, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10237 }, { "epoch": 34.00587837837838, "grad_norm": 0.000695066642947495, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10238 }, { "epoch": 34.00591216216216, "grad_norm": 0.0016367865027859807, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10239 }, { "epoch": 34.005945945945946, "grad_norm": 0.001593286287970841, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10240 }, { "epoch": 34.00597972972973, "grad_norm": 1.2792601585388184, "learning_rate": 3.90625e-07, "loss": 0.0128, "step": 10241 }, { "epoch": 34.006013513513516, "grad_norm": 1.0148178339004517, "learning_rate": 3.90625e-07, "loss": 0.0133, "step": 10242 }, { "epoch": 34.0060472972973, "grad_norm": 0.010562626644968987, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10243 }, { "epoch": 34.00608108108108, "grad_norm": 0.13161002099514008, "learning_rate": 3.90625e-07, "loss": 0.0049, "step": 10244 }, { "epoch": 34.00611486486486, "grad_norm": 0.02976195141673088, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10245 }, { "epoch": 34.00614864864865, "grad_norm": 0.0005549159832298756, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10246 }, { "epoch": 34.00618243243243, "grad_norm": 0.16422545909881592, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 10247 }, { "epoch": 34.00621621621622, "grad_norm": 0.20813381671905518, "learning_rate": 3.90625e-07, "loss": 0.0067, "step": 10248 }, { "epoch": 34.00625, "grad_norm": 0.025117939338088036, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10249 }, { "epoch": 34.006283783783786, "grad_norm": 0.0008880704990588129, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10250 }, { "epoch": 34.00631756756757, "grad_norm": 0.1640029400587082, "learning_rate": 3.90625e-07, "loss": 0.0024, "step": 10251 }, { "epoch": 34.00635135135135, "grad_norm": 0.005463792011141777, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10252 }, { "epoch": 34.00638513513513, "grad_norm": 0.000725226360373199, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10253 }, { "epoch": 34.00641891891892, "grad_norm": 0.000624735897872597, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10254 }, { "epoch": 34.0064527027027, "grad_norm": 0.005527269560843706, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10255 }, { "epoch": 34.00648648648649, "grad_norm": 0.011659665033221245, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10256 }, { "epoch": 34.00652027027027, "grad_norm": 3.951486349105835, "learning_rate": 3.90625e-07, "loss": 0.4652, "step": 10257 }, { "epoch": 34.00655405405406, "grad_norm": 0.004324670415371656, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10258 }, { "epoch": 34.006587837837834, "grad_norm": 0.007706587202847004, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10259 }, { "epoch": 34.00662162162162, "grad_norm": 6.007850646972656, "learning_rate": 3.90625e-07, "loss": 0.2889, "step": 10260 }, { "epoch": 34.006655405405404, "grad_norm": 0.009136057458817959, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10261 }, { "epoch": 34.00668918918919, "grad_norm": 0.1074424758553505, "learning_rate": 3.90625e-07, "loss": 0.0039, "step": 10262 }, { "epoch": 34.00672297297297, "grad_norm": 0.002077963436022401, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10263 }, { "epoch": 34.00675675675676, "grad_norm": 0.009937801398336887, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10264 }, { "epoch": 34.00679054054054, "grad_norm": 0.001097019761800766, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10265 }, { "epoch": 34.00682432432433, "grad_norm": 0.004605032037943602, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10266 }, { "epoch": 34.006858108108105, "grad_norm": 0.0016382311005145311, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10267 }, { "epoch": 34.00689189189189, "grad_norm": 0.0011401150841265917, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10268 }, { "epoch": 34.006925675675674, "grad_norm": 0.05991801992058754, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10269 }, { "epoch": 34.00695945945946, "grad_norm": 0.0010995108168572187, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10270 }, { "epoch": 34.006993243243244, "grad_norm": 0.11372099071741104, "learning_rate": 3.90625e-07, "loss": 0.0041, "step": 10271 }, { "epoch": 34.00702702702703, "grad_norm": 1.3082914352416992, "learning_rate": 3.90625e-07, "loss": 0.0277, "step": 10272 }, { "epoch": 34.00706081081081, "grad_norm": 0.007657807320356369, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10273 }, { "epoch": 34.0070945945946, "grad_norm": 0.1454145759344101, "learning_rate": 3.90625e-07, "loss": 0.0005, "step": 10274 }, { "epoch": 34.007128378378376, "grad_norm": 0.009864446707069874, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10275 }, { "epoch": 34.00716216216216, "grad_norm": 0.0029125504661351442, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10276 }, { "epoch": 34.007195945945945, "grad_norm": 0.05658109858632088, "learning_rate": 3.90625e-07, "loss": 0.0007, "step": 10277 }, { "epoch": 34.00722972972973, "grad_norm": 1.7606074810028076, "learning_rate": 3.90625e-07, "loss": 0.004, "step": 10278 }, { "epoch": 34.007263513513514, "grad_norm": 0.07409996539354324, "learning_rate": 3.90625e-07, "loss": 0.0023, "step": 10279 }, { "epoch": 34.0072972972973, "grad_norm": 3.6872968673706055, "learning_rate": 3.90625e-07, "loss": 0.0958, "step": 10280 }, { "epoch": 34.007331081081084, "grad_norm": 0.0008781425422057509, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10281 }, { "epoch": 34.00736486486486, "grad_norm": 2.4249112606048584, "learning_rate": 3.90625e-07, "loss": 0.0863, "step": 10282 }, { "epoch": 34.007398648648646, "grad_norm": 0.1111249104142189, "learning_rate": 3.90625e-07, "loss": 0.0042, "step": 10283 }, { "epoch": 34.00743243243243, "grad_norm": 1.3301308155059814, "learning_rate": 3.90625e-07, "loss": 0.0242, "step": 10284 }, { "epoch": 34.007466216216216, "grad_norm": 0.002483893418684602, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10285 }, { "epoch": 34.0075, "grad_norm": 1.139278769493103, "learning_rate": 3.90625e-07, "loss": 0.032, "step": 10286 }, { "epoch": 34.007533783783785, "grad_norm": 0.0014151079813018441, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10287 }, { "epoch": 34.00756756756757, "grad_norm": 0.0011414389591664076, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10288 }, { "epoch": 34.007601351351354, "grad_norm": 0.0057556820102036, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10289 }, { "epoch": 34.00763513513513, "grad_norm": 3.3976075649261475, "learning_rate": 3.90625e-07, "loss": 0.4197, "step": 10290 }, { "epoch": 34.00766891891892, "grad_norm": 17.371110916137695, "learning_rate": 3.90625e-07, "loss": 0.7936, "step": 10291 }, { "epoch": 34.0077027027027, "grad_norm": 0.015302698127925396, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10292 }, { "epoch": 34.007736486486486, "grad_norm": 0.0012518562143668532, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10293 }, { "epoch": 34.00777027027027, "grad_norm": 0.001069920603185892, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10294 }, { "epoch": 34.007804054054056, "grad_norm": 0.25764963030815125, "learning_rate": 3.90625e-07, "loss": 0.0009, "step": 10295 }, { "epoch": 34.00783783783784, "grad_norm": 0.01229463703930378, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10296 }, { "epoch": 34.007871621621625, "grad_norm": 0.018636981025338173, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10297 }, { "epoch": 34.0079054054054, "grad_norm": 0.001300255418755114, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10298 }, { "epoch": 34.00793918918919, "grad_norm": 0.0027812402695417404, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10299 }, { "epoch": 34.00797297297297, "grad_norm": 0.0023030147422105074, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10300 }, { "epoch": 34.00800675675676, "grad_norm": 0.2220362275838852, "learning_rate": 3.90625e-07, "loss": 0.0015, "step": 10301 }, { "epoch": 34.00804054054054, "grad_norm": 0.017995059490203857, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10302 }, { "epoch": 34.008074324324326, "grad_norm": 3.1788077354431152, "learning_rate": 3.90625e-07, "loss": 0.0115, "step": 10303 }, { "epoch": 34.00810810810811, "grad_norm": 19.426105499267578, "learning_rate": 3.90625e-07, "loss": 0.4254, "step": 10304 }, { "epoch": 34.00814189189189, "grad_norm": 0.004710909444838762, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10305 }, { "epoch": 34.00817567567567, "grad_norm": 0.0011849404545500875, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10306 }, { "epoch": 34.00820945945946, "grad_norm": 0.0034156108740717173, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10307 }, { "epoch": 34.00824324324324, "grad_norm": 0.07273173332214355, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 10308 }, { "epoch": 34.00827702702703, "grad_norm": 0.0009413044899702072, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10309 }, { "epoch": 34.00831081081081, "grad_norm": 0.005058085545897484, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10310 }, { "epoch": 34.0083445945946, "grad_norm": 0.0192573219537735, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10311 }, { "epoch": 34.00837837837838, "grad_norm": 0.32455506920814514, "learning_rate": 3.90625e-07, "loss": 0.0065, "step": 10312 }, { "epoch": 34.00841216216216, "grad_norm": 18.744728088378906, "learning_rate": 3.90625e-07, "loss": 0.6131, "step": 10313 }, { "epoch": 34.008445945945944, "grad_norm": 0.002479841699823737, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10314 }, { "epoch": 34.00847972972973, "grad_norm": 0.04068697988986969, "learning_rate": 3.90625e-07, "loss": 0.0003, "step": 10315 }, { "epoch": 34.00851351351351, "grad_norm": 0.3480031192302704, "learning_rate": 3.90625e-07, "loss": 0.0024, "step": 10316 }, { "epoch": 34.0085472972973, "grad_norm": 0.003474531229585409, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10317 }, { "epoch": 34.00858108108108, "grad_norm": 67.6323471069336, "learning_rate": 3.90625e-07, "loss": 0.2349, "step": 10318 }, { "epoch": 34.00861486486487, "grad_norm": 0.004268398974090815, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10319 }, { "epoch": 34.00864864864865, "grad_norm": 0.000811526901088655, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10320 }, { "epoch": 34.00868243243243, "grad_norm": 0.001097380300052464, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10321 }, { "epoch": 34.008716216216214, "grad_norm": 0.004593044985085726, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10322 }, { "epoch": 34.00875, "grad_norm": 0.0031586256809532642, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10323 }, { "epoch": 34.008783783783784, "grad_norm": 0.5292773842811584, "learning_rate": 3.90625e-07, "loss": 0.0061, "step": 10324 }, { "epoch": 34.00881756756757, "grad_norm": 0.003647193545475602, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10325 }, { "epoch": 34.00885135135135, "grad_norm": 8.389702796936035, "learning_rate": 3.90625e-07, "loss": 0.3336, "step": 10326 }, { "epoch": 34.00888513513514, "grad_norm": 0.001664226409047842, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10327 }, { "epoch": 34.008918918918916, "grad_norm": 0.07140666246414185, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10328 }, { "epoch": 34.0089527027027, "grad_norm": 0.0055926102213561535, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10329 }, { "epoch": 34.008986486486485, "grad_norm": 0.005324197467416525, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10330 }, { "epoch": 34.00902027027027, "grad_norm": 0.23064500093460083, "learning_rate": 3.90625e-07, "loss": 0.0045, "step": 10331 }, { "epoch": 34.009054054054054, "grad_norm": 0.050962191075086594, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 10332 }, { "epoch": 34.00908783783784, "grad_norm": 0.002515139291062951, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10333 }, { "epoch": 34.009121621621624, "grad_norm": 0.0008938582614064217, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10334 }, { "epoch": 34.00915540540541, "grad_norm": 1.1780438423156738, "learning_rate": 3.90625e-07, "loss": 0.0018, "step": 10335 }, { "epoch": 34.009189189189186, "grad_norm": 0.011711074970662594, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10336 }, { "epoch": 34.00922297297297, "grad_norm": 0.0006520373863168061, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10337 }, { "epoch": 34.009256756756756, "grad_norm": 0.001409023068845272, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10338 }, { "epoch": 34.00929054054054, "grad_norm": 0.0013923938386142254, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10339 }, { "epoch": 34.009324324324325, "grad_norm": 0.0029898236971348524, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10340 }, { "epoch": 34.00935810810811, "grad_norm": 0.002547228941693902, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10341 }, { "epoch": 34.009391891891894, "grad_norm": 0.09474105387926102, "learning_rate": 3.90625e-07, "loss": 0.0006, "step": 10342 }, { "epoch": 34.00942567567568, "grad_norm": 0.0008591265650466084, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10343 }, { "epoch": 34.00945945945946, "grad_norm": 0.11774266511201859, "learning_rate": 3.90625e-07, "loss": 0.0043, "step": 10344 }, { "epoch": 34.00949324324324, "grad_norm": 0.0030180984176695347, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10345 }, { "epoch": 34.009527027027026, "grad_norm": 0.006674756295979023, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10346 }, { "epoch": 34.00956081081081, "grad_norm": 0.03188442066311836, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10347 }, { "epoch": 34.009594594594596, "grad_norm": 0.15503562986850739, "learning_rate": 3.90625e-07, "loss": 0.0017, "step": 10348 }, { "epoch": 34.00962837837838, "grad_norm": 0.004664101172238588, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10349 }, { "epoch": 34.009662162162165, "grad_norm": 0.009932945482432842, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10350 }, { "epoch": 34.00969594594594, "grad_norm": 0.0011480285320430994, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10351 }, { "epoch": 34.00972972972973, "grad_norm": 13.146384239196777, "learning_rate": 3.90625e-07, "loss": 0.216, "step": 10352 }, { "epoch": 34.00976351351351, "grad_norm": 0.04930158331990242, "learning_rate": 3.90625e-07, "loss": 0.0004, "step": 10353 }, { "epoch": 34.0097972972973, "grad_norm": 0.0022436310537159443, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10354 }, { "epoch": 34.00983108108108, "grad_norm": 0.0018232460133731365, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10355 }, { "epoch": 34.009864864864866, "grad_norm": 0.0008130219648592174, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10356 }, { "epoch": 34.00989864864865, "grad_norm": 0.011346212588250637, "learning_rate": 3.90625e-07, "loss": 0.0002, "step": 10357 }, { "epoch": 34.009932432432436, "grad_norm": 0.0008322375360876322, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10358 }, { "epoch": 34.00996621621621, "grad_norm": 0.002898730803281069, "learning_rate": 3.90625e-07, "loss": 0.0001, "step": 10359 }, { "epoch": 34.01, "grad_norm": 0.0009738121880218387, "learning_rate": 3.90625e-07, "loss": 0.0, "step": 10360 }, { "epoch": 34.01, "eval_accuracy": 0.8966074313408724, "eval_loss": 0.6036667823791504, "eval_runtime": 32.9978, "eval_samples_per_second": 18.759, "eval_steps_per_second": 2.364, "step": 10360 }, { "epoch": 34.01, "step": 10360, "total_flos": 1.0327383317541814e+20, "train_loss": 0.08835294664600042, "train_runtime": 8140.7467, "train_samples_per_second": 29.088, "train_steps_per_second": 3.636 }, { "epoch": 34.01, "eval_accuracy": 0.939297124600639, "eval_loss": 0.3122161626815796, "eval_runtime": 17.1632, "eval_samples_per_second": 18.237, "eval_steps_per_second": 2.331, "step": 10360 }, { "epoch": 34.01, "eval_accuracy": 0.939297124600639, "eval_loss": 0.3122161626815796, "eval_runtime": 16.8893, "eval_samples_per_second": 18.532, "eval_steps_per_second": 2.368, "step": 10360 } ], "logging_steps": 1, "max_steps": 29600, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 10 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0327383317541814e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }