AraEuroBert-210M / trainer_state.json
Omartificial-Intelligence-Space's picture
upload files
65aa523 verified
{
"best_metric": 0.8237398652434835,
"best_model_checkpoint": "output/eurobert_simce_EuroBERT-EuroBERT-210m_32_bs_1_e/checkpoint-4500",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 7813,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02559836170485089,
"grad_norm": 21741072.0,
"learning_rate": 1.2787723785166241e-05,
"loss": 8.8816,
"step": 200
},
{
"epoch": 0.05119672340970178,
"grad_norm": 11201852.0,
"learning_rate": 2.5575447570332482e-05,
"loss": 5.1404,
"step": 400
},
{
"epoch": 0.06399590426212723,
"eval_loss": 6.53036642074585,
"eval_runtime": 185.969,
"eval_samples_per_second": 35.538,
"eval_sequential_score": 0.785466694337272,
"eval_steps_per_second": 0.28,
"eval_sts-dev-128_pearson_cosine": 0.763999536812149,
"eval_sts-dev-128_spearman_cosine": 0.7711764958651725,
"eval_sts-dev-256_pearson_cosine": 0.7720362664289375,
"eval_sts-dev-256_spearman_cosine": 0.7765855658716928,
"eval_sts-dev-512_pearson_cosine": 0.7788408953704133,
"eval_sts-dev-512_spearman_cosine": 0.7817962751922598,
"eval_sts-dev-64_pearson_cosine": 0.7530255624130506,
"eval_sts-dev-64_spearman_cosine": 0.7635133266587258,
"eval_sts-dev-768_pearson_cosine": 0.7833066834602731,
"eval_sts-dev-768_spearman_cosine": 0.785466694337272,
"step": 500
},
{
"epoch": 0.07679508511455267,
"grad_norm": 97525040.0,
"learning_rate": 3.8363171355498725e-05,
"loss": 4.7789,
"step": 600
},
{
"epoch": 0.10239344681940356,
"grad_norm": 16968314.0,
"learning_rate": 4.987199544872707e-05,
"loss": 4.6845,
"step": 800
},
{
"epoch": 0.12799180852425446,
"grad_norm": 8517995.0,
"learning_rate": 4.844972265680558e-05,
"loss": 4.6628,
"step": 1000
},
{
"epoch": 0.12799180852425446,
"eval_loss": 6.329844951629639,
"eval_runtime": 443.2691,
"eval_samples_per_second": 14.91,
"eval_sequential_score": 0.7900474282041248,
"eval_steps_per_second": 0.117,
"eval_sts-dev-128_pearson_cosine": 0.7796253891599777,
"eval_sts-dev-128_spearman_cosine": 0.7828728829847589,
"eval_sts-dev-256_pearson_cosine": 0.7844919892349245,
"eval_sts-dev-256_spearman_cosine": 0.7854868395314357,
"eval_sts-dev-512_pearson_cosine": 0.7895438861374281,
"eval_sts-dev-512_spearman_cosine": 0.7887806711839089,
"eval_sts-dev-64_pearson_cosine": 0.7711864442090859,
"eval_sts-dev-64_spearman_cosine": 0.7756594477323392,
"eval_sts-dev-768_pearson_cosine": 0.7912711696792909,
"eval_sts-dev-768_spearman_cosine": 0.7900474282041248,
"step": 1000
},
{
"epoch": 0.15359017022910534,
"grad_norm": 9523596.0,
"learning_rate": 4.702744986488409e-05,
"loss": 4.2947,
"step": 1200
},
{
"epoch": 0.17918853193395623,
"grad_norm": 12880760.0,
"learning_rate": 4.56051770729626e-05,
"loss": 4.0669,
"step": 1400
},
{
"epoch": 0.19198771278638166,
"eval_loss": 6.11918830871582,
"eval_runtime": 442.8341,
"eval_samples_per_second": 14.924,
"eval_sequential_score": 0.776235475618058,
"eval_steps_per_second": 0.117,
"eval_sts-dev-128_pearson_cosine": 0.7556070600058153,
"eval_sts-dev-128_spearman_cosine": 0.7650883993782764,
"eval_sts-dev-256_pearson_cosine": 0.7615982531224779,
"eval_sts-dev-256_spearman_cosine": 0.7688372980643342,
"eval_sts-dev-512_pearson_cosine": 0.7658162533685451,
"eval_sts-dev-512_spearman_cosine": 0.7717076619559236,
"eval_sts-dev-64_pearson_cosine": 0.7412214467196323,
"eval_sts-dev-64_spearman_cosine": 0.7546465937668674,
"eval_sts-dev-768_pearson_cosine": 0.7709585212331336,
"eval_sts-dev-768_spearman_cosine": 0.776235475618058,
"step": 1500
},
{
"epoch": 0.20478689363880712,
"grad_norm": 5121008.5,
"learning_rate": 4.4182904281041105e-05,
"loss": 3.7798,
"step": 1600
},
{
"epoch": 0.230385255343658,
"grad_norm": 8532118.0,
"learning_rate": 4.276063148911961e-05,
"loss": 3.6295,
"step": 1800
},
{
"epoch": 0.2559836170485089,
"grad_norm": 5177615.0,
"learning_rate": 4.133835869719813e-05,
"loss": 3.4326,
"step": 2000
},
{
"epoch": 0.2559836170485089,
"eval_loss": 5.5250773429870605,
"eval_runtime": 129.0233,
"eval_samples_per_second": 51.223,
"eval_sequential_score": 0.7967876172773333,
"eval_steps_per_second": 0.403,
"eval_sts-dev-128_pearson_cosine": 0.784502590249166,
"eval_sts-dev-128_spearman_cosine": 0.7904922950886314,
"eval_sts-dev-256_pearson_cosine": 0.7879728763645151,
"eval_sts-dev-256_spearman_cosine": 0.7926398088881677,
"eval_sts-dev-512_pearson_cosine": 0.79034320657993,
"eval_sts-dev-512_spearman_cosine": 0.7941246347219933,
"eval_sts-dev-64_pearson_cosine": 0.7732918356795019,
"eval_sts-dev-64_spearman_cosine": 0.7822318891319547,
"eval_sts-dev-768_pearson_cosine": 0.7932888301272122,
"eval_sts-dev-768_spearman_cosine": 0.7967876172773333,
"step": 2000
},
{
"epoch": 0.2815819787533598,
"grad_norm": 3656165.0,
"learning_rate": 3.9916085905276635e-05,
"loss": 3.5024,
"step": 2200
},
{
"epoch": 0.3071803404582107,
"grad_norm": 4165362.5,
"learning_rate": 3.849381311335514e-05,
"loss": 3.2039,
"step": 2400
},
{
"epoch": 0.3199795213106361,
"eval_loss": 5.417263507843018,
"eval_runtime": 190.9081,
"eval_samples_per_second": 34.619,
"eval_sequential_score": 0.7985143359688982,
"eval_steps_per_second": 0.272,
"eval_sts-dev-128_pearson_cosine": 0.7806598252859374,
"eval_sts-dev-128_spearman_cosine": 0.7903599418037432,
"eval_sts-dev-256_pearson_cosine": 0.7884086400168866,
"eval_sts-dev-256_spearman_cosine": 0.7946240645451771,
"eval_sts-dev-512_pearson_cosine": 0.7915267570052114,
"eval_sts-dev-512_spearman_cosine": 0.7957194058029897,
"eval_sts-dev-64_pearson_cosine": 0.7654385410326767,
"eval_sts-dev-64_spearman_cosine": 0.7805870680928543,
"eval_sts-dev-768_pearson_cosine": 0.7950747299618343,
"eval_sts-dev-768_spearman_cosine": 0.7985143359688982,
"step": 2500
},
{
"epoch": 0.33277870216306155,
"grad_norm": 3111441.75,
"learning_rate": 3.707154032143365e-05,
"loss": 3.1517,
"step": 2600
},
{
"epoch": 0.35837706386791246,
"grad_norm": 4315246.0,
"learning_rate": 3.5649267529512165e-05,
"loss": 3.0409,
"step": 2800
},
{
"epoch": 0.3839754255727633,
"grad_norm": 4406995.0,
"learning_rate": 3.422699473759067e-05,
"loss": 2.9611,
"step": 3000
},
{
"epoch": 0.3839754255727633,
"eval_loss": 5.039449691772461,
"eval_runtime": 186.7714,
"eval_samples_per_second": 35.385,
"eval_sequential_score": 0.7922583016547031,
"eval_steps_per_second": 0.278,
"eval_sts-dev-128_pearson_cosine": 0.782435760922896,
"eval_sts-dev-128_spearman_cosine": 0.7847986500402634,
"eval_sts-dev-256_pearson_cosine": 0.785603571035357,
"eval_sts-dev-256_spearman_cosine": 0.7871271821843686,
"eval_sts-dev-512_pearson_cosine": 0.7882585155923556,
"eval_sts-dev-512_spearman_cosine": 0.7893899811664858,
"eval_sts-dev-64_pearson_cosine": 0.7731641525494573,
"eval_sts-dev-64_spearman_cosine": 0.7789396674425884,
"eval_sts-dev-768_pearson_cosine": 0.7914921308293312,
"eval_sts-dev-768_spearman_cosine": 0.7922583016547031,
"step": 3000
},
{
"epoch": 0.40957378727761423,
"grad_norm": 4295917.5,
"learning_rate": 3.280472194566918e-05,
"loss": 2.8913,
"step": 3200
},
{
"epoch": 0.43517214898246515,
"grad_norm": 3956720.0,
"learning_rate": 3.138244915374769e-05,
"loss": 2.6737,
"step": 3400
},
{
"epoch": 0.4479713298348906,
"eval_loss": 4.8449788093566895,
"eval_runtime": 381.2673,
"eval_samples_per_second": 17.334,
"eval_sequential_score": 0.8124001874236463,
"eval_steps_per_second": 0.136,
"eval_sts-dev-128_pearson_cosine": 0.7982139278806232,
"eval_sts-dev-128_spearman_cosine": 0.8076074750236868,
"eval_sts-dev-256_pearson_cosine": 0.7999244325189871,
"eval_sts-dev-256_spearman_cosine": 0.8075358060747592,
"eval_sts-dev-512_pearson_cosine": 0.8054012343851129,
"eval_sts-dev-512_spearman_cosine": 0.8110777459628828,
"eval_sts-dev-64_pearson_cosine": 0.7826221833277923,
"eval_sts-dev-64_spearman_cosine": 0.7968397814525646,
"eval_sts-dev-768_pearson_cosine": 0.8074395668796857,
"eval_sts-dev-768_spearman_cosine": 0.8124001874236463,
"step": 3500
},
{
"epoch": 0.460770510687316,
"grad_norm": 3219552.5,
"learning_rate": 2.99601763618262e-05,
"loss": 2.6488,
"step": 3600
},
{
"epoch": 0.4863688723921669,
"grad_norm": 3358243.75,
"learning_rate": 2.853790356990471e-05,
"loss": 2.6208,
"step": 3800
},
{
"epoch": 0.5119672340970178,
"grad_norm": 4434459.5,
"learning_rate": 2.7115630777983218e-05,
"loss": 2.4823,
"step": 4000
},
{
"epoch": 0.5119672340970178,
"eval_loss": 4.5710768699646,
"eval_runtime": 216.5327,
"eval_samples_per_second": 30.522,
"eval_sequential_score": 0.811115353774689,
"eval_steps_per_second": 0.24,
"eval_sts-dev-128_pearson_cosine": 0.7996500780619267,
"eval_sts-dev-128_spearman_cosine": 0.8075174124731305,
"eval_sts-dev-256_pearson_cosine": 0.8022455109638521,
"eval_sts-dev-256_spearman_cosine": 0.8081526580763048,
"eval_sts-dev-512_pearson_cosine": 0.8053149776225357,
"eval_sts-dev-512_spearman_cosine": 0.8101542245323032,
"eval_sts-dev-64_pearson_cosine": 0.7897403224233863,
"eval_sts-dev-64_spearman_cosine": 0.8014649100236256,
"eval_sts-dev-768_pearson_cosine": 0.8071703574474214,
"eval_sts-dev-768_spearman_cosine": 0.811115353774689,
"step": 4000
},
{
"epoch": 0.5375655958018687,
"grad_norm": 3851683.75,
"learning_rate": 2.569335798606173e-05,
"loss": 2.5081,
"step": 4200
},
{
"epoch": 0.5631639575067195,
"grad_norm": 3974658.0,
"learning_rate": 2.4271085194140237e-05,
"loss": 2.3827,
"step": 4400
},
{
"epoch": 0.575963138359145,
"eval_loss": 4.527626037597656,
"eval_runtime": 247.0855,
"eval_samples_per_second": 26.748,
"eval_sequential_score": 0.8237398652434835,
"eval_steps_per_second": 0.21,
"eval_sts-dev-128_pearson_cosine": 0.8088977658009835,
"eval_sts-dev-128_spearman_cosine": 0.8200038307453663,
"eval_sts-dev-256_pearson_cosine": 0.8120809058779974,
"eval_sts-dev-256_spearman_cosine": 0.8205438030370273,
"eval_sts-dev-512_pearson_cosine": 0.8157264805096027,
"eval_sts-dev-512_spearman_cosine": 0.8226500864435473,
"eval_sts-dev-64_pearson_cosine": 0.7964069841493276,
"eval_sts-dev-64_spearman_cosine": 0.8117000425044992,
"eval_sts-dev-768_pearson_cosine": 0.8179809027318157,
"eval_sts-dev-768_spearman_cosine": 0.8237398652434835,
"step": 4500
},
{
"epoch": 0.5887623192115704,
"grad_norm": 2969929.75,
"learning_rate": 2.284881240221875e-05,
"loss": 2.2867,
"step": 4600
},
{
"epoch": 0.6143606809164214,
"grad_norm": 3068778.0,
"learning_rate": 2.1426539610297256e-05,
"loss": 2.2608,
"step": 4800
},
{
"epoch": 0.6399590426212722,
"grad_norm": 5789758.5,
"learning_rate": 2.0004266818375767e-05,
"loss": 2.6285,
"step": 5000
},
{
"epoch": 0.6399590426212722,
"eval_loss": 2.69280743598938,
"eval_runtime": 587.2079,
"eval_samples_per_second": 11.255,
"eval_sequential_score": 0.812358680832537,
"eval_steps_per_second": 0.089,
"eval_sts-dev-128_pearson_cosine": 0.805215933596938,
"eval_sts-dev-128_spearman_cosine": 0.8087200580616569,
"eval_sts-dev-256_pearson_cosine": 0.8085076472836253,
"eval_sts-dev-256_spearman_cosine": 0.8099002132418758,
"eval_sts-dev-512_pearson_cosine": 0.8104747450142471,
"eval_sts-dev-512_spearman_cosine": 0.8112676803940946,
"eval_sts-dev-64_pearson_cosine": 0.7956052250413164,
"eval_sts-dev-64_spearman_cosine": 0.8022672223914163,
"eval_sts-dev-768_pearson_cosine": 0.8122169140710348,
"eval_sts-dev-768_spearman_cosine": 0.812358680832537,
"step": 5000
},
{
"epoch": 0.6655574043261231,
"grad_norm": 7265908.5,
"learning_rate": 1.8581994026454275e-05,
"loss": 3.2569,
"step": 5200
},
{
"epoch": 0.6911557660309741,
"grad_norm": 5744435.0,
"learning_rate": 1.7159721234532783e-05,
"loss": 2.7108,
"step": 5400
},
{
"epoch": 0.7039549468833994,
"eval_loss": 3.4081127643585205,
"eval_runtime": 113.2957,
"eval_samples_per_second": 58.334,
"eval_sequential_score": 0.8112072214643352,
"eval_steps_per_second": 0.459,
"eval_sts-dev-128_pearson_cosine": 0.8036120610302749,
"eval_sts-dev-128_spearman_cosine": 0.8060065068978162,
"eval_sts-dev-256_pearson_cosine": 0.8076653976665353,
"eval_sts-dev-256_spearman_cosine": 0.8079945036667597,
"eval_sts-dev-512_pearson_cosine": 0.8103438010262101,
"eval_sts-dev-512_spearman_cosine": 0.8099837098639602,
"eval_sts-dev-64_pearson_cosine": 0.7942652314065892,
"eval_sts-dev-64_spearman_cosine": 0.7993950394097328,
"eval_sts-dev-768_pearson_cosine": 0.8122845848971061,
"eval_sts-dev-768_spearman_cosine": 0.8112072214643352,
"step": 5500
},
{
"epoch": 0.7167541277358249,
"grad_norm": 5608753.5,
"learning_rate": 1.5737448442611294e-05,
"loss": 2.2756,
"step": 5600
},
{
"epoch": 0.7423524894406758,
"grad_norm": 4639111.5,
"learning_rate": 1.4315175650689802e-05,
"loss": 1.9964,
"step": 5800
},
{
"epoch": 0.7679508511455266,
"grad_norm": 3252142.5,
"learning_rate": 1.2892902858768313e-05,
"loss": 1.8278,
"step": 6000
},
{
"epoch": 0.7679508511455266,
"eval_loss": 3.626107931137085,
"eval_runtime": 402.388,
"eval_samples_per_second": 16.424,
"eval_sequential_score": 0.8115702497698598,
"eval_steps_per_second": 0.129,
"eval_sts-dev-128_pearson_cosine": 0.8045525818531101,
"eval_sts-dev-128_spearman_cosine": 0.8070988970713203,
"eval_sts-dev-256_pearson_cosine": 0.808009240681977,
"eval_sts-dev-256_spearman_cosine": 0.8087801893710216,
"eval_sts-dev-512_pearson_cosine": 0.8100529203161405,
"eval_sts-dev-512_spearman_cosine": 0.8101431817309978,
"eval_sts-dev-64_pearson_cosine": 0.7958099227865137,
"eval_sts-dev-64_spearman_cosine": 0.8012609269210379,
"eval_sts-dev-768_pearson_cosine": 0.8121800215134509,
"eval_sts-dev-768_spearman_cosine": 0.8115702497698598,
"step": 6000
},
{
"epoch": 0.7935492128503776,
"grad_norm": 7875703.0,
"learning_rate": 1.147063006684682e-05,
"loss": 1.7105,
"step": 6200
},
{
"epoch": 0.8191475745552285,
"grad_norm": 4817139.5,
"learning_rate": 1.0048357274925332e-05,
"loss": 1.5719,
"step": 6400
},
{
"epoch": 0.831946755407654,
"eval_loss": 3.7825615406036377,
"eval_runtime": 260.9688,
"eval_samples_per_second": 25.325,
"eval_sequential_score": 0.809746911527668,
"eval_steps_per_second": 0.199,
"eval_sts-dev-128_pearson_cosine": 0.8019992344978342,
"eval_sts-dev-128_spearman_cosine": 0.8040026735198376,
"eval_sts-dev-256_pearson_cosine": 0.8069582374847454,
"eval_sts-dev-256_spearman_cosine": 0.8072454625835676,
"eval_sts-dev-512_pearson_cosine": 0.8088522922289604,
"eval_sts-dev-512_spearman_cosine": 0.8085037600756491,
"eval_sts-dev-64_pearson_cosine": 0.7917208895813888,
"eval_sts-dev-64_spearman_cosine": 0.7966340332099681,
"eval_sts-dev-768_pearson_cosine": 0.8110657541385573,
"eval_sts-dev-768_spearman_cosine": 0.809746911527668,
"step": 6500
},
{
"epoch": 0.8447459362600793,
"grad_norm": 3659550.25,
"learning_rate": 8.626084483003841e-06,
"loss": 1.4569,
"step": 6600
},
{
"epoch": 0.8703442979649303,
"grad_norm": 5161893.0,
"learning_rate": 7.20381169108235e-06,
"loss": 1.3572,
"step": 6800
},
{
"epoch": 0.8959426596697811,
"grad_norm": 3728513.5,
"learning_rate": 5.781538899160859e-06,
"loss": 1.2607,
"step": 7000
},
{
"epoch": 0.8959426596697811,
"eval_loss": 3.732253074645996,
"eval_runtime": 343.8,
"eval_samples_per_second": 19.223,
"eval_sequential_score": 0.8113987863085653,
"eval_steps_per_second": 0.151,
"eval_sts-dev-128_pearson_cosine": 0.8052203471354245,
"eval_sts-dev-128_spearman_cosine": 0.8069675064136675,
"eval_sts-dev-256_pearson_cosine": 0.8090504276321419,
"eval_sts-dev-256_spearman_cosine": 0.8092690932878782,
"eval_sts-dev-512_pearson_cosine": 0.8106848356090132,
"eval_sts-dev-512_spearman_cosine": 0.8102076026050016,
"eval_sts-dev-64_pearson_cosine": 0.7959836620564662,
"eval_sts-dev-64_spearman_cosine": 0.8004825005334193,
"eval_sts-dev-768_pearson_cosine": 0.8127183729174601,
"eval_sts-dev-768_spearman_cosine": 0.8113987863085653,
"step": 7000
},
{
"epoch": 0.921541021374632,
"grad_norm": 3384944.0,
"learning_rate": 4.359266107239369e-06,
"loss": 1.1676,
"step": 7200
},
{
"epoch": 0.9471393830794829,
"grad_norm": 3170591.5,
"learning_rate": 2.936993315317878e-06,
"loss": 1.1663,
"step": 7400
},
{
"epoch": 0.9599385639319084,
"eval_loss": 3.83072829246521,
"eval_runtime": 240.3095,
"eval_samples_per_second": 27.502,
"eval_sequential_score": 0.8100586279907306,
"eval_steps_per_second": 0.216,
"eval_sts-dev-128_pearson_cosine": 0.8028710019029521,
"eval_sts-dev-128_spearman_cosine": 0.8054855987917489,
"eval_sts-dev-256_pearson_cosine": 0.8076510620939634,
"eval_sts-dev-256_spearman_cosine": 0.8080588277305082,
"eval_sts-dev-512_pearson_cosine": 0.8092891955563192,
"eval_sts-dev-512_spearman_cosine": 0.8087644228771842,
"eval_sts-dev-64_pearson_cosine": 0.7923252906438638,
"eval_sts-dev-64_spearman_cosine": 0.7975941111911333,
"eval_sts-dev-768_pearson_cosine": 0.8111988062913815,
"eval_sts-dev-768_spearman_cosine": 0.8100586279907306,
"step": 7500
},
{
"epoch": 0.9727377447843338,
"grad_norm": 3539899.5,
"learning_rate": 1.5147205233963876e-06,
"loss": 1.1079,
"step": 7600
},
{
"epoch": 0.9983361064891847,
"grad_norm": 4290327.5,
"learning_rate": 9.24477314748969e-08,
"loss": 1.0827,
"step": 7800
}
],
"logging_steps": 200,
"max_steps": 7813,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}