eric-tramel commited on
Commit
c54dad3
·
verified ·
1 Parent(s): 984ca90

Training in progress, step 50, checkpoint

Browse files
checkpoint-50/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2429f0db82d6f54c3977f53dd544b7529d6322458c73a8466ac23ac09838833
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20fa3baf6756e94e31dbdf2d129c16029be7c3edad043851b1654b96435a34a5
3
  size 1976163472
checkpoint-50/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e08d3c27bb734d1a7f5e6232947eec965f1a97f8953134c9f61a64d71676db8c
3
  size 3952505274
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2305c113c394478b5194c70c42e2cae0dd86712b86e88ff56dcdbebe7f7b01be
3
  size 3952505274
checkpoint-50/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdcf4a103437b92127bd1dd3021524a9c27dc7e2ec2f6d2e41ad871cca920b4e
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:743d69234c1f9bbe7b75d618de78259f369b365fdc4d0ff60b04407b0ffde02e
3
  size 15024
checkpoint-50/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:496426dea16f7374fd996fbb9f1c9a400a305d948493f266a5a0f243875e7912
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a289e6c01736bc0b4798b4132cb5ef587b566fa27b1ebed6c36bf6371ee37d
3
  size 14960
checkpoint-50/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e09450641d8b5f277734d6c1c8fc179a1ab3468e04fe71ff5eb6a41a61bfabf3
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e1c711aa7ab8e083af11f3e41ee95efb2a49034b8306a01aaeabdad27f5da2
3
  size 14960
checkpoint-50/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e486719966634cac2b2e87419a65e82830284ccecc147674fffb99ed74aa9f3f
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ef94998045ba904ed0bd8e95574fee171d672e45082ceb2d1086d05632d894
3
  size 15024
checkpoint-50/trainer_state.json CHANGED
@@ -9,653 +9,653 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 158.140625,
13
  "epoch": 0.0013333333333333333,
14
- "grad_norm": 3.1150918006896973,
15
  "kl": 0.0,
16
  "learning_rate": 4.998766400914329e-07,
17
  "loss": 0.0,
18
- "reward": 0.0625,
19
- "reward_std": 0.17078250646591187,
20
- "rewards/emotion_reward_func": 0.0625,
21
  "rewards/format_reward_func": 0.0,
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 141.6875,
26
  "epoch": 0.0026666666666666666,
27
- "grad_norm": 2.0959103107452393,
28
- "kl": 0.0008207425271393731,
29
  "learning_rate": 4.995066821070679e-07,
30
  "loss": 0.0,
31
- "reward": 0.015625,
32
- "reward_std": 0.0625,
33
  "rewards/emotion_reward_func": 0.015625,
34
- "rewards/format_reward_func": 0.0,
35
  "step": 2
36
  },
37
  {
38
- "completion_length": 166.28125,
39
  "epoch": 0.004,
40
- "grad_norm": 3.0417022705078125,
41
- "kl": 0.0009600210760254413,
42
  "learning_rate": 4.9889049115077e-07,
43
  "loss": 0.0,
44
- "reward": 0.015625,
45
- "reward_std": 0.0625,
46
  "rewards/emotion_reward_func": 0.015625,
47
- "rewards/format_reward_func": 0.0,
48
  "step": 3
49
  },
50
  {
51
- "completion_length": 152.15625,
52
  "epoch": 0.005333333333333333,
53
- "grad_norm": 0.0009840590646490455,
54
- "kl": 0.0009705750271677971,
55
  "learning_rate": 4.980286753286194e-07,
56
  "loss": 0.0,
57
- "reward": 0.0,
58
- "reward_std": 0.0,
59
- "rewards/emotion_reward_func": 0.0,
60
- "rewards/format_reward_func": 0.0,
61
  "step": 4
62
  },
63
  {
64
- "completion_length": 149.6875,
65
  "epoch": 0.006666666666666667,
66
- "grad_norm": 1.7184406518936157,
67
- "kl": 0.0008838580542942509,
68
  "learning_rate": 4.969220851487844e-07,
69
  "loss": 0.0,
70
- "reward": 0.015625,
71
- "reward_std": 0.0625,
72
- "rewards/emotion_reward_func": 0.015625,
73
- "rewards/format_reward_func": 0.0,
74
  "step": 5
75
  },
76
  {
77
- "completion_length": 133.078125,
78
  "epoch": 0.008,
79
- "grad_norm": 3.0811102390289307,
80
- "kl": 0.001091994228772819,
81
  "learning_rate": 4.955718126821722e-07,
82
  "loss": 0.0,
83
- "reward": 0.015625,
84
- "reward_std": 0.0625,
85
- "rewards/emotion_reward_func": 0.015625,
86
- "rewards/format_reward_func": 0.0,
87
  "step": 6
88
  },
89
  {
90
- "completion_length": 171.171875,
91
  "epoch": 0.009333333333333334,
92
- "grad_norm": 2.815908908843994,
93
- "kl": 0.0014450001472141594,
94
  "learning_rate": 4.939791904846868e-07,
95
  "loss": 0.0,
96
  "reward": 0.03125,
97
  "reward_std": 0.125,
98
- "rewards/emotion_reward_func": 0.03125,
99
- "rewards/format_reward_func": 0.0,
100
  "step": 7
101
  },
102
  {
103
- "completion_length": 139.71875,
104
  "epoch": 0.010666666666666666,
105
- "grad_norm": 2.7541661262512207,
106
- "kl": 0.0015429891645908356,
107
  "learning_rate": 4.921457902821578e-07,
108
  "loss": 0.0,
109
- "reward": 0.046875,
110
- "reward_std": 0.1359764039516449,
111
- "rewards/emotion_reward_func": 0.03125,
112
- "rewards/format_reward_func": 0.015625,
113
  "step": 8
114
  },
115
  {
116
- "completion_length": 164.484375,
117
  "epoch": 0.012,
118
- "grad_norm": 4.821209907531738,
119
- "kl": 0.001787194050848484,
120
  "learning_rate": 4.900734214192358e-07,
121
  "loss": 0.0,
122
- "reward": 0.078125,
123
- "reward_std": 0.2257782220840454,
124
- "rewards/emotion_reward_func": 0.078125,
125
- "rewards/format_reward_func": 0.0,
126
  "step": 9
127
  },
128
  {
129
- "completion_length": 148.46875,
130
  "epoch": 0.013333333333333334,
131
- "grad_norm": 3.1241660118103027,
132
- "kl": 0.0024147421936504543,
133
  "learning_rate": 4.877641290737883e-07,
134
  "loss": 0.0,
135
- "reward": 0.015625,
136
- "reward_std": 0.0625,
137
- "rewards/emotion_reward_func": 0.015625,
138
- "rewards/format_reward_func": 0.0,
139
  "step": 10
140
  },
141
  {
142
- "completion_length": 131.171875,
143
  "epoch": 0.014666666666666666,
144
- "grad_norm": 4.040431022644043,
145
- "kl": 0.004135944647714496,
146
  "learning_rate": 4.852201922385564e-07,
147
  "loss": 0.0,
148
- "reward": 0.109375,
149
- "reward_std": 0.2050696462392807,
150
- "rewards/emotion_reward_func": 0.109375,
151
- "rewards/format_reward_func": 0.0,
152
  "step": 11
153
  },
154
  {
155
- "completion_length": 182.296875,
156
  "epoch": 0.016,
157
- "grad_norm": 4.632655143737793,
158
- "kl": 0.00541023223195225,
159
  "learning_rate": 4.824441214720628e-07,
160
  "loss": 0.0,
161
- "reward": 0.09375,
162
- "reward_std": 0.2561737596988678,
163
- "rewards/emotion_reward_func": 0.09375,
164
- "rewards/format_reward_func": 0.0,
165
  "step": 12
166
  },
167
  {
168
- "completion_length": 158.8125,
169
  "epoch": 0.017333333333333333,
170
- "grad_norm": 4.393741607666016,
171
- "kl": 0.005743494722992182,
172
  "learning_rate": 4.794386564209952e-07,
173
  "loss": 0.0,
174
- "reward": 0.140625,
175
- "reward_std": 0.34508590400218964,
176
- "rewards/emotion_reward_func": 0.140625,
177
- "rewards/format_reward_func": 0.0,
178
  "step": 13
179
  },
180
  {
181
- "completion_length": 146.359375,
182
  "epoch": 0.018666666666666668,
183
- "grad_norm": 4.499224662780762,
184
- "kl": 0.009640732081606984,
185
  "learning_rate": 4.762067631165049e-07,
186
  "loss": 0.0,
187
- "reward": 0.21875,
188
- "reward_std": 0.4176512658596039,
189
- "rewards/emotion_reward_func": 0.21875,
190
- "rewards/format_reward_func": 0.0,
191
  "step": 14
192
  },
193
  {
194
- "completion_length": 139.234375,
195
  "epoch": 0.02,
196
- "grad_norm": 4.588191509246826,
197
- "kl": 0.014436421450227499,
198
  "learning_rate": 4.7275163104709194e-07,
199
  "loss": 0.0,
200
- "reward": 0.140625,
201
- "reward_std": 0.21347813308238983,
202
- "rewards/emotion_reward_func": 0.140625,
203
- "rewards/format_reward_func": 0.0,
204
  "step": 15
205
  },
206
  {
207
- "completion_length": 150.03125,
208
  "epoch": 0.021333333333333333,
209
- "grad_norm": 6.439107894897461,
210
- "kl": 0.012775216484442353,
211
  "learning_rate": 4.6907667001096585e-07,
212
  "loss": 0.0,
213
  "reward": 0.296875,
214
- "reward_std": 0.4682851880788803,
215
- "rewards/emotion_reward_func": 0.296875,
216
- "rewards/format_reward_func": 0.0,
217
  "step": 16
218
  },
219
  {
220
- "completion_length": 134.5625,
221
  "epoch": 0.02266666666666667,
222
- "grad_norm": 4.521386623382568,
223
- "kl": 0.020883948309347034,
224
  "learning_rate": 4.6518550675098587e-07,
225
  "loss": 0.0,
226
- "reward": 0.171875,
227
- "reward_std": 0.29886938631534576,
228
- "rewards/emotion_reward_func": 0.171875,
229
- "rewards/format_reward_func": 0.0,
230
  "step": 17
231
  },
232
  {
233
- "completion_length": 142.671875,
234
  "epoch": 0.024,
235
- "grad_norm": 4.811304569244385,
236
- "kl": 0.018777580466121435,
237
  "learning_rate": 4.6108198137550377e-07,
238
  "loss": 0.0,
239
- "reward": 0.15625,
240
- "reward_std": 0.35296089947223663,
241
- "rewards/emotion_reward_func": 0.15625,
242
- "rewards/format_reward_func": 0.0,
243
  "step": 18
244
  },
245
  {
246
- "completion_length": 161.84375,
247
  "epoch": 0.025333333333333333,
248
- "grad_norm": 3.745910406112671,
249
- "kl": 0.015528417890891433,
250
  "learning_rate": 4.567701435686404e-07,
251
  "loss": 0.0,
252
- "reward": 0.125,
253
- "reward_std": 0.2750816196203232,
254
  "rewards/emotion_reward_func": 0.125,
255
- "rewards/format_reward_func": 0.0,
256
  "step": 19
257
  },
258
  {
259
- "completion_length": 130.953125,
260
  "epoch": 0.02666666666666667,
261
- "grad_norm": 4.955869674682617,
262
- "kl": 0.02359085949137807,
263
  "learning_rate": 4.5225424859373684e-07,
264
  "loss": 0.0,
265
- "reward": 0.359375,
266
- "reward_std": 0.49833427369594574,
267
- "rewards/emotion_reward_func": 0.328125,
268
- "rewards/format_reward_func": 0.03125,
269
  "step": 20
270
  },
271
  {
272
- "completion_length": 128.5625,
273
  "epoch": 0.028,
274
- "grad_norm": 4.614920616149902,
275
- "kl": 0.03107461892068386,
276
  "learning_rate": 4.475387530939226e-07,
277
  "loss": 0.0,
278
- "reward": 0.171875,
279
- "reward_std": 0.3683478683233261,
280
- "rewards/emotion_reward_func": 0.171875,
281
- "rewards/format_reward_func": 0.0,
282
  "step": 21
283
  },
284
  {
285
- "completion_length": 117.3125,
286
  "epoch": 0.029333333333333333,
287
- "grad_norm": 7.755206108093262,
288
- "kl": 0.03445248864591122,
289
  "learning_rate": 4.426283106939473e-07,
290
  "loss": 0.0,
291
- "reward": 0.4375,
292
- "reward_std": 0.41898179054260254,
293
- "rewards/emotion_reward_func": 0.4375,
294
- "rewards/format_reward_func": 0.0,
295
  "step": 22
296
  },
297
  {
298
- "completion_length": 136.0,
299
  "epoch": 0.030666666666666665,
300
- "grad_norm": 4.976964950561523,
301
- "kl": 0.030441070441156626,
302
  "learning_rate": 4.375277674076149e-07,
303
  "loss": 0.0,
304
- "reward": 0.4375,
305
- "reward_std": 0.43217839300632477,
306
  "rewards/emotion_reward_func": 0.4375,
307
- "rewards/format_reward_func": 0.0,
308
  "step": 23
309
  },
310
  {
311
- "completion_length": 113.140625,
312
  "epoch": 0.032,
313
- "grad_norm": 5.640017986297607,
314
- "kl": 0.04589735437184572,
315
  "learning_rate": 4.3224215685535287e-07,
316
- "loss": 0.0,
317
- "reward": 0.453125,
318
- "reward_std": 0.5008521527051926,
319
- "rewards/emotion_reward_func": 0.4375,
320
- "rewards/format_reward_func": 0.015625,
321
  "step": 24
322
  },
323
  {
324
- "completion_length": 132.65625,
325
  "epoch": 0.03333333333333333,
326
- "grad_norm": 4.010633945465088,
327
- "kl": 0.029425509739667177,
328
  "learning_rate": 4.2677669529663686e-07,
329
- "loss": 0.0,
330
- "reward": 0.15625,
331
- "reward_std": 0.29578250646591187,
332
- "rewards/emotion_reward_func": 0.140625,
333
- "rewards/format_reward_func": 0.015625,
334
  "step": 25
335
  },
336
  {
337
- "completion_length": 116.125,
338
  "epoch": 0.034666666666666665,
339
- "grad_norm": 5.199466228485107,
340
- "kl": 0.028916552662849426,
341
  "learning_rate": 4.2113677648217216e-07,
342
- "loss": 0.0,
343
- "reward": 0.609375,
344
- "reward_std": 0.4819519817829132,
345
- "rewards/emotion_reward_func": 0.609375,
346
- "rewards/format_reward_func": 0.0,
347
  "step": 26
348
  },
349
  {
350
- "completion_length": 119.578125,
351
  "epoch": 0.036,
352
- "grad_norm": 4.086185455322266,
353
- "kl": 0.0329075139015913,
354
  "learning_rate": 4.1532796633091294e-07,
355
- "loss": 0.0,
356
- "reward": 0.21875,
357
- "reward_std": 0.31425635516643524,
358
- "rewards/emotion_reward_func": 0.21875,
359
- "rewards/format_reward_func": 0.0,
360
  "step": 27
361
  },
362
  {
363
- "completion_length": 100.296875,
364
  "epoch": 0.037333333333333336,
365
- "grad_norm": 5.692497253417969,
366
- "kl": 0.03961053770035505,
367
  "learning_rate": 4.0935599743717244e-07,
368
- "loss": 0.0,
369
- "reward": 0.359375,
370
- "reward_std": 0.41067278385162354,
371
- "rewards/emotion_reward_func": 0.359375,
372
- "rewards/format_reward_func": 0.0,
373
  "step": 28
374
  },
375
  {
376
- "completion_length": 103.234375,
377
  "epoch": 0.03866666666666667,
378
- "grad_norm": 5.965167045593262,
379
- "kl": 0.03921728301793337,
380
  "learning_rate": 4.0322676341324414e-07,
381
- "loss": 0.0,
382
- "reward": 0.484375,
383
- "reward_std": 0.49467839300632477,
384
- "rewards/emotion_reward_func": 0.484375,
385
- "rewards/format_reward_func": 0.0,
386
  "step": 29
387
  },
388
  {
389
- "completion_length": 111.0625,
390
  "epoch": 0.04,
391
- "grad_norm": 5.3481645584106445,
392
- "kl": 0.040639642626047134,
393
  "learning_rate": 3.9694631307311825e-07,
394
- "loss": 0.0,
395
- "reward": 0.453125,
396
- "reward_std": 0.49808916449546814,
397
- "rewards/emotion_reward_func": 0.453125,
398
- "rewards/format_reward_func": 0.0,
399
  "step": 30
400
  },
401
  {
402
- "completion_length": 107.796875,
403
  "epoch": 0.04133333333333333,
404
- "grad_norm": 5.7155890464782715,
405
- "kl": 0.037587080616503954,
406
  "learning_rate": 3.9052084446303265e-07,
407
- "loss": 0.0,
408
- "reward": 0.3125,
409
- "reward_std": 0.40507757663726807,
410
- "rewards/emotion_reward_func": 0.3125,
411
- "rewards/format_reward_func": 0.0,
412
  "step": 31
413
  },
414
  {
415
- "completion_length": 132.078125,
416
  "epoch": 0.042666666666666665,
417
- "grad_norm": 4.6740241050720215,
418
- "kl": 0.033237069845199585,
419
  "learning_rate": 3.839566987447491e-07,
420
- "loss": 0.0,
421
- "reward": 0.609375,
422
- "reward_std": 0.47669367492198944,
423
- "rewards/emotion_reward_func": 0.609375,
424
- "rewards/format_reward_func": 0.0,
425
  "step": 32
426
  },
427
  {
428
- "completion_length": 87.25,
429
  "epoch": 0.044,
430
- "grad_norm": 4.911775588989258,
431
- "kl": 0.03579402016475797,
432
  "learning_rate": 3.7726035393759283e-07,
433
- "loss": 0.0,
434
- "reward": 0.4375,
435
- "reward_std": 0.3133598417043686,
436
- "rewards/emotion_reward_func": 0.4375,
437
- "rewards/format_reward_func": 0.0,
438
  "step": 33
439
  },
440
  {
441
- "completion_length": 106.5625,
442
  "epoch": 0.04533333333333334,
443
- "grad_norm": 4.858055114746094,
444
- "kl": 0.03773342818021774,
445
  "learning_rate": 3.704384185254288e-07,
446
- "loss": 0.0,
447
- "reward": 0.515625,
448
- "reward_std": 0.4550696462392807,
449
- "rewards/emotion_reward_func": 0.515625,
450
- "rewards/format_reward_func": 0.0,
451
  "step": 34
452
  },
453
  {
454
- "completion_length": 126.578125,
455
  "epoch": 0.04666666666666667,
456
- "grad_norm": 5.445562839508057,
457
- "kl": 0.02881002752110362,
458
  "learning_rate": 3.634976249348867e-07,
459
- "loss": 0.0,
460
- "reward": 0.28125,
461
- "reward_std": 0.4230812340974808,
462
- "rewards/emotion_reward_func": 0.28125,
463
- "rewards/format_reward_func": 0.0,
464
  "step": 35
465
  },
466
  {
467
- "completion_length": 117.078125,
468
  "epoch": 0.048,
469
- "grad_norm": 4.025181770324707,
470
- "kl": 0.03474720939993858,
471
  "learning_rate": 3.5644482289126813e-07,
472
- "loss": 0.0,
473
- "reward": 0.265625,
474
- "reward_std": 0.2979728728532791,
475
- "rewards/emotion_reward_func": 0.265625,
476
- "rewards/format_reward_func": 0.0,
477
  "step": 36
478
  },
479
  {
480
- "completion_length": 99.609375,
481
  "epoch": 0.04933333333333333,
482
- "grad_norm": 11.872902870178223,
483
- "kl": 0.045760439708828926,
484
  "learning_rate": 3.492869726586951e-07,
485
- "loss": 0.0,
486
- "reward": 0.328125,
487
- "reward_std": 0.42430339753627777,
488
- "rewards/emotion_reward_func": 0.328125,
489
- "rewards/format_reward_func": 0.0,
490
  "step": 37
491
  },
492
  {
493
- "completion_length": 108.265625,
494
  "epoch": 0.050666666666666665,
495
- "grad_norm": 6.217259407043457,
496
- "kl": 0.04085303423926234,
497
  "learning_rate": 3.4203113817116953e-07,
498
- "loss": 0.0,
499
- "reward": 0.625,
500
- "reward_std": 0.48296454548835754,
501
- "rewards/emotion_reward_func": 0.625,
502
- "rewards/format_reward_func": 0.0,
503
  "step": 38
504
  },
505
  {
506
- "completion_length": 136.34375,
507
  "epoch": 0.052,
508
- "grad_norm": 3.5724854469299316,
509
- "kl": 0.03312604874372482,
510
  "learning_rate": 3.346844800613229e-07,
511
- "loss": 0.0,
512
- "reward": 0.125,
513
- "reward_std": 0.28694769740104675,
514
- "rewards/emotion_reward_func": 0.125,
515
- "rewards/format_reward_func": 0.0,
516
  "step": 39
517
  },
518
  {
519
- "completion_length": 104.203125,
520
  "epoch": 0.05333333333333334,
521
- "grad_norm": 4.63241720199585,
522
- "kl": 0.04127589985728264,
523
  "learning_rate": 3.272542485937368e-07,
524
- "loss": 0.0,
525
- "reward": 0.484375,
526
- "reward_std": 0.45283491909503937,
527
- "rewards/emotion_reward_func": 0.484375,
528
- "rewards/format_reward_func": 0.0,
529
  "step": 40
530
  },
531
  {
532
- "completion_length": 121.25,
533
  "epoch": 0.05466666666666667,
534
- "grad_norm": 3.7740116119384766,
535
- "kl": 0.03728105220943689,
536
  "learning_rate": 3.1974777650980734e-07,
537
- "loss": 0.0,
538
- "reward": 0.4375,
539
- "reward_std": 0.3221946507692337,
540
- "rewards/emotion_reward_func": 0.421875,
541
- "rewards/format_reward_func": 0.015625,
542
  "step": 41
543
  },
544
  {
545
- "completion_length": 100.125,
546
  "epoch": 0.056,
547
- "grad_norm": 4.631157875061035,
548
- "kl": 0.04315151646733284,
549
  "learning_rate": 3.121724717912138e-07,
550
- "loss": 0.0,
551
- "reward": 0.46875,
552
- "reward_std": 0.3454566150903702,
553
- "rewards/emotion_reward_func": 0.46875,
554
- "rewards/format_reward_func": 0.0,
555
  "step": 42
556
  },
557
  {
558
- "completion_length": 108.109375,
559
  "epoch": 0.05733333333333333,
560
- "grad_norm": 4.478452205657959,
561
- "kl": 0.03689518291503191,
562
  "learning_rate": 3.0453581034913565e-07,
563
- "loss": 0.0,
564
- "reward": 0.265625,
565
- "reward_std": 0.43616947531700134,
566
- "rewards/emotion_reward_func": 0.265625,
567
- "rewards/format_reward_func": 0.0,
568
  "step": 43
569
  },
570
  {
571
- "completion_length": 103.59375,
572
  "epoch": 0.058666666666666666,
573
- "grad_norm": 4.84773063659668,
574
- "kl": 0.0415381109341979,
575
  "learning_rate": 2.968453286464312e-07,
576
- "loss": 0.0,
577
- "reward": 0.625,
578
- "reward_std": 0.42078250646591187,
579
- "rewards/emotion_reward_func": 0.625,
580
- "rewards/format_reward_func": 0.0,
581
  "step": 44
582
  },
583
  {
584
- "completion_length": 114.421875,
585
  "epoch": 0.06,
586
- "grad_norm": 5.1858391761779785,
587
- "kl": 0.03836139664053917,
588
  "learning_rate": 2.8910861626005773e-07,
589
- "loss": 0.0,
590
- "reward": 0.40625,
591
- "reward_std": 0.49500228464603424,
592
- "rewards/emotion_reward_func": 0.40625,
593
- "rewards/format_reward_func": 0.0,
594
  "step": 45
595
  },
596
  {
597
- "completion_length": 108.421875,
598
  "epoch": 0.06133333333333333,
599
- "grad_norm": 5.418513298034668,
600
- "kl": 0.03823486668989062,
601
  "learning_rate": 2.8133330839107604e-07,
602
- "loss": 0.0,
603
- "reward": 0.4375,
604
- "reward_std": 0.49776527285575867,
605
- "rewards/emotion_reward_func": 0.4375,
606
- "rewards/format_reward_func": 0.0,
607
  "step": 46
608
  },
609
  {
610
- "completion_length": 114.15625,
611
  "epoch": 0.06266666666666666,
612
- "grad_norm": 4.45364236831665,
613
- "kl": 0.032252665143460035,
614
  "learning_rate": 2.735270783296286e-07,
615
- "loss": 0.0,
616
- "reward": 0.3125,
617
- "reward_std": 0.43925635516643524,
618
- "rewards/emotion_reward_func": 0.3125,
619
- "rewards/format_reward_func": 0.0,
620
  "step": 47
621
  },
622
  {
623
- "completion_length": 119.5,
624
  "epoch": 0.064,
625
- "grad_norm": 4.792967319488525,
626
- "kl": 0.038752755150198936,
627
  "learning_rate": 2.6569762988232837e-07,
628
- "loss": 0.0,
629
- "reward": 0.15625,
630
- "reward_std": 0.3058478683233261,
631
- "rewards/emotion_reward_func": 0.15625,
632
- "rewards/format_reward_func": 0.0,
633
  "step": 48
634
  },
635
  {
636
- "completion_length": 113.0,
637
  "epoch": 0.06533333333333333,
638
- "grad_norm": 4.534658908843994,
639
- "kl": 0.041261011734604836,
640
  "learning_rate": 2.5785268976953204e-07,
641
- "loss": 0.0,
642
- "reward": 0.4375,
643
- "reward_std": 0.4795328378677368,
644
- "rewards/emotion_reward_func": 0.421875,
645
- "rewards/format_reward_func": 0.015625,
646
  "step": 49
647
  },
648
  {
649
- "completion_length": 94.046875,
650
  "epoch": 0.06666666666666667,
651
- "grad_norm": 5.302418231964111,
652
- "kl": 0.04163787979632616,
653
  "learning_rate": 2.5e-07,
654
- "loss": 0.0,
655
- "reward": 0.390625,
656
- "reward_std": 0.4757782220840454,
657
- "rewards/emotion_reward_func": 0.390625,
658
- "rewards/format_reward_func": 0.0,
659
  "step": 50
660
  }
661
  ],
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 113.171875,
13
  "epoch": 0.0013333333333333333,
14
+ "grad_norm": 0.0,
15
  "kl": 0.0,
16
  "learning_rate": 4.998766400914329e-07,
17
  "loss": 0.0,
18
+ "reward": 0.0,
19
+ "reward_std": 0.0,
20
+ "rewards/emotion_reward_func": 0.0,
21
  "rewards/format_reward_func": 0.0,
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 100.609375,
26
  "epoch": 0.0026666666666666666,
27
+ "grad_norm": 7.206315517425537,
28
+ "kl": 0.0,
29
  "learning_rate": 4.995066821070679e-07,
30
  "loss": 0.0,
31
+ "reward": 0.046875,
32
+ "reward_std": 0.1875,
33
  "rewards/emotion_reward_func": 0.015625,
34
+ "rewards/format_reward_func": 0.03125,
35
  "step": 2
36
  },
37
  {
38
+ "completion_length": 130.65625,
39
  "epoch": 0.004,
40
+ "grad_norm": 7.190981864929199,
41
+ "kl": 0.0008925930160330608,
42
  "learning_rate": 4.9889049115077e-07,
43
  "loss": 0.0,
44
+ "reward": 0.0625,
45
+ "reward_std": 0.25,
46
  "rewards/emotion_reward_func": 0.015625,
47
+ "rewards/format_reward_func": 0.046875,
48
  "step": 3
49
  },
50
  {
51
+ "completion_length": 112.484375,
52
  "epoch": 0.005333333333333333,
53
+ "grad_norm": 4.745754718780518,
54
+ "kl": 0.0009372199856443331,
55
  "learning_rate": 4.980286753286194e-07,
56
  "loss": 0.0,
57
+ "reward": 0.046875,
58
+ "reward_std": 0.1875,
59
+ "rewards/emotion_reward_func": 0.015625,
60
+ "rewards/format_reward_func": 0.03125,
61
  "step": 4
62
  },
63
  {
64
+ "completion_length": 100.359375,
65
  "epoch": 0.006666666666666667,
66
+ "grad_norm": 6.957683086395264,
67
+ "kl": 0.0009052860696101561,
68
  "learning_rate": 4.969220851487844e-07,
69
  "loss": 0.0,
70
+ "reward": 0.078125,
71
+ "reward_std": 0.3125,
72
+ "rewards/emotion_reward_func": 0.03125,
73
+ "rewards/format_reward_func": 0.046875,
74
  "step": 5
75
  },
76
  {
77
+ "completion_length": 109.5625,
78
  "epoch": 0.008,
79
+ "grad_norm": 8.517982482910156,
80
+ "kl": 0.0011545967281563208,
81
  "learning_rate": 4.955718126821722e-07,
82
  "loss": 0.0,
83
+ "reward": 0.140625,
84
+ "reward_std": 0.34228479862213135,
85
+ "rewards/emotion_reward_func": 0.03125,
86
+ "rewards/format_reward_func": 0.109375,
87
  "step": 6
88
  },
89
  {
90
+ "completion_length": 117.484375,
91
  "epoch": 0.009333333333333334,
92
+ "grad_norm": 2.9639530181884766,
93
+ "kl": 0.001261903322301805,
94
  "learning_rate": 4.939791904846868e-07,
95
  "loss": 0.0,
96
  "reward": 0.03125,
97
  "reward_std": 0.125,
98
+ "rewards/emotion_reward_func": 0.015625,
99
+ "rewards/format_reward_func": 0.015625,
100
  "step": 7
101
  },
102
  {
103
+ "completion_length": 118.234375,
104
  "epoch": 0.010666666666666666,
105
+ "grad_norm": 8.49864387512207,
106
+ "kl": 0.002180565701564774,
107
  "learning_rate": 4.921457902821578e-07,
108
  "loss": 0.0,
109
+ "reward": 0.140625,
110
+ "reward_std": 0.4436737596988678,
111
+ "rewards/emotion_reward_func": 0.046875,
112
+ "rewards/format_reward_func": 0.09375,
113
  "step": 8
114
  },
115
  {
116
+ "completion_length": 106.546875,
117
  "epoch": 0.012,
118
+ "grad_norm": 7.236983299255371,
119
+ "kl": 0.0018676594190765172,
120
  "learning_rate": 4.900734214192358e-07,
121
  "loss": 0.0,
122
+ "reward": 0.109375,
123
+ "reward_std": 0.3186737596988678,
124
+ "rewards/emotion_reward_func": 0.03125,
125
+ "rewards/format_reward_func": 0.078125,
126
  "step": 9
127
  },
128
  {
129
+ "completion_length": 94.640625,
130
  "epoch": 0.013333333333333334,
131
+ "grad_norm": 6.612936973571777,
132
+ "kl": 0.00287746504181996,
133
  "learning_rate": 4.877641290737883e-07,
134
  "loss": 0.0,
135
+ "reward": 0.15625,
136
+ "reward_std": 0.38375620543956757,
137
+ "rewards/emotion_reward_func": 0.03125,
138
+ "rewards/format_reward_func": 0.125,
139
  "step": 10
140
  },
141
  {
142
+ "completion_length": 78.453125,
143
  "epoch": 0.014666666666666666,
144
+ "grad_norm": 6.714486598968506,
145
+ "kl": 0.0037596136680804193,
146
  "learning_rate": 4.852201922385564e-07,
147
  "loss": 0.0,
148
+ "reward": 0.15625,
149
+ "reward_std": 0.34297704696655273,
150
+ "rewards/emotion_reward_func": 0.03125,
151
+ "rewards/format_reward_func": 0.125,
152
  "step": 11
153
  },
154
  {
155
+ "completion_length": 113.765625,
156
  "epoch": 0.016,
157
+ "grad_norm": 6.222474098205566,
158
+ "kl": 0.005189417512156069,
159
  "learning_rate": 4.824441214720628e-07,
160
  "loss": 0.0,
161
+ "reward": 0.328125,
162
+ "reward_std": 0.6560364216566086,
163
+ "rewards/emotion_reward_func": 0.109375,
164
+ "rewards/format_reward_func": 0.21875,
165
  "step": 12
166
  },
167
  {
168
+ "completion_length": 93.890625,
169
  "epoch": 0.017333333333333333,
170
+ "grad_norm": 7.199591636657715,
171
+ "kl": 0.011019598576240242,
172
  "learning_rate": 4.794386564209952e-07,
173
  "loss": 0.0,
174
+ "reward": 0.359375,
175
+ "reward_std": 0.647876650094986,
176
+ "rewards/emotion_reward_func": 0.125,
177
+ "rewards/format_reward_func": 0.234375,
178
  "step": 13
179
  },
180
  {
181
+ "completion_length": 93.6875,
182
  "epoch": 0.018666666666666668,
183
+ "grad_norm": 7.459893226623535,
184
+ "kl": 0.012794358422979712,
185
  "learning_rate": 4.762067631165049e-07,
186
  "loss": 0.0,
187
+ "reward": 0.28125,
188
+ "reward_std": 0.4561576098203659,
189
+ "rewards/emotion_reward_func": 0.015625,
190
+ "rewards/format_reward_func": 0.265625,
191
  "step": 14
192
  },
193
  {
194
+ "completion_length": 114.03125,
195
  "epoch": 0.02,
196
+ "grad_norm": 7.204787731170654,
197
+ "kl": 0.01207199739292264,
198
  "learning_rate": 4.7275163104709194e-07,
199
  "loss": 0.0,
200
+ "reward": 0.421875,
201
+ "reward_std": 0.6076867878437042,
202
+ "rewards/emotion_reward_func": 0.0625,
203
+ "rewards/format_reward_func": 0.359375,
204
  "step": 15
205
  },
206
  {
207
+ "completion_length": 90.1875,
208
  "epoch": 0.021333333333333333,
209
+ "grad_norm": 6.9398369789123535,
210
+ "kl": 0.012930819648317993,
211
  "learning_rate": 4.6907667001096585e-07,
212
  "loss": 0.0,
213
  "reward": 0.296875,
214
+ "reward_std": 0.6791985481977463,
215
+ "rewards/emotion_reward_func": 0.125,
216
+ "rewards/format_reward_func": 0.171875,
217
  "step": 16
218
  },
219
  {
220
+ "completion_length": 100.265625,
221
  "epoch": 0.02266666666666667,
222
+ "grad_norm": 7.01741361618042,
223
+ "kl": 0.014807499013841152,
224
  "learning_rate": 4.6518550675098587e-07,
225
  "loss": 0.0,
226
+ "reward": 0.609375,
227
+ "reward_std": 0.7960269749164581,
228
+ "rewards/emotion_reward_func": 0.1875,
229
+ "rewards/format_reward_func": 0.421875,
230
  "step": 17
231
  },
232
  {
233
+ "completion_length": 101.59375,
234
  "epoch": 0.024,
235
+ "grad_norm": 6.980762004852295,
236
+ "kl": 0.02019192511215806,
237
  "learning_rate": 4.6108198137550377e-07,
238
  "loss": 0.0,
239
+ "reward": 0.53125,
240
+ "reward_std": 0.6885540634393692,
241
+ "rewards/emotion_reward_func": 0.125,
242
+ "rewards/format_reward_func": 0.40625,
243
  "step": 18
244
  },
245
  {
246
+ "completion_length": 93.859375,
247
  "epoch": 0.025333333333333333,
248
+ "grad_norm": 8.390938758850098,
249
+ "kl": 0.03694334626197815,
250
  "learning_rate": 4.567701435686404e-07,
251
  "loss": 0.0,
252
+ "reward": 0.65625,
253
+ "reward_std": 0.651764303445816,
254
  "rewards/emotion_reward_func": 0.125,
255
+ "rewards/format_reward_func": 0.53125,
256
  "step": 19
257
  },
258
  {
259
+ "completion_length": 85.546875,
260
  "epoch": 0.02666666666666667,
261
+ "grad_norm": 8.39484691619873,
262
+ "kl": 0.03380461875349283,
263
  "learning_rate": 4.5225424859373684e-07,
264
  "loss": 0.0,
265
+ "reward": 0.765625,
266
+ "reward_std": 0.7304560542106628,
267
+ "rewards/emotion_reward_func": 0.203125,
268
+ "rewards/format_reward_func": 0.5625,
269
  "step": 20
270
  },
271
  {
272
+ "completion_length": 91.3125,
273
  "epoch": 0.028,
274
+ "grad_norm": 6.440164089202881,
275
+ "kl": 0.04492605570703745,
276
  "learning_rate": 4.475387530939226e-07,
277
  "loss": 0.0,
278
+ "reward": 0.765625,
279
+ "reward_std": 0.5731314420700073,
280
+ "rewards/emotion_reward_func": 0.078125,
281
+ "rewards/format_reward_func": 0.6875,
282
  "step": 21
283
  },
284
  {
285
+ "completion_length": 91.0625,
286
  "epoch": 0.029333333333333333,
287
+ "grad_norm": 6.099334716796875,
288
+ "kl": 0.040256964042782784,
289
  "learning_rate": 4.426283106939473e-07,
290
  "loss": 0.0,
291
+ "reward": 1.1875,
292
+ "reward_std": 0.6744011342525482,
293
+ "rewards/emotion_reward_func": 0.390625,
294
+ "rewards/format_reward_func": 0.796875,
295
  "step": 22
296
  },
297
  {
298
+ "completion_length": 79.328125,
299
  "epoch": 0.030666666666666665,
300
+ "grad_norm": 7.1284871101379395,
301
+ "kl": 0.04640346672385931,
302
  "learning_rate": 4.375277674076149e-07,
303
  "loss": 0.0,
304
+ "reward": 1.21875,
305
+ "reward_std": 0.7463032901287079,
306
  "rewards/emotion_reward_func": 0.4375,
307
+ "rewards/format_reward_func": 0.78125,
308
  "step": 23
309
  },
310
  {
311
+ "completion_length": 71.84375,
312
  "epoch": 0.032,
313
+ "grad_norm": 6.066468715667725,
314
+ "kl": 0.07137730903923512,
315
  "learning_rate": 4.3224215685535287e-07,
316
+ "loss": 0.0001,
317
+ "reward": 1.328125,
318
+ "reward_std": 0.8113406747579575,
319
+ "rewards/emotion_reward_func": 0.546875,
320
+ "rewards/format_reward_func": 0.78125,
321
  "step": 24
322
  },
323
  {
324
+ "completion_length": 101.0625,
325
  "epoch": 0.03333333333333333,
326
+ "grad_norm": 5.047848224639893,
327
+ "kl": 0.0665823919698596,
328
  "learning_rate": 4.2677669529663686e-07,
329
+ "loss": 0.0001,
330
+ "reward": 0.828125,
331
+ "reward_std": 0.5247472077608109,
332
+ "rewards/emotion_reward_func": 0.078125,
333
+ "rewards/format_reward_func": 0.75,
334
  "step": 25
335
  },
336
  {
337
+ "completion_length": 68.1875,
338
  "epoch": 0.034666666666666665,
339
+ "grad_norm": 6.7862958908081055,
340
+ "kl": 0.07357331551611423,
341
  "learning_rate": 4.2113677648217216e-07,
342
+ "loss": 0.0001,
343
+ "reward": 1.515625,
344
+ "reward_std": 0.7178780436515808,
345
+ "rewards/emotion_reward_func": 0.671875,
346
+ "rewards/format_reward_func": 0.84375,
347
  "step": 26
348
  },
349
  {
350
+ "completion_length": 73.5625,
351
  "epoch": 0.036,
352
+ "grad_norm": 6.041502475738525,
353
+ "kl": 0.0839837146922946,
354
  "learning_rate": 4.1532796633091294e-07,
355
+ "loss": 0.0001,
356
+ "reward": 1.0,
357
+ "reward_std": 0.5787727609276772,
358
+ "rewards/emotion_reward_func": 0.1875,
359
+ "rewards/format_reward_func": 0.8125,
360
  "step": 27
361
  },
362
  {
363
+ "completion_length": 75.515625,
364
  "epoch": 0.037333333333333336,
365
+ "grad_norm": 5.880768299102783,
366
+ "kl": 0.07887133583426476,
367
  "learning_rate": 4.0935599743717244e-07,
368
+ "loss": 0.0001,
369
+ "reward": 1.265625,
370
+ "reward_std": 0.6387931928038597,
371
+ "rewards/emotion_reward_func": 0.390625,
372
+ "rewards/format_reward_func": 0.875,
373
  "step": 28
374
  },
375
  {
376
+ "completion_length": 84.65625,
377
  "epoch": 0.03866666666666667,
378
+ "grad_norm": 4.456689357757568,
379
+ "kl": 0.05541729833930731,
380
  "learning_rate": 4.0322676341324414e-07,
381
+ "loss": 0.0001,
382
+ "reward": 1.546875,
383
+ "reward_std": 0.6021395623683929,
384
+ "rewards/emotion_reward_func": 0.6875,
385
+ "rewards/format_reward_func": 0.859375,
386
  "step": 29
387
  },
388
  {
389
+ "completion_length": 72.03125,
390
  "epoch": 0.04,
391
+ "grad_norm": 6.281215667724609,
392
+ "kl": 0.07642886973917484,
393
  "learning_rate": 3.9694631307311825e-07,
394
+ "loss": 0.0001,
395
+ "reward": 1.421875,
396
+ "reward_std": 0.6625982969999313,
397
+ "rewards/emotion_reward_func": 0.515625,
398
+ "rewards/format_reward_func": 0.90625,
399
  "step": 30
400
  },
401
  {
402
+ "completion_length": 64.96875,
403
  "epoch": 0.04133333333333333,
404
+ "grad_norm": 6.267475128173828,
405
+ "kl": 0.07545926049351692,
406
  "learning_rate": 3.9052084446303265e-07,
407
+ "loss": 0.0001,
408
+ "reward": 1.28125,
409
+ "reward_std": 0.5209204778075218,
410
+ "rewards/emotion_reward_func": 0.34375,
411
+ "rewards/format_reward_func": 0.9375,
412
  "step": 31
413
  },
414
  {
415
+ "completion_length": 74.28125,
416
  "epoch": 0.042666666666666665,
417
+ "grad_norm": 6.10364294052124,
418
+ "kl": 0.08546704892069101,
419
  "learning_rate": 3.839566987447491e-07,
420
+ "loss": 0.0001,
421
+ "reward": 1.59375,
422
+ "reward_std": 0.7168828397989273,
423
+ "rewards/emotion_reward_func": 0.71875,
424
+ "rewards/format_reward_func": 0.875,
425
  "step": 32
426
  },
427
  {
428
+ "completion_length": 70.03125,
429
  "epoch": 0.044,
430
+ "grad_norm": 4.039772033691406,
431
+ "kl": 0.06002845522016287,
432
  "learning_rate": 3.7726035393759283e-07,
433
+ "loss": 0.0001,
434
+ "reward": 1.3125,
435
+ "reward_std": 0.2808031141757965,
436
+ "rewards/emotion_reward_func": 0.390625,
437
+ "rewards/format_reward_func": 0.921875,
438
  "step": 33
439
  },
440
  {
441
+ "completion_length": 83.109375,
442
  "epoch": 0.04533333333333334,
443
+ "grad_norm": 6.683216094970703,
444
+ "kl": 0.0706396009773016,
445
  "learning_rate": 3.704384185254288e-07,
446
+ "loss": 0.0001,
447
+ "reward": 1.421875,
448
+ "reward_std": 0.7211004346609116,
449
+ "rewards/emotion_reward_func": 0.578125,
450
+ "rewards/format_reward_func": 0.84375,
451
  "step": 34
452
  },
453
  {
454
+ "completion_length": 72.765625,
455
  "epoch": 0.04666666666666667,
456
+ "grad_norm": 6.402747631072998,
457
+ "kl": 0.10811681114137173,
458
  "learning_rate": 3.634976249348867e-07,
459
+ "loss": 0.0001,
460
+ "reward": 1.28125,
461
+ "reward_std": 0.5904398858547211,
462
+ "rewards/emotion_reward_func": 0.375,
463
+ "rewards/format_reward_func": 0.90625,
464
  "step": 35
465
  },
466
  {
467
+ "completion_length": 78.734375,
468
  "epoch": 0.048,
469
+ "grad_norm": 5.202386856079102,
470
+ "kl": 0.07850308250635862,
471
  "learning_rate": 3.5644482289126813e-07,
472
+ "loss": 0.0001,
473
+ "reward": 1.09375,
474
+ "reward_std": 0.5113069340586662,
475
+ "rewards/emotion_reward_func": 0.203125,
476
+ "rewards/format_reward_func": 0.890625,
477
  "step": 36
478
  },
479
  {
480
+ "completion_length": 68.953125,
481
  "epoch": 0.04933333333333333,
482
+ "grad_norm": 7.1071085929870605,
483
+ "kl": 0.08383779786527157,
484
  "learning_rate": 3.492869726586951e-07,
485
+ "loss": 0.0001,
486
+ "reward": 1.1875,
487
+ "reward_std": 0.4691474586725235,
488
+ "rewards/emotion_reward_func": 0.21875,
489
+ "rewards/format_reward_func": 0.96875,
490
  "step": 37
491
  },
492
  {
493
+ "completion_length": 65.734375,
494
  "epoch": 0.050666666666666665,
495
+ "grad_norm": 6.272955417633057,
496
+ "kl": 0.06441066134721041,
497
  "learning_rate": 3.4203113817116953e-07,
498
+ "loss": 0.0001,
499
+ "reward": 1.75,
500
+ "reward_std": 0.41095855832099915,
501
+ "rewards/emotion_reward_func": 0.765625,
502
+ "rewards/format_reward_func": 0.984375,
503
  "step": 38
504
  },
505
  {
506
+ "completion_length": 69.9375,
507
  "epoch": 0.052,
508
+ "grad_norm": 3.7793548107147217,
509
+ "kl": 0.06628133170306683,
510
  "learning_rate": 3.346844800613229e-07,
511
+ "loss": 0.0001,
512
+ "reward": 1.078125,
513
+ "reward_std": 0.21347813308238983,
514
+ "rewards/emotion_reward_func": 0.109375,
515
+ "rewards/format_reward_func": 0.96875,
516
  "step": 39
517
  },
518
  {
519
+ "completion_length": 64.34375,
520
  "epoch": 0.05333333333333334,
521
+ "grad_norm": 6.500398635864258,
522
+ "kl": 0.06995576526969671,
523
  "learning_rate": 3.272542485937368e-07,
524
+ "loss": 0.0001,
525
+ "reward": 1.546875,
526
+ "reward_std": 0.47020626068115234,
527
+ "rewards/emotion_reward_func": 0.5625,
528
+ "rewards/format_reward_func": 0.984375,
529
  "step": 40
530
  },
531
  {
532
+ "completion_length": 73.375,
533
  "epoch": 0.05466666666666667,
534
+ "grad_norm": 4.637302875518799,
535
+ "kl": 0.06445631105452776,
536
  "learning_rate": 3.1974777650980734e-07,
537
+ "loss": 0.0001,
538
+ "reward": 1.578125,
539
+ "reward_std": 0.38923946768045425,
540
+ "rewards/emotion_reward_func": 0.609375,
541
+ "rewards/format_reward_func": 0.96875,
542
  "step": 41
543
  },
544
  {
545
+ "completion_length": 69.875,
546
  "epoch": 0.056,
547
+ "grad_norm": 5.5001220703125,
548
+ "kl": 0.06374164298176765,
549
  "learning_rate": 3.121724717912138e-07,
550
+ "loss": 0.0001,
551
+ "reward": 1.671875,
552
+ "reward_std": 0.18616947531700134,
553
+ "rewards/emotion_reward_func": 0.671875,
554
+ "rewards/format_reward_func": 1.0,
555
  "step": 42
556
  },
557
  {
558
+ "completion_length": 65.75,
559
  "epoch": 0.05733333333333333,
560
+ "grad_norm": 5.561408996582031,
561
+ "kl": 0.11185399815440178,
562
  "learning_rate": 3.0453581034913565e-07,
563
+ "loss": 0.0001,
564
+ "reward": 1.125,
565
+ "reward_std": 0.4263191595673561,
566
+ "rewards/emotion_reward_func": 0.1875,
567
+ "rewards/format_reward_func": 0.9375,
568
  "step": 43
569
  },
570
  {
571
+ "completion_length": 57.59375,
572
  "epoch": 0.058666666666666666,
573
+ "grad_norm": 6.903532028198242,
574
+ "kl": 0.08089348301291466,
575
  "learning_rate": 2.968453286464312e-07,
576
+ "loss": 0.0001,
577
+ "reward": 1.75,
578
+ "reward_std": 0.3838024437427521,
579
+ "rewards/emotion_reward_func": 0.765625,
580
+ "rewards/format_reward_func": 0.984375,
581
  "step": 44
582
  },
583
  {
584
+ "completion_length": 81.0625,
585
  "epoch": 0.06,
586
+ "grad_norm": 5.9377288818359375,
587
+ "kl": 0.06169276125729084,
588
  "learning_rate": 2.8910861626005773e-07,
589
+ "loss": 0.0001,
590
+ "reward": 1.375,
591
+ "reward_std": 0.5583916157484055,
592
+ "rewards/emotion_reward_func": 0.421875,
593
+ "rewards/format_reward_func": 0.953125,
594
  "step": 45
595
  },
596
  {
597
+ "completion_length": 64.09375,
598
  "epoch": 0.06133333333333333,
599
+ "grad_norm": 6.421321392059326,
600
+ "kl": 0.08067317306995392,
601
  "learning_rate": 2.8133330839107604e-07,
602
+ "loss": 0.0001,
603
+ "reward": 1.484375,
604
+ "reward_std": 0.6873095482587814,
605
+ "rewards/emotion_reward_func": 0.59375,
606
+ "rewards/format_reward_func": 0.890625,
607
  "step": 46
608
  },
609
  {
610
+ "completion_length": 65.859375,
611
  "epoch": 0.06266666666666666,
612
+ "grad_norm": 5.465454578399658,
613
+ "kl": 0.08742601610720158,
614
  "learning_rate": 2.735270783296286e-07,
615
+ "loss": 0.0001,
616
+ "reward": 1.421875,
617
+ "reward_std": 0.37412673234939575,
618
+ "rewards/emotion_reward_func": 0.4375,
619
+ "rewards/format_reward_func": 0.984375,
620
  "step": 47
621
  },
622
  {
623
+ "completion_length": 61.734375,
624
  "epoch": 0.064,
625
+ "grad_norm": 6.597202777862549,
626
+ "kl": 0.09020566754043102,
627
  "learning_rate": 2.6569762988232837e-07,
628
+ "loss": 0.0001,
629
+ "reward": 1.203125,
630
+ "reward_std": 0.4003961533308029,
631
+ "rewards/emotion_reward_func": 0.265625,
632
+ "rewards/format_reward_func": 0.9375,
633
  "step": 48
634
  },
635
  {
636
+ "completion_length": 63.21875,
637
  "epoch": 0.06533333333333333,
638
+ "grad_norm": 5.847234725952148,
639
+ "kl": 0.09095379617065191,
640
  "learning_rate": 2.5785268976953204e-07,
641
+ "loss": 0.0001,
642
+ "reward": 1.375,
643
+ "reward_std": 0.3876233473420143,
644
+ "rewards/emotion_reward_func": 0.390625,
645
+ "rewards/format_reward_func": 0.984375,
646
  "step": 49
647
  },
648
  {
649
+ "completion_length": 59.890625,
650
  "epoch": 0.06666666666666667,
651
+ "grad_norm": 7.6002936363220215,
652
+ "kl": 0.08561510033905506,
653
  "learning_rate": 2.5e-07,
654
+ "loss": 0.0001,
655
+ "reward": 1.25,
656
+ "reward_std": 0.3925696462392807,
657
+ "rewards/emotion_reward_func": 0.28125,
658
+ "rewards/format_reward_func": 0.96875,
659
  "step": 50
660
  }
661
  ],