eric-tramel commited on
Commit
cc8efd4
·
verified ·
1 Parent(s): fe16ea8

Training in progress, step 100, checkpoint

Browse files
checkpoint-100/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b4d33389f1cc4a587fb1cc0e286e0fb0f48ea297e318d76ce7fc7dc4e15d4ac
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46bbda891252d1bc7d7b207072338c59dd511f23a0bb1a77a233dd55bf64bc3
3
  size 1976163472
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96d319c90d917e3d81f0107f270f4c2887636b656c752ce11241c1c9632e8d8b
3
  size 3952505274
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e61137fa392006841b3c9d3b0075e477b2bfd4da9800e8738d48e950a612ad64
3
  size 3952505274
checkpoint-100/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d952ecf17d9b51ed5e329f95c84c1249d0107fc4d0109eb87fb465326e8bff24
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8016113faecd368858eb6ebc4fcb61a9f1956107a452dada8e98bfda76288f6
3
  size 15024
checkpoint-100/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dcf2b47c9a735f13280e89c61a0263a149b2cdaf95d263753b5201739a39512
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:974125c47b4b1edfbbcd3caeae8511abb215ef1494e888b05e7fde18c3ed08ed
3
  size 15024
checkpoint-100/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:006dea8e4200032c17f87db4801d9905390e53a088c7e9a09835fe1319c7bc83
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ac54bed7d9d2e9cb07f31f09406c3bf48fced29450844d31d747045d9f2f6ea
3
  size 15024
checkpoint-100/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc83440fa410c77e3d675e7539df9956319717fb0fd9b5adaf4caf1f6ef19a1c
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:817aecd630d55ef1ec1da71dbd2daae7b12d4db119f38d05d48fdc7f0cb7134f
3
  size 15024
checkpoint-100/trainer_state.json CHANGED
@@ -9,7 +9,7 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 134.4375,
13
  "epoch": 0.0013333333333333333,
14
  "grad_norm": 0.0,
15
  "kl": 0.0,
@@ -22,23 +22,23 @@
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 157.375,
26
  "epoch": 0.0026666666666666666,
27
- "grad_norm": 0.0,
28
  "kl": 0.0,
29
  "learning_rate": 4.995066821070679e-07,
30
- "loss": 0.0,
31
- "reward": 0.0,
32
- "reward_std": 0.0,
33
  "rewards/emotion_reward_func": 0.0,
34
- "rewards/format_reward_func": 0.0,
35
  "step": 2
36
  },
37
  {
38
- "completion_length": 161.75,
39
  "epoch": 0.004,
40
- "grad_norm": 0.0,
41
- "kl": 0.0,
42
  "learning_rate": 4.9889049115077e-07,
43
  "loss": 0.0,
44
  "reward": 0.0,
@@ -48,10 +48,10 @@
48
  "step": 3
49
  },
50
  {
51
- "completion_length": 147.8125,
52
  "epoch": 0.005333333333333333,
53
- "grad_norm": 0.0,
54
- "kl": 0.0,
55
  "learning_rate": 4.980286753286194e-07,
56
  "loss": 0.0,
57
  "reward": 0.0,
@@ -61,23 +61,23 @@
61
  "step": 4
62
  },
63
  {
64
- "completion_length": 207.5625,
65
  "epoch": 0.006666666666666667,
66
- "grad_norm": 0.0,
67
- "kl": 0.0,
68
  "learning_rate": 4.969220851487844e-07,
69
  "loss": 0.0,
70
- "reward": 0.0,
71
- "reward_std": 0.0,
72
  "rewards/emotion_reward_func": 0.0,
73
- "rewards/format_reward_func": 0.0,
74
  "step": 5
75
  },
76
  {
77
- "completion_length": 188.875,
78
  "epoch": 0.008,
79
- "grad_norm": 0.0,
80
- "kl": 0.0,
81
  "learning_rate": 4.955718126821722e-07,
82
  "loss": 0.0,
83
  "reward": 0.0,
@@ -87,23 +87,23 @@
87
  "step": 6
88
  },
89
  {
90
- "completion_length": 197.5,
91
  "epoch": 0.009333333333333334,
92
- "grad_norm": 0.0,
93
- "kl": 0.0,
94
  "learning_rate": 4.939791904846868e-07,
95
  "loss": 0.0,
96
- "reward": 0.0,
97
- "reward_std": 0.0,
98
  "rewards/emotion_reward_func": 0.0,
99
- "rewards/format_reward_func": 0.0,
100
  "step": 7
101
  },
102
  {
103
- "completion_length": 158.5625,
104
  "epoch": 0.010666666666666666,
105
- "grad_norm": 0.0,
106
- "kl": 0.0,
107
  "learning_rate": 4.921457902821578e-07,
108
  "loss": 0.0,
109
  "reward": 0.0,
@@ -113,10 +113,10 @@
113
  "step": 8
114
  },
115
  {
116
- "completion_length": 149.5,
117
  "epoch": 0.012,
118
- "grad_norm": 5.7435736656188965,
119
- "kl": 0.0,
120
  "learning_rate": 4.900734214192358e-07,
121
  "loss": 0.0,
122
  "reward": 0.0625,
@@ -126,10 +126,10 @@
126
  "step": 9
127
  },
128
  {
129
- "completion_length": 184.0625,
130
  "epoch": 0.013333333333333334,
131
- "grad_norm": 2.868870973587036,
132
- "kl": 0.0008592925732955337,
133
  "learning_rate": 4.877641290737883e-07,
134
  "loss": 0.0,
135
  "reward": 0.0625,
@@ -139,725 +139,725 @@
139
  "step": 10
140
  },
141
  {
142
- "completion_length": 173.5,
143
  "epoch": 0.014666666666666666,
144
- "grad_norm": 0.001981294248253107,
145
- "kl": 0.000871832889970392,
146
  "learning_rate": 4.852201922385564e-07,
147
  "loss": 0.0,
148
- "reward": 0.0,
149
- "reward_std": 0.0,
150
  "rewards/emotion_reward_func": 0.0,
151
- "rewards/format_reward_func": 0.0,
152
  "step": 11
153
  },
154
  {
155
- "completion_length": 181.0625,
156
  "epoch": 0.016,
157
- "grad_norm": 0.004562250338494778,
158
- "kl": 0.001329638296738267,
159
  "learning_rate": 4.824441214720628e-07,
160
  "loss": 0.0,
161
- "reward": 0.0,
162
- "reward_std": 0.0,
163
  "rewards/emotion_reward_func": 0.0,
164
- "rewards/format_reward_func": 0.0,
165
  "step": 12
166
  },
167
  {
168
- "completion_length": 183.375,
169
  "epoch": 0.017333333333333333,
170
- "grad_norm": 0.0010324150789529085,
171
- "kl": 0.0008550028433091938,
172
  "learning_rate": 4.794386564209952e-07,
173
  "loss": 0.0,
174
- "reward": 0.0,
175
- "reward_std": 0.0,
176
  "rewards/emotion_reward_func": 0.0,
177
- "rewards/format_reward_func": 0.0,
178
  "step": 13
179
  },
180
  {
181
- "completion_length": 192.875,
182
  "epoch": 0.018666666666666668,
183
- "grad_norm": 0.0010999701917171478,
184
- "kl": 0.0008370049763470888,
185
  "learning_rate": 4.762067631165049e-07,
186
  "loss": 0.0,
187
- "reward": 0.0,
188
- "reward_std": 0.0,
189
  "rewards/emotion_reward_func": 0.0,
190
- "rewards/format_reward_func": 0.0,
191
  "step": 14
192
  },
193
  {
194
- "completion_length": 205.6875,
195
  "epoch": 0.02,
196
- "grad_norm": 0.0007266216562129557,
197
- "kl": 0.0007688422920182347,
198
  "learning_rate": 4.7275163104709194e-07,
199
- "loss": 0.0,
200
- "reward": 0.0,
201
- "reward_std": 0.0,
202
  "rewards/emotion_reward_func": 0.0,
203
- "rewards/format_reward_func": 0.0,
204
  "step": 15
205
  },
206
  {
207
- "completion_length": 163.0625,
208
  "epoch": 0.021333333333333333,
209
- "grad_norm": 0.002079131081700325,
210
- "kl": 0.0009201083448715508,
211
  "learning_rate": 4.6907667001096585e-07,
212
  "loss": 0.0,
213
- "reward": 0.0,
214
- "reward_std": 0.0,
215
  "rewards/emotion_reward_func": 0.0,
216
- "rewards/format_reward_func": 0.0,
217
  "step": 16
218
  },
219
  {
220
- "completion_length": 131.6875,
221
  "epoch": 0.02266666666666667,
222
- "grad_norm": 0.005897476337850094,
223
- "kl": 0.0009303554543294013,
224
  "learning_rate": 4.6518550675098587e-07,
225
  "loss": 0.0,
226
- "reward": 0.0,
227
- "reward_std": 0.0,
228
  "rewards/emotion_reward_func": 0.0,
229
- "rewards/format_reward_func": 0.0,
230
  "step": 17
231
  },
232
  {
233
- "completion_length": 151.4375,
234
  "epoch": 0.024,
235
- "grad_norm": 0.002442071447148919,
236
- "kl": 0.0010876132873818278,
237
  "learning_rate": 4.6108198137550377e-07,
238
  "loss": 0.0,
239
- "reward": 0.0,
240
- "reward_std": 0.0,
241
  "rewards/emotion_reward_func": 0.0,
242
- "rewards/format_reward_func": 0.0,
243
  "step": 18
244
  },
245
  {
246
- "completion_length": 177.9375,
247
  "epoch": 0.025333333333333333,
248
- "grad_norm": 9.569433212280273,
249
- "kl": 0.004583852831274271,
250
  "learning_rate": 4.567701435686404e-07,
251
  "loss": 0.0,
252
- "reward": 0.0625,
253
- "reward_std": 0.125,
254
  "rewards/emotion_reward_func": 0.0,
255
- "rewards/format_reward_func": 0.0625,
256
  "step": 19
257
  },
258
  {
259
- "completion_length": 176.8125,
260
  "epoch": 0.02666666666666667,
261
- "grad_norm": 0.0038311942480504513,
262
- "kl": 0.001420724904164672,
263
  "learning_rate": 4.5225424859373684e-07,
264
  "loss": 0.0,
265
- "reward": 0.0,
266
- "reward_std": 0.0,
267
  "rewards/emotion_reward_func": 0.0,
268
- "rewards/format_reward_func": 0.0,
269
  "step": 20
270
  },
271
  {
272
- "completion_length": 145.6875,
273
  "epoch": 0.028,
274
- "grad_norm": 0.019769007340073586,
275
- "kl": 0.003168366150930524,
276
  "learning_rate": 4.475387530939226e-07,
277
  "loss": 0.0,
278
- "reward": 0.0,
279
- "reward_std": 0.0,
280
  "rewards/emotion_reward_func": 0.0,
281
- "rewards/format_reward_func": 0.0,
282
  "step": 21
283
  },
284
  {
285
- "completion_length": 99.125,
286
  "epoch": 0.029333333333333333,
287
- "grad_norm": 0.019412707537412643,
288
- "kl": 0.0031863912008702755,
289
  "learning_rate": 4.426283106939473e-07,
290
  "loss": 0.0,
291
- "reward": 0.0,
292
- "reward_std": 0.0,
293
  "rewards/emotion_reward_func": 0.0,
294
- "rewards/format_reward_func": 0.0,
295
  "step": 22
296
  },
297
  {
298
- "completion_length": 166.4375,
299
  "epoch": 0.030666666666666665,
300
- "grad_norm": 3.5533502101898193,
301
- "kl": 0.002899652114138007,
302
  "learning_rate": 4.375277674076149e-07,
303
  "loss": 0.0,
304
- "reward": 0.0625,
305
- "reward_std": 0.125,
306
  "rewards/emotion_reward_func": 0.0,
307
- "rewards/format_reward_func": 0.0625,
308
  "step": 23
309
  },
310
  {
311
- "completion_length": 138.3125,
312
  "epoch": 0.032,
313
- "grad_norm": 0.006241905968636274,
314
- "kl": 0.0016803923062980175,
315
  "learning_rate": 4.3224215685535287e-07,
316
  "loss": 0.0,
317
- "reward": 0.0,
318
- "reward_std": 0.0,
319
  "rewards/emotion_reward_func": 0.0,
320
- "rewards/format_reward_func": 0.0,
321
  "step": 24
322
  },
323
  {
324
- "completion_length": 158.4375,
325
  "epoch": 0.03333333333333333,
326
- "grad_norm": 4.496654987335205,
327
- "kl": 0.001099013490602374,
328
  "learning_rate": 4.2677669529663686e-07,
329
  "loss": 0.0,
330
- "reward": 0.0625,
331
- "reward_std": 0.125,
332
  "rewards/emotion_reward_func": 0.0,
333
- "rewards/format_reward_func": 0.0625,
334
  "step": 25
335
  },
336
  {
337
- "completion_length": 189.25,
338
  "epoch": 0.034666666666666665,
339
- "grad_norm": 0.004112588707357645,
340
- "kl": 0.0016956878826022148,
341
  "learning_rate": 4.2113677648217216e-07,
342
- "loss": 0.0,
343
- "reward": 0.0,
344
- "reward_std": 0.0,
345
  "rewards/emotion_reward_func": 0.0,
346
- "rewards/format_reward_func": 0.0,
347
  "step": 26
348
  },
349
  {
350
- "completion_length": 188.1875,
351
  "epoch": 0.036,
352
- "grad_norm": 3.3644537925720215,
353
- "kl": 0.0010217109229415655,
354
  "learning_rate": 4.1532796633091294e-07,
355
  "loss": 0.0,
356
- "reward": 0.0625,
357
- "reward_std": 0.125,
358
  "rewards/emotion_reward_func": 0.0,
359
- "rewards/format_reward_func": 0.0625,
360
  "step": 27
361
  },
362
  {
363
- "completion_length": 185.1875,
364
  "epoch": 0.037333333333333336,
365
- "grad_norm": 0.0016409006202593446,
366
- "kl": 0.0010863029165193439,
367
  "learning_rate": 4.0935599743717244e-07,
368
- "loss": 0.0,
369
- "reward": 0.0,
370
- "reward_std": 0.0,
371
  "rewards/emotion_reward_func": 0.0,
372
- "rewards/format_reward_func": 0.0,
373
  "step": 28
374
  },
375
  {
376
- "completion_length": 144.3125,
377
  "epoch": 0.03866666666666667,
378
- "grad_norm": 0.011104789562523365,
379
- "kl": 0.0030689446721225977,
380
  "learning_rate": 4.0322676341324414e-07,
381
  "loss": 0.0,
382
- "reward": 0.0,
383
- "reward_std": 0.0,
384
  "rewards/emotion_reward_func": 0.0,
385
- "rewards/format_reward_func": 0.0,
386
  "step": 29
387
  },
388
  {
389
- "completion_length": 221.75,
390
  "epoch": 0.04,
391
- "grad_norm": 3.384615659713745,
392
- "kl": 0.0014688875526189804,
393
  "learning_rate": 3.9694631307311825e-07,
394
  "loss": 0.0,
395
- "reward": 0.0625,
396
- "reward_std": 0.125,
397
  "rewards/emotion_reward_func": 0.0,
398
- "rewards/format_reward_func": 0.0625,
399
  "step": 30
400
  },
401
  {
402
- "completion_length": 178.25,
403
  "epoch": 0.04133333333333333,
404
- "grad_norm": 0.007718691602349281,
405
- "kl": 0.002737755421549082,
406
  "learning_rate": 3.9052084446303265e-07,
407
- "loss": 0.0,
408
- "reward": 0.0,
409
- "reward_std": 0.0,
410
  "rewards/emotion_reward_func": 0.0,
411
- "rewards/format_reward_func": 0.0,
412
  "step": 31
413
  },
414
  {
415
- "completion_length": 165.1875,
416
  "epoch": 0.042666666666666665,
417
- "grad_norm": 3.6017088890075684,
418
- "kl": 0.0028503944631665945,
419
  "learning_rate": 3.839566987447491e-07,
420
  "loss": 0.0,
421
- "reward": 0.0625,
422
- "reward_std": 0.125,
423
  "rewards/emotion_reward_func": 0.0,
424
- "rewards/format_reward_func": 0.0625,
425
  "step": 32
426
  },
427
  {
428
- "completion_length": 144.0,
429
  "epoch": 0.044,
430
- "grad_norm": 4.880671977996826,
431
- "kl": 0.009817617945373058,
432
  "learning_rate": 3.7726035393759283e-07,
433
- "loss": 0.0,
434
- "reward": 0.0625,
435
- "reward_std": 0.125,
436
  "rewards/emotion_reward_func": 0.0,
437
- "rewards/format_reward_func": 0.0625,
438
  "step": 33
439
  },
440
  {
441
- "completion_length": 129.4375,
442
  "epoch": 0.04533333333333334,
443
- "grad_norm": 0.002933287527412176,
444
- "kl": 0.0015820721164345741,
445
  "learning_rate": 3.704384185254288e-07,
446
- "loss": 0.0,
447
- "reward": 0.0,
448
- "reward_std": 0.0,
449
  "rewards/emotion_reward_func": 0.0,
450
- "rewards/format_reward_func": 0.0,
451
  "step": 34
452
  },
453
  {
454
- "completion_length": 102.9375,
455
  "epoch": 0.04666666666666667,
456
- "grad_norm": 8.057755470275879,
457
- "kl": 0.011732880026102066,
458
  "learning_rate": 3.634976249348867e-07,
459
- "loss": 0.0,
460
- "reward": 0.0625,
461
- "reward_std": 0.125,
462
  "rewards/emotion_reward_func": 0.0,
463
- "rewards/format_reward_func": 0.0625,
464
  "step": 35
465
  },
466
  {
467
- "completion_length": 144.0,
468
  "epoch": 0.048,
469
- "grad_norm": 3.886326551437378,
470
- "kl": 0.0062111858278512955,
471
  "learning_rate": 3.5644482289126813e-07,
472
- "loss": 0.0,
473
- "reward": 0.0625,
474
- "reward_std": 0.125,
475
  "rewards/emotion_reward_func": 0.0,
476
- "rewards/format_reward_func": 0.0625,
477
  "step": 36
478
  },
479
  {
480
- "completion_length": 138.25,
481
  "epoch": 0.04933333333333333,
482
- "grad_norm": 8.39665412902832,
483
- "kl": 0.0049956561997532845,
484
  "learning_rate": 3.492869726586951e-07,
485
- "loss": 0.0,
486
- "reward": 0.125,
487
- "reward_std": 0.25,
488
  "rewards/emotion_reward_func": 0.0,
489
- "rewards/format_reward_func": 0.125,
490
  "step": 37
491
  },
492
  {
493
- "completion_length": 184.1875,
494
  "epoch": 0.050666666666666665,
495
- "grad_norm": 3.575878858566284,
496
- "kl": 0.009746416471898556,
497
  "learning_rate": 3.4203113817116953e-07,
498
- "loss": 0.0,
499
- "reward": 0.0625,
500
- "reward_std": 0.125,
501
  "rewards/emotion_reward_func": 0.0,
502
- "rewards/format_reward_func": 0.0625,
503
  "step": 38
504
  },
505
  {
506
- "completion_length": 173.1875,
507
  "epoch": 0.052,
508
- "grad_norm": 4.618286609649658,
509
- "kl": 0.004162437282502651,
510
  "learning_rate": 3.346844800613229e-07,
511
- "loss": 0.0,
512
- "reward": 0.0625,
513
- "reward_std": 0.125,
514
  "rewards/emotion_reward_func": 0.0,
515
- "rewards/format_reward_func": 0.0625,
516
  "step": 39
517
  },
518
  {
519
- "completion_length": 144.75,
520
  "epoch": 0.05333333333333334,
521
- "grad_norm": 4.964860916137695,
522
- "kl": 0.0039725033566355705,
523
  "learning_rate": 3.272542485937368e-07,
524
- "loss": 0.0,
525
- "reward": 0.0625,
526
- "reward_std": 0.125,
527
  "rewards/emotion_reward_func": 0.0,
528
- "rewards/format_reward_func": 0.0625,
529
  "step": 40
530
  },
531
  {
532
- "completion_length": 191.5,
533
  "epoch": 0.05466666666666667,
534
- "grad_norm": 5.967226028442383,
535
- "kl": 0.008939428254961967,
536
  "learning_rate": 3.1974777650980734e-07,
537
- "loss": 0.0,
538
- "reward": 0.125,
539
- "reward_std": 0.25,
540
  "rewards/emotion_reward_func": 0.0,
541
- "rewards/format_reward_func": 0.125,
542
  "step": 41
543
  },
544
  {
545
- "completion_length": 149.6875,
546
  "epoch": 0.056,
547
- "grad_norm": 5.210999488830566,
548
- "kl": 0.01842350885272026,
549
  "learning_rate": 3.121724717912138e-07,
550
- "loss": 0.0,
551
- "reward": 0.0625,
552
- "reward_std": 0.125,
553
  "rewards/emotion_reward_func": 0.0,
554
- "rewards/format_reward_func": 0.0625,
555
  "step": 42
556
  },
557
  {
558
- "completion_length": 138.5,
559
  "epoch": 0.05733333333333333,
560
- "grad_norm": 12.056984901428223,
561
- "kl": 0.019806817173957825,
562
  "learning_rate": 3.0453581034913565e-07,
563
- "loss": 0.0,
564
- "reward": 0.3125,
565
- "reward_std": 0.51933753490448,
566
  "rewards/emotion_reward_func": 0.0,
567
- "rewards/format_reward_func": 0.3125,
568
  "step": 43
569
  },
570
  {
571
- "completion_length": 115.9375,
572
  "epoch": 0.058666666666666666,
573
- "grad_norm": 17.690818786621094,
574
- "kl": 0.054062891751527786,
575
  "learning_rate": 2.968453286464312e-07,
576
  "loss": 0.0001,
577
- "reward": 0.1875,
578
  "reward_std": 0.375,
579
  "rewards/emotion_reward_func": 0.0,
580
- "rewards/format_reward_func": 0.1875,
581
  "step": 44
582
  },
583
  {
584
- "completion_length": 137.1875,
585
  "epoch": 0.06,
586
- "grad_norm": 12.18422794342041,
587
- "kl": 0.025158584117889404,
588
  "learning_rate": 2.8910861626005773e-07,
589
- "loss": 0.0,
590
- "reward": 0.25,
591
- "reward_std": 0.5,
592
  "rewards/emotion_reward_func": 0.0,
593
- "rewards/format_reward_func": 0.25,
594
  "step": 45
595
  },
596
  {
597
- "completion_length": 145.0625,
598
  "epoch": 0.06133333333333333,
599
- "grad_norm": 4.150605201721191,
600
- "kl": 0.011470025405287743,
601
  "learning_rate": 2.8133330839107604e-07,
602
- "loss": 0.0,
603
- "reward": 0.0625,
604
- "reward_std": 0.125,
605
  "rewards/emotion_reward_func": 0.0,
606
- "rewards/format_reward_func": 0.0625,
607
  "step": 46
608
  },
609
  {
610
- "completion_length": 149.6875,
611
  "epoch": 0.06266666666666666,
612
- "grad_norm": 7.5838541984558105,
613
- "kl": 0.028349004685878754,
614
  "learning_rate": 2.735270783296286e-07,
615
- "loss": 0.0,
616
- "reward": 0.3125,
617
- "reward_std": 0.41367512941360474,
618
  "rewards/emotion_reward_func": 0.0,
619
- "rewards/format_reward_func": 0.3125,
620
  "step": 47
621
  },
622
  {
623
- "completion_length": 129.6875,
624
  "epoch": 0.064,
625
- "grad_norm": 7.82366943359375,
626
- "kl": 0.012228617444634438,
627
  "learning_rate": 2.6569762988232837e-07,
628
- "loss": 0.0,
629
- "reward": 0.1875,
630
- "reward_std": 0.375,
631
  "rewards/emotion_reward_func": 0.0,
632
- "rewards/format_reward_func": 0.1875,
633
  "step": 48
634
  },
635
  {
636
- "completion_length": 167.125,
637
  "epoch": 0.06533333333333333,
638
- "grad_norm": 6.554472923278809,
639
- "kl": 0.01310904324054718,
640
  "learning_rate": 2.5785268976953204e-07,
641
- "loss": 0.0,
642
- "reward": 0.1875,
643
- "reward_std": 0.375,
644
  "rewards/emotion_reward_func": 0.0,
645
- "rewards/format_reward_func": 0.1875,
646
  "step": 49
647
  },
648
  {
649
- "completion_length": 157.1875,
650
  "epoch": 0.06666666666666667,
651
- "grad_norm": 6.399176120758057,
652
- "kl": 0.02061685547232628,
653
  "learning_rate": 2.5e-07,
654
- "loss": 0.0,
655
- "reward": 0.4375,
656
- "reward_std": 0.41367512941360474,
657
  "rewards/emotion_reward_func": 0.0,
658
- "rewards/format_reward_func": 0.4375,
659
  "step": 50
660
  },
661
  {
662
- "completion_length": 164.875,
663
  "epoch": 0.068,
664
- "grad_norm": 7.1559977531433105,
665
- "kl": 0.02672770991921425,
666
  "learning_rate": 2.4214731023046794e-07,
667
- "loss": 0.0,
668
- "reward": 0.25,
669
- "reward_std": 0.39433756470680237,
670
  "rewards/emotion_reward_func": 0.0,
671
- "rewards/format_reward_func": 0.25,
672
  "step": 51
673
  },
674
  {
675
- "completion_length": 131.75,
676
  "epoch": 0.06933333333333333,
677
- "grad_norm": 10.416351318359375,
678
- "kl": 0.09771859645843506,
679
  "learning_rate": 2.3430237011767164e-07,
680
  "loss": 0.0001,
681
- "reward": 0.6875,
682
- "reward_std": 0.41367512941360474,
683
  "rewards/emotion_reward_func": 0.0,
684
- "rewards/format_reward_func": 0.6875,
685
  "step": 52
686
  },
687
  {
688
- "completion_length": 134.4375,
689
  "epoch": 0.07066666666666667,
690
- "grad_norm": 9.832062721252441,
691
- "kl": 0.03194844722747803,
692
  "learning_rate": 2.264729216703714e-07,
693
- "loss": 0.0,
694
- "reward": 0.375,
695
- "reward_std": 0.5386751294136047,
696
  "rewards/emotion_reward_func": 0.0,
697
- "rewards/format_reward_func": 0.375,
698
  "step": 53
699
  },
700
  {
701
- "completion_length": 155.375,
702
  "epoch": 0.072,
703
- "grad_norm": 7.258142471313477,
704
- "kl": 0.022360337898135185,
705
  "learning_rate": 2.1866669160892389e-07,
706
- "loss": 0.0,
707
- "reward": 0.5,
708
- "reward_std": 0.39433756470680237,
709
  "rewards/emotion_reward_func": 0.0,
710
- "rewards/format_reward_func": 0.5,
711
  "step": 54
712
  },
713
  {
714
- "completion_length": 128.375,
715
  "epoch": 0.07333333333333333,
716
- "grad_norm": 8.793972969055176,
717
- "kl": 0.029077202081680298,
718
  "learning_rate": 2.1089138373994222e-07,
719
- "loss": 0.0,
720
- "reward": 0.4375,
721
- "reward_std": 0.5580127239227295,
722
  "rewards/emotion_reward_func": 0.0,
723
- "rewards/format_reward_func": 0.4375,
724
  "step": 55
725
  },
726
  {
727
- "completion_length": 127.9375,
728
  "epoch": 0.07466666666666667,
729
- "grad_norm": 22.435115814208984,
730
- "kl": 0.23208478093147278,
731
  "learning_rate": 2.0315467135356878e-07,
732
- "loss": 0.0002,
733
- "reward": 0.5,
734
- "reward_std": 0.5386751294136047,
735
  "rewards/emotion_reward_func": 0.0,
736
- "rewards/format_reward_func": 0.5,
737
  "step": 56
738
  },
739
  {
740
- "completion_length": 115.6875,
741
  "epoch": 0.076,
742
- "grad_norm": 7.384529113769531,
743
- "kl": 0.02935698628425598,
744
  "learning_rate": 1.954641896508644e-07,
745
- "loss": 0.0,
746
- "reward": 0.5625,
747
- "reward_std": 0.41367512941360474,
748
  "rewards/emotion_reward_func": 0.0,
749
- "rewards/format_reward_func": 0.5625,
750
  "step": 57
751
  },
752
  {
753
- "completion_length": 161.4375,
754
  "epoch": 0.07733333333333334,
755
- "grad_norm": 5.652832984924316,
756
- "kl": 0.02788429707288742,
757
  "learning_rate": 1.8782752820878633e-07,
758
- "loss": 0.0,
759
- "reward": 0.5625,
760
- "reward_std": 0.26933756470680237,
761
  "rewards/emotion_reward_func": 0.0,
762
- "rewards/format_reward_func": 0.5625,
763
  "step": 58
764
  },
765
  {
766
- "completion_length": 114.0625,
767
  "epoch": 0.07866666666666666,
768
- "grad_norm": 7.544243335723877,
769
- "kl": 0.0415709987282753,
770
  "learning_rate": 1.802522234901927e-07,
771
- "loss": 0.0,
772
- "reward": 0.75,
773
- "reward_std": 0.25,
774
  "rewards/emotion_reward_func": 0.0,
775
- "rewards/format_reward_func": 0.75,
776
  "step": 59
777
  },
778
  {
779
- "completion_length": 128.25,
780
  "epoch": 0.08,
781
- "grad_norm": 6.015617847442627,
782
- "kl": 0.038651518523693085,
783
  "learning_rate": 1.7274575140626315e-07,
784
- "loss": 0.0,
785
- "reward": 0.875,
786
- "reward_std": 0.25,
787
  "rewards/emotion_reward_func": 0.0,
788
- "rewards/format_reward_func": 0.875,
789
  "step": 60
790
  },
791
  {
792
- "completion_length": 146.125,
793
  "epoch": 0.08133333333333333,
794
- "grad_norm": 7.534911155700684,
795
- "kl": 0.023068124428391457,
796
  "learning_rate": 1.6531551993867715e-07,
797
- "loss": 0.0,
798
- "reward": 0.6875,
799
- "reward_std": 0.375,
800
  "rewards/emotion_reward_func": 0.0,
801
- "rewards/format_reward_func": 0.6875,
802
  "step": 61
803
  },
804
  {
805
- "completion_length": 136.6875,
806
  "epoch": 0.08266666666666667,
807
- "grad_norm": 7.638873100280762,
808
- "kl": 0.03138712793588638,
809
  "learning_rate": 1.579688618288305e-07,
810
- "loss": 0.0,
811
- "reward": 0.75,
812
- "reward_std": 0.39433756470680237,
813
  "rewards/emotion_reward_func": 0.0,
814
- "rewards/format_reward_func": 0.75,
815
  "step": 62
816
  },
817
  {
818
- "completion_length": 124.8125,
819
  "epoch": 0.084,
820
- "grad_norm": 22.380794525146484,
821
- "kl": 0.28954756259918213,
822
  "learning_rate": 1.5071302734130486e-07,
823
- "loss": 0.0003,
824
- "reward": 0.75,
825
- "reward_std": 0.5,
826
  "rewards/emotion_reward_func": 0.0,
827
- "rewards/format_reward_func": 0.75,
828
  "step": 63
829
  },
830
  {
831
- "completion_length": 126.3125,
832
  "epoch": 0.08533333333333333,
833
- "grad_norm": 7.005850791931152,
834
- "kl": 0.03995266556739807,
835
  "learning_rate": 1.4355517710873182e-07,
836
- "loss": 0.0,
837
- "reward": 0.8125,
838
- "reward_std": 0.375,
839
  "rewards/emotion_reward_func": 0.0,
840
- "rewards/format_reward_func": 0.8125,
841
  "step": 64
842
  },
843
  {
844
- "completion_length": 167.5,
845
  "epoch": 0.08666666666666667,
846
- "grad_norm": 6.591019153594971,
847
- "kl": 0.029363617300987244,
848
  "learning_rate": 1.365023750651133e-07,
849
- "loss": 0.0,
850
- "reward": 0.8125,
851
- "reward_std": 0.375,
852
  "rewards/emotion_reward_func": 0.0,
853
- "rewards/format_reward_func": 0.8125,
854
  "step": 65
855
  },
856
  {
857
- "completion_length": 120.0,
858
  "epoch": 0.088,
859
- "grad_norm": 3.859731674194336,
860
- "kl": 0.05216670036315918,
861
  "learning_rate": 1.2956158147457114e-07,
862
  "loss": 0.0001,
863
  "reward": 0.9375,
@@ -867,207 +867,207 @@
867
  "step": 66
868
  },
869
  {
870
- "completion_length": 116.0,
871
  "epoch": 0.08933333333333333,
872
- "grad_norm": 11.705424308776855,
873
- "kl": 0.42816463112831116,
874
  "learning_rate": 1.2273964606240718e-07,
875
- "loss": 0.0004,
876
- "reward": 0.625,
877
- "reward_std": 0.4330126941204071,
878
  "rewards/emotion_reward_func": 0.0,
879
- "rewards/format_reward_func": 0.625,
880
  "step": 67
881
  },
882
  {
883
- "completion_length": 157.125,
884
  "epoch": 0.09066666666666667,
885
- "grad_norm": 7.195036888122559,
886
- "kl": 0.030014928430318832,
887
  "learning_rate": 1.1604330125525078e-07,
888
- "loss": 0.0,
889
- "reward": 0.6875,
890
- "reward_std": 0.41367512941360474,
891
  "rewards/emotion_reward_func": 0.0,
892
- "rewards/format_reward_func": 0.6875,
893
  "step": 68
894
  },
895
  {
896
- "completion_length": 114.5,
897
  "epoch": 0.092,
898
- "grad_norm": 6.199704647064209,
899
- "kl": 0.04237810894846916,
900
  "learning_rate": 1.0947915553696741e-07,
901
- "loss": 0.0,
902
- "reward": 0.8125,
903
- "reward_std": 0.26933756470680237,
904
  "rewards/emotion_reward_func": 0.0,
905
- "rewards/format_reward_func": 0.8125,
906
  "step": 69
907
  },
908
  {
909
- "completion_length": 143.125,
910
  "epoch": 0.09333333333333334,
911
- "grad_norm": 8.729787826538086,
912
- "kl": 0.0419083908200264,
913
  "learning_rate": 1.0305368692688174e-07,
914
- "loss": 0.0,
915
- "reward": 0.625,
916
- "reward_std": 0.5386751294136047,
917
  "rewards/emotion_reward_func": 0.0,
918
- "rewards/format_reward_func": 0.625,
919
  "step": 70
920
  },
921
  {
922
- "completion_length": 142.0,
923
  "epoch": 0.09466666666666666,
924
- "grad_norm": 7.905228137969971,
925
- "kl": 0.04787478223443031,
926
  "learning_rate": 9.677323658675593e-08,
927
- "loss": 0.0,
928
- "reward": 0.625,
929
- "reward_std": 0.5,
930
  "rewards/emotion_reward_func": 0.0,
931
- "rewards/format_reward_func": 0.625,
932
  "step": 71
933
  },
934
  {
935
- "completion_length": 144.75,
936
  "epoch": 0.096,
937
- "grad_norm": 6.076593399047852,
938
- "kl": 0.03787325695157051,
939
  "learning_rate": 9.064400256282755e-08,
940
- "loss": 0.0,
941
- "reward": 0.5625,
942
- "reward_std": 0.26933756470680237,
943
  "rewards/emotion_reward_func": 0.0,
944
- "rewards/format_reward_func": 0.5625,
945
  "step": 72
946
  },
947
  {
948
- "completion_length": 170.625,
949
  "epoch": 0.09733333333333333,
950
- "grad_norm": 5.793034076690674,
951
- "kl": 0.025196196511387825,
952
  "learning_rate": 8.467203366908707e-08,
953
- "loss": 0.0,
954
- "reward": 0.8125,
955
- "reward_std": 0.375,
956
  "rewards/emotion_reward_func": 0.0,
957
- "rewards/format_reward_func": 0.8125,
958
  "step": 73
959
  },
960
  {
961
- "completion_length": 128.25,
962
  "epoch": 0.09866666666666667,
963
- "grad_norm": 7.563605308532715,
964
- "kl": 0.04877077788114548,
965
  "learning_rate": 7.886322351782782e-08,
966
- "loss": 0.0,
967
- "reward": 0.75,
968
- "reward_std": 0.39433756470680237,
969
  "rewards/emotion_reward_func": 0.0,
970
- "rewards/format_reward_func": 0.75,
971
  "step": 74
972
  },
973
  {
974
- "completion_length": 104.3125,
975
  "epoch": 0.1,
976
- "grad_norm": 9.703227043151855,
977
- "kl": 0.0765969455242157,
978
  "learning_rate": 7.322330470336313e-08,
979
  "loss": 0.0001,
980
- "reward": 0.5625,
981
- "reward_std": 0.41367512941360474,
982
  "rewards/emotion_reward_func": 0.0,
983
- "rewards/format_reward_func": 0.5625,
984
  "step": 75
985
  },
986
  {
987
- "completion_length": 131.5,
988
  "epoch": 0.10133333333333333,
989
- "grad_norm": 8.99862003326416,
990
- "kl": 0.04248180240392685,
991
  "learning_rate": 6.775784314464716e-08,
992
- "loss": 0.0,
993
- "reward": 0.75,
994
- "reward_std": 0.5,
995
  "rewards/emotion_reward_func": 0.0,
996
- "rewards/format_reward_func": 0.75,
997
  "step": 76
998
  },
999
  {
1000
- "completion_length": 140.625,
1001
  "epoch": 0.10266666666666667,
1002
- "grad_norm": 8.795363426208496,
1003
- "kl": 0.043973423540592194,
1004
  "learning_rate": 6.24722325923851e-08,
1005
- "loss": 0.0,
1006
- "reward": 0.5625,
1007
- "reward_std": 0.51933753490448,
1008
  "rewards/emotion_reward_func": 0.0,
1009
- "rewards/format_reward_func": 0.5625,
1010
  "step": 77
1011
  },
1012
  {
1013
- "completion_length": 104.5625,
1014
  "epoch": 0.104,
1015
- "grad_norm": 9.773259162902832,
1016
- "kl": 0.07939579337835312,
1017
  "learning_rate": 5.737168930605271e-08,
1018
- "loss": 0.0001,
1019
- "reward": 0.75,
1020
- "reward_std": 0.25,
1021
  "rewards/emotion_reward_func": 0.0,
1022
- "rewards/format_reward_func": 0.75,
1023
  "step": 78
1024
  },
1025
  {
1026
- "completion_length": 119.1875,
1027
  "epoch": 0.10533333333333333,
1028
- "grad_norm": 6.493321418762207,
1029
- "kl": 0.07017514854669571,
1030
  "learning_rate": 5.246124690607739e-08,
1031
- "loss": 0.0001,
1032
- "reward": 0.75,
1033
- "reward_std": 0.28867512941360474,
1034
  "rewards/emotion_reward_func": 0.0,
1035
- "rewards/format_reward_func": 0.75,
1036
  "step": 79
1037
  },
1038
  {
1039
- "completion_length": 121.125,
1040
  "epoch": 0.10666666666666667,
1041
- "grad_norm": 6.501302719116211,
1042
- "kl": 0.04043734073638916,
1043
  "learning_rate": 4.774575140626316e-08,
1044
- "loss": 0.0,
1045
- "reward": 0.875,
1046
- "reward_std": 0.25,
1047
  "rewards/emotion_reward_func": 0.0,
1048
- "rewards/format_reward_func": 0.875,
1049
  "step": 80
1050
  },
1051
  {
1052
- "completion_length": 171.9375,
1053
  "epoch": 0.108,
1054
- "grad_norm": 5.958766937255859,
1055
- "kl": 0.03187550604343414,
1056
  "learning_rate": 4.3229856431359513e-08,
1057
- "loss": 0.0,
1058
- "reward": 0.75,
1059
- "reward_std": 0.39433756470680237,
1060
  "rewards/emotion_reward_func": 0.0,
1061
- "rewards/format_reward_func": 0.75,
1062
  "step": 81
1063
  },
1064
  {
1065
- "completion_length": 145.1875,
1066
  "epoch": 0.10933333333333334,
1067
- "grad_norm": 3.5965261459350586,
1068
- "kl": 0.045581966638565063,
1069
  "learning_rate": 3.8918018624496286e-08,
1070
- "loss": 0.0,
1071
  "reward": 0.9375,
1072
  "reward_std": 0.125,
1073
  "rewards/emotion_reward_func": 0.0,
@@ -1075,101 +1075,101 @@
1075
  "step": 82
1076
  },
1077
  {
1078
- "completion_length": 150.375,
1079
  "epoch": 0.11066666666666666,
1080
- "grad_norm": 3.3176019191741943,
1081
- "kl": 0.04733135551214218,
1082
  "learning_rate": 3.481449324901411e-08,
1083
- "loss": 0.0,
1084
- "reward": 0.875,
1085
- "reward_std": 0.14433756470680237,
1086
  "rewards/emotion_reward_func": 0.0,
1087
- "rewards/format_reward_func": 0.875,
1088
  "step": 83
1089
  },
1090
  {
1091
- "completion_length": 125.0,
1092
  "epoch": 0.112,
1093
- "grad_norm": 5.251501560211182,
1094
- "kl": 0.04413582757115364,
1095
  "learning_rate": 3.092332998903416e-08,
1096
- "loss": 0.0,
1097
- "reward": 0.875,
1098
- "reward_std": 0.14433756470680237,
1099
  "rewards/emotion_reward_func": 0.0,
1100
- "rewards/format_reward_func": 0.875,
1101
  "step": 84
1102
  },
1103
  {
1104
- "completion_length": 124.5,
1105
  "epoch": 0.11333333333333333,
1106
- "grad_norm": 10.067094802856445,
1107
- "kl": 0.05217205733060837,
1108
  "learning_rate": 2.724836895290805e-08,
1109
  "loss": 0.0001,
1110
- "reward": 0.6875,
1111
- "reward_std": 0.51933753490448,
1112
  "rewards/emotion_reward_func": 0.0,
1113
- "rewards/format_reward_func": 0.6875,
1114
  "step": 85
1115
  },
1116
  {
1117
- "completion_length": 129.8125,
1118
  "epoch": 0.11466666666666667,
1119
- "grad_norm": 5.6904096603393555,
1120
- "kl": 0.039646558463573456,
1121
  "learning_rate": 2.379323688349516e-08,
1122
- "loss": 0.0,
1123
- "reward": 0.875,
1124
- "reward_std": 0.25,
1125
  "rewards/emotion_reward_func": 0.0,
1126
- "rewards/format_reward_func": 0.875,
1127
  "step": 86
1128
  },
1129
  {
1130
- "completion_length": 114.8125,
1131
  "epoch": 0.116,
1132
- "grad_norm": 3.087869644165039,
1133
- "kl": 0.050276342779397964,
1134
  "learning_rate": 2.0561343579004715e-08,
1135
  "loss": 0.0001,
1136
- "reward": 0.9375,
1137
- "reward_std": 0.125,
1138
  "rewards/emotion_reward_func": 0.0,
1139
- "rewards/format_reward_func": 0.9375,
1140
  "step": 87
1141
  },
1142
  {
1143
- "completion_length": 116.875,
1144
  "epoch": 0.11733333333333333,
1145
- "grad_norm": 13.770880699157715,
1146
- "kl": 0.655558168888092,
1147
  "learning_rate": 1.7555878527937163e-08,
1148
- "loss": 0.0007,
1149
- "reward": 0.75,
1150
- "reward_std": 0.39433756470680237,
1151
  "rewards/emotion_reward_func": 0.0,
1152
- "rewards/format_reward_func": 0.75,
1153
  "step": 88
1154
  },
1155
  {
1156
- "completion_length": 113.375,
1157
  "epoch": 0.11866666666666667,
1158
- "grad_norm": 5.708318710327148,
1159
- "kl": 0.0568830668926239,
1160
  "learning_rate": 1.4779807761443635e-08,
1161
  "loss": 0.0001,
1162
- "reward": 0.8125,
1163
- "reward_std": 0.26933756470680237,
1164
  "rewards/emotion_reward_func": 0.0,
1165
- "rewards/format_reward_func": 0.8125,
1166
  "step": 89
1167
  },
1168
  {
1169
- "completion_length": 144.25,
1170
  "epoch": 0.12,
1171
- "grad_norm": 5.573944091796875,
1172
- "kl": 0.05078238993883133,
1173
  "learning_rate": 1.2235870926211616e-08,
1174
  "loss": 0.0001,
1175
  "reward": 0.8125,
@@ -1179,75 +1179,75 @@
1179
  "step": 90
1180
  },
1181
  {
1182
- "completion_length": 144.3125,
1183
  "epoch": 0.12133333333333333,
1184
- "grad_norm": 7.064413547515869,
1185
- "kl": 0.04295295104384422,
1186
  "learning_rate": 9.926578580764234e-09,
1187
- "loss": 0.0,
1188
- "reward": 0.75,
1189
- "reward_std": 0.39433756470680237,
1190
  "rewards/emotion_reward_func": 0.0,
1191
- "rewards/format_reward_func": 0.75,
1192
  "step": 91
1193
  },
1194
  {
1195
- "completion_length": 107.625,
1196
  "epoch": 0.12266666666666666,
1197
- "grad_norm": 10.970053672790527,
1198
- "kl": 0.04969753324985504,
1199
  "learning_rate": 7.85420971784223e-09,
1200
- "loss": 0.0,
1201
- "reward": 0.75,
1202
- "reward_std": 0.5,
1203
  "rewards/emotion_reward_func": 0.0,
1204
- "rewards/format_reward_func": 0.75,
1205
  "step": 92
1206
  },
1207
  {
1208
- "completion_length": 134.9375,
1209
  "epoch": 0.124,
1210
- "grad_norm": 5.796207427978516,
1211
- "kl": 0.044127851724624634,
1212
  "learning_rate": 6.020809515313141e-09,
1213
- "loss": 0.0,
1214
- "reward": 0.875,
1215
- "reward_std": 0.25,
1216
  "rewards/emotion_reward_func": 0.0,
1217
- "rewards/format_reward_func": 0.875,
1218
  "step": 93
1219
  },
1220
  {
1221
- "completion_length": 109.75,
1222
  "epoch": 0.12533333333333332,
1223
- "grad_norm": 6.654936790466309,
1224
- "kl": 0.05904502421617508,
1225
  "learning_rate": 4.4281873178278475e-09,
1226
- "loss": 0.0001,
1227
- "reward": 0.875,
1228
- "reward_std": 0.25,
1229
  "rewards/emotion_reward_func": 0.0,
1230
- "rewards/format_reward_func": 0.875,
1231
  "step": 94
1232
  },
1233
  {
1234
- "completion_length": 124.125,
1235
  "epoch": 0.12666666666666668,
1236
- "grad_norm": 8.560564041137695,
1237
- "kl": 0.6496566534042358,
1238
  "learning_rate": 3.077914851215585e-09,
1239
- "loss": 0.0006,
1240
- "reward": 0.5625,
1241
- "reward_std": 0.375,
1242
  "rewards/emotion_reward_func": 0.0,
1243
- "rewards/format_reward_func": 0.5625,
1244
  "step": 95
1245
  },
1246
  {
1247
- "completion_length": 131.9375,
1248
  "epoch": 0.128,
1249
- "grad_norm": 5.740026473999023,
1250
- "kl": 0.06654007732868195,
1251
  "learning_rate": 1.9713246713805587e-09,
1252
  "loss": 0.0001,
1253
  "reward": 0.875,
@@ -1257,55 +1257,55 @@
1257
  "step": 96
1258
  },
1259
  {
1260
- "completion_length": 104.1875,
1261
  "epoch": 0.12933333333333333,
1262
- "grad_norm": 0.006113509647548199,
1263
- "kl": 0.06723961234092712,
1264
  "learning_rate": 1.1095088492300008e-09,
1265
- "loss": 0.0001,
1266
- "reward": 1.0,
1267
- "reward_std": 0.0,
1268
  "rewards/emotion_reward_func": 0.0,
1269
- "rewards/format_reward_func": 1.0,
1270
  "step": 97
1271
  },
1272
  {
1273
- "completion_length": 127.5625,
1274
  "epoch": 0.13066666666666665,
1275
- "grad_norm": 3.994433879852295,
1276
- "kl": 0.048512913286685944,
1277
  "learning_rate": 4.933178929321102e-10,
1278
- "loss": 0.0,
1279
- "reward": 0.9375,
1280
- "reward_std": 0.125,
1281
  "rewards/emotion_reward_func": 0.0,
1282
- "rewards/format_reward_func": 0.9375,
1283
  "step": 98
1284
  },
1285
  {
1286
- "completion_length": 150.25,
1287
  "epoch": 0.132,
1288
- "grad_norm": 5.778770446777344,
1289
- "kl": 0.03687442094087601,
1290
  "learning_rate": 1.2335990856709998e-10,
1291
- "loss": 0.0,
1292
- "reward": 0.8125,
1293
- "reward_std": 0.375,
1294
  "rewards/emotion_reward_func": 0.0,
1295
- "rewards/format_reward_func": 0.8125,
1296
  "step": 99
1297
  },
1298
  {
1299
- "completion_length": 127.8125,
1300
  "epoch": 0.13333333333333333,
1301
- "grad_norm": 26.98400115966797,
1302
- "kl": 0.097214475274086,
1303
  "learning_rate": 0.0,
1304
  "loss": 0.0001,
1305
- "reward": 0.9375,
1306
- "reward_std": 0.125,
1307
  "rewards/emotion_reward_func": 0.0,
1308
- "rewards/format_reward_func": 0.9375,
1309
  "step": 100
1310
  }
1311
  ],
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 25.8125,
13
  "epoch": 0.0013333333333333333,
14
  "grad_norm": 0.0,
15
  "kl": 0.0,
 
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 45.0,
26
  "epoch": 0.0026666666666666666,
27
+ "grad_norm": 9.964546203613281,
28
  "kl": 0.0,
29
  "learning_rate": 4.995066821070679e-07,
30
+ "loss": -0.0,
31
+ "reward": 0.0625,
32
+ "reward_std": 0.125,
33
  "rewards/emotion_reward_func": 0.0,
34
+ "rewards/format_reward_func": 0.0625,
35
  "step": 2
36
  },
37
  {
38
+ "completion_length": 38.9375,
39
  "epoch": 0.004,
40
+ "grad_norm": 0.002867324510589242,
41
+ "kl": 0.0005742026260122657,
42
  "learning_rate": 4.9889049115077e-07,
43
  "loss": 0.0,
44
  "reward": 0.0,
 
48
  "step": 3
49
  },
50
  {
51
+ "completion_length": 27.4375,
52
  "epoch": 0.005333333333333333,
53
+ "grad_norm": 0.006247804034501314,
54
+ "kl": 0.0010731846559792757,
55
  "learning_rate": 4.980286753286194e-07,
56
  "loss": 0.0,
57
  "reward": 0.0,
 
61
  "step": 4
62
  },
63
  {
64
+ "completion_length": 47.125,
65
  "epoch": 0.006666666666666667,
66
+ "grad_norm": 4.878363132476807,
67
+ "kl": 0.0029146450106054544,
68
  "learning_rate": 4.969220851487844e-07,
69
  "loss": 0.0,
70
+ "reward": 0.0625,
71
+ "reward_std": 0.125,
72
  "rewards/emotion_reward_func": 0.0,
73
+ "rewards/format_reward_func": 0.0625,
74
  "step": 5
75
  },
76
  {
77
+ "completion_length": 48.3125,
78
  "epoch": 0.008,
79
+ "grad_norm": 0.011296601966023445,
80
+ "kl": 0.0013835413847118616,
81
  "learning_rate": 4.955718126821722e-07,
82
  "loss": 0.0,
83
  "reward": 0.0,
 
87
  "step": 6
88
  },
89
  {
90
+ "completion_length": 64.5625,
91
  "epoch": 0.009333333333333334,
92
+ "grad_norm": 9.235820770263672,
93
+ "kl": 0.0008675489807501435,
94
  "learning_rate": 4.939791904846868e-07,
95
  "loss": 0.0,
96
+ "reward": 0.0625,
97
+ "reward_std": 0.125,
98
  "rewards/emotion_reward_func": 0.0,
99
+ "rewards/format_reward_func": 0.0625,
100
  "step": 7
101
  },
102
  {
103
+ "completion_length": 40.6875,
104
  "epoch": 0.010666666666666666,
105
+ "grad_norm": 0.01409083604812622,
106
+ "kl": 0.002122239675372839,
107
  "learning_rate": 4.921457902821578e-07,
108
  "loss": 0.0,
109
  "reward": 0.0,
 
113
  "step": 8
114
  },
115
  {
116
+ "completion_length": 55.0,
117
  "epoch": 0.012,
118
+ "grad_norm": 12.573073387145996,
119
+ "kl": 0.006520974449813366,
120
  "learning_rate": 4.900734214192358e-07,
121
  "loss": 0.0,
122
  "reward": 0.0625,
 
126
  "step": 9
127
  },
128
  {
129
+ "completion_length": 83.875,
130
  "epoch": 0.013333333333333334,
131
+ "grad_norm": 6.103325843811035,
132
+ "kl": 0.003683926770463586,
133
  "learning_rate": 4.877641290737883e-07,
134
  "loss": 0.0,
135
  "reward": 0.0625,
 
139
  "step": 10
140
  },
141
  {
142
+ "completion_length": 47.125,
143
  "epoch": 0.014666666666666666,
144
+ "grad_norm": 6.05898380279541,
145
+ "kl": 0.010808728635311127,
146
  "learning_rate": 4.852201922385564e-07,
147
  "loss": 0.0,
148
+ "reward": 0.0625,
149
+ "reward_std": 0.125,
150
  "rewards/emotion_reward_func": 0.0,
151
+ "rewards/format_reward_func": 0.0625,
152
  "step": 11
153
  },
154
  {
155
+ "completion_length": 52.75,
156
  "epoch": 0.016,
157
+ "grad_norm": 10.170279502868652,
158
+ "kl": 0.017805274575948715,
159
  "learning_rate": 4.824441214720628e-07,
160
  "loss": 0.0,
161
+ "reward": 0.0625,
162
+ "reward_std": 0.125,
163
  "rewards/emotion_reward_func": 0.0,
164
+ "rewards/format_reward_func": 0.0625,
165
  "step": 12
166
  },
167
  {
168
+ "completion_length": 86.5,
169
  "epoch": 0.017333333333333333,
170
+ "grad_norm": 3.2019193172454834,
171
+ "kl": 0.00332952244207263,
172
  "learning_rate": 4.794386564209952e-07,
173
  "loss": 0.0,
174
+ "reward": 0.0625,
175
+ "reward_std": 0.125,
176
  "rewards/emotion_reward_func": 0.0,
177
+ "rewards/format_reward_func": 0.0625,
178
  "step": 13
179
  },
180
  {
181
+ "completion_length": 69.3125,
182
  "epoch": 0.018666666666666668,
183
+ "grad_norm": 11.347992897033691,
184
+ "kl": 0.015205658972263336,
185
  "learning_rate": 4.762067631165049e-07,
186
  "loss": 0.0,
187
+ "reward": 0.0625,
188
+ "reward_std": 0.125,
189
  "rewards/emotion_reward_func": 0.0,
190
+ "rewards/format_reward_func": 0.0625,
191
  "step": 14
192
  },
193
  {
194
+ "completion_length": 91.6875,
195
  "epoch": 0.02,
196
+ "grad_norm": 6.206334114074707,
197
+ "kl": 0.0514555498957634,
198
  "learning_rate": 4.7275163104709194e-07,
199
+ "loss": 0.0001,
200
+ "reward": 0.0625,
201
+ "reward_std": 0.125,
202
  "rewards/emotion_reward_func": 0.0,
203
+ "rewards/format_reward_func": 0.0625,
204
  "step": 15
205
  },
206
  {
207
+ "completion_length": 123.125,
208
  "epoch": 0.021333333333333333,
209
+ "grad_norm": 9.800585746765137,
210
+ "kl": 0.0126947071403265,
211
  "learning_rate": 4.6907667001096585e-07,
212
  "loss": 0.0,
213
+ "reward": 0.125,
214
+ "reward_std": 0.25,
215
  "rewards/emotion_reward_func": 0.0,
216
+ "rewards/format_reward_func": 0.125,
217
  "step": 16
218
  },
219
  {
220
+ "completion_length": 110.625,
221
  "epoch": 0.02266666666666667,
222
+ "grad_norm": 8.219995498657227,
223
+ "kl": 0.037900131195783615,
224
  "learning_rate": 4.6518550675098587e-07,
225
  "loss": 0.0,
226
+ "reward": 0.125,
227
+ "reward_std": 0.25,
228
  "rewards/emotion_reward_func": 0.0,
229
+ "rewards/format_reward_func": 0.125,
230
  "step": 17
231
  },
232
  {
233
+ "completion_length": 102.5625,
234
  "epoch": 0.024,
235
+ "grad_norm": 5.030233383178711,
236
+ "kl": 0.015008080750703812,
237
  "learning_rate": 4.6108198137550377e-07,
238
  "loss": 0.0,
239
+ "reward": 0.125,
240
+ "reward_std": 0.14433756470680237,
241
  "rewards/emotion_reward_func": 0.0,
242
+ "rewards/format_reward_func": 0.125,
243
  "step": 18
244
  },
245
  {
246
+ "completion_length": 88.25,
247
  "epoch": 0.025333333333333333,
248
+ "grad_norm": 3.730710029602051,
249
+ "kl": 0.04922018200159073,
250
  "learning_rate": 4.567701435686404e-07,
251
  "loss": 0.0,
252
+ "reward": 0.03125,
253
+ "reward_std": 0.0625,
254
  "rewards/emotion_reward_func": 0.0,
255
+ "rewards/format_reward_func": 0.03125,
256
  "step": 19
257
  },
258
  {
259
+ "completion_length": 120.0625,
260
  "epoch": 0.02666666666666667,
261
+ "grad_norm": 8.545513153076172,
262
+ "kl": 0.0323660746216774,
263
  "learning_rate": 4.5225424859373684e-07,
264
  "loss": 0.0,
265
+ "reward": 0.25,
266
+ "reward_std": 0.39433756470680237,
267
  "rewards/emotion_reward_func": 0.0,
268
+ "rewards/format_reward_func": 0.25,
269
  "step": 20
270
  },
271
  {
272
+ "completion_length": 133.875,
273
  "epoch": 0.028,
274
+ "grad_norm": 7.563712120056152,
275
+ "kl": 0.032230477780103683,
276
  "learning_rate": 4.475387530939226e-07,
277
  "loss": 0.0,
278
+ "reward": 0.25,
279
+ "reward_std": 0.25,
280
  "rewards/emotion_reward_func": 0.0,
281
+ "rewards/format_reward_func": 0.25,
282
  "step": 21
283
  },
284
  {
285
+ "completion_length": 109.4375,
286
  "epoch": 0.029333333333333333,
287
+ "grad_norm": 13.283954620361328,
288
+ "kl": 0.036050185561180115,
289
  "learning_rate": 4.426283106939473e-07,
290
  "loss": 0.0,
291
+ "reward": 0.3125,
292
+ "reward_std": 0.41367512941360474,
293
  "rewards/emotion_reward_func": 0.0,
294
+ "rewards/format_reward_func": 0.3125,
295
  "step": 22
296
  },
297
  {
298
+ "completion_length": 139.0,
299
  "epoch": 0.030666666666666665,
300
+ "grad_norm": 8.06175422668457,
301
+ "kl": 0.01777772419154644,
302
  "learning_rate": 4.375277674076149e-07,
303
  "loss": 0.0,
304
+ "reward": 0.3125,
305
+ "reward_std": 0.41367512941360474,
306
  "rewards/emotion_reward_func": 0.0,
307
+ "rewards/format_reward_func": 0.3125,
308
  "step": 23
309
  },
310
  {
311
+ "completion_length": 130.5,
312
  "epoch": 0.032,
313
+ "grad_norm": 10.465758323669434,
314
+ "kl": 0.028637699782848358,
315
  "learning_rate": 4.3224215685535287e-07,
316
  "loss": 0.0,
317
+ "reward": 0.46875,
318
+ "reward_std": 0.3696783781051636,
319
  "rewards/emotion_reward_func": 0.0,
320
+ "rewards/format_reward_func": 0.46875,
321
  "step": 24
322
  },
323
  {
324
+ "completion_length": 180.6875,
325
  "epoch": 0.03333333333333333,
326
+ "grad_norm": 9.501301765441895,
327
+ "kl": 0.02521451562643051,
328
  "learning_rate": 4.2677669529663686e-07,
329
  "loss": 0.0,
330
+ "reward": 0.3125,
331
+ "reward_std": 0.51933753490448,
332
  "rewards/emotion_reward_func": 0.0,
333
+ "rewards/format_reward_func": 0.3125,
334
  "step": 25
335
  },
336
  {
337
+ "completion_length": 158.6875,
338
  "epoch": 0.034666666666666665,
339
+ "grad_norm": 10.960349082946777,
340
+ "kl": 0.0829106867313385,
341
  "learning_rate": 4.2113677648217216e-07,
342
+ "loss": 0.0001,
343
+ "reward": 0.25,
344
+ "reward_std": 0.28867512941360474,
345
  "rewards/emotion_reward_func": 0.0,
346
+ "rewards/format_reward_func": 0.25,
347
  "step": 26
348
  },
349
  {
350
+ "completion_length": 138.25,
351
  "epoch": 0.036,
352
+ "grad_norm": 7.017608165740967,
353
+ "kl": 0.02953716553747654,
354
  "learning_rate": 4.1532796633091294e-07,
355
  "loss": 0.0,
356
+ "reward": 0.5,
357
+ "reward_std": 0.39433756470680237,
358
  "rewards/emotion_reward_func": 0.0,
359
+ "rewards/format_reward_func": 0.5,
360
  "step": 27
361
  },
362
  {
363
+ "completion_length": 112.125,
364
  "epoch": 0.037333333333333336,
365
+ "grad_norm": 9.538881301879883,
366
+ "kl": 0.06156347692012787,
367
  "learning_rate": 4.0935599743717244e-07,
368
+ "loss": 0.0001,
369
+ "reward": 0.625,
370
+ "reward_std": 0.5,
371
  "rewards/emotion_reward_func": 0.0,
372
+ "rewards/format_reward_func": 0.625,
373
  "step": 28
374
  },
375
  {
376
+ "completion_length": 151.25,
377
  "epoch": 0.03866666666666667,
378
+ "grad_norm": 10.00710678100586,
379
+ "kl": 0.027312466874718666,
380
  "learning_rate": 4.0322676341324414e-07,
381
  "loss": 0.0,
382
+ "reward": 0.4375,
383
+ "reward_std": 0.51933753490448,
384
  "rewards/emotion_reward_func": 0.0,
385
+ "rewards/format_reward_func": 0.4375,
386
  "step": 29
387
  },
388
  {
389
+ "completion_length": 160.1875,
390
  "epoch": 0.04,
391
+ "grad_norm": 10.74777603149414,
392
+ "kl": 0.04307672381401062,
393
  "learning_rate": 3.9694631307311825e-07,
394
  "loss": 0.0,
395
+ "reward": 0.25,
396
+ "reward_std": 0.5,
397
  "rewards/emotion_reward_func": 0.0,
398
+ "rewards/format_reward_func": 0.25,
399
  "step": 30
400
  },
401
  {
402
+ "completion_length": 141.0625,
403
  "epoch": 0.04133333333333333,
404
+ "grad_norm": 11.85556697845459,
405
+ "kl": 0.05974256619811058,
406
  "learning_rate": 3.9052084446303265e-07,
407
+ "loss": 0.0001,
408
+ "reward": 0.40625,
409
+ "reward_std": 0.4946783781051636,
410
  "rewards/emotion_reward_func": 0.0,
411
+ "rewards/format_reward_func": 0.40625,
412
  "step": 31
413
  },
414
  {
415
+ "completion_length": 153.875,
416
  "epoch": 0.042666666666666665,
417
+ "grad_norm": 8.729798316955566,
418
+ "kl": 0.042332496494054794,
419
  "learning_rate": 3.839566987447491e-07,
420
  "loss": 0.0,
421
+ "reward": 0.4375,
422
+ "reward_std": 0.5580127239227295,
423
  "rewards/emotion_reward_func": 0.0,
424
+ "rewards/format_reward_func": 0.4375,
425
  "step": 32
426
  },
427
  {
428
+ "completion_length": 118.25,
429
  "epoch": 0.044,
430
+ "grad_norm": 6.3447585105896,
431
+ "kl": 0.07207943499088287,
432
  "learning_rate": 3.7726035393759283e-07,
433
+ "loss": 0.0001,
434
+ "reward": 0.8125,
435
+ "reward_std": 0.26933756470680237,
436
  "rewards/emotion_reward_func": 0.0,
437
+ "rewards/format_reward_func": 0.8125,
438
  "step": 33
439
  },
440
  {
441
+ "completion_length": 112.5,
442
  "epoch": 0.04533333333333334,
443
+ "grad_norm": 6.1037797927856445,
444
+ "kl": 0.09316730499267578,
445
  "learning_rate": 3.704384185254288e-07,
446
+ "loss": 0.0001,
447
+ "reward": 0.8125,
448
+ "reward_std": 0.26933756470680237,
449
  "rewards/emotion_reward_func": 0.0,
450
+ "rewards/format_reward_func": 0.8125,
451
  "step": 34
452
  },
453
  {
454
+ "completion_length": 139.8125,
455
  "epoch": 0.04666666666666667,
456
+ "grad_norm": 7.664778232574463,
457
+ "kl": 0.0723225474357605,
458
  "learning_rate": 3.634976249348867e-07,
459
+ "loss": 0.0001,
460
+ "reward": 0.78125,
461
+ "reward_std": 0.3696783781051636,
462
  "rewards/emotion_reward_func": 0.0,
463
+ "rewards/format_reward_func": 0.78125,
464
  "step": 35
465
  },
466
  {
467
+ "completion_length": 112.375,
468
  "epoch": 0.048,
469
+ "grad_norm": 9.651317596435547,
470
+ "kl": 0.09862169623374939,
471
  "learning_rate": 3.5644482289126813e-07,
472
+ "loss": 0.0001,
473
+ "reward": 0.5625,
474
+ "reward_std": 0.51933753490448,
475
  "rewards/emotion_reward_func": 0.0,
476
+ "rewards/format_reward_func": 0.5625,
477
  "step": 36
478
  },
479
  {
480
+ "completion_length": 108.9375,
481
  "epoch": 0.04933333333333333,
482
+ "grad_norm": 8.387043952941895,
483
+ "kl": 0.10006917268037796,
484
  "learning_rate": 3.492869726586951e-07,
485
+ "loss": 0.0001,
486
+ "reward": 0.8125,
487
+ "reward_std": 0.375,
488
  "rewards/emotion_reward_func": 0.0,
489
+ "rewards/format_reward_func": 0.8125,
490
  "step": 37
491
  },
492
  {
493
+ "completion_length": 122.0,
494
  "epoch": 0.050666666666666665,
495
+ "grad_norm": 8.030802726745605,
496
+ "kl": 0.13344745337963104,
497
  "learning_rate": 3.4203113817116953e-07,
498
+ "loss": 0.0001,
499
+ "reward": 0.8125,
500
+ "reward_std": 0.375,
501
  "rewards/emotion_reward_func": 0.0,
502
+ "rewards/format_reward_func": 0.8125,
503
  "step": 38
504
  },
505
  {
506
+ "completion_length": 171.8125,
507
  "epoch": 0.052,
508
+ "grad_norm": 68.77984619140625,
509
+ "kl": 0.05845522880554199,
510
  "learning_rate": 3.346844800613229e-07,
511
+ "loss": 0.0001,
512
+ "reward": 0.6875,
513
+ "reward_std": 0.48935678601264954,
514
  "rewards/emotion_reward_func": 0.0,
515
+ "rewards/format_reward_func": 0.6875,
516
  "step": 39
517
  },
518
  {
519
+ "completion_length": 109.4375,
520
  "epoch": 0.05333333333333334,
521
+ "grad_norm": 7.004795551300049,
522
+ "kl": 0.07453721761703491,
523
  "learning_rate": 3.272542485937368e-07,
524
+ "loss": 0.0001,
525
+ "reward": 0.90625,
526
+ "reward_std": 0.1875,
527
  "rewards/emotion_reward_func": 0.0,
528
+ "rewards/format_reward_func": 0.90625,
529
  "step": 40
530
  },
531
  {
532
+ "completion_length": 129.75,
533
  "epoch": 0.05466666666666667,
534
+ "grad_norm": 7.910297393798828,
535
+ "kl": 0.09665161371231079,
536
  "learning_rate": 3.1974777650980734e-07,
537
+ "loss": 0.0001,
538
+ "reward": 0.75,
539
+ "reward_std": 0.36435678601264954,
540
  "rewards/emotion_reward_func": 0.0,
541
+ "rewards/format_reward_func": 0.75,
542
  "step": 41
543
  },
544
  {
545
+ "completion_length": 138.6875,
546
  "epoch": 0.056,
547
+ "grad_norm": 6.366047382354736,
548
+ "kl": 0.06816712021827698,
549
  "learning_rate": 3.121724717912138e-07,
550
+ "loss": 0.0001,
551
+ "reward": 0.875,
552
+ "reward_std": 0.18217839300632477,
553
  "rewards/emotion_reward_func": 0.0,
554
+ "rewards/format_reward_func": 0.875,
555
  "step": 42
556
  },
557
  {
558
+ "completion_length": 163.4375,
559
  "epoch": 0.05733333333333333,
560
+ "grad_norm": 9.733158111572266,
561
+ "kl": 0.07456796616315842,
562
  "learning_rate": 3.0453581034913565e-07,
563
+ "loss": 0.0001,
564
+ "reward": 0.75,
565
+ "reward_std": 0.39433756470680237,
566
  "rewards/emotion_reward_func": 0.0,
567
+ "rewards/format_reward_func": 0.75,
568
  "step": 43
569
  },
570
  {
571
+ "completion_length": 149.375,
572
  "epoch": 0.058666666666666666,
573
+ "grad_norm": 7.283127784729004,
574
+ "kl": 0.10417325794696808,
575
  "learning_rate": 2.968453286464312e-07,
576
  "loss": 0.0001,
577
+ "reward": 0.8125,
578
  "reward_std": 0.375,
579
  "rewards/emotion_reward_func": 0.0,
580
+ "rewards/format_reward_func": 0.8125,
581
  "step": 44
582
  },
583
  {
584
+ "completion_length": 151.8125,
585
  "epoch": 0.06,
586
+ "grad_norm": 8.65285873413086,
587
+ "kl": 0.08351579308509827,
588
  "learning_rate": 2.8910861626005773e-07,
589
+ "loss": 0.0001,
590
+ "reward": 0.71875,
591
+ "reward_std": 0.45683756470680237,
592
  "rewards/emotion_reward_func": 0.0,
593
+ "rewards/format_reward_func": 0.71875,
594
  "step": 45
595
  },
596
  {
597
+ "completion_length": 122.1875,
598
  "epoch": 0.06133333333333333,
599
+ "grad_norm": 6.481723308563232,
600
+ "kl": 0.0788784921169281,
601
  "learning_rate": 2.8133330839107604e-07,
602
+ "loss": 0.0001,
603
+ "reward": 0.84375,
604
+ "reward_std": 0.24467839300632477,
605
  "rewards/emotion_reward_func": 0.0,
606
+ "rewards/format_reward_func": 0.84375,
607
  "step": 46
608
  },
609
  {
610
+ "completion_length": 177.125,
611
  "epoch": 0.06266666666666666,
612
+ "grad_norm": 6.107300281524658,
613
+ "kl": 0.10843782126903534,
614
  "learning_rate": 2.735270783296286e-07,
615
+ "loss": 0.0001,
616
+ "reward": 0.84375,
617
+ "reward_std": 0.24467839300632477,
618
  "rewards/emotion_reward_func": 0.0,
619
+ "rewards/format_reward_func": 0.84375,
620
  "step": 47
621
  },
622
  {
623
+ "completion_length": 115.875,
624
  "epoch": 0.064,
625
+ "grad_norm": 5.656793117523193,
626
+ "kl": 0.11855285614728928,
627
  "learning_rate": 2.6569762988232837e-07,
628
+ "loss": 0.0001,
629
+ "reward": 0.90625,
630
+ "reward_std": 0.1875,
631
  "rewards/emotion_reward_func": 0.0,
632
+ "rewards/format_reward_func": 0.90625,
633
  "step": 48
634
  },
635
  {
636
+ "completion_length": 137.6875,
637
  "epoch": 0.06533333333333333,
638
+ "grad_norm": 7.453767776489258,
639
+ "kl": 0.10749398171901703,
640
  "learning_rate": 2.5785268976953204e-07,
641
+ "loss": 0.0001,
642
+ "reward": 0.90625,
643
+ "reward_std": 0.1875,
644
  "rewards/emotion_reward_func": 0.0,
645
+ "rewards/format_reward_func": 0.90625,
646
  "step": 49
647
  },
648
  {
649
+ "completion_length": 117.375,
650
  "epoch": 0.06666666666666667,
651
+ "grad_norm": 6.2615861892700195,
652
+ "kl": 0.09339688718318939,
653
  "learning_rate": 2.5e-07,
654
+ "loss": 0.0001,
655
+ "reward": 0.8125,
656
+ "reward_std": 0.26933756470680237,
657
  "rewards/emotion_reward_func": 0.0,
658
+ "rewards/format_reward_func": 0.8125,
659
  "step": 50
660
  },
661
  {
662
+ "completion_length": 120.625,
663
  "epoch": 0.068,
664
+ "grad_norm": 7.3459577560424805,
665
+ "kl": 0.08987420052289963,
666
  "learning_rate": 2.4214731023046794e-07,
667
+ "loss": 0.0001,
668
+ "reward": 0.84375,
669
+ "reward_std": 0.3125,
670
  "rewards/emotion_reward_func": 0.0,
671
+ "rewards/format_reward_func": 0.84375,
672
  "step": 51
673
  },
674
  {
675
+ "completion_length": 130.4375,
676
  "epoch": 0.06933333333333333,
677
+ "grad_norm": 7.253162384033203,
678
+ "kl": 0.0824127197265625,
679
  "learning_rate": 2.3430237011767164e-07,
680
  "loss": 0.0001,
681
+ "reward": 0.8125,
682
+ "reward_std": 0.26933756470680237,
683
  "rewards/emotion_reward_func": 0.0,
684
+ "rewards/format_reward_func": 0.8125,
685
  "step": 52
686
  },
687
  {
688
+ "completion_length": 102.875,
689
  "epoch": 0.07066666666666667,
690
+ "grad_norm": 7.457027435302734,
691
+ "kl": 0.08334946632385254,
692
  "learning_rate": 2.264729216703714e-07,
693
+ "loss": 0.0001,
694
+ "reward": 0.875,
695
+ "reward_std": 0.25,
696
  "rewards/emotion_reward_func": 0.0,
697
+ "rewards/format_reward_func": 0.875,
698
  "step": 53
699
  },
700
  {
701
+ "completion_length": 127.75,
702
  "epoch": 0.072,
703
+ "grad_norm": 0.01185312308371067,
704
+ "kl": 0.094039186835289,
705
  "learning_rate": 2.1866669160892389e-07,
706
+ "loss": 0.0001,
707
+ "reward": 1.0,
708
+ "reward_std": 0.0,
709
  "rewards/emotion_reward_func": 0.0,
710
+ "rewards/format_reward_func": 1.0,
711
  "step": 54
712
  },
713
  {
714
+ "completion_length": 86.5,
715
  "epoch": 0.07333333333333333,
716
+ "grad_norm": 11.868236541748047,
717
+ "kl": 0.22406581044197083,
718
  "learning_rate": 2.1089138373994222e-07,
719
+ "loss": 0.0002,
720
+ "reward": 0.84375,
721
+ "reward_std": 0.3125,
722
  "rewards/emotion_reward_func": 0.0,
723
+ "rewards/format_reward_func": 0.84375,
724
  "step": 55
725
  },
726
  {
727
+ "completion_length": 105.0625,
728
  "epoch": 0.07466666666666667,
729
+ "grad_norm": 3.1902968883514404,
730
+ "kl": 0.10381343960762024,
731
  "learning_rate": 2.0315467135356878e-07,
732
+ "loss": 0.0001,
733
+ "reward": 0.96875,
734
+ "reward_std": 0.0625,
735
  "rewards/emotion_reward_func": 0.0,
736
+ "rewards/format_reward_func": 0.96875,
737
  "step": 56
738
  },
739
  {
740
+ "completion_length": 77.8125,
741
  "epoch": 0.076,
742
+ "grad_norm": 0.04087565839290619,
743
+ "kl": 0.17592526972293854,
744
  "learning_rate": 1.954641896508644e-07,
745
+ "loss": 0.0002,
746
+ "reward": 1.0,
747
+ "reward_std": 0.0,
748
  "rewards/emotion_reward_func": 0.0,
749
+ "rewards/format_reward_func": 1.0,
750
  "step": 57
751
  },
752
  {
753
+ "completion_length": 114.5,
754
  "epoch": 0.07733333333333334,
755
+ "grad_norm": 4.14316463470459,
756
+ "kl": 0.09431131184101105,
757
  "learning_rate": 1.8782752820878633e-07,
758
+ "loss": 0.0001,
759
+ "reward": 0.875,
760
+ "reward_std": 0.14433756470680237,
761
  "rewards/emotion_reward_func": 0.0,
762
+ "rewards/format_reward_func": 0.875,
763
  "step": 58
764
  },
765
  {
766
+ "completion_length": 99.25,
767
  "epoch": 0.07866666666666666,
768
+ "grad_norm": 12.474014282226562,
769
+ "kl": 0.16972073912620544,
770
  "learning_rate": 1.802522234901927e-07,
771
+ "loss": 0.0002,
772
+ "reward": 0.6875,
773
+ "reward_std": 0.41367512941360474,
774
  "rewards/emotion_reward_func": 0.0,
775
+ "rewards/format_reward_func": 0.6875,
776
  "step": 59
777
  },
778
  {
779
+ "completion_length": 114.6875,
780
  "epoch": 0.08,
781
+ "grad_norm": 0.014834249392151833,
782
+ "kl": 0.11283782124519348,
783
  "learning_rate": 1.7274575140626315e-07,
784
+ "loss": 0.0001,
785
+ "reward": 1.0,
786
+ "reward_std": 0.0,
787
  "rewards/emotion_reward_func": 0.0,
788
+ "rewards/format_reward_func": 1.0,
789
  "step": 60
790
  },
791
  {
792
+ "completion_length": 107.6875,
793
  "epoch": 0.08133333333333333,
794
+ "grad_norm": 11.97496509552002,
795
+ "kl": 0.2880978286266327,
796
  "learning_rate": 1.6531551993867715e-07,
797
+ "loss": 0.0003,
798
+ "reward": 0.84375,
799
+ "reward_std": 0.3125,
800
  "rewards/emotion_reward_func": 0.0,
801
+ "rewards/format_reward_func": 0.84375,
802
  "step": 61
803
  },
804
  {
805
+ "completion_length": 115.3125,
806
  "epoch": 0.08266666666666667,
807
+ "grad_norm": 0.06830989569425583,
808
+ "kl": 0.12032976001501083,
809
  "learning_rate": 1.579688618288305e-07,
810
+ "loss": 0.0001,
811
+ "reward": 1.0,
812
+ "reward_std": 0.0,
813
  "rewards/emotion_reward_func": 0.0,
814
+ "rewards/format_reward_func": 1.0,
815
  "step": 62
816
  },
817
  {
818
+ "completion_length": 109.9375,
819
  "epoch": 0.084,
820
+ "grad_norm": 12.59013557434082,
821
+ "kl": 0.16462820768356323,
822
  "learning_rate": 1.5071302734130486e-07,
823
+ "loss": 0.0002,
824
+ "reward": 0.875,
825
+ "reward_std": 0.25,
826
  "rewards/emotion_reward_func": 0.0,
827
+ "rewards/format_reward_func": 0.875,
828
  "step": 63
829
  },
830
  {
831
+ "completion_length": 112.6875,
832
  "epoch": 0.08533333333333333,
833
+ "grad_norm": 0.011938858777284622,
834
+ "kl": 0.105996273458004,
835
  "learning_rate": 1.4355517710873182e-07,
836
+ "loss": 0.0001,
837
+ "reward": 1.0,
838
+ "reward_std": 0.0,
839
  "rewards/emotion_reward_func": 0.0,
840
+ "rewards/format_reward_func": 1.0,
841
  "step": 64
842
  },
843
  {
844
+ "completion_length": 113.3125,
845
  "epoch": 0.08666666666666667,
846
+ "grad_norm": 3.5281906127929688,
847
+ "kl": 0.11741024255752563,
848
  "learning_rate": 1.365023750651133e-07,
849
+ "loss": 0.0001,
850
+ "reward": 0.96875,
851
+ "reward_std": 0.0625,
852
  "rewards/emotion_reward_func": 0.0,
853
+ "rewards/format_reward_func": 0.96875,
854
  "step": 65
855
  },
856
  {
857
+ "completion_length": 115.375,
858
  "epoch": 0.088,
859
+ "grad_norm": 5.508726596832275,
860
+ "kl": 0.11176759004592896,
861
  "learning_rate": 1.2956158147457114e-07,
862
  "loss": 0.0001,
863
  "reward": 0.9375,
 
867
  "step": 66
868
  },
869
  {
870
+ "completion_length": 118.75,
871
  "epoch": 0.08933333333333333,
872
+ "grad_norm": 6.57678747177124,
873
+ "kl": 0.12585385143756866,
874
  "learning_rate": 1.2273964606240718e-07,
875
+ "loss": 0.0001,
876
+ "reward": 0.90625,
877
+ "reward_std": 0.1875,
878
  "rewards/emotion_reward_func": 0.0,
879
+ "rewards/format_reward_func": 0.90625,
880
  "step": 67
881
  },
882
  {
883
+ "completion_length": 128.8125,
884
  "epoch": 0.09066666666666667,
885
+ "grad_norm": 0.012199473567306995,
886
+ "kl": 0.1194550171494484,
887
  "learning_rate": 1.1604330125525078e-07,
888
+ "loss": 0.0001,
889
+ "reward": 1.0,
890
+ "reward_std": 0.0,
891
  "rewards/emotion_reward_func": 0.0,
892
+ "rewards/format_reward_func": 1.0,
893
  "step": 68
894
  },
895
  {
896
+ "completion_length": 102.75,
897
  "epoch": 0.092,
898
+ "grad_norm": 6.777124881744385,
899
+ "kl": 0.14668866991996765,
900
  "learning_rate": 1.0947915553696741e-07,
901
+ "loss": 0.0001,
902
+ "reward": 0.9375,
903
+ "reward_std": 0.125,
904
  "rewards/emotion_reward_func": 0.0,
905
+ "rewards/format_reward_func": 0.9375,
906
  "step": 69
907
  },
908
  {
909
+ "completion_length": 96.25,
910
  "epoch": 0.09333333333333334,
911
+ "grad_norm": 6.230812072753906,
912
+ "kl": 0.1095753163099289,
913
  "learning_rate": 1.0305368692688174e-07,
914
+ "loss": 0.0001,
915
+ "reward": 0.90625,
916
+ "reward_std": 0.1875,
917
  "rewards/emotion_reward_func": 0.0,
918
+ "rewards/format_reward_func": 0.90625,
919
  "step": 70
920
  },
921
  {
922
+ "completion_length": 98.25,
923
  "epoch": 0.09466666666666666,
924
+ "grad_norm": 3.248274326324463,
925
+ "kl": 0.1126992404460907,
926
  "learning_rate": 9.677323658675593e-08,
927
+ "loss": 0.0001,
928
+ "reward": 0.9375,
929
+ "reward_std": 0.125,
930
  "rewards/emotion_reward_func": 0.0,
931
+ "rewards/format_reward_func": 0.9375,
932
  "step": 71
933
  },
934
  {
935
+ "completion_length": 112.8125,
936
  "epoch": 0.096,
937
+ "grad_norm": 5.262601852416992,
938
+ "kl": 0.1305035650730133,
939
  "learning_rate": 9.064400256282755e-08,
940
+ "loss": 0.0001,
941
+ "reward": 0.9375,
942
+ "reward_std": 0.125,
943
  "rewards/emotion_reward_func": 0.0,
944
+ "rewards/format_reward_func": 0.9375,
945
  "step": 72
946
  },
947
  {
948
+ "completion_length": 121.375,
949
  "epoch": 0.09733333333333333,
950
+ "grad_norm": 0.01645738258957863,
951
+ "kl": 0.12455137819051743,
952
  "learning_rate": 8.467203366908707e-08,
953
+ "loss": 0.0001,
954
+ "reward": 1.0,
955
+ "reward_std": 0.0,
956
  "rewards/emotion_reward_func": 0.0,
957
+ "rewards/format_reward_func": 1.0,
958
  "step": 73
959
  },
960
  {
961
+ "completion_length": 113.625,
962
  "epoch": 0.09866666666666667,
963
+ "grad_norm": 5.738455295562744,
964
+ "kl": 0.14311644434928894,
965
  "learning_rate": 7.886322351782782e-08,
966
+ "loss": 0.0001,
967
+ "reward": 0.875,
968
+ "reward_std": 0.25,
969
  "rewards/emotion_reward_func": 0.0,
970
+ "rewards/format_reward_func": 0.875,
971
  "step": 74
972
  },
973
  {
974
+ "completion_length": 104.375,
975
  "epoch": 0.1,
976
+ "grad_norm": 0.014866613782942295,
977
+ "kl": 0.13841284811496735,
978
  "learning_rate": 7.322330470336313e-08,
979
  "loss": 0.0001,
980
+ "reward": 1.0,
981
+ "reward_std": 0.0,
982
  "rewards/emotion_reward_func": 0.0,
983
+ "rewards/format_reward_func": 1.0,
984
  "step": 75
985
  },
986
  {
987
+ "completion_length": 96.125,
988
  "epoch": 0.10133333333333333,
989
+ "grad_norm": 4.531452178955078,
990
+ "kl": 0.11935670673847198,
991
  "learning_rate": 6.775784314464716e-08,
992
+ "loss": 0.0001,
993
+ "reward": 0.9375,
994
+ "reward_std": 0.125,
995
  "rewards/emotion_reward_func": 0.0,
996
+ "rewards/format_reward_func": 0.9375,
997
  "step": 76
998
  },
999
  {
1000
+ "completion_length": 102.0,
1001
  "epoch": 0.10266666666666667,
1002
+ "grad_norm": 0.017895404249429703,
1003
+ "kl": 0.15568867325782776,
1004
  "learning_rate": 6.24722325923851e-08,
1005
+ "loss": 0.0002,
1006
+ "reward": 1.0,
1007
+ "reward_std": 0.0,
1008
  "rewards/emotion_reward_func": 0.0,
1009
+ "rewards/format_reward_func": 1.0,
1010
  "step": 77
1011
  },
1012
  {
1013
+ "completion_length": 85.25,
1014
  "epoch": 0.104,
1015
+ "grad_norm": 6.45017147064209,
1016
+ "kl": 0.20556744933128357,
1017
  "learning_rate": 5.737168930605271e-08,
1018
+ "loss": 0.0002,
1019
+ "reward": 0.96875,
1020
+ "reward_std": 0.0625,
1021
  "rewards/emotion_reward_func": 0.0,
1022
+ "rewards/format_reward_func": 0.96875,
1023
  "step": 78
1024
  },
1025
  {
1026
+ "completion_length": 90.25,
1027
  "epoch": 0.10533333333333333,
1028
+ "grad_norm": 7.30122709274292,
1029
+ "kl": 0.1739582121372223,
1030
  "learning_rate": 5.246124690607739e-08,
1031
+ "loss": 0.0002,
1032
+ "reward": 0.9375,
1033
+ "reward_std": 0.125,
1034
  "rewards/emotion_reward_func": 0.0,
1035
+ "rewards/format_reward_func": 0.9375,
1036
  "step": 79
1037
  },
1038
  {
1039
+ "completion_length": 99.4375,
1040
  "epoch": 0.10666666666666667,
1041
+ "grad_norm": 7.5901103019714355,
1042
+ "kl": 0.1773838996887207,
1043
  "learning_rate": 4.774575140626316e-08,
1044
+ "loss": 0.0002,
1045
+ "reward": 0.90625,
1046
+ "reward_std": 0.1875,
1047
  "rewards/emotion_reward_func": 0.0,
1048
+ "rewards/format_reward_func": 0.90625,
1049
  "step": 80
1050
  },
1051
  {
1052
+ "completion_length": 101.875,
1053
  "epoch": 0.108,
1054
+ "grad_norm": 0.016729678958654404,
1055
+ "kl": 0.16674719750881195,
1056
  "learning_rate": 4.3229856431359513e-08,
1057
+ "loss": 0.0002,
1058
+ "reward": 1.0,
1059
+ "reward_std": 0.0,
1060
  "rewards/emotion_reward_func": 0.0,
1061
+ "rewards/format_reward_func": 1.0,
1062
  "step": 81
1063
  },
1064
  {
1065
+ "completion_length": 104.5,
1066
  "epoch": 0.10933333333333334,
1067
+ "grad_norm": 5.888128757476807,
1068
+ "kl": 0.13316090404987335,
1069
  "learning_rate": 3.8918018624496286e-08,
1070
+ "loss": 0.0001,
1071
  "reward": 0.9375,
1072
  "reward_std": 0.125,
1073
  "rewards/emotion_reward_func": 0.0,
 
1075
  "step": 82
1076
  },
1077
  {
1078
+ "completion_length": 96.5625,
1079
  "epoch": 0.11066666666666666,
1080
+ "grad_norm": 5.195379734039307,
1081
+ "kl": 0.13178151845932007,
1082
  "learning_rate": 3.481449324901411e-08,
1083
+ "loss": 0.0001,
1084
+ "reward": 0.96875,
1085
+ "reward_std": 0.0625,
1086
  "rewards/emotion_reward_func": 0.0,
1087
+ "rewards/format_reward_func": 0.96875,
1088
  "step": 83
1089
  },
1090
  {
1091
+ "completion_length": 120.3125,
1092
  "epoch": 0.112,
1093
+ "grad_norm": 0.023672526702284813,
1094
+ "kl": 0.09863791614770889,
1095
  "learning_rate": 3.092332998903416e-08,
1096
+ "loss": 0.0001,
1097
+ "reward": 1.0,
1098
+ "reward_std": 0.0,
1099
  "rewards/emotion_reward_func": 0.0,
1100
+ "rewards/format_reward_func": 1.0,
1101
  "step": 84
1102
  },
1103
  {
1104
+ "completion_length": 120.3125,
1105
  "epoch": 0.11333333333333333,
1106
+ "grad_norm": 0.016033878549933434,
1107
+ "kl": 0.1298094093799591,
1108
  "learning_rate": 2.724836895290805e-08,
1109
  "loss": 0.0001,
1110
+ "reward": 1.0,
1111
+ "reward_std": 0.0,
1112
  "rewards/emotion_reward_func": 0.0,
1113
+ "rewards/format_reward_func": 1.0,
1114
  "step": 85
1115
  },
1116
  {
1117
+ "completion_length": 123.875,
1118
  "epoch": 0.11466666666666667,
1119
+ "grad_norm": 0.013259505853056908,
1120
+ "kl": 0.13770553469657898,
1121
  "learning_rate": 2.379323688349516e-08,
1122
+ "loss": 0.0001,
1123
+ "reward": 1.0,
1124
+ "reward_std": 0.0,
1125
  "rewards/emotion_reward_func": 0.0,
1126
+ "rewards/format_reward_func": 1.0,
1127
  "step": 86
1128
  },
1129
  {
1130
+ "completion_length": 93.375,
1131
  "epoch": 0.116,
1132
+ "grad_norm": 7.885369777679443,
1133
+ "kl": 0.14231295883655548,
1134
  "learning_rate": 2.0561343579004715e-08,
1135
  "loss": 0.0001,
1136
+ "reward": 0.875,
1137
+ "reward_std": 0.25,
1138
  "rewards/emotion_reward_func": 0.0,
1139
+ "rewards/format_reward_func": 0.875,
1140
  "step": 87
1141
  },
1142
  {
1143
+ "completion_length": 106.8125,
1144
  "epoch": 0.11733333333333333,
1145
+ "grad_norm": 10.611775398254395,
1146
+ "kl": 0.25752192735671997,
1147
  "learning_rate": 1.7555878527937163e-08,
1148
+ "loss": 0.0003,
1149
+ "reward": 0.84375,
1150
+ "reward_std": 0.3125,
1151
  "rewards/emotion_reward_func": 0.0,
1152
+ "rewards/format_reward_func": 0.84375,
1153
  "step": 88
1154
  },
1155
  {
1156
+ "completion_length": 105.25,
1157
  "epoch": 0.11866666666666667,
1158
+ "grad_norm": 5.855681419372559,
1159
+ "kl": 0.1377657949924469,
1160
  "learning_rate": 1.4779807761443635e-08,
1161
  "loss": 0.0001,
1162
+ "reward": 0.9375,
1163
+ "reward_std": 0.125,
1164
  "rewards/emotion_reward_func": 0.0,
1165
+ "rewards/format_reward_func": 0.9375,
1166
  "step": 89
1167
  },
1168
  {
1169
+ "completion_length": 112.9375,
1170
  "epoch": 0.12,
1171
+ "grad_norm": 7.280832290649414,
1172
+ "kl": 0.13963450491428375,
1173
  "learning_rate": 1.2235870926211616e-08,
1174
  "loss": 0.0001,
1175
  "reward": 0.8125,
 
1179
  "step": 90
1180
  },
1181
  {
1182
+ "completion_length": 113.0625,
1183
  "epoch": 0.12133333333333333,
1184
+ "grad_norm": 5.792718410491943,
1185
+ "kl": 0.10216458886861801,
1186
  "learning_rate": 9.926578580764234e-09,
1187
+ "loss": 0.0001,
1188
+ "reward": 0.875,
1189
+ "reward_std": 0.25,
1190
  "rewards/emotion_reward_func": 0.0,
1191
+ "rewards/format_reward_func": 0.875,
1192
  "step": 91
1193
  },
1194
  {
1195
+ "completion_length": 130.625,
1196
  "epoch": 0.12266666666666666,
1197
+ "grad_norm": 3.8016581535339355,
1198
+ "kl": 0.09364865720272064,
1199
  "learning_rate": 7.85420971784223e-09,
1200
+ "loss": 0.0001,
1201
+ "reward": 0.9375,
1202
+ "reward_std": 0.125,
1203
  "rewards/emotion_reward_func": 0.0,
1204
+ "rewards/format_reward_func": 0.9375,
1205
  "step": 92
1206
  },
1207
  {
1208
+ "completion_length": 128.8125,
1209
  "epoch": 0.124,
1210
+ "grad_norm": 4.038362979888916,
1211
+ "kl": 0.09943213313817978,
1212
  "learning_rate": 6.020809515313141e-09,
1213
+ "loss": 0.0001,
1214
+ "reward": 0.96875,
1215
+ "reward_std": 0.0625,
1216
  "rewards/emotion_reward_func": 0.0,
1217
+ "rewards/format_reward_func": 0.96875,
1218
  "step": 93
1219
  },
1220
  {
1221
+ "completion_length": 81.0,
1222
  "epoch": 0.12533333333333332,
1223
+ "grad_norm": 5.979770660400391,
1224
+ "kl": 0.15632514655590057,
1225
  "learning_rate": 4.4281873178278475e-09,
1226
+ "loss": 0.0002,
1227
+ "reward": 0.9375,
1228
+ "reward_std": 0.125,
1229
  "rewards/emotion_reward_func": 0.0,
1230
+ "rewards/format_reward_func": 0.9375,
1231
  "step": 94
1232
  },
1233
  {
1234
+ "completion_length": 95.9375,
1235
  "epoch": 0.12666666666666668,
1236
+ "grad_norm": 8.092153549194336,
1237
+ "kl": 0.16083820164203644,
1238
  "learning_rate": 3.077914851215585e-09,
1239
+ "loss": 0.0002,
1240
+ "reward": 0.9375,
1241
+ "reward_std": 0.125,
1242
  "rewards/emotion_reward_func": 0.0,
1243
+ "rewards/format_reward_func": 0.9375,
1244
  "step": 95
1245
  },
1246
  {
1247
+ "completion_length": 109.5,
1248
  "epoch": 0.128,
1249
+ "grad_norm": 7.098442077636719,
1250
+ "kl": 0.11922727525234222,
1251
  "learning_rate": 1.9713246713805587e-09,
1252
  "loss": 0.0001,
1253
  "reward": 0.875,
 
1257
  "step": 96
1258
  },
1259
  {
1260
+ "completion_length": 104.0,
1261
  "epoch": 0.12933333333333333,
1262
+ "grad_norm": 6.17384147644043,
1263
+ "kl": 0.23637814819812775,
1264
  "learning_rate": 1.1095088492300008e-09,
1265
+ "loss": 0.0002,
1266
+ "reward": 0.9375,
1267
+ "reward_std": 0.125,
1268
  "rewards/emotion_reward_func": 0.0,
1269
+ "rewards/format_reward_func": 0.9375,
1270
  "step": 97
1271
  },
1272
  {
1273
+ "completion_length": 97.75,
1274
  "epoch": 0.13066666666666665,
1275
+ "grad_norm": 0.04274175688624382,
1276
+ "kl": 0.18250277638435364,
1277
  "learning_rate": 4.933178929321102e-10,
1278
+ "loss": 0.0002,
1279
+ "reward": 1.0,
1280
+ "reward_std": 0.0,
1281
  "rewards/emotion_reward_func": 0.0,
1282
+ "rewards/format_reward_func": 1.0,
1283
  "step": 98
1284
  },
1285
  {
1286
+ "completion_length": 124.6875,
1287
  "epoch": 0.132,
1288
+ "grad_norm": 0.03310486301779747,
1289
+ "kl": 0.09437094628810883,
1290
  "learning_rate": 1.2335990856709998e-10,
1291
+ "loss": 0.0001,
1292
+ "reward": 1.0,
1293
+ "reward_std": 0.0,
1294
  "rewards/emotion_reward_func": 0.0,
1295
+ "rewards/format_reward_func": 1.0,
1296
  "step": 99
1297
  },
1298
  {
1299
+ "completion_length": 116.0,
1300
  "epoch": 0.13333333333333333,
1301
+ "grad_norm": 0.0801442489027977,
1302
+ "kl": 0.10857471823692322,
1303
  "learning_rate": 0.0,
1304
  "loss": 0.0001,
1305
+ "reward": 1.0,
1306
+ "reward_std": 0.0,
1307
  "rewards/emotion_reward_func": 0.0,
1308
+ "rewards/format_reward_func": 1.0,
1309
  "step": 100
1310
  }
1311
  ],