eric-tramel commited on
Commit
35a4d25
·
verified ·
1 Parent(s): d6c1856

Training in progress, step 100, checkpoint

Browse files
checkpoint-100/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d46bbda891252d1bc7d7b207072338c59dd511f23a0bb1a77a233dd55bf64bc3
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad242d2f09d0c8055654d9bf3cc76ef68efe210b059d9b41e2e0bb07f5607f65
3
  size 1976163472
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e61137fa392006841b3c9d3b0075e477b2bfd4da9800e8738d48e950a612ad64
3
  size 3952505274
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d18f3ee815fe1b06aa8c6b95f73354f741c1d7cb3d51e6010f8769c80ec5b6a
3
  size 3952505274
checkpoint-100/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8016113faecd368858eb6ebc4fcb61a9f1956107a452dada8e98bfda76288f6
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae69984fdd13c24018ecb1aa802b54afa3ca3e6a5ff1a82b51b4de545f616c19
3
  size 15024
checkpoint-100/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:974125c47b4b1edfbbcd3caeae8511abb215ef1494e888b05e7fde18c3ed08ed
3
- size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cba81e50fb33c36c59d80bf8793f39dc24433a2190b46f8d2803c7db580c7ef5
3
+ size 14960
checkpoint-100/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ac54bed7d9d2e9cb07f31f09406c3bf48fced29450844d31d747045d9f2f6ea
3
- size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f63266ab578a5d64086913db0f644de0a295ba0be0b8d58ecad8228d9505d57
3
+ size 14960
checkpoint-100/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:817aecd630d55ef1ec1da71dbd2daae7b12d4db119f38d05d48fdc7f0cb7134f
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:701c2a23e45bbff29fa1f6df5af29dcc2cdb24bd86078096f4fb5abd74ce9d23
3
  size 15024
checkpoint-100/trainer_state.json CHANGED
@@ -9,7 +9,7 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 25.8125,
13
  "epoch": 0.0013333333333333333,
14
  "grad_norm": 0.0,
15
  "kl": 0.0,
@@ -22,1290 +22,1290 @@
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 45.0,
26
  "epoch": 0.0026666666666666666,
27
- "grad_norm": 9.964546203613281,
28
  "kl": 0.0,
29
  "learning_rate": 4.995066821070679e-07,
30
- "loss": -0.0,
31
- "reward": 0.0625,
32
- "reward_std": 0.125,
33
- "rewards/emotion_reward_func": 0.0,
34
- "rewards/format_reward_func": 0.0625,
35
  "step": 2
36
  },
37
  {
38
- "completion_length": 38.9375,
39
  "epoch": 0.004,
40
- "grad_norm": 0.002867324510589242,
41
- "kl": 0.0005742026260122657,
42
  "learning_rate": 4.9889049115077e-07,
43
  "loss": 0.0,
44
- "reward": 0.0,
45
- "reward_std": 0.0,
46
- "rewards/emotion_reward_func": 0.0,
47
- "rewards/format_reward_func": 0.0,
48
  "step": 3
49
  },
50
  {
51
- "completion_length": 27.4375,
52
  "epoch": 0.005333333333333333,
53
- "grad_norm": 0.006247804034501314,
54
- "kl": 0.0010731846559792757,
55
  "learning_rate": 4.980286753286194e-07,
56
  "loss": 0.0,
57
- "reward": 0.0,
58
- "reward_std": 0.0,
59
- "rewards/emotion_reward_func": 0.0,
60
- "rewards/format_reward_func": 0.0,
61
  "step": 4
62
  },
63
  {
64
- "completion_length": 47.125,
65
  "epoch": 0.006666666666666667,
66
- "grad_norm": 4.878363132476807,
67
- "kl": 0.0029146450106054544,
68
  "learning_rate": 4.969220851487844e-07,
69
  "loss": 0.0,
70
- "reward": 0.0625,
71
- "reward_std": 0.125,
72
- "rewards/emotion_reward_func": 0.0,
73
- "rewards/format_reward_func": 0.0625,
74
  "step": 5
75
  },
76
  {
77
- "completion_length": 48.3125,
78
  "epoch": 0.008,
79
- "grad_norm": 0.011296601966023445,
80
- "kl": 0.0013835413847118616,
81
  "learning_rate": 4.955718126821722e-07,
82
  "loss": 0.0,
83
- "reward": 0.0,
84
- "reward_std": 0.0,
85
- "rewards/emotion_reward_func": 0.0,
86
- "rewards/format_reward_func": 0.0,
87
  "step": 6
88
  },
89
  {
90
- "completion_length": 64.5625,
91
  "epoch": 0.009333333333333334,
92
- "grad_norm": 9.235820770263672,
93
- "kl": 0.0008675489807501435,
94
  "learning_rate": 4.939791904846868e-07,
95
  "loss": 0.0,
96
- "reward": 0.0625,
97
  "reward_std": 0.125,
98
- "rewards/emotion_reward_func": 0.0,
99
- "rewards/format_reward_func": 0.0625,
100
  "step": 7
101
  },
102
  {
103
- "completion_length": 40.6875,
104
  "epoch": 0.010666666666666666,
105
- "grad_norm": 0.01409083604812622,
106
- "kl": 0.002122239675372839,
107
  "learning_rate": 4.921457902821578e-07,
108
  "loss": 0.0,
109
- "reward": 0.0,
110
- "reward_std": 0.0,
111
- "rewards/emotion_reward_func": 0.0,
112
- "rewards/format_reward_func": 0.0,
113
  "step": 8
114
  },
115
  {
116
- "completion_length": 55.0,
117
  "epoch": 0.012,
118
- "grad_norm": 12.573073387145996,
119
- "kl": 0.006520974449813366,
120
  "learning_rate": 4.900734214192358e-07,
121
  "loss": 0.0,
122
- "reward": 0.0625,
123
- "reward_std": 0.125,
124
- "rewards/emotion_reward_func": 0.0,
125
- "rewards/format_reward_func": 0.0625,
126
  "step": 9
127
  },
128
  {
129
- "completion_length": 83.875,
130
  "epoch": 0.013333333333333334,
131
- "grad_norm": 6.103325843811035,
132
- "kl": 0.003683926770463586,
133
  "learning_rate": 4.877641290737883e-07,
134
  "loss": 0.0,
135
- "reward": 0.0625,
136
- "reward_std": 0.125,
137
- "rewards/emotion_reward_func": 0.0,
138
- "rewards/format_reward_func": 0.0625,
139
  "step": 10
140
  },
141
  {
142
- "completion_length": 47.125,
143
  "epoch": 0.014666666666666666,
144
- "grad_norm": 6.05898380279541,
145
- "kl": 0.010808728635311127,
146
  "learning_rate": 4.852201922385564e-07,
147
  "loss": 0.0,
148
- "reward": 0.0625,
149
- "reward_std": 0.125,
150
- "rewards/emotion_reward_func": 0.0,
151
- "rewards/format_reward_func": 0.0625,
152
  "step": 11
153
  },
154
  {
155
- "completion_length": 52.75,
156
  "epoch": 0.016,
157
- "grad_norm": 10.170279502868652,
158
- "kl": 0.017805274575948715,
159
  "learning_rate": 4.824441214720628e-07,
160
  "loss": 0.0,
161
- "reward": 0.0625,
162
- "reward_std": 0.125,
163
- "rewards/emotion_reward_func": 0.0,
164
- "rewards/format_reward_func": 0.0625,
165
  "step": 12
166
  },
167
  {
168
- "completion_length": 86.5,
169
  "epoch": 0.017333333333333333,
170
- "grad_norm": 3.2019193172454834,
171
- "kl": 0.00332952244207263,
172
  "learning_rate": 4.794386564209952e-07,
173
  "loss": 0.0,
174
- "reward": 0.0625,
175
- "reward_std": 0.125,
176
- "rewards/emotion_reward_func": 0.0,
177
- "rewards/format_reward_func": 0.0625,
178
  "step": 13
179
  },
180
  {
181
- "completion_length": 69.3125,
182
  "epoch": 0.018666666666666668,
183
- "grad_norm": 11.347992897033691,
184
- "kl": 0.015205658972263336,
185
  "learning_rate": 4.762067631165049e-07,
186
  "loss": 0.0,
187
- "reward": 0.0625,
188
- "reward_std": 0.125,
189
- "rewards/emotion_reward_func": 0.0,
190
- "rewards/format_reward_func": 0.0625,
191
  "step": 14
192
  },
193
  {
194
- "completion_length": 91.6875,
195
  "epoch": 0.02,
196
- "grad_norm": 6.206334114074707,
197
- "kl": 0.0514555498957634,
198
  "learning_rate": 4.7275163104709194e-07,
199
- "loss": 0.0001,
200
- "reward": 0.0625,
201
- "reward_std": 0.125,
202
- "rewards/emotion_reward_func": 0.0,
203
- "rewards/format_reward_func": 0.0625,
204
  "step": 15
205
  },
206
  {
207
- "completion_length": 123.125,
208
  "epoch": 0.021333333333333333,
209
- "grad_norm": 9.800585746765137,
210
- "kl": 0.0126947071403265,
211
  "learning_rate": 4.6907667001096585e-07,
212
  "loss": 0.0,
213
- "reward": 0.125,
214
- "reward_std": 0.25,
215
- "rewards/emotion_reward_func": 0.0,
216
- "rewards/format_reward_func": 0.125,
217
  "step": 16
218
  },
219
  {
220
- "completion_length": 110.625,
221
  "epoch": 0.02266666666666667,
222
- "grad_norm": 8.219995498657227,
223
- "kl": 0.037900131195783615,
224
  "learning_rate": 4.6518550675098587e-07,
225
  "loss": 0.0,
226
- "reward": 0.125,
227
- "reward_std": 0.25,
228
- "rewards/emotion_reward_func": 0.0,
229
- "rewards/format_reward_func": 0.125,
230
  "step": 17
231
  },
232
  {
233
- "completion_length": 102.5625,
234
  "epoch": 0.024,
235
- "grad_norm": 5.030233383178711,
236
- "kl": 0.015008080750703812,
237
  "learning_rate": 4.6108198137550377e-07,
238
  "loss": 0.0,
239
- "reward": 0.125,
240
- "reward_std": 0.14433756470680237,
241
- "rewards/emotion_reward_func": 0.0,
242
- "rewards/format_reward_func": 0.125,
243
  "step": 18
244
  },
245
  {
246
- "completion_length": 88.25,
247
  "epoch": 0.025333333333333333,
248
- "grad_norm": 3.730710029602051,
249
- "kl": 0.04922018200159073,
250
  "learning_rate": 4.567701435686404e-07,
251
  "loss": 0.0,
252
- "reward": 0.03125,
253
- "reward_std": 0.0625,
254
- "rewards/emotion_reward_func": 0.0,
255
- "rewards/format_reward_func": 0.03125,
256
  "step": 19
257
  },
258
  {
259
- "completion_length": 120.0625,
260
  "epoch": 0.02666666666666667,
261
- "grad_norm": 8.545513153076172,
262
- "kl": 0.0323660746216774,
263
  "learning_rate": 4.5225424859373684e-07,
264
  "loss": 0.0,
265
- "reward": 0.25,
266
- "reward_std": 0.39433756470680237,
267
- "rewards/emotion_reward_func": 0.0,
268
- "rewards/format_reward_func": 0.25,
269
  "step": 20
270
  },
271
  {
272
- "completion_length": 133.875,
273
  "epoch": 0.028,
274
- "grad_norm": 7.563712120056152,
275
- "kl": 0.032230477780103683,
276
  "learning_rate": 4.475387530939226e-07,
277
  "loss": 0.0,
278
- "reward": 0.25,
279
- "reward_std": 0.25,
280
- "rewards/emotion_reward_func": 0.0,
281
- "rewards/format_reward_func": 0.25,
282
  "step": 21
283
  },
284
  {
285
- "completion_length": 109.4375,
286
  "epoch": 0.029333333333333333,
287
- "grad_norm": 13.283954620361328,
288
- "kl": 0.036050185561180115,
289
  "learning_rate": 4.426283106939473e-07,
290
  "loss": 0.0,
291
- "reward": 0.3125,
292
- "reward_std": 0.41367512941360474,
293
- "rewards/emotion_reward_func": 0.0,
294
- "rewards/format_reward_func": 0.3125,
295
  "step": 22
296
  },
297
  {
298
- "completion_length": 139.0,
299
  "epoch": 0.030666666666666665,
300
- "grad_norm": 8.06175422668457,
301
- "kl": 0.01777772419154644,
302
  "learning_rate": 4.375277674076149e-07,
303
  "loss": 0.0,
304
- "reward": 0.3125,
305
- "reward_std": 0.41367512941360474,
306
- "rewards/emotion_reward_func": 0.0,
307
- "rewards/format_reward_func": 0.3125,
308
  "step": 23
309
  },
310
  {
311
- "completion_length": 130.5,
312
  "epoch": 0.032,
313
- "grad_norm": 10.465758323669434,
314
- "kl": 0.028637699782848358,
315
  "learning_rate": 4.3224215685535287e-07,
316
- "loss": 0.0,
317
- "reward": 0.46875,
318
- "reward_std": 0.3696783781051636,
319
- "rewards/emotion_reward_func": 0.0,
320
- "rewards/format_reward_func": 0.46875,
321
  "step": 24
322
  },
323
  {
324
- "completion_length": 180.6875,
325
  "epoch": 0.03333333333333333,
326
- "grad_norm": 9.501301765441895,
327
- "kl": 0.02521451562643051,
328
  "learning_rate": 4.2677669529663686e-07,
329
- "loss": 0.0,
330
- "reward": 0.3125,
331
- "reward_std": 0.51933753490448,
332
- "rewards/emotion_reward_func": 0.0,
333
- "rewards/format_reward_func": 0.3125,
334
  "step": 25
335
  },
336
  {
337
- "completion_length": 158.6875,
338
  "epoch": 0.034666666666666665,
339
- "grad_norm": 10.960349082946777,
340
- "kl": 0.0829106867313385,
341
  "learning_rate": 4.2113677648217216e-07,
342
  "loss": 0.0001,
343
- "reward": 0.25,
344
- "reward_std": 0.28867512941360474,
345
- "rewards/emotion_reward_func": 0.0,
346
- "rewards/format_reward_func": 0.25,
347
  "step": 26
348
  },
349
  {
350
- "completion_length": 138.25,
351
  "epoch": 0.036,
352
- "grad_norm": 7.017608165740967,
353
- "kl": 0.02953716553747654,
354
  "learning_rate": 4.1532796633091294e-07,
355
- "loss": 0.0,
356
- "reward": 0.5,
357
- "reward_std": 0.39433756470680237,
358
- "rewards/emotion_reward_func": 0.0,
359
- "rewards/format_reward_func": 0.5,
360
  "step": 27
361
  },
362
  {
363
- "completion_length": 112.125,
364
  "epoch": 0.037333333333333336,
365
- "grad_norm": 9.538881301879883,
366
- "kl": 0.06156347692012787,
367
  "learning_rate": 4.0935599743717244e-07,
368
  "loss": 0.0001,
369
- "reward": 0.625,
370
- "reward_std": 0.5,
371
- "rewards/emotion_reward_func": 0.0,
372
- "rewards/format_reward_func": 0.625,
373
  "step": 28
374
  },
375
  {
376
- "completion_length": 151.25,
377
  "epoch": 0.03866666666666667,
378
- "grad_norm": 10.00710678100586,
379
- "kl": 0.027312466874718666,
380
  "learning_rate": 4.0322676341324414e-07,
381
- "loss": 0.0,
382
- "reward": 0.4375,
383
- "reward_std": 0.51933753490448,
384
- "rewards/emotion_reward_func": 0.0,
385
- "rewards/format_reward_func": 0.4375,
386
  "step": 29
387
  },
388
  {
389
- "completion_length": 160.1875,
390
  "epoch": 0.04,
391
- "grad_norm": 10.74777603149414,
392
- "kl": 0.04307672381401062,
393
  "learning_rate": 3.9694631307311825e-07,
394
- "loss": 0.0,
395
- "reward": 0.25,
396
- "reward_std": 0.5,
397
- "rewards/emotion_reward_func": 0.0,
398
- "rewards/format_reward_func": 0.25,
399
  "step": 30
400
  },
401
  {
402
- "completion_length": 141.0625,
403
  "epoch": 0.04133333333333333,
404
- "grad_norm": 11.85556697845459,
405
- "kl": 0.05974256619811058,
406
  "learning_rate": 3.9052084446303265e-07,
407
  "loss": 0.0001,
408
- "reward": 0.40625,
409
- "reward_std": 0.4946783781051636,
410
- "rewards/emotion_reward_func": 0.0,
411
- "rewards/format_reward_func": 0.40625,
412
  "step": 31
413
  },
414
  {
415
- "completion_length": 153.875,
416
  "epoch": 0.042666666666666665,
417
- "grad_norm": 8.729798316955566,
418
- "kl": 0.042332496494054794,
419
  "learning_rate": 3.839566987447491e-07,
420
- "loss": 0.0,
421
- "reward": 0.4375,
422
- "reward_std": 0.5580127239227295,
423
- "rewards/emotion_reward_func": 0.0,
424
- "rewards/format_reward_func": 0.4375,
425
  "step": 32
426
  },
427
  {
428
- "completion_length": 118.25,
429
  "epoch": 0.044,
430
- "grad_norm": 6.3447585105896,
431
- "kl": 0.07207943499088287,
432
  "learning_rate": 3.7726035393759283e-07,
433
  "loss": 0.0001,
434
- "reward": 0.8125,
435
- "reward_std": 0.26933756470680237,
436
- "rewards/emotion_reward_func": 0.0,
437
- "rewards/format_reward_func": 0.8125,
438
  "step": 33
439
  },
440
  {
441
- "completion_length": 112.5,
442
  "epoch": 0.04533333333333334,
443
- "grad_norm": 6.1037797927856445,
444
- "kl": 0.09316730499267578,
445
  "learning_rate": 3.704384185254288e-07,
446
  "loss": 0.0001,
447
- "reward": 0.8125,
448
- "reward_std": 0.26933756470680237,
449
- "rewards/emotion_reward_func": 0.0,
450
- "rewards/format_reward_func": 0.8125,
451
  "step": 34
452
  },
453
  {
454
- "completion_length": 139.8125,
455
  "epoch": 0.04666666666666667,
456
- "grad_norm": 7.664778232574463,
457
- "kl": 0.0723225474357605,
458
  "learning_rate": 3.634976249348867e-07,
459
  "loss": 0.0001,
460
- "reward": 0.78125,
461
- "reward_std": 0.3696783781051636,
462
- "rewards/emotion_reward_func": 0.0,
463
- "rewards/format_reward_func": 0.78125,
464
  "step": 35
465
  },
466
  {
467
- "completion_length": 112.375,
468
  "epoch": 0.048,
469
- "grad_norm": 9.651317596435547,
470
- "kl": 0.09862169623374939,
471
  "learning_rate": 3.5644482289126813e-07,
472
  "loss": 0.0001,
473
- "reward": 0.5625,
474
- "reward_std": 0.51933753490448,
475
- "rewards/emotion_reward_func": 0.0,
476
- "rewards/format_reward_func": 0.5625,
477
  "step": 36
478
  },
479
  {
480
- "completion_length": 108.9375,
481
  "epoch": 0.04933333333333333,
482
- "grad_norm": 8.387043952941895,
483
- "kl": 0.10006917268037796,
484
  "learning_rate": 3.492869726586951e-07,
485
  "loss": 0.0001,
486
- "reward": 0.8125,
487
- "reward_std": 0.375,
488
- "rewards/emotion_reward_func": 0.0,
489
- "rewards/format_reward_func": 0.8125,
490
  "step": 37
491
  },
492
  {
493
- "completion_length": 122.0,
494
  "epoch": 0.050666666666666665,
495
- "grad_norm": 8.030802726745605,
496
- "kl": 0.13344745337963104,
497
  "learning_rate": 3.4203113817116953e-07,
498
  "loss": 0.0001,
499
- "reward": 0.8125,
500
- "reward_std": 0.375,
501
- "rewards/emotion_reward_func": 0.0,
502
- "rewards/format_reward_func": 0.8125,
503
  "step": 38
504
  },
505
  {
506
- "completion_length": 171.8125,
507
  "epoch": 0.052,
508
- "grad_norm": 68.77984619140625,
509
- "kl": 0.05845522880554199,
510
  "learning_rate": 3.346844800613229e-07,
511
  "loss": 0.0001,
512
- "reward": 0.6875,
513
- "reward_std": 0.48935678601264954,
514
- "rewards/emotion_reward_func": 0.0,
515
- "rewards/format_reward_func": 0.6875,
516
  "step": 39
517
  },
518
  {
519
- "completion_length": 109.4375,
520
  "epoch": 0.05333333333333334,
521
- "grad_norm": 7.004795551300049,
522
- "kl": 0.07453721761703491,
523
  "learning_rate": 3.272542485937368e-07,
524
  "loss": 0.0001,
525
- "reward": 0.90625,
526
- "reward_std": 0.1875,
527
- "rewards/emotion_reward_func": 0.0,
528
- "rewards/format_reward_func": 0.90625,
529
  "step": 40
530
  },
531
  {
532
- "completion_length": 129.75,
533
  "epoch": 0.05466666666666667,
534
- "grad_norm": 7.910297393798828,
535
- "kl": 0.09665161371231079,
536
  "learning_rate": 3.1974777650980734e-07,
537
  "loss": 0.0001,
538
- "reward": 0.75,
539
- "reward_std": 0.36435678601264954,
540
- "rewards/emotion_reward_func": 0.0,
541
- "rewards/format_reward_func": 0.75,
542
  "step": 41
543
  },
544
  {
545
- "completion_length": 138.6875,
546
  "epoch": 0.056,
547
- "grad_norm": 6.366047382354736,
548
- "kl": 0.06816712021827698,
549
  "learning_rate": 3.121724717912138e-07,
550
  "loss": 0.0001,
551
- "reward": 0.875,
552
- "reward_std": 0.18217839300632477,
553
- "rewards/emotion_reward_func": 0.0,
554
- "rewards/format_reward_func": 0.875,
555
  "step": 42
556
  },
557
  {
558
- "completion_length": 163.4375,
559
  "epoch": 0.05733333333333333,
560
- "grad_norm": 9.733158111572266,
561
- "kl": 0.07456796616315842,
562
  "learning_rate": 3.0453581034913565e-07,
563
  "loss": 0.0001,
564
- "reward": 0.75,
565
- "reward_std": 0.39433756470680237,
566
- "rewards/emotion_reward_func": 0.0,
567
- "rewards/format_reward_func": 0.75,
568
  "step": 43
569
  },
570
  {
571
- "completion_length": 149.375,
572
  "epoch": 0.058666666666666666,
573
- "grad_norm": 7.283127784729004,
574
- "kl": 0.10417325794696808,
575
  "learning_rate": 2.968453286464312e-07,
576
  "loss": 0.0001,
577
- "reward": 0.8125,
578
- "reward_std": 0.375,
579
- "rewards/emotion_reward_func": 0.0,
580
- "rewards/format_reward_func": 0.8125,
581
  "step": 44
582
  },
583
  {
584
- "completion_length": 151.8125,
585
  "epoch": 0.06,
586
- "grad_norm": 8.65285873413086,
587
- "kl": 0.08351579308509827,
588
  "learning_rate": 2.8910861626005773e-07,
589
  "loss": 0.0001,
590
- "reward": 0.71875,
591
- "reward_std": 0.45683756470680237,
592
- "rewards/emotion_reward_func": 0.0,
593
- "rewards/format_reward_func": 0.71875,
594
  "step": 45
595
  },
596
  {
597
- "completion_length": 122.1875,
598
  "epoch": 0.06133333333333333,
599
- "grad_norm": 6.481723308563232,
600
- "kl": 0.0788784921169281,
601
  "learning_rate": 2.8133330839107604e-07,
602
  "loss": 0.0001,
603
- "reward": 0.84375,
604
- "reward_std": 0.24467839300632477,
605
- "rewards/emotion_reward_func": 0.0,
606
- "rewards/format_reward_func": 0.84375,
607
  "step": 46
608
  },
609
  {
610
- "completion_length": 177.125,
611
  "epoch": 0.06266666666666666,
612
- "grad_norm": 6.107300281524658,
613
- "kl": 0.10843782126903534,
614
  "learning_rate": 2.735270783296286e-07,
615
  "loss": 0.0001,
616
- "reward": 0.84375,
617
- "reward_std": 0.24467839300632477,
618
- "rewards/emotion_reward_func": 0.0,
619
- "rewards/format_reward_func": 0.84375,
620
  "step": 47
621
  },
622
  {
623
- "completion_length": 115.875,
624
  "epoch": 0.064,
625
- "grad_norm": 5.656793117523193,
626
- "kl": 0.11855285614728928,
627
  "learning_rate": 2.6569762988232837e-07,
628
  "loss": 0.0001,
629
- "reward": 0.90625,
630
- "reward_std": 0.1875,
631
- "rewards/emotion_reward_func": 0.0,
632
- "rewards/format_reward_func": 0.90625,
633
  "step": 48
634
  },
635
  {
636
- "completion_length": 137.6875,
637
  "epoch": 0.06533333333333333,
638
- "grad_norm": 7.453767776489258,
639
- "kl": 0.10749398171901703,
640
  "learning_rate": 2.5785268976953204e-07,
641
  "loss": 0.0001,
642
- "reward": 0.90625,
643
- "reward_std": 0.1875,
644
- "rewards/emotion_reward_func": 0.0,
645
- "rewards/format_reward_func": 0.90625,
646
  "step": 49
647
  },
648
  {
649
- "completion_length": 117.375,
650
  "epoch": 0.06666666666666667,
651
- "grad_norm": 6.2615861892700195,
652
- "kl": 0.09339688718318939,
653
  "learning_rate": 2.5e-07,
654
  "loss": 0.0001,
655
- "reward": 0.8125,
656
- "reward_std": 0.26933756470680237,
657
- "rewards/emotion_reward_func": 0.0,
658
- "rewards/format_reward_func": 0.8125,
659
  "step": 50
660
  },
661
  {
662
- "completion_length": 120.625,
663
  "epoch": 0.068,
664
- "grad_norm": 7.3459577560424805,
665
- "kl": 0.08987420052289963,
666
  "learning_rate": 2.4214731023046794e-07,
667
  "loss": 0.0001,
668
- "reward": 0.84375,
669
- "reward_std": 0.3125,
670
- "rewards/emotion_reward_func": 0.0,
671
- "rewards/format_reward_func": 0.84375,
672
  "step": 51
673
  },
674
  {
675
- "completion_length": 130.4375,
676
  "epoch": 0.06933333333333333,
677
- "grad_norm": 7.253162384033203,
678
- "kl": 0.0824127197265625,
679
  "learning_rate": 2.3430237011767164e-07,
680
  "loss": 0.0001,
681
- "reward": 0.8125,
682
- "reward_std": 0.26933756470680237,
683
- "rewards/emotion_reward_func": 0.0,
684
- "rewards/format_reward_func": 0.8125,
685
  "step": 52
686
  },
687
  {
688
- "completion_length": 102.875,
689
  "epoch": 0.07066666666666667,
690
- "grad_norm": 7.457027435302734,
691
- "kl": 0.08334946632385254,
692
  "learning_rate": 2.264729216703714e-07,
693
  "loss": 0.0001,
694
- "reward": 0.875,
695
- "reward_std": 0.25,
696
- "rewards/emotion_reward_func": 0.0,
697
- "rewards/format_reward_func": 0.875,
698
  "step": 53
699
  },
700
  {
701
- "completion_length": 127.75,
702
  "epoch": 0.072,
703
- "grad_norm": 0.01185312308371067,
704
- "kl": 0.094039186835289,
705
  "learning_rate": 2.1866669160892389e-07,
706
  "loss": 0.0001,
707
- "reward": 1.0,
708
- "reward_std": 0.0,
709
- "rewards/emotion_reward_func": 0.0,
710
- "rewards/format_reward_func": 1.0,
711
  "step": 54
712
  },
713
  {
714
- "completion_length": 86.5,
715
  "epoch": 0.07333333333333333,
716
- "grad_norm": 11.868236541748047,
717
- "kl": 0.22406581044197083,
718
  "learning_rate": 2.1089138373994222e-07,
719
- "loss": 0.0002,
720
- "reward": 0.84375,
721
- "reward_std": 0.3125,
722
- "rewards/emotion_reward_func": 0.0,
723
- "rewards/format_reward_func": 0.84375,
724
  "step": 55
725
  },
726
  {
727
- "completion_length": 105.0625,
728
  "epoch": 0.07466666666666667,
729
- "grad_norm": 3.1902968883514404,
730
- "kl": 0.10381343960762024,
731
  "learning_rate": 2.0315467135356878e-07,
732
  "loss": 0.0001,
733
- "reward": 0.96875,
734
- "reward_std": 0.0625,
735
- "rewards/emotion_reward_func": 0.0,
736
- "rewards/format_reward_func": 0.96875,
737
  "step": 56
738
  },
739
  {
740
- "completion_length": 77.8125,
741
  "epoch": 0.076,
742
- "grad_norm": 0.04087565839290619,
743
- "kl": 0.17592526972293854,
744
  "learning_rate": 1.954641896508644e-07,
745
- "loss": 0.0002,
746
- "reward": 1.0,
747
- "reward_std": 0.0,
748
- "rewards/emotion_reward_func": 0.0,
749
- "rewards/format_reward_func": 1.0,
750
  "step": 57
751
  },
752
  {
753
- "completion_length": 114.5,
754
  "epoch": 0.07733333333333334,
755
- "grad_norm": 4.14316463470459,
756
- "kl": 0.09431131184101105,
757
  "learning_rate": 1.8782752820878633e-07,
758
- "loss": 0.0001,
759
- "reward": 0.875,
760
- "reward_std": 0.14433756470680237,
761
- "rewards/emotion_reward_func": 0.0,
762
- "rewards/format_reward_func": 0.875,
763
  "step": 58
764
  },
765
  {
766
- "completion_length": 99.25,
767
  "epoch": 0.07866666666666666,
768
- "grad_norm": 12.474014282226562,
769
- "kl": 0.16972073912620544,
770
  "learning_rate": 1.802522234901927e-07,
771
  "loss": 0.0002,
772
- "reward": 0.6875,
773
- "reward_std": 0.41367512941360474,
774
- "rewards/emotion_reward_func": 0.0,
775
- "rewards/format_reward_func": 0.6875,
776
  "step": 59
777
  },
778
  {
779
- "completion_length": 114.6875,
780
  "epoch": 0.08,
781
- "grad_norm": 0.014834249392151833,
782
- "kl": 0.11283782124519348,
783
  "learning_rate": 1.7274575140626315e-07,
784
  "loss": 0.0001,
785
- "reward": 1.0,
786
- "reward_std": 0.0,
787
- "rewards/emotion_reward_func": 0.0,
788
- "rewards/format_reward_func": 1.0,
789
  "step": 60
790
  },
791
  {
792
- "completion_length": 107.6875,
793
  "epoch": 0.08133333333333333,
794
- "grad_norm": 11.97496509552002,
795
- "kl": 0.2880978286266327,
796
  "learning_rate": 1.6531551993867715e-07,
797
- "loss": 0.0003,
798
- "reward": 0.84375,
799
- "reward_std": 0.3125,
800
- "rewards/emotion_reward_func": 0.0,
801
- "rewards/format_reward_func": 0.84375,
802
  "step": 61
803
  },
804
  {
805
- "completion_length": 115.3125,
806
  "epoch": 0.08266666666666667,
807
- "grad_norm": 0.06830989569425583,
808
- "kl": 0.12032976001501083,
809
  "learning_rate": 1.579688618288305e-07,
810
  "loss": 0.0001,
811
- "reward": 1.0,
812
- "reward_std": 0.0,
813
- "rewards/emotion_reward_func": 0.0,
814
  "rewards/format_reward_func": 1.0,
815
  "step": 62
816
  },
817
  {
818
- "completion_length": 109.9375,
819
  "epoch": 0.084,
820
- "grad_norm": 12.59013557434082,
821
- "kl": 0.16462820768356323,
822
  "learning_rate": 1.5071302734130486e-07,
823
- "loss": 0.0002,
824
- "reward": 0.875,
825
- "reward_std": 0.25,
826
- "rewards/emotion_reward_func": 0.0,
827
- "rewards/format_reward_func": 0.875,
828
  "step": 63
829
  },
830
  {
831
- "completion_length": 112.6875,
832
  "epoch": 0.08533333333333333,
833
- "grad_norm": 0.011938858777284622,
834
- "kl": 0.105996273458004,
835
  "learning_rate": 1.4355517710873182e-07,
836
  "loss": 0.0001,
837
- "reward": 1.0,
838
- "reward_std": 0.0,
839
- "rewards/emotion_reward_func": 0.0,
840
- "rewards/format_reward_func": 1.0,
841
  "step": 64
842
  },
843
  {
844
- "completion_length": 113.3125,
845
  "epoch": 0.08666666666666667,
846
- "grad_norm": 3.5281906127929688,
847
- "kl": 0.11741024255752563,
848
  "learning_rate": 1.365023750651133e-07,
849
  "loss": 0.0001,
850
- "reward": 0.96875,
851
- "reward_std": 0.0625,
852
- "rewards/emotion_reward_func": 0.0,
853
- "rewards/format_reward_func": 0.96875,
854
  "step": 65
855
  },
856
  {
857
- "completion_length": 115.375,
858
  "epoch": 0.088,
859
- "grad_norm": 5.508726596832275,
860
- "kl": 0.11176759004592896,
861
  "learning_rate": 1.2956158147457114e-07,
862
  "loss": 0.0001,
863
- "reward": 0.9375,
864
- "reward_std": 0.125,
865
- "rewards/emotion_reward_func": 0.0,
866
- "rewards/format_reward_func": 0.9375,
867
  "step": 66
868
  },
869
  {
870
- "completion_length": 118.75,
871
  "epoch": 0.08933333333333333,
872
- "grad_norm": 6.57678747177124,
873
- "kl": 0.12585385143756866,
874
  "learning_rate": 1.2273964606240718e-07,
875
  "loss": 0.0001,
876
- "reward": 0.90625,
877
- "reward_std": 0.1875,
878
- "rewards/emotion_reward_func": 0.0,
879
- "rewards/format_reward_func": 0.90625,
880
  "step": 67
881
  },
882
  {
883
- "completion_length": 128.8125,
884
  "epoch": 0.09066666666666667,
885
- "grad_norm": 0.012199473567306995,
886
- "kl": 0.1194550171494484,
887
  "learning_rate": 1.1604330125525078e-07,
888
  "loss": 0.0001,
889
- "reward": 1.0,
890
- "reward_std": 0.0,
891
- "rewards/emotion_reward_func": 0.0,
892
- "rewards/format_reward_func": 1.0,
893
  "step": 68
894
  },
895
  {
896
- "completion_length": 102.75,
897
  "epoch": 0.092,
898
- "grad_norm": 6.777124881744385,
899
- "kl": 0.14668866991996765,
900
  "learning_rate": 1.0947915553696741e-07,
901
  "loss": 0.0001,
902
- "reward": 0.9375,
903
- "reward_std": 0.125,
904
- "rewards/emotion_reward_func": 0.0,
905
- "rewards/format_reward_func": 0.9375,
906
  "step": 69
907
  },
908
  {
909
- "completion_length": 96.25,
910
  "epoch": 0.09333333333333334,
911
- "grad_norm": 6.230812072753906,
912
- "kl": 0.1095753163099289,
913
  "learning_rate": 1.0305368692688174e-07,
914
- "loss": 0.0001,
915
- "reward": 0.90625,
916
- "reward_std": 0.1875,
917
- "rewards/emotion_reward_func": 0.0,
918
- "rewards/format_reward_func": 0.90625,
919
  "step": 70
920
  },
921
  {
922
- "completion_length": 98.25,
923
  "epoch": 0.09466666666666666,
924
- "grad_norm": 3.248274326324463,
925
- "kl": 0.1126992404460907,
926
  "learning_rate": 9.677323658675593e-08,
927
  "loss": 0.0001,
928
- "reward": 0.9375,
929
- "reward_std": 0.125,
930
- "rewards/emotion_reward_func": 0.0,
931
- "rewards/format_reward_func": 0.9375,
932
  "step": 71
933
  },
934
  {
935
- "completion_length": 112.8125,
936
  "epoch": 0.096,
937
- "grad_norm": 5.262601852416992,
938
- "kl": 0.1305035650730133,
939
  "learning_rate": 9.064400256282755e-08,
940
- "loss": 0.0001,
941
- "reward": 0.9375,
942
- "reward_std": 0.125,
943
- "rewards/emotion_reward_func": 0.0,
944
- "rewards/format_reward_func": 0.9375,
945
  "step": 72
946
  },
947
  {
948
- "completion_length": 121.375,
949
  "epoch": 0.09733333333333333,
950
- "grad_norm": 0.01645738258957863,
951
- "kl": 0.12455137819051743,
952
  "learning_rate": 8.467203366908707e-08,
953
  "loss": 0.0001,
954
- "reward": 1.0,
955
- "reward_std": 0.0,
956
- "rewards/emotion_reward_func": 0.0,
957
- "rewards/format_reward_func": 1.0,
958
  "step": 73
959
  },
960
  {
961
- "completion_length": 113.625,
962
  "epoch": 0.09866666666666667,
963
- "grad_norm": 5.738455295562744,
964
- "kl": 0.14311644434928894,
965
  "learning_rate": 7.886322351782782e-08,
966
- "loss": 0.0001,
967
- "reward": 0.875,
968
- "reward_std": 0.25,
969
- "rewards/emotion_reward_func": 0.0,
970
- "rewards/format_reward_func": 0.875,
971
  "step": 74
972
  },
973
  {
974
- "completion_length": 104.375,
975
  "epoch": 0.1,
976
- "grad_norm": 0.014866613782942295,
977
- "kl": 0.13841284811496735,
978
  "learning_rate": 7.322330470336313e-08,
979
  "loss": 0.0001,
980
- "reward": 1.0,
981
- "reward_std": 0.0,
982
- "rewards/emotion_reward_func": 0.0,
983
- "rewards/format_reward_func": 1.0,
984
  "step": 75
985
  },
986
  {
987
- "completion_length": 96.125,
988
  "epoch": 0.10133333333333333,
989
- "grad_norm": 4.531452178955078,
990
- "kl": 0.11935670673847198,
991
  "learning_rate": 6.775784314464716e-08,
992
  "loss": 0.0001,
993
- "reward": 0.9375,
994
- "reward_std": 0.125,
995
- "rewards/emotion_reward_func": 0.0,
996
- "rewards/format_reward_func": 0.9375,
997
  "step": 76
998
  },
999
  {
1000
- "completion_length": 102.0,
1001
  "epoch": 0.10266666666666667,
1002
- "grad_norm": 0.017895404249429703,
1003
- "kl": 0.15568867325782776,
1004
  "learning_rate": 6.24722325923851e-08,
1005
  "loss": 0.0002,
1006
- "reward": 1.0,
1007
- "reward_std": 0.0,
1008
- "rewards/emotion_reward_func": 0.0,
1009
  "rewards/format_reward_func": 1.0,
1010
  "step": 77
1011
  },
1012
  {
1013
- "completion_length": 85.25,
1014
  "epoch": 0.104,
1015
- "grad_norm": 6.45017147064209,
1016
- "kl": 0.20556744933128357,
1017
  "learning_rate": 5.737168930605271e-08,
1018
  "loss": 0.0002,
1019
- "reward": 0.96875,
1020
- "reward_std": 0.0625,
1021
- "rewards/emotion_reward_func": 0.0,
1022
  "rewards/format_reward_func": 0.96875,
1023
  "step": 78
1024
  },
1025
  {
1026
- "completion_length": 90.25,
1027
  "epoch": 0.10533333333333333,
1028
- "grad_norm": 7.30122709274292,
1029
- "kl": 0.1739582121372223,
1030
  "learning_rate": 5.246124690607739e-08,
1031
- "loss": 0.0002,
1032
- "reward": 0.9375,
1033
- "reward_std": 0.125,
1034
- "rewards/emotion_reward_func": 0.0,
1035
- "rewards/format_reward_func": 0.9375,
1036
  "step": 79
1037
  },
1038
  {
1039
- "completion_length": 99.4375,
1040
  "epoch": 0.10666666666666667,
1041
- "grad_norm": 7.5901103019714355,
1042
- "kl": 0.1773838996887207,
1043
  "learning_rate": 4.774575140626316e-08,
1044
- "loss": 0.0002,
1045
- "reward": 0.90625,
1046
- "reward_std": 0.1875,
1047
- "rewards/emotion_reward_func": 0.0,
1048
- "rewards/format_reward_func": 0.90625,
1049
  "step": 80
1050
  },
1051
  {
1052
- "completion_length": 101.875,
1053
  "epoch": 0.108,
1054
- "grad_norm": 0.016729678958654404,
1055
- "kl": 0.16674719750881195,
1056
  "learning_rate": 4.3229856431359513e-08,
1057
- "loss": 0.0002,
1058
- "reward": 1.0,
1059
- "reward_std": 0.0,
1060
- "rewards/emotion_reward_func": 0.0,
1061
- "rewards/format_reward_func": 1.0,
1062
  "step": 81
1063
  },
1064
  {
1065
- "completion_length": 104.5,
1066
  "epoch": 0.10933333333333334,
1067
- "grad_norm": 5.888128757476807,
1068
- "kl": 0.13316090404987335,
1069
  "learning_rate": 3.8918018624496286e-08,
1070
  "loss": 0.0001,
1071
- "reward": 0.9375,
1072
- "reward_std": 0.125,
1073
- "rewards/emotion_reward_func": 0.0,
1074
- "rewards/format_reward_func": 0.9375,
1075
  "step": 82
1076
  },
1077
  {
1078
- "completion_length": 96.5625,
1079
  "epoch": 0.11066666666666666,
1080
- "grad_norm": 5.195379734039307,
1081
- "kl": 0.13178151845932007,
1082
  "learning_rate": 3.481449324901411e-08,
1083
  "loss": 0.0001,
1084
- "reward": 0.96875,
1085
- "reward_std": 0.0625,
1086
- "rewards/emotion_reward_func": 0.0,
1087
  "rewards/format_reward_func": 0.96875,
1088
  "step": 83
1089
  },
1090
  {
1091
- "completion_length": 120.3125,
1092
  "epoch": 0.112,
1093
- "grad_norm": 0.023672526702284813,
1094
- "kl": 0.09863791614770889,
1095
  "learning_rate": 3.092332998903416e-08,
1096
- "loss": 0.0001,
1097
- "reward": 1.0,
1098
- "reward_std": 0.0,
1099
- "rewards/emotion_reward_func": 0.0,
1100
- "rewards/format_reward_func": 1.0,
1101
  "step": 84
1102
  },
1103
  {
1104
- "completion_length": 120.3125,
1105
  "epoch": 0.11333333333333333,
1106
- "grad_norm": 0.016033878549933434,
1107
- "kl": 0.1298094093799591,
1108
  "learning_rate": 2.724836895290805e-08,
1109
  "loss": 0.0001,
1110
- "reward": 1.0,
1111
- "reward_std": 0.0,
1112
- "rewards/emotion_reward_func": 0.0,
1113
  "rewards/format_reward_func": 1.0,
1114
  "step": 85
1115
  },
1116
  {
1117
- "completion_length": 123.875,
1118
  "epoch": 0.11466666666666667,
1119
- "grad_norm": 0.013259505853056908,
1120
- "kl": 0.13770553469657898,
1121
  "learning_rate": 2.379323688349516e-08,
1122
  "loss": 0.0001,
1123
- "reward": 1.0,
1124
- "reward_std": 0.0,
1125
- "rewards/emotion_reward_func": 0.0,
1126
  "rewards/format_reward_func": 1.0,
1127
  "step": 86
1128
  },
1129
  {
1130
- "completion_length": 93.375,
1131
  "epoch": 0.116,
1132
- "grad_norm": 7.885369777679443,
1133
- "kl": 0.14231295883655548,
1134
  "learning_rate": 2.0561343579004715e-08,
1135
  "loss": 0.0001,
1136
- "reward": 0.875,
1137
- "reward_std": 0.25,
1138
- "rewards/emotion_reward_func": 0.0,
1139
- "rewards/format_reward_func": 0.875,
1140
  "step": 87
1141
  },
1142
  {
1143
- "completion_length": 106.8125,
1144
  "epoch": 0.11733333333333333,
1145
- "grad_norm": 10.611775398254395,
1146
- "kl": 0.25752192735671997,
1147
  "learning_rate": 1.7555878527937163e-08,
1148
- "loss": 0.0003,
1149
- "reward": 0.84375,
1150
- "reward_std": 0.3125,
1151
- "rewards/emotion_reward_func": 0.0,
1152
- "rewards/format_reward_func": 0.84375,
1153
  "step": 88
1154
  },
1155
  {
1156
- "completion_length": 105.25,
1157
  "epoch": 0.11866666666666667,
1158
- "grad_norm": 5.855681419372559,
1159
- "kl": 0.1377657949924469,
1160
  "learning_rate": 1.4779807761443635e-08,
1161
  "loss": 0.0001,
1162
- "reward": 0.9375,
1163
- "reward_std": 0.125,
1164
- "rewards/emotion_reward_func": 0.0,
1165
- "rewards/format_reward_func": 0.9375,
1166
  "step": 89
1167
  },
1168
  {
1169
- "completion_length": 112.9375,
1170
  "epoch": 0.12,
1171
- "grad_norm": 7.280832290649414,
1172
- "kl": 0.13963450491428375,
1173
  "learning_rate": 1.2235870926211616e-08,
1174
  "loss": 0.0001,
1175
- "reward": 0.8125,
1176
- "reward_std": 0.26933756470680237,
1177
- "rewards/emotion_reward_func": 0.0,
1178
- "rewards/format_reward_func": 0.8125,
1179
  "step": 90
1180
  },
1181
  {
1182
- "completion_length": 113.0625,
1183
  "epoch": 0.12133333333333333,
1184
- "grad_norm": 5.792718410491943,
1185
- "kl": 0.10216458886861801,
1186
  "learning_rate": 9.926578580764234e-09,
1187
  "loss": 0.0001,
1188
- "reward": 0.875,
1189
- "reward_std": 0.25,
1190
- "rewards/emotion_reward_func": 0.0,
1191
- "rewards/format_reward_func": 0.875,
1192
  "step": 91
1193
  },
1194
  {
1195
- "completion_length": 130.625,
1196
  "epoch": 0.12266666666666666,
1197
- "grad_norm": 3.8016581535339355,
1198
- "kl": 0.09364865720272064,
1199
  "learning_rate": 7.85420971784223e-09,
1200
  "loss": 0.0001,
1201
- "reward": 0.9375,
1202
- "reward_std": 0.125,
1203
- "rewards/emotion_reward_func": 0.0,
1204
- "rewards/format_reward_func": 0.9375,
1205
  "step": 92
1206
  },
1207
  {
1208
- "completion_length": 128.8125,
1209
  "epoch": 0.124,
1210
- "grad_norm": 4.038362979888916,
1211
- "kl": 0.09943213313817978,
1212
  "learning_rate": 6.020809515313141e-09,
1213
  "loss": 0.0001,
1214
- "reward": 0.96875,
1215
- "reward_std": 0.0625,
1216
- "rewards/emotion_reward_func": 0.0,
1217
- "rewards/format_reward_func": 0.96875,
1218
  "step": 93
1219
  },
1220
  {
1221
- "completion_length": 81.0,
1222
  "epoch": 0.12533333333333332,
1223
- "grad_norm": 5.979770660400391,
1224
- "kl": 0.15632514655590057,
1225
  "learning_rate": 4.4281873178278475e-09,
1226
  "loss": 0.0002,
1227
- "reward": 0.9375,
1228
- "reward_std": 0.125,
1229
- "rewards/emotion_reward_func": 0.0,
1230
- "rewards/format_reward_func": 0.9375,
1231
  "step": 94
1232
  },
1233
  {
1234
- "completion_length": 95.9375,
1235
  "epoch": 0.12666666666666668,
1236
- "grad_norm": 8.092153549194336,
1237
- "kl": 0.16083820164203644,
1238
  "learning_rate": 3.077914851215585e-09,
1239
- "loss": 0.0002,
1240
- "reward": 0.9375,
1241
- "reward_std": 0.125,
1242
- "rewards/emotion_reward_func": 0.0,
1243
- "rewards/format_reward_func": 0.9375,
1244
  "step": 95
1245
  },
1246
  {
1247
- "completion_length": 109.5,
1248
  "epoch": 0.128,
1249
- "grad_norm": 7.098442077636719,
1250
- "kl": 0.11922727525234222,
1251
  "learning_rate": 1.9713246713805587e-09,
1252
  "loss": 0.0001,
1253
- "reward": 0.875,
1254
- "reward_std": 0.25,
1255
- "rewards/emotion_reward_func": 0.0,
1256
- "rewards/format_reward_func": 0.875,
1257
  "step": 96
1258
  },
1259
  {
1260
- "completion_length": 104.0,
1261
  "epoch": 0.12933333333333333,
1262
- "grad_norm": 6.17384147644043,
1263
- "kl": 0.23637814819812775,
1264
  "learning_rate": 1.1095088492300008e-09,
1265
- "loss": 0.0002,
1266
- "reward": 0.9375,
1267
- "reward_std": 0.125,
1268
- "rewards/emotion_reward_func": 0.0,
1269
- "rewards/format_reward_func": 0.9375,
1270
  "step": 97
1271
  },
1272
  {
1273
- "completion_length": 97.75,
1274
  "epoch": 0.13066666666666665,
1275
- "grad_norm": 0.04274175688624382,
1276
- "kl": 0.18250277638435364,
1277
  "learning_rate": 4.933178929321102e-10,
1278
- "loss": 0.0002,
1279
- "reward": 1.0,
1280
- "reward_std": 0.0,
1281
- "rewards/emotion_reward_func": 0.0,
1282
- "rewards/format_reward_func": 1.0,
1283
  "step": 98
1284
  },
1285
  {
1286
- "completion_length": 124.6875,
1287
  "epoch": 0.132,
1288
- "grad_norm": 0.03310486301779747,
1289
- "kl": 0.09437094628810883,
1290
  "learning_rate": 1.2335990856709998e-10,
1291
  "loss": 0.0001,
1292
- "reward": 1.0,
1293
- "reward_std": 0.0,
1294
- "rewards/emotion_reward_func": 0.0,
1295
- "rewards/format_reward_func": 1.0,
1296
  "step": 99
1297
  },
1298
  {
1299
- "completion_length": 116.0,
1300
  "epoch": 0.13333333333333333,
1301
- "grad_norm": 0.0801442489027977,
1302
- "kl": 0.10857471823692322,
1303
  "learning_rate": 0.0,
1304
  "loss": 0.0001,
1305
- "reward": 1.0,
1306
- "reward_std": 0.0,
1307
- "rewards/emotion_reward_func": 0.0,
1308
- "rewards/format_reward_func": 1.0,
1309
  "step": 100
1310
  }
1311
  ],
@@ -1327,7 +1327,7 @@
1327
  }
1328
  },
1329
  "total_flos": 0.0,
1330
- "train_batch_size": 1,
1331
  "trial_name": null,
1332
  "trial_params": null
1333
  }
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 113.171875,
13
  "epoch": 0.0013333333333333333,
14
  "grad_norm": 0.0,
15
  "kl": 0.0,
 
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 100.609375,
26
  "epoch": 0.0026666666666666666,
27
+ "grad_norm": 7.206315517425537,
28
  "kl": 0.0,
29
  "learning_rate": 4.995066821070679e-07,
30
+ "loss": 0.0,
31
+ "reward": 0.046875,
32
+ "reward_std": 0.1875,
33
+ "rewards/emotion_reward_func": 0.015625,
34
+ "rewards/format_reward_func": 0.03125,
35
  "step": 2
36
  },
37
  {
38
+ "completion_length": 130.65625,
39
  "epoch": 0.004,
40
+ "grad_norm": 7.190981864929199,
41
+ "kl": 0.0008925930160330608,
42
  "learning_rate": 4.9889049115077e-07,
43
  "loss": 0.0,
44
+ "reward": 0.0625,
45
+ "reward_std": 0.25,
46
+ "rewards/emotion_reward_func": 0.015625,
47
+ "rewards/format_reward_func": 0.046875,
48
  "step": 3
49
  },
50
  {
51
+ "completion_length": 112.484375,
52
  "epoch": 0.005333333333333333,
53
+ "grad_norm": 4.745754718780518,
54
+ "kl": 0.0009372199856443331,
55
  "learning_rate": 4.980286753286194e-07,
56
  "loss": 0.0,
57
+ "reward": 0.046875,
58
+ "reward_std": 0.1875,
59
+ "rewards/emotion_reward_func": 0.015625,
60
+ "rewards/format_reward_func": 0.03125,
61
  "step": 4
62
  },
63
  {
64
+ "completion_length": 100.359375,
65
  "epoch": 0.006666666666666667,
66
+ "grad_norm": 6.957683086395264,
67
+ "kl": 0.0009052860696101561,
68
  "learning_rate": 4.969220851487844e-07,
69
  "loss": 0.0,
70
+ "reward": 0.078125,
71
+ "reward_std": 0.3125,
72
+ "rewards/emotion_reward_func": 0.03125,
73
+ "rewards/format_reward_func": 0.046875,
74
  "step": 5
75
  },
76
  {
77
+ "completion_length": 109.5625,
78
  "epoch": 0.008,
79
+ "grad_norm": 8.517982482910156,
80
+ "kl": 0.0011545967281563208,
81
  "learning_rate": 4.955718126821722e-07,
82
  "loss": 0.0,
83
+ "reward": 0.140625,
84
+ "reward_std": 0.34228479862213135,
85
+ "rewards/emotion_reward_func": 0.03125,
86
+ "rewards/format_reward_func": 0.109375,
87
  "step": 6
88
  },
89
  {
90
+ "completion_length": 117.484375,
91
  "epoch": 0.009333333333333334,
92
+ "grad_norm": 2.9639530181884766,
93
+ "kl": 0.001261903322301805,
94
  "learning_rate": 4.939791904846868e-07,
95
  "loss": 0.0,
96
+ "reward": 0.03125,
97
  "reward_std": 0.125,
98
+ "rewards/emotion_reward_func": 0.015625,
99
+ "rewards/format_reward_func": 0.015625,
100
  "step": 7
101
  },
102
  {
103
+ "completion_length": 118.234375,
104
  "epoch": 0.010666666666666666,
105
+ "grad_norm": 8.49864387512207,
106
+ "kl": 0.002180565701564774,
107
  "learning_rate": 4.921457902821578e-07,
108
  "loss": 0.0,
109
+ "reward": 0.140625,
110
+ "reward_std": 0.4436737596988678,
111
+ "rewards/emotion_reward_func": 0.046875,
112
+ "rewards/format_reward_func": 0.09375,
113
  "step": 8
114
  },
115
  {
116
+ "completion_length": 106.546875,
117
  "epoch": 0.012,
118
+ "grad_norm": 7.236983299255371,
119
+ "kl": 0.0018676594190765172,
120
  "learning_rate": 4.900734214192358e-07,
121
  "loss": 0.0,
122
+ "reward": 0.109375,
123
+ "reward_std": 0.3186737596988678,
124
+ "rewards/emotion_reward_func": 0.03125,
125
+ "rewards/format_reward_func": 0.078125,
126
  "step": 9
127
  },
128
  {
129
+ "completion_length": 94.640625,
130
  "epoch": 0.013333333333333334,
131
+ "grad_norm": 6.612936973571777,
132
+ "kl": 0.00287746504181996,
133
  "learning_rate": 4.877641290737883e-07,
134
  "loss": 0.0,
135
+ "reward": 0.15625,
136
+ "reward_std": 0.38375620543956757,
137
+ "rewards/emotion_reward_func": 0.03125,
138
+ "rewards/format_reward_func": 0.125,
139
  "step": 10
140
  },
141
  {
142
+ "completion_length": 78.453125,
143
  "epoch": 0.014666666666666666,
144
+ "grad_norm": 6.714486598968506,
145
+ "kl": 0.0037596136680804193,
146
  "learning_rate": 4.852201922385564e-07,
147
  "loss": 0.0,
148
+ "reward": 0.15625,
149
+ "reward_std": 0.34297704696655273,
150
+ "rewards/emotion_reward_func": 0.03125,
151
+ "rewards/format_reward_func": 0.125,
152
  "step": 11
153
  },
154
  {
155
+ "completion_length": 113.765625,
156
  "epoch": 0.016,
157
+ "grad_norm": 6.222474098205566,
158
+ "kl": 0.005189417512156069,
159
  "learning_rate": 4.824441214720628e-07,
160
  "loss": 0.0,
161
+ "reward": 0.328125,
162
+ "reward_std": 0.6560364216566086,
163
+ "rewards/emotion_reward_func": 0.109375,
164
+ "rewards/format_reward_func": 0.21875,
165
  "step": 12
166
  },
167
  {
168
+ "completion_length": 93.890625,
169
  "epoch": 0.017333333333333333,
170
+ "grad_norm": 7.199591636657715,
171
+ "kl": 0.011019598576240242,
172
  "learning_rate": 4.794386564209952e-07,
173
  "loss": 0.0,
174
+ "reward": 0.359375,
175
+ "reward_std": 0.647876650094986,
176
+ "rewards/emotion_reward_func": 0.125,
177
+ "rewards/format_reward_func": 0.234375,
178
  "step": 13
179
  },
180
  {
181
+ "completion_length": 93.6875,
182
  "epoch": 0.018666666666666668,
183
+ "grad_norm": 7.459893226623535,
184
+ "kl": 0.012794358422979712,
185
  "learning_rate": 4.762067631165049e-07,
186
  "loss": 0.0,
187
+ "reward": 0.28125,
188
+ "reward_std": 0.4561576098203659,
189
+ "rewards/emotion_reward_func": 0.015625,
190
+ "rewards/format_reward_func": 0.265625,
191
  "step": 14
192
  },
193
  {
194
+ "completion_length": 114.03125,
195
  "epoch": 0.02,
196
+ "grad_norm": 7.204787731170654,
197
+ "kl": 0.01207199739292264,
198
  "learning_rate": 4.7275163104709194e-07,
199
+ "loss": 0.0,
200
+ "reward": 0.421875,
201
+ "reward_std": 0.6076867878437042,
202
+ "rewards/emotion_reward_func": 0.0625,
203
+ "rewards/format_reward_func": 0.359375,
204
  "step": 15
205
  },
206
  {
207
+ "completion_length": 90.1875,
208
  "epoch": 0.021333333333333333,
209
+ "grad_norm": 6.9398369789123535,
210
+ "kl": 0.012930819648317993,
211
  "learning_rate": 4.6907667001096585e-07,
212
  "loss": 0.0,
213
+ "reward": 0.296875,
214
+ "reward_std": 0.6791985481977463,
215
+ "rewards/emotion_reward_func": 0.125,
216
+ "rewards/format_reward_func": 0.171875,
217
  "step": 16
218
  },
219
  {
220
+ "completion_length": 100.265625,
221
  "epoch": 0.02266666666666667,
222
+ "grad_norm": 7.01741361618042,
223
+ "kl": 0.014807499013841152,
224
  "learning_rate": 4.6518550675098587e-07,
225
  "loss": 0.0,
226
+ "reward": 0.609375,
227
+ "reward_std": 0.7960269749164581,
228
+ "rewards/emotion_reward_func": 0.1875,
229
+ "rewards/format_reward_func": 0.421875,
230
  "step": 17
231
  },
232
  {
233
+ "completion_length": 101.59375,
234
  "epoch": 0.024,
235
+ "grad_norm": 6.980762004852295,
236
+ "kl": 0.02019192511215806,
237
  "learning_rate": 4.6108198137550377e-07,
238
  "loss": 0.0,
239
+ "reward": 0.53125,
240
+ "reward_std": 0.6885540634393692,
241
+ "rewards/emotion_reward_func": 0.125,
242
+ "rewards/format_reward_func": 0.40625,
243
  "step": 18
244
  },
245
  {
246
+ "completion_length": 93.859375,
247
  "epoch": 0.025333333333333333,
248
+ "grad_norm": 8.390938758850098,
249
+ "kl": 0.03694334626197815,
250
  "learning_rate": 4.567701435686404e-07,
251
  "loss": 0.0,
252
+ "reward": 0.65625,
253
+ "reward_std": 0.651764303445816,
254
+ "rewards/emotion_reward_func": 0.125,
255
+ "rewards/format_reward_func": 0.53125,
256
  "step": 19
257
  },
258
  {
259
+ "completion_length": 85.546875,
260
  "epoch": 0.02666666666666667,
261
+ "grad_norm": 8.39484691619873,
262
+ "kl": 0.03380461875349283,
263
  "learning_rate": 4.5225424859373684e-07,
264
  "loss": 0.0,
265
+ "reward": 0.765625,
266
+ "reward_std": 0.7304560542106628,
267
+ "rewards/emotion_reward_func": 0.203125,
268
+ "rewards/format_reward_func": 0.5625,
269
  "step": 20
270
  },
271
  {
272
+ "completion_length": 91.3125,
273
  "epoch": 0.028,
274
+ "grad_norm": 6.440164089202881,
275
+ "kl": 0.04492605570703745,
276
  "learning_rate": 4.475387530939226e-07,
277
  "loss": 0.0,
278
+ "reward": 0.765625,
279
+ "reward_std": 0.5731314420700073,
280
+ "rewards/emotion_reward_func": 0.078125,
281
+ "rewards/format_reward_func": 0.6875,
282
  "step": 21
283
  },
284
  {
285
+ "completion_length": 91.0625,
286
  "epoch": 0.029333333333333333,
287
+ "grad_norm": 6.099334716796875,
288
+ "kl": 0.040256964042782784,
289
  "learning_rate": 4.426283106939473e-07,
290
  "loss": 0.0,
291
+ "reward": 1.1875,
292
+ "reward_std": 0.6744011342525482,
293
+ "rewards/emotion_reward_func": 0.390625,
294
+ "rewards/format_reward_func": 0.796875,
295
  "step": 22
296
  },
297
  {
298
+ "completion_length": 79.328125,
299
  "epoch": 0.030666666666666665,
300
+ "grad_norm": 7.1284871101379395,
301
+ "kl": 0.04640346672385931,
302
  "learning_rate": 4.375277674076149e-07,
303
  "loss": 0.0,
304
+ "reward": 1.21875,
305
+ "reward_std": 0.7463032901287079,
306
+ "rewards/emotion_reward_func": 0.4375,
307
+ "rewards/format_reward_func": 0.78125,
308
  "step": 23
309
  },
310
  {
311
+ "completion_length": 71.84375,
312
  "epoch": 0.032,
313
+ "grad_norm": 6.066468715667725,
314
+ "kl": 0.07137730903923512,
315
  "learning_rate": 4.3224215685535287e-07,
316
+ "loss": 0.0001,
317
+ "reward": 1.328125,
318
+ "reward_std": 0.8113406747579575,
319
+ "rewards/emotion_reward_func": 0.546875,
320
+ "rewards/format_reward_func": 0.78125,
321
  "step": 24
322
  },
323
  {
324
+ "completion_length": 101.0625,
325
  "epoch": 0.03333333333333333,
326
+ "grad_norm": 5.047848224639893,
327
+ "kl": 0.0665823919698596,
328
  "learning_rate": 4.2677669529663686e-07,
329
+ "loss": 0.0001,
330
+ "reward": 0.828125,
331
+ "reward_std": 0.5247472077608109,
332
+ "rewards/emotion_reward_func": 0.078125,
333
+ "rewards/format_reward_func": 0.75,
334
  "step": 25
335
  },
336
  {
337
+ "completion_length": 68.1875,
338
  "epoch": 0.034666666666666665,
339
+ "grad_norm": 6.7862958908081055,
340
+ "kl": 0.07357331551611423,
341
  "learning_rate": 4.2113677648217216e-07,
342
  "loss": 0.0001,
343
+ "reward": 1.515625,
344
+ "reward_std": 0.7178780436515808,
345
+ "rewards/emotion_reward_func": 0.671875,
346
+ "rewards/format_reward_func": 0.84375,
347
  "step": 26
348
  },
349
  {
350
+ "completion_length": 73.5625,
351
  "epoch": 0.036,
352
+ "grad_norm": 6.041502475738525,
353
+ "kl": 0.0839837146922946,
354
  "learning_rate": 4.1532796633091294e-07,
355
+ "loss": 0.0001,
356
+ "reward": 1.0,
357
+ "reward_std": 0.5787727609276772,
358
+ "rewards/emotion_reward_func": 0.1875,
359
+ "rewards/format_reward_func": 0.8125,
360
  "step": 27
361
  },
362
  {
363
+ "completion_length": 75.515625,
364
  "epoch": 0.037333333333333336,
365
+ "grad_norm": 5.880768299102783,
366
+ "kl": 0.07887133583426476,
367
  "learning_rate": 4.0935599743717244e-07,
368
  "loss": 0.0001,
369
+ "reward": 1.265625,
370
+ "reward_std": 0.6387931928038597,
371
+ "rewards/emotion_reward_func": 0.390625,
372
+ "rewards/format_reward_func": 0.875,
373
  "step": 28
374
  },
375
  {
376
+ "completion_length": 84.65625,
377
  "epoch": 0.03866666666666667,
378
+ "grad_norm": 4.456689357757568,
379
+ "kl": 0.05541729833930731,
380
  "learning_rate": 4.0322676341324414e-07,
381
+ "loss": 0.0001,
382
+ "reward": 1.546875,
383
+ "reward_std": 0.6021395623683929,
384
+ "rewards/emotion_reward_func": 0.6875,
385
+ "rewards/format_reward_func": 0.859375,
386
  "step": 29
387
  },
388
  {
389
+ "completion_length": 72.03125,
390
  "epoch": 0.04,
391
+ "grad_norm": 6.281215667724609,
392
+ "kl": 0.07642886973917484,
393
  "learning_rate": 3.9694631307311825e-07,
394
+ "loss": 0.0001,
395
+ "reward": 1.421875,
396
+ "reward_std": 0.6625982969999313,
397
+ "rewards/emotion_reward_func": 0.515625,
398
+ "rewards/format_reward_func": 0.90625,
399
  "step": 30
400
  },
401
  {
402
+ "completion_length": 64.96875,
403
  "epoch": 0.04133333333333333,
404
+ "grad_norm": 6.267475128173828,
405
+ "kl": 0.07545926049351692,
406
  "learning_rate": 3.9052084446303265e-07,
407
  "loss": 0.0001,
408
+ "reward": 1.28125,
409
+ "reward_std": 0.5209204778075218,
410
+ "rewards/emotion_reward_func": 0.34375,
411
+ "rewards/format_reward_func": 0.9375,
412
  "step": 31
413
  },
414
  {
415
+ "completion_length": 74.28125,
416
  "epoch": 0.042666666666666665,
417
+ "grad_norm": 6.10364294052124,
418
+ "kl": 0.08546704892069101,
419
  "learning_rate": 3.839566987447491e-07,
420
+ "loss": 0.0001,
421
+ "reward": 1.59375,
422
+ "reward_std": 0.7168828397989273,
423
+ "rewards/emotion_reward_func": 0.71875,
424
+ "rewards/format_reward_func": 0.875,
425
  "step": 32
426
  },
427
  {
428
+ "completion_length": 70.03125,
429
  "epoch": 0.044,
430
+ "grad_norm": 4.039772033691406,
431
+ "kl": 0.06002845522016287,
432
  "learning_rate": 3.7726035393759283e-07,
433
  "loss": 0.0001,
434
+ "reward": 1.3125,
435
+ "reward_std": 0.2808031141757965,
436
+ "rewards/emotion_reward_func": 0.390625,
437
+ "rewards/format_reward_func": 0.921875,
438
  "step": 33
439
  },
440
  {
441
+ "completion_length": 83.109375,
442
  "epoch": 0.04533333333333334,
443
+ "grad_norm": 6.683216094970703,
444
+ "kl": 0.0706396009773016,
445
  "learning_rate": 3.704384185254288e-07,
446
  "loss": 0.0001,
447
+ "reward": 1.421875,
448
+ "reward_std": 0.7211004346609116,
449
+ "rewards/emotion_reward_func": 0.578125,
450
+ "rewards/format_reward_func": 0.84375,
451
  "step": 34
452
  },
453
  {
454
+ "completion_length": 72.765625,
455
  "epoch": 0.04666666666666667,
456
+ "grad_norm": 6.402747631072998,
457
+ "kl": 0.10811681114137173,
458
  "learning_rate": 3.634976249348867e-07,
459
  "loss": 0.0001,
460
+ "reward": 1.28125,
461
+ "reward_std": 0.5904398858547211,
462
+ "rewards/emotion_reward_func": 0.375,
463
+ "rewards/format_reward_func": 0.90625,
464
  "step": 35
465
  },
466
  {
467
+ "completion_length": 78.734375,
468
  "epoch": 0.048,
469
+ "grad_norm": 5.202386856079102,
470
+ "kl": 0.07850308250635862,
471
  "learning_rate": 3.5644482289126813e-07,
472
  "loss": 0.0001,
473
+ "reward": 1.09375,
474
+ "reward_std": 0.5113069340586662,
475
+ "rewards/emotion_reward_func": 0.203125,
476
+ "rewards/format_reward_func": 0.890625,
477
  "step": 36
478
  },
479
  {
480
+ "completion_length": 68.953125,
481
  "epoch": 0.04933333333333333,
482
+ "grad_norm": 7.1071085929870605,
483
+ "kl": 0.08383779786527157,
484
  "learning_rate": 3.492869726586951e-07,
485
  "loss": 0.0001,
486
+ "reward": 1.1875,
487
+ "reward_std": 0.4691474586725235,
488
+ "rewards/emotion_reward_func": 0.21875,
489
+ "rewards/format_reward_func": 0.96875,
490
  "step": 37
491
  },
492
  {
493
+ "completion_length": 65.734375,
494
  "epoch": 0.050666666666666665,
495
+ "grad_norm": 6.272955417633057,
496
+ "kl": 0.06441066134721041,
497
  "learning_rate": 3.4203113817116953e-07,
498
  "loss": 0.0001,
499
+ "reward": 1.75,
500
+ "reward_std": 0.41095855832099915,
501
+ "rewards/emotion_reward_func": 0.765625,
502
+ "rewards/format_reward_func": 0.984375,
503
  "step": 38
504
  },
505
  {
506
+ "completion_length": 69.9375,
507
  "epoch": 0.052,
508
+ "grad_norm": 3.7793548107147217,
509
+ "kl": 0.06628133170306683,
510
  "learning_rate": 3.346844800613229e-07,
511
  "loss": 0.0001,
512
+ "reward": 1.078125,
513
+ "reward_std": 0.21347813308238983,
514
+ "rewards/emotion_reward_func": 0.109375,
515
+ "rewards/format_reward_func": 0.96875,
516
  "step": 39
517
  },
518
  {
519
+ "completion_length": 64.34375,
520
  "epoch": 0.05333333333333334,
521
+ "grad_norm": 6.500398635864258,
522
+ "kl": 0.06995576526969671,
523
  "learning_rate": 3.272542485937368e-07,
524
  "loss": 0.0001,
525
+ "reward": 1.546875,
526
+ "reward_std": 0.47020626068115234,
527
+ "rewards/emotion_reward_func": 0.5625,
528
+ "rewards/format_reward_func": 0.984375,
529
  "step": 40
530
  },
531
  {
532
+ "completion_length": 73.375,
533
  "epoch": 0.05466666666666667,
534
+ "grad_norm": 4.637302875518799,
535
+ "kl": 0.06445631105452776,
536
  "learning_rate": 3.1974777650980734e-07,
537
  "loss": 0.0001,
538
+ "reward": 1.578125,
539
+ "reward_std": 0.38923946768045425,
540
+ "rewards/emotion_reward_func": 0.609375,
541
+ "rewards/format_reward_func": 0.96875,
542
  "step": 41
543
  },
544
  {
545
+ "completion_length": 69.875,
546
  "epoch": 0.056,
547
+ "grad_norm": 5.5001220703125,
548
+ "kl": 0.06374164298176765,
549
  "learning_rate": 3.121724717912138e-07,
550
  "loss": 0.0001,
551
+ "reward": 1.671875,
552
+ "reward_std": 0.18616947531700134,
553
+ "rewards/emotion_reward_func": 0.671875,
554
+ "rewards/format_reward_func": 1.0,
555
  "step": 42
556
  },
557
  {
558
+ "completion_length": 65.75,
559
  "epoch": 0.05733333333333333,
560
+ "grad_norm": 5.561408996582031,
561
+ "kl": 0.11185399815440178,
562
  "learning_rate": 3.0453581034913565e-07,
563
  "loss": 0.0001,
564
+ "reward": 1.125,
565
+ "reward_std": 0.4263191595673561,
566
+ "rewards/emotion_reward_func": 0.1875,
567
+ "rewards/format_reward_func": 0.9375,
568
  "step": 43
569
  },
570
  {
571
+ "completion_length": 57.59375,
572
  "epoch": 0.058666666666666666,
573
+ "grad_norm": 6.903532028198242,
574
+ "kl": 0.08089348301291466,
575
  "learning_rate": 2.968453286464312e-07,
576
  "loss": 0.0001,
577
+ "reward": 1.75,
578
+ "reward_std": 0.3838024437427521,
579
+ "rewards/emotion_reward_func": 0.765625,
580
+ "rewards/format_reward_func": 0.984375,
581
  "step": 44
582
  },
583
  {
584
+ "completion_length": 81.0625,
585
  "epoch": 0.06,
586
+ "grad_norm": 5.9377288818359375,
587
+ "kl": 0.06169276125729084,
588
  "learning_rate": 2.8910861626005773e-07,
589
  "loss": 0.0001,
590
+ "reward": 1.375,
591
+ "reward_std": 0.5583916157484055,
592
+ "rewards/emotion_reward_func": 0.421875,
593
+ "rewards/format_reward_func": 0.953125,
594
  "step": 45
595
  },
596
  {
597
+ "completion_length": 64.09375,
598
  "epoch": 0.06133333333333333,
599
+ "grad_norm": 6.421321392059326,
600
+ "kl": 0.08067317306995392,
601
  "learning_rate": 2.8133330839107604e-07,
602
  "loss": 0.0001,
603
+ "reward": 1.484375,
604
+ "reward_std": 0.6873095482587814,
605
+ "rewards/emotion_reward_func": 0.59375,
606
+ "rewards/format_reward_func": 0.890625,
607
  "step": 46
608
  },
609
  {
610
+ "completion_length": 65.859375,
611
  "epoch": 0.06266666666666666,
612
+ "grad_norm": 5.465454578399658,
613
+ "kl": 0.08742601610720158,
614
  "learning_rate": 2.735270783296286e-07,
615
  "loss": 0.0001,
616
+ "reward": 1.421875,
617
+ "reward_std": 0.37412673234939575,
618
+ "rewards/emotion_reward_func": 0.4375,
619
+ "rewards/format_reward_func": 0.984375,
620
  "step": 47
621
  },
622
  {
623
+ "completion_length": 61.734375,
624
  "epoch": 0.064,
625
+ "grad_norm": 6.597202777862549,
626
+ "kl": 0.09020566754043102,
627
  "learning_rate": 2.6569762988232837e-07,
628
  "loss": 0.0001,
629
+ "reward": 1.203125,
630
+ "reward_std": 0.4003961533308029,
631
+ "rewards/emotion_reward_func": 0.265625,
632
+ "rewards/format_reward_func": 0.9375,
633
  "step": 48
634
  },
635
  {
636
+ "completion_length": 63.21875,
637
  "epoch": 0.06533333333333333,
638
+ "grad_norm": 5.847234725952148,
639
+ "kl": 0.09095379617065191,
640
  "learning_rate": 2.5785268976953204e-07,
641
  "loss": 0.0001,
642
+ "reward": 1.375,
643
+ "reward_std": 0.3876233473420143,
644
+ "rewards/emotion_reward_func": 0.390625,
645
+ "rewards/format_reward_func": 0.984375,
646
  "step": 49
647
  },
648
  {
649
+ "completion_length": 59.890625,
650
  "epoch": 0.06666666666666667,
651
+ "grad_norm": 7.6002936363220215,
652
+ "kl": 0.08561510033905506,
653
  "learning_rate": 2.5e-07,
654
  "loss": 0.0001,
655
+ "reward": 1.25,
656
+ "reward_std": 0.3925696462392807,
657
+ "rewards/emotion_reward_func": 0.28125,
658
+ "rewards/format_reward_func": 0.96875,
659
  "step": 50
660
  },
661
  {
662
+ "completion_length": 57.890625,
663
  "epoch": 0.068,
664
+ "grad_norm": 6.88557243347168,
665
+ "kl": 0.08306531235575676,
666
  "learning_rate": 2.4214731023046794e-07,
667
  "loss": 0.0001,
668
+ "reward": 1.453125,
669
+ "reward_std": 0.5443578287959099,
670
+ "rewards/emotion_reward_func": 0.515625,
671
+ "rewards/format_reward_func": 0.9375,
672
  "step": 51
673
  },
674
  {
675
+ "completion_length": 64.234375,
676
  "epoch": 0.06933333333333333,
677
+ "grad_norm": 6.265974044799805,
678
+ "kl": 0.09051466174423695,
679
  "learning_rate": 2.3430237011767164e-07,
680
  "loss": 0.0001,
681
+ "reward": 1.484375,
682
+ "reward_std": 0.1875,
683
+ "rewards/emotion_reward_func": 0.484375,
684
+ "rewards/format_reward_func": 1.0,
685
  "step": 52
686
  },
687
  {
688
+ "completion_length": 55.609375,
689
  "epoch": 0.07066666666666667,
690
+ "grad_norm": 5.614126205444336,
691
+ "kl": 0.0768609419465065,
692
  "learning_rate": 2.264729216703714e-07,
693
  "loss": 0.0001,
694
+ "reward": 1.875,
695
+ "reward_std": 0.31027980148792267,
696
+ "rewards/emotion_reward_func": 0.890625,
697
+ "rewards/format_reward_func": 0.984375,
698
  "step": 53
699
  },
700
  {
701
+ "completion_length": 68.859375,
702
  "epoch": 0.072,
703
+ "grad_norm": 5.596243381500244,
704
+ "kl": 0.07597188651561737,
705
  "learning_rate": 2.1866669160892389e-07,
706
  "loss": 0.0001,
707
+ "reward": 1.125,
708
+ "reward_std": 0.3412870988249779,
709
+ "rewards/emotion_reward_func": 0.171875,
710
+ "rewards/format_reward_func": 0.953125,
711
  "step": 54
712
  },
713
  {
714
+ "completion_length": 51.609375,
715
  "epoch": 0.07333333333333333,
716
+ "grad_norm": 7.993938446044922,
717
+ "kl": 0.09897390753030777,
718
  "learning_rate": 2.1089138373994222e-07,
719
+ "loss": 0.0001,
720
+ "reward": 1.578125,
721
+ "reward_std": 0.47475843876600266,
722
+ "rewards/emotion_reward_func": 0.625,
723
+ "rewards/format_reward_func": 0.953125,
724
  "step": 55
725
  },
726
  {
727
+ "completion_length": 48.265625,
728
  "epoch": 0.07466666666666667,
729
+ "grad_norm": 8.550956726074219,
730
+ "kl": 0.10356861166656017,
731
  "learning_rate": 2.0315467135356878e-07,
732
  "loss": 0.0001,
733
+ "reward": 1.28125,
734
+ "reward_std": 0.3340607285499573,
735
+ "rewards/emotion_reward_func": 0.28125,
736
+ "rewards/format_reward_func": 1.0,
737
  "step": 56
738
  },
739
  {
740
+ "completion_length": 56.953125,
741
  "epoch": 0.076,
742
+ "grad_norm": 6.799314498901367,
743
+ "kl": 0.11332254763692617,
744
  "learning_rate": 1.954641896508644e-07,
745
+ "loss": 0.0001,
746
+ "reward": 1.671875,
747
+ "reward_std": 0.3965607285499573,
748
+ "rewards/emotion_reward_func": 0.6875,
749
+ "rewards/format_reward_func": 0.984375,
750
  "step": 57
751
  },
752
  {
753
+ "completion_length": 57.15625,
754
  "epoch": 0.07733333333333334,
755
+ "grad_norm": 6.939026832580566,
756
+ "kl": 0.1864840853959322,
757
  "learning_rate": 1.8782752820878633e-07,
758
+ "loss": 0.0002,
759
+ "reward": 1.53125,
760
+ "reward_std": 0.4032272547483444,
761
+ "rewards/emotion_reward_func": 0.609375,
762
+ "rewards/format_reward_func": 0.921875,
763
  "step": 58
764
  },
765
  {
766
+ "completion_length": 50.34375,
767
  "epoch": 0.07866666666666666,
768
+ "grad_norm": 7.448709487915039,
769
+ "kl": 0.15000650100409985,
770
  "learning_rate": 1.802522234901927e-07,
771
  "loss": 0.0002,
772
+ "reward": 1.453125,
773
+ "reward_std": 0.4035460501909256,
774
+ "rewards/emotion_reward_func": 0.46875,
775
+ "rewards/format_reward_func": 0.984375,
776
  "step": 59
777
  },
778
  {
779
+ "completion_length": 56.296875,
780
  "epoch": 0.08,
781
+ "grad_norm": 6.004992485046387,
782
+ "kl": 0.08795534446835518,
783
  "learning_rate": 1.7274575140626315e-07,
784
  "loss": 0.0001,
785
+ "reward": 1.53125,
786
+ "reward_std": 0.4612579345703125,
787
+ "rewards/emotion_reward_func": 0.546875,
788
+ "rewards/format_reward_func": 0.984375,
789
  "step": 60
790
  },
791
  {
792
+ "completion_length": 55.703125,
793
  "epoch": 0.08133333333333333,
794
+ "grad_norm": 6.890044212341309,
795
+ "kl": 0.09031901508569717,
796
  "learning_rate": 1.6531551993867715e-07,
797
+ "loss": 0.0001,
798
+ "reward": 1.453125,
799
+ "reward_std": 0.41576120257377625,
800
+ "rewards/emotion_reward_func": 0.484375,
801
+ "rewards/format_reward_func": 0.96875,
802
  "step": 61
803
  },
804
  {
805
+ "completion_length": 58.625,
806
  "epoch": 0.08266666666666667,
807
+ "grad_norm": 6.977153778076172,
808
+ "kl": 0.09828664548695087,
809
  "learning_rate": 1.579688618288305e-07,
810
  "loss": 0.0001,
811
+ "reward": 1.609375,
812
+ "reward_std": 0.3605812340974808,
813
+ "rewards/emotion_reward_func": 0.609375,
814
  "rewards/format_reward_func": 1.0,
815
  "step": 62
816
  },
817
  {
818
+ "completion_length": 52.421875,
819
  "epoch": 0.084,
820
+ "grad_norm": 3.109772205352783,
821
+ "kl": 0.1292141806334257,
822
  "learning_rate": 1.5071302734130486e-07,
823
+ "loss": 0.0001,
824
+ "reward": 1.65625,
825
+ "reward_std": 0.15478479862213135,
826
+ "rewards/emotion_reward_func": 0.671875,
827
+ "rewards/format_reward_func": 0.984375,
828
  "step": 63
829
  },
830
  {
831
+ "completion_length": 53.578125,
832
  "epoch": 0.08533333333333333,
833
+ "grad_norm": 7.28511905670166,
834
+ "kl": 0.1237227451056242,
835
  "learning_rate": 1.4355517710873182e-07,
836
  "loss": 0.0001,
837
+ "reward": 1.765625,
838
+ "reward_std": 0.5836244821548462,
839
+ "rewards/emotion_reward_func": 0.84375,
840
+ "rewards/format_reward_func": 0.921875,
841
  "step": 64
842
  },
843
  {
844
+ "completion_length": 54.265625,
845
  "epoch": 0.08666666666666667,
846
+ "grad_norm": 5.760799884796143,
847
+ "kl": 0.10565520823001862,
848
  "learning_rate": 1.365023750651133e-07,
849
  "loss": 0.0001,
850
+ "reward": 1.59375,
851
+ "reward_std": 0.38039760291576385,
852
+ "rewards/emotion_reward_func": 0.609375,
853
+ "rewards/format_reward_func": 0.984375,
854
  "step": 65
855
  },
856
  {
857
+ "completion_length": 55.9375,
858
  "epoch": 0.088,
859
+ "grad_norm": 7.823127269744873,
860
+ "kl": 0.09790175035595894,
861
  "learning_rate": 1.2956158147457114e-07,
862
  "loss": 0.0001,
863
+ "reward": 1.625,
864
+ "reward_std": 0.40478479862213135,
865
+ "rewards/emotion_reward_func": 0.65625,
866
+ "rewards/format_reward_func": 0.96875,
867
  "step": 66
868
  },
869
  {
870
+ "completion_length": 44.671875,
871
  "epoch": 0.08933333333333333,
872
+ "grad_norm": 4.364702224731445,
873
+ "kl": 0.11339546367526054,
874
  "learning_rate": 1.2273964606240718e-07,
875
  "loss": 0.0001,
876
+ "reward": 1.34375,
877
+ "reward_std": 0.18217839300632477,
878
+ "rewards/emotion_reward_func": 0.34375,
879
+ "rewards/format_reward_func": 1.0,
880
  "step": 67
881
  },
882
  {
883
+ "completion_length": 57.421875,
884
  "epoch": 0.09066666666666667,
885
+ "grad_norm": 5.267661094665527,
886
+ "kl": 0.10790612921118736,
887
  "learning_rate": 1.1604330125525078e-07,
888
  "loss": 0.0001,
889
+ "reward": 1.1875,
890
+ "reward_std": 0.2545653209090233,
891
+ "rewards/emotion_reward_func": 0.21875,
892
+ "rewards/format_reward_func": 0.96875,
893
  "step": 68
894
  },
895
  {
896
+ "completion_length": 48.921875,
897
  "epoch": 0.092,
898
+ "grad_norm": 5.409526824951172,
899
+ "kl": 0.11567417718470097,
900
  "learning_rate": 1.0947915553696741e-07,
901
  "loss": 0.0001,
902
+ "reward": 1.453125,
903
+ "reward_std": 0.23328250646591187,
904
+ "rewards/emotion_reward_func": 0.453125,
905
+ "rewards/format_reward_func": 1.0,
906
  "step": 69
907
  },
908
  {
909
+ "completion_length": 55.828125,
910
  "epoch": 0.09333333333333334,
911
+ "grad_norm": 7.669867992401123,
912
+ "kl": 3.9372363202273846,
913
  "learning_rate": 1.0305368692688174e-07,
914
+ "loss": 0.0039,
915
+ "reward": 1.34375,
916
+ "reward_std": 0.3306438848376274,
917
+ "rewards/emotion_reward_func": 0.359375,
918
+ "rewards/format_reward_func": 0.984375,
919
  "step": 70
920
  },
921
  {
922
+ "completion_length": 43.125,
923
  "epoch": 0.09466666666666666,
924
+ "grad_norm": 8.619794845581055,
925
+ "kl": 0.13703159242868423,
926
  "learning_rate": 9.677323658675593e-08,
927
  "loss": 0.0001,
928
+ "reward": 1.5625,
929
+ "reward_std": 0.3234764039516449,
930
+ "rewards/emotion_reward_func": 0.578125,
931
+ "rewards/format_reward_func": 0.984375,
932
  "step": 71
933
  },
934
  {
935
+ "completion_length": 47.484375,
936
  "epoch": 0.096,
937
+ "grad_norm": 5.2266740798950195,
938
+ "kl": 0.1516919508576393,
939
  "learning_rate": 9.064400256282755e-08,
940
+ "loss": 0.0002,
941
+ "reward": 1.296875,
942
+ "reward_std": 0.34940721094608307,
943
+ "rewards/emotion_reward_func": 0.3125,
944
+ "rewards/format_reward_func": 0.984375,
945
  "step": 72
946
  },
947
  {
948
+ "completion_length": 60.5,
949
  "epoch": 0.09733333333333333,
950
+ "grad_norm": 6.890865325927734,
951
+ "kl": 0.10698151029646397,
952
  "learning_rate": 8.467203366908707e-08,
953
  "loss": 0.0001,
954
+ "reward": 1.375,
955
+ "reward_std": 0.3380199372768402,
956
+ "rewards/emotion_reward_func": 0.40625,
957
+ "rewards/format_reward_func": 0.96875,
958
  "step": 73
959
  },
960
  {
961
+ "completion_length": 45.8125,
962
  "epoch": 0.09866666666666667,
963
+ "grad_norm": 4.76033878326416,
964
+ "kl": 0.2001918088644743,
965
  "learning_rate": 7.886322351782782e-08,
966
+ "loss": 0.0002,
967
+ "reward": 1.6875,
968
+ "reward_std": 0.26933756470680237,
969
+ "rewards/emotion_reward_func": 0.703125,
970
+ "rewards/format_reward_func": 0.984375,
971
  "step": 74
972
  },
973
  {
974
+ "completion_length": 53.640625,
975
  "epoch": 0.1,
976
+ "grad_norm": 7.104291915893555,
977
+ "kl": 0.09334802068769932,
978
  "learning_rate": 7.322330470336313e-08,
979
  "loss": 0.0001,
980
+ "reward": 1.453125,
981
+ "reward_std": 0.3290881961584091,
982
+ "rewards/emotion_reward_func": 0.484375,
983
+ "rewards/format_reward_func": 0.96875,
984
  "step": 75
985
  },
986
  {
987
+ "completion_length": 45.109375,
988
  "epoch": 0.10133333333333333,
989
+ "grad_norm": 10.217005729675293,
990
+ "kl": 0.12721979059278965,
991
  "learning_rate": 6.775784314464716e-08,
992
  "loss": 0.0001,
993
+ "reward": 1.4375,
994
+ "reward_std": 0.3354101926088333,
995
+ "rewards/emotion_reward_func": 0.4375,
996
+ "rewards/format_reward_func": 1.0,
997
  "step": 76
998
  },
999
  {
1000
+ "completion_length": 49.390625,
1001
  "epoch": 0.10266666666666667,
1002
+ "grad_norm": 5.090665817260742,
1003
+ "kl": 0.1952703520655632,
1004
  "learning_rate": 6.24722325923851e-08,
1005
  "loss": 0.0002,
1006
+ "reward": 1.5625,
1007
+ "reward_std": 0.21039125323295593,
1008
+ "rewards/emotion_reward_func": 0.5625,
1009
  "rewards/format_reward_func": 1.0,
1010
  "step": 77
1011
  },
1012
  {
1013
+ "completion_length": 40.953125,
1014
  "epoch": 0.104,
1015
+ "grad_norm": 10.18713665008545,
1016
+ "kl": 0.1544223688542843,
1017
  "learning_rate": 5.737168930605271e-08,
1018
  "loss": 0.0002,
1019
+ "reward": 1.609375,
1020
+ "reward_std": 0.44701361656188965,
1021
+ "rewards/emotion_reward_func": 0.640625,
1022
  "rewards/format_reward_func": 0.96875,
1023
  "step": 78
1024
  },
1025
  {
1026
+ "completion_length": 45.84375,
1027
  "epoch": 0.10533333333333333,
1028
+ "grad_norm": 6.143667221069336,
1029
+ "kl": 0.14481117203831673,
1030
  "learning_rate": 5.246124690607739e-08,
1031
+ "loss": 0.0001,
1032
+ "reward": 1.515625,
1033
+ "reward_std": 0.2715607285499573,
1034
+ "rewards/emotion_reward_func": 0.515625,
1035
+ "rewards/format_reward_func": 1.0,
1036
  "step": 79
1037
  },
1038
  {
1039
+ "completion_length": 46.59375,
1040
  "epoch": 0.10666666666666667,
1041
+ "grad_norm": 5.600602149963379,
1042
+ "kl": 0.10117345489561558,
1043
  "learning_rate": 4.774575140626316e-08,
1044
+ "loss": 0.0001,
1045
+ "reward": 1.890625,
1046
+ "reward_std": 0.1280868798494339,
1047
+ "rewards/emotion_reward_func": 0.890625,
1048
+ "rewards/format_reward_func": 1.0,
1049
  "step": 80
1050
  },
1051
  {
1052
+ "completion_length": 55.265625,
1053
  "epoch": 0.108,
1054
+ "grad_norm": 8.253979682922363,
1055
+ "kl": 0.11770397983491421,
1056
  "learning_rate": 4.3229856431359513e-08,
1057
+ "loss": 0.0001,
1058
+ "reward": 1.515625,
1059
+ "reward_std": 0.5620162785053253,
1060
+ "rewards/emotion_reward_func": 0.5625,
1061
+ "rewards/format_reward_func": 0.953125,
1062
  "step": 81
1063
  },
1064
  {
1065
+ "completion_length": 55.34375,
1066
  "epoch": 0.10933333333333334,
1067
+ "grad_norm": 7.1831135749816895,
1068
+ "kl": 0.1115355659276247,
1069
  "learning_rate": 3.8918018624496286e-08,
1070
  "loss": 0.0001,
1071
+ "reward": 1.375,
1072
+ "reward_std": 0.3838024437427521,
1073
+ "rewards/emotion_reward_func": 0.390625,
1074
+ "rewards/format_reward_func": 0.984375,
1075
  "step": 82
1076
  },
1077
  {
1078
+ "completion_length": 48.40625,
1079
  "epoch": 0.11066666666666666,
1080
+ "grad_norm": 7.065600395202637,
1081
+ "kl": 0.14161021262407303,
1082
  "learning_rate": 3.481449324901411e-08,
1083
  "loss": 0.0001,
1084
+ "reward": 1.515625,
1085
+ "reward_std": 0.3940916210412979,
1086
+ "rewards/emotion_reward_func": 0.546875,
1087
  "rewards/format_reward_func": 0.96875,
1088
  "step": 83
1089
  },
1090
  {
1091
+ "completion_length": 52.5625,
1092
  "epoch": 0.112,
1093
+ "grad_norm": 7.827910423278809,
1094
+ "kl": 0.16811930015683174,
1095
  "learning_rate": 3.092332998903416e-08,
1096
+ "loss": 0.0002,
1097
+ "reward": 1.5,
1098
+ "reward_std": 0.5367372632026672,
1099
+ "rewards/emotion_reward_func": 0.546875,
1100
+ "rewards/format_reward_func": 0.953125,
1101
  "step": 84
1102
  },
1103
  {
1104
+ "completion_length": 51.328125,
1105
  "epoch": 0.11333333333333333,
1106
+ "grad_norm": 4.83447265625,
1107
+ "kl": 0.11310878023505211,
1108
  "learning_rate": 2.724836895290805e-08,
1109
  "loss": 0.0001,
1110
+ "reward": 1.640625,
1111
+ "reward_std": 0.1280868798494339,
1112
+ "rewards/emotion_reward_func": 0.640625,
1113
  "rewards/format_reward_func": 1.0,
1114
  "step": 85
1115
  },
1116
  {
1117
+ "completion_length": 47.578125,
1118
  "epoch": 0.11466666666666667,
1119
+ "grad_norm": 5.901933670043945,
1120
+ "kl": 0.1336957048624754,
1121
  "learning_rate": 2.379323688349516e-08,
1122
  "loss": 0.0001,
1123
+ "reward": 1.375,
1124
+ "reward_std": 0.21039125323295593,
1125
+ "rewards/emotion_reward_func": 0.375,
1126
  "rewards/format_reward_func": 1.0,
1127
  "step": 86
1128
  },
1129
  {
1130
+ "completion_length": 50.53125,
1131
  "epoch": 0.116,
1132
+ "grad_norm": 5.392825603485107,
1133
+ "kl": 0.1246971283107996,
1134
  "learning_rate": 2.0561343579004715e-08,
1135
  "loss": 0.0001,
1136
+ "reward": 1.6875,
1137
+ "reward_std": 0.3024514466524124,
1138
+ "rewards/emotion_reward_func": 0.71875,
1139
+ "rewards/format_reward_func": 0.96875,
1140
  "step": 87
1141
  },
1142
  {
1143
+ "completion_length": 52.515625,
1144
  "epoch": 0.11733333333333333,
1145
+ "grad_norm": 7.34331750869751,
1146
+ "kl": 0.11970937624573708,
1147
  "learning_rate": 1.7555878527937163e-08,
1148
+ "loss": 0.0001,
1149
+ "reward": 1.34375,
1150
+ "reward_std": 0.40886765718460083,
1151
+ "rewards/emotion_reward_func": 0.359375,
1152
+ "rewards/format_reward_func": 0.984375,
1153
  "step": 88
1154
  },
1155
  {
1156
+ "completion_length": 58.03125,
1157
  "epoch": 0.11866666666666667,
1158
+ "grad_norm": 4.543256759643555,
1159
+ "kl": 0.12226046249270439,
1160
  "learning_rate": 1.4779807761443635e-08,
1161
  "loss": 0.0001,
1162
+ "reward": 1.546875,
1163
+ "reward_std": 0.3359370082616806,
1164
+ "rewards/emotion_reward_func": 0.578125,
1165
+ "rewards/format_reward_func": 0.96875,
1166
  "step": 89
1167
  },
1168
  {
1169
+ "completion_length": 52.265625,
1170
  "epoch": 0.12,
1171
+ "grad_norm": 7.780374050140381,
1172
+ "kl": 0.1240895576775074,
1173
  "learning_rate": 1.2235870926211616e-08,
1174
  "loss": 0.0001,
1175
+ "reward": 1.59375,
1176
+ "reward_std": 0.25,
1177
+ "rewards/emotion_reward_func": 0.59375,
1178
+ "rewards/format_reward_func": 1.0,
1179
  "step": 90
1180
  },
1181
  {
1182
+ "completion_length": 52.5,
1183
  "epoch": 0.12133333333333333,
1184
+ "grad_norm": 3.1433615684509277,
1185
+ "kl": 0.10472103767096996,
1186
  "learning_rate": 9.926578580764234e-09,
1187
  "loss": 0.0001,
1188
+ "reward": 1.734375,
1189
+ "reward_std": 0.0625,
1190
+ "rewards/emotion_reward_func": 0.734375,
1191
+ "rewards/format_reward_func": 1.0,
1192
  "step": 91
1193
  },
1194
  {
1195
+ "completion_length": 50.921875,
1196
  "epoch": 0.12266666666666666,
1197
+ "grad_norm": 7.484493732452393,
1198
+ "kl": 0.10997971147298813,
1199
  "learning_rate": 7.85420971784223e-09,
1200
  "loss": 0.0001,
1201
+ "reward": 1.640625,
1202
+ "reward_std": 0.2829566150903702,
1203
+ "rewards/emotion_reward_func": 0.640625,
1204
+ "rewards/format_reward_func": 1.0,
1205
  "step": 92
1206
  },
1207
  {
1208
+ "completion_length": 52.015625,
1209
  "epoch": 0.124,
1210
+ "grad_norm": 8.367355346679688,
1211
+ "kl": 0.1366768404841423,
1212
  "learning_rate": 6.020809515313141e-09,
1213
  "loss": 0.0001,
1214
+ "reward": 1.28125,
1215
+ "reward_std": 0.3538651019334793,
1216
+ "rewards/emotion_reward_func": 0.28125,
1217
+ "rewards/format_reward_func": 1.0,
1218
  "step": 93
1219
  },
1220
  {
1221
+ "completion_length": 45.796875,
1222
  "epoch": 0.12533333333333332,
1223
+ "grad_norm": 6.152498245239258,
1224
+ "kl": 0.1532982587814331,
1225
  "learning_rate": 4.4281873178278475e-09,
1226
  "loss": 0.0002,
1227
+ "reward": 1.640625,
1228
+ "reward_std": 0.414583221077919,
1229
+ "rewards/emotion_reward_func": 0.671875,
1230
+ "rewards/format_reward_func": 0.96875,
1231
  "step": 94
1232
  },
1233
  {
1234
+ "completion_length": 50.25,
1235
  "epoch": 0.12666666666666668,
1236
+ "grad_norm": 0.020403465256094933,
1237
+ "kl": 0.11631555296480656,
1238
  "learning_rate": 3.077914851215585e-09,
1239
+ "loss": 0.0001,
1240
+ "reward": 1.5,
1241
+ "reward_std": 0.0,
1242
+ "rewards/emotion_reward_func": 0.5,
1243
+ "rewards/format_reward_func": 1.0,
1244
  "step": 95
1245
  },
1246
  {
1247
+ "completion_length": 53.671875,
1248
  "epoch": 0.128,
1249
+ "grad_norm": 6.355586528778076,
1250
+ "kl": 0.11114749684929848,
1251
  "learning_rate": 1.9713246713805587e-09,
1252
  "loss": 0.0001,
1253
+ "reward": 1.578125,
1254
+ "reward_std": 0.33949775248765945,
1255
+ "rewards/emotion_reward_func": 0.59375,
1256
+ "rewards/format_reward_func": 0.984375,
1257
  "step": 96
1258
  },
1259
  {
1260
+ "completion_length": 51.921875,
1261
  "epoch": 0.12933333333333333,
1262
+ "grad_norm": 7.896821022033691,
1263
+ "kl": 0.14222519844770432,
1264
  "learning_rate": 1.1095088492300008e-09,
1265
+ "loss": 0.0001,
1266
+ "reward": 1.625,
1267
+ "reward_std": 0.1905868798494339,
1268
+ "rewards/emotion_reward_func": 0.625,
1269
+ "rewards/format_reward_func": 1.0,
1270
  "step": 97
1271
  },
1272
  {
1273
+ "completion_length": 52.90625,
1274
  "epoch": 0.13066666666666665,
1275
+ "grad_norm": 8.117393493652344,
1276
+ "kl": 0.29374901205301285,
1277
  "learning_rate": 4.933178929321102e-10,
1278
+ "loss": 0.0003,
1279
+ "reward": 1.65625,
1280
+ "reward_std": 0.5836337506771088,
1281
+ "rewards/emotion_reward_func": 0.71875,
1282
+ "rewards/format_reward_func": 0.9375,
1283
  "step": 98
1284
  },
1285
  {
1286
+ "completion_length": 49.234375,
1287
  "epoch": 0.132,
1288
+ "grad_norm": 4.081859111785889,
1289
+ "kl": 0.12134127877652645,
1290
  "learning_rate": 1.2335990856709998e-10,
1291
  "loss": 0.0001,
1292
+ "reward": 1.75,
1293
+ "reward_std": 0.2367546260356903,
1294
+ "rewards/emotion_reward_func": 0.765625,
1295
+ "rewards/format_reward_func": 0.984375,
1296
  "step": 99
1297
  },
1298
  {
1299
+ "completion_length": 51.25,
1300
  "epoch": 0.13333333333333333,
1301
+ "grad_norm": 6.719541072845459,
1302
+ "kl": 0.1454935073852539,
1303
  "learning_rate": 0.0,
1304
  "loss": 0.0001,
1305
+ "reward": 1.609375,
1306
+ "reward_std": 0.36967839300632477,
1307
+ "rewards/emotion_reward_func": 0.640625,
1308
+ "rewards/format_reward_func": 0.96875,
1309
  "step": 100
1310
  }
1311
  ],
 
1327
  }
1328
  },
1329
  "total_flos": 0.0,
1330
+ "train_batch_size": 4,
1331
  "trial_name": null,
1332
  "trial_params": null
1333
  }
checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44028d40b3841ea820da6e7b4c46c26072bff1fc53ba336bed63b0030abe9bdd
3
- size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6e728504ceeed070362223f2228cf2aa1bf386361fcc0c0d4d877e62196c58
3
+ size 5752