File size: 25,073 Bytes
ac1ec4d
 
20261c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac1ec4d
20261c8
 
 
 
 
 
 
 
 
 
9f5b8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b71ffb9
 
 
 
1280a74
 
 
 
 
 
 
 
 
20261c8
 
 
 
 
 
 
b71ffb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc63ed3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20261c8
 
 
 
 
 
 
 
ef05d5a
 
 
1280a74
ef05d5a
 
20261c8
 
 
 
 
 
 
1280a74
20261c8
 
 
1280a74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20261c8
18f918f
 
 
 
ef18151
 
 
 
91d88a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c65045f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d88a3
1f19c01
988877d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1aece92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988877d
 
1f19c01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a4f571
 
 
 
 
 
54daf0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a4f571
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
---

license: mit
pipeline_tag: text-generation
library_name: transformers
language: [
    'en', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el',
    'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he',
    'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko',
    'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my',
    'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si',
    'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tn',
    'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zu',
]
datasets:
# core - base
- ontocord/fineweb-permissive-multilingual-2m
- distily/c4_multilingual_1M
- data-silence/sumnews
- xu-song/cc100-samples
- badrex/llm-emoji-dataset
- fblgit/simple-math
- Gusarich/math-expressions-1m
- neuralwork/arxiver
- christopher/rosetta-code
- nampdn-ai/tiny-codes
- JeanKaddour/minipile
# core - instruct
- NousResearch/hermes-function-calling-v1
- simplescaling/s1K-1.1
# base - instruct
- mlabonne/open-perfectblend
- allenai/tulu-3-sft-mixture
- rombodawg/Everything_Instruct_Multilingual
# base - reason
- open-r1/OpenR1-Math-220k
- open-thoughts/OpenThoughts-114k
- cognitivecomputations/dolphin-r1
- simplescaling/s1K-1.1
tags:
- chat
- core
- base
- instruct
- reason
---


# tangled-alpha-0.9-core

![logo](./misc/logo.jpg)

```bash

time python -B prepare_core_datasets.py

```

```

i=0, min_len=0, max_len=1073741824, block_size=1025, chunk_size=16400000, len(dataset)=5146620, len(dataset) * block_size=5275285500

Total number of tokens in the optimized dataset '../core-data-0-0-1073741824-1025-16000' is 5275285500



i=1, min_len=1025, max_len=2049, block_size=2049, chunk_size=16392000, len(dataset)=309838, len(dataset) * block_size=634858062

Total number of tokens in the optimized dataset '../core-data-1-1025-2049-2049-8000' is 634858062



i=2, min_len=2049, max_len=4097, block_size=4097, chunk_size=16388000, len(dataset)=113843, len(dataset) * block_size=466414771

Total number of tokens in the optimized dataset '../core-data-2-2049-4097-4097-4000' is 466414771



i=3, min_len=4097, max_len=8193, block_size=8193, chunk_size=16386000, len(dataset)=56713, len(dataset) * block_size=464649609

Total number of tokens in the optimized dataset '../core-data-3-4097-8193-8193-2000' is 464649609



i=4, min_len=8193, max_len=16385, block_size=16385, chunk_size=16385000, len(dataset)=37406, len(dataset) * block_size=612897310

Total number of tokens in the optimized dataset '../core-data-4-8193-16385-16385-1000' is 612897310



i=5, min_len=16385, max_len=32769, block_size=32769, chunk_size=16384500, len(dataset)=12737, len(dataset) * block_size=417378753

Total number of tokens in the optimized dataset '../core-data-5-16385-32769-32769-500' is 417378753



i=6, min_len=32769, max_len=65537, block_size=65537, chunk_size=16384250, len(dataset)=2824, len(dataset) * block_size=185076488

Total number of tokens in the optimized dataset '../core-data-6-32769-65537-65537-250' is 185076488



i=7, min_len=65537, max_len=131073, block_size=131073, chunk_size=16384125, len(dataset)=634, len(dataset) * block_size=83100282

Total number of tokens in the optimized dataset '../core-data-7-65537-131073-131073-125' is 83100282



real    292m54.341s

user    2118m1.154s

sys     12m2.746s



20G     tangled-alpha-0.9-core/core-data-0-0-1073741824-1025-16000

2.4G    tangled-alpha-0.9-core/core-data-1-1025-2049-2049-8000

1.8G    tangled-alpha-0.9-core/core-data-2-2049-4097-4097-4000

1.8G    tangled-alpha-0.9-core/core-data-3-4097-8193-8193-2000

2.3G    tangled-alpha-0.9-core/core-data-4-8193-16385-16385-1000

1.6G    tangled-alpha-0.9-core/core-data-5-16385-32769-32769-500

709M    tangled-alpha-0.9-core/core-data-6-32769-65537-65537-250

321M    tangled-alpha-0.9-core/core-data-7-65537-131073-131073-125

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_0.yaml

```

```

Seed set to 23

Time to instantiate model: 0.44 seconds.

Total parameters: 234,914,304

Verifying settings ...

Measured TFLOPs: 55520.94

Epoch 1 | iter 64 step 1 | loss train: 11.977, val: n/a | iter time: 490.27 ms (step) remaining time: 6 days, 22:47:04

Epoch 1 | iter 128 step 2 | loss train: 11.970, val: n/a | iter time: 351.11 ms (step) remaining time: 4 days, 16:53:01

Epoch 1 | iter 192 step 3 | loss train: 11.971, val: n/a | iter time: 353.74 ms (step) remaining time: 3 days, 23:43:23

Epoch 1 | iter 256 step 4 | loss train: 11.974, val: n/a | iter time: 355.03 ms (step) remaining time: 3 days, 14:41:57

Epoch 1 | iter 320 step 5 | loss train: 11.964, val: n/a | iter time: 357.36 ms (step) remaining time: 3 days, 9:21:54

Epoch 1 | iter 384 step 6 | loss train: 11.957, val: n/a | iter time: 362.27 ms (step) remaining time: 3 days, 5:53:20

Epoch 1 | iter 448 step 7 | loss train: 11.948, val: n/a | iter time: 359.89 ms (step) remaining time: 3 days, 3:26:34

Epoch 1 | iter 512 step 8 | loss train: 11.938, val: n/a | iter time: 363.84 ms (step) remaining time: 3 days, 1:37:54

Epoch 1 | iter 576 step 9 | loss train: 11.920, val: n/a | iter time: 362.75 ms (step) remaining time: 3 days, 0:13:59

Epoch 1 | iter 640 step 10 | loss train: 11.900, val: n/a | iter time: 363.46 ms (step) remaining time: 2 days, 23:07:06

# ...

Epoch 1 | iter 643264 step 10051 | loss train: 2.834, val: 2.669 | iter time: 360.50 ms (step) remaining time: 0:03:59

Epoch 2 | iter 643328 step 10052 | loss train: 2.837, val: 2.669 | iter time: 359.53 ms (step) remaining time: 0:03:37

Epoch 2 | iter 643392 step 10053 | loss train: 2.768, val: 2.669 | iter time: 362.83 ms (step) remaining time: 0:03:15

Epoch 2 | iter 643456 step 10054 | loss train: 2.695, val: 2.669 | iter time: 363.85 ms (step) remaining time: 0:02:53

Epoch 2 | iter 643520 step 10055 | loss train: 2.768, val: 2.669 | iter time: 365.40 ms (step) remaining time: 0:02:30

Epoch 2 | iter 643584 step 10056 | loss train: 2.710, val: 2.669 | iter time: 364.72 ms (step) remaining time: 0:02:08

Epoch 2 | iter 643648 step 10057 | loss train: 2.749, val: 2.669 | iter time: 365.00 ms (step) remaining time: 0:01:46

Epoch 2 | iter 643712 step 10058 | loss train: 2.748, val: 2.669 | iter time: 363.42 ms (step) remaining time: 0:01:24

Epoch 2 | iter 643776 step 10059 | loss train: 2.710, val: 2.669 | iter time: 364.49 ms (step) remaining time: 0:01:02

Epoch 2 | iter 643840 step 10060 | loss train: 2.738, val: 2.669 | iter time: 364.43 ms (step) remaining time: 0:00:39

Epoch 2 | iter 643904 step 10061 | loss train: 2.734, val: 2.669 | iter time: 364.94 ms (step) remaining time: 0:00:17

Validating ...

Final evaluation | val loss: 2.669 | val ppl: 14.422

Saving checkpoint to '../out/pretrain-core-0/final/lit_model.pth'

----------------------------------------

| Performance

| - Total tokens  : 5,275,279,360

| - Training Time : 223314.37 s

| - Tok/sec       : 5541.09 tok/s

| ----------------------------------------

| Memory Usage

| - Memory Used   : 22.33 GB

----------------------------------------

```

Backup `wandb`:

```bash

mv wandb wandb-pretrain-core-0

```

Copy config:

```bash

cp ../config-0.json ../out/pretrain-core-0/final/config.json

```

Chat with model:

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-core-0/final

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-core-0/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-core-0/final'

```

```

|                           Tasks                           |Version|Filter|n-shot|        Metric         |   |Value |   |Stderr|

|-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|

|leaderboard                                                |    N/A|      |      |                       |   |      |   |      |

| - leaderboard_bbh                                         |    N/A|      |      |                       |   |      |   |      |

|  - leaderboard_bbh_boolean_expressions                    |      1|none  |     3|acc_norm               |↑  |0.4600|±  |0.0316|

|  - leaderboard_bbh_causal_judgement                       |      1|none  |     3|acc_norm               |↑  |0.5134|±  |0.0366|

|  - leaderboard_bbh_date_understanding                     |      1|none  |     3|acc_norm               |↑  |0.1960|±  |0.0252|

|  - leaderboard_bbh_disambiguation_qa                      |      1|none  |     3|acc_norm               |↑  |0.3320|±  |0.0298|

|  - leaderboard_bbh_formal_fallacies                       |      1|none  |     3|acc_norm               |↑  |0.4680|±  |0.0316|

|  - leaderboard_bbh_geometric_shapes                       |      1|none  |     3|acc_norm               |↑  |0.2400|±  |0.0271|

|  - leaderboard_bbh_hyperbaton                             |      1|none  |     3|acc_norm               |↑  |0.5160|±  |0.0317|

|  - leaderboard_bbh_logical_deduction_five_objects         |      1|none  |     3|acc_norm               |↑  |0.2040|±  |0.0255|

|  - leaderboard_bbh_logical_deduction_seven_objects        |      1|none  |     3|acc_norm               |↑  |0.1320|±  |0.0215|

|  - leaderboard_bbh_logical_deduction_three_objects        |      1|none  |     3|acc_norm               |↑  |0.3440|±  |0.0301|

|  - leaderboard_bbh_movie_recommendation                   |      1|none  |     3|acc_norm               |↑  |0.2680|±  |0.0281|

|  - leaderboard_bbh_navigate                               |      1|none  |     3|acc_norm               |↑  |0.5720|±  |0.0314|

|  - leaderboard_bbh_object_counting                        |      1|none  |     3|acc_norm               |↑  |0.0680|±  |0.0160|

|  - leaderboard_bbh_penguins_in_a_table                    |      1|none  |     3|acc_norm               |↑  |0.2055|±  |0.0336|

|  - leaderboard_bbh_reasoning_about_colored_objects        |      1|none  |     3|acc_norm               |↑  |0.1760|±  |0.0241|

|  - leaderboard_bbh_ruin_names                             |      1|none  |     3|acc_norm               |↑  |0.2120|±  |0.0259|

|  - leaderboard_bbh_salient_translation_error_detection    |      1|none  |     3|acc_norm               |↑  |0.2240|±  |0.0264|

|  - leaderboard_bbh_snarks                                 |      1|none  |     3|acc_norm               |↑  |0.5393|±  |0.0375|

|  - leaderboard_bbh_sports_understanding                   |      1|none  |     3|acc_norm               |↑  |0.4600|±  |0.0316|

|  - leaderboard_bbh_temporal_sequences                     |      1|none  |     3|acc_norm               |↑  |0.2760|±  |0.0283|

|  - leaderboard_bbh_tracking_shuffled_objects_five_objects |      1|none  |     3|acc_norm               |↑  |0.1720|±  |0.0239|

|  - leaderboard_bbh_tracking_shuffled_objects_seven_objects|      1|none  |     3|acc_norm               |↑  |0.1360|±  |0.0217|

|  - leaderboard_bbh_tracking_shuffled_objects_three_objects|      1|none  |     3|acc_norm               |↑  |0.3320|±  |0.0298|

|  - leaderboard_bbh_web_of_lies                            |      1|none  |     3|acc_norm               |↑  |0.4880|±  |0.0317|

| - leaderboard_gpqa                                        |    N/A|      |      |                       |   |      |   |      |

|  - leaderboard_gpqa_diamond                               |      1|none  |     0|acc_norm               |↑  |0.2071|±  |0.0289|

|  - leaderboard_gpqa_extended                              |      1|none  |     0|acc_norm               |↑  |0.2637|±  |0.0189|

|  - leaderboard_gpqa_main                                  |      1|none  |     0|acc_norm               |↑  |0.2612|±  |0.0208|

| - leaderboard_ifeval                                      |      3|none  |     0|inst_level_loose_acc   |↑  |0.2770|±  |   N/A|

|                                                           |       |none  |     0|inst_level_strict_acc  |↑  |0.2710|±  |   N/A|

|                                                           |       |none  |     0|prompt_level_loose_acc |↑  |0.1534|±  |0.0155|

|                                                           |       |none  |     0|prompt_level_strict_acc|↑  |0.1497|±  |0.0154|

| - leaderboard_math_hard                                   |    N/A|      |      |                       |   |      |   |      |

|  - leaderboard_math_algebra_hard                          |      2|none  |     4|exact_match            |↑  |0.0017|±  |0.0012|

|  - leaderboard_math_counting_and_prob_hard                |      2|none  |     4|exact_match            |↑  |0.0000|±  |     0|

|  - leaderboard_math_geometry_hard                         |      2|none  |     4|exact_match            |↑  |0.0000|±  |     0|

|  - leaderboard_math_intermediate_algebra_hard             |      2|none  |     4|exact_match            |↑  |0.0033|±  |0.0019|

|  - leaderboard_math_num_theory_hard                       |      2|none  |     4|exact_match            |↑  |0.0037|±  |0.0026|

|  - leaderboard_math_prealgebra_hard                       |      2|none  |     4|exact_match            |↑  |0.0046|±  |0.0023|

|  - leaderboard_math_precalculus_hard                      |      2|none  |     4|exact_match            |↑  |0.0000|±  |     0|

| - leaderboard_mmlu_pro                                    |    0.1|none  |     5|acc                    |↑  |0.1068|±  |0.0028|

| - leaderboard_musr                                        |    N/A|      |      |                       |   |      |   |      |

|  - leaderboard_musr_murder_mysteries                      |      1|none  |     0|acc_norm               |↑  |0.5160|±  |0.0317|

|  - leaderboard_musr_object_placements                     |      1|none  |     0|acc_norm               |↑  |0.2344|±  |0.0265|

|  - leaderboard_musr_team_allocation                       |      1|none  |     0|acc_norm               |↑  |0.3200|±  |0.0296|

```

```bash

litgpt convert_pretrained_checkpoint ../out/pretrain-core-0/final ../out/pretrain-core-0/checkpoint

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_1.yaml

```

```

Seed set to 23

Time to instantiate model: 0.32 seconds.

Total parameters: 234,914,304

Validating ...

Measured TFLOPs: 27760.47

Epoch 1 | iter 128 step 1 | loss train: 3.205, val: 3.457 | iter time: 450.87 ms (step) remaining time: 10:13:33

Epoch 1 | iter 256 step 2 | loss train: 3.175, val: 3.457 | iter time: 386.03 ms (step) remaining time: 9:00:10

Epoch 1 | iter 384 step 3 | loss train: 3.144, val: 3.457 | iter time: 388.88 ms (step) remaining time: 8:35:53

Epoch 1 | iter 512 step 4 | loss train: 3.260, val: 3.457 | iter time: 390.66 ms (step) remaining time: 8:24:17

Epoch 1 | iter 640 step 5 | loss train: 3.247, val: 3.457 | iter time: 392.99 ms (step) remaining time: 8:17:54

Epoch 1 | iter 768 step 6 | loss train: 3.264, val: 3.457 | iter time: 397.01 ms (step) remaining time: 8:13:51

Epoch 1 | iter 896 step 7 | loss train: 3.232, val: 3.457 | iter time: 396.00 ms (step) remaining time: 8:11:00

Epoch 1 | iter 1024 step 8 | loss train: 3.287, val: 3.457 | iter time: 396.93 ms (step) remaining time: 8:08:43

Epoch 1 | iter 1152 step 9 | loss train: 3.236, val: 3.457 | iter time: 398.67 ms (step) remaining time: 8:06:49

Epoch 1 | iter 1280 step 10 | loss train: 3.274, val: 3.457 | iter time: 399.49 ms (step) remaining time: 8:05:09

# ...

Epoch 1 | iter 76928 step 601 | loss train: 3.177, val: 3.304 | iter time: 400.61 ms (step) remaining time: 0:03:35

Epoch 1 | iter 77056 step 602 | loss train: 3.191, val: 3.304 | iter time: 396.14 ms (step) remaining time: 0:02:46

Epoch 1 | iter 77184 step 603 | loss train: 3.173, val: 3.304 | iter time: 399.39 ms (step) remaining time: 0:01:58

Epoch 1 | iter 77312 step 604 | loss train: 3.211, val: 3.304 | iter time: 398.61 ms (step) remaining time: 0:01:09

Epoch 1 | iter 77440 step 605 | loss train: 3.203, val: 3.304 | iter time: 399.31 ms (step) remaining time: 0:00:21

Validating ...

Final evaluation | val loss: 3.304 | val ppl: 27.217

Saving checkpoint to '../out/pretrain-core-1/final/lit_model.pth'

----------------------------------------

| Performance

| - Total tokens  : 634,855,424

| - Training Time : 29361.39 s

| - Tok/sec       : 524.18 tok/s

| ----------------------------------------

| Memory Usage

| - Memory Used   : 22.33 GB

----------------------------------------

```

```bash

mv wandb wandb-pretrain-core-1

```

```bash

cp ../config-1.json ../out/pretrain-core-1/final/config.json

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-core-1/final

```

```bash

litgpt convert_pretrained_checkpoint ../out/pretrain-core-1/final ../out/pretrain-core-1/checkpoint

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_2.yaml

```

```

Seed set to 23

Time to instantiate model: 0.30 seconds.

Total parameters: 234,914,304

Validating ...

Measured TFLOPs: 13880.23

Epoch 1 | iter 256 step 1 | loss train: 2.857, val: 3.078 | iter time: 518.53 ms (step) remaining time: 8:49:46

Epoch 1 | iter 512 step 2 | loss train: 2.839, val: 3.078 | iter time: 461.04 ms (step) remaining time: 7:47:05

Epoch 1 | iter 768 step 3 | loss train: 2.835, val: 3.078 | iter time: 462.82 ms (step) remaining time: 7:27:16

Epoch 1 | iter 1024 step 4 | loss train: 2.872, val: 3.078 | iter time: 464.55 ms (step) remaining time: 7:16:22

Epoch 1 | iter 1280 step 5 | loss train: 2.867, val: 3.078 | iter time: 462.06 ms (step) remaining time: 7:09:02

Epoch 1 | iter 1536 step 6 | loss train: 2.899, val: 3.078 | iter time: 465.26 ms (step) remaining time: 7:03:30

Epoch 1 | iter 1792 step 7 | loss train: 2.878, val: 3.078 | iter time: 465.57 ms (step) remaining time: 6:59:00

Epoch 1 | iter 2048 step 8 | loss train: 2.919, val: 3.078 | iter time: 464.37 ms (step) remaining time: 6:55:10

Epoch 1 | iter 2304 step 9 | loss train: 2.922, val: 3.078 | iter time: 464.24 ms (step) remaining time: 6:51:45

Epoch 1 | iter 2560 step 10 | loss train: 2.924, val: 3.078 | iter time: 464.71 ms (step) remaining time: 6:48:39

# ...

Epoch 1 | iter 53760 step 210 | loss train: 2.904, val: 3.013 | iter time: 468.66 ms (step) remaining time: 0:23:26

Epoch 1 | iter 54016 step 211 | loss train: 2.903, val: 3.013 | iter time: 468.81 ms (step) remaining time: 0:21:32

Epoch 1 | iter 54272 step 212 | loss train: 2.951, val: 3.013 | iter time: 463.52 ms (step) remaining time: 0:19:39

Epoch 1 | iter 54528 step 213 | loss train: 2.941, val: 3.013 | iter time: 466.12 ms (step) remaining time: 0:17:45

Epoch 1 | iter 54784 step 214 | loss train: 2.950, val: 3.013 | iter time: 468.24 ms (step) remaining time: 0:15:52

Epoch 1 | iter 55040 step 215 | loss train: 2.943, val: 3.013 | iter time: 466.65 ms (step) remaining time: 0:13:59

Epoch 1 | iter 55296 step 216 | loss train: 2.903, val: 3.013 | iter time: 464.44 ms (step) remaining time: 0:12:05

Epoch 1 | iter 55552 step 217 | loss train: 2.954, val: 3.013 | iter time: 465.12 ms (step) remaining time: 0:10:12

Epoch 1 | iter 55808 step 218 | loss train: 2.907, val: 3.013 | iter time: 464.96 ms (step) remaining time: 0:08:19

Epoch 1 | iter 56064 step 219 | loss train: 2.909, val: 3.013 | iter time: 467.12 ms (step) remaining time: 0:06:25

Epoch 1 | iter 56320 step 220 | loss train: 2.908, val: 3.013 | iter time: 466.43 ms (step) remaining time: 0:04:32

Epoch 1 | iter 56576 step 221 | loss train: 2.894, val: 3.013 | iter time: 469.70 ms (step) remaining time: 0:02:38

Epoch 1 | iter 56832 step 222 | loss train: 2.809, val: 3.013 | iter time: 463.88 ms (step) remaining time: 0:00:45

Validating ...

Final evaluation | val loss: 3.011 | val ppl: 20.306

Saving checkpoint to '../out/pretrain-core-1/final/lit_model.pth'

----------------------------------------

| Performance

| - Total tokens  : 466,411,520

| - Training Time : 25263.31 s

| - Tok/sec       : 371.33 tok/s

| ----------------------------------------

| Memory Usage

| - Memory Used   : 22.33 GB

----------------------------------------

```

```bash

cp ../config-2.json ../out/pretrain-core-2/final/config.json

```

```bash

mv wandb wandb-pretrain-core-2

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-core-2/final

```

```bash

litgpt convert_pretrained_checkpoint ../out/pretrain-core-2/final ../out/pretrain-core-2/checkpoint

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_3.yaml

```

```

Seed set to 23

Time to instantiate model: 0.30 seconds.

Total parameters: 234,914,304

Validating ...

Measured TFLOPs: 6940.12

Epoch 1 | iter 512 step 1 | loss train: 2.698, val: 2.522 | iter time: 675.96 ms (step) remaining time: 9:49:31

Epoch 1 | iter 1024 step 2 | loss train: 2.627, val: 2.522 | iter time: 603.66 ms (step) remaining time: 9:19:41

Epoch 1 | iter 1536 step 3 | loss train: 2.653, val: 2.522 | iter time: 604.66 ms (step) remaining time: 9:06:15

Epoch 1 | iter 2048 step 4 | loss train: 2.608, val: 2.522 | iter time: 606.23 ms (step) remaining time: 8:57:08

Epoch 1 | iter 2560 step 5 | loss train: 2.604, val: 2.522 | iter time: 605.04 ms (step) remaining time: 8:49:43

Epoch 1 | iter 3072 step 6 | loss train: 2.578, val: 2.522 | iter time: 606.32 ms (step) remaining time: 8:43:08

Epoch 1 | iter 3584 step 7 | loss train: 2.692, val: 2.522 | iter time: 605.08 ms (step) remaining time: 8:37:01

Epoch 1 | iter 4096 step 8 | loss train: 2.570, val: 2.522 | iter time: 607.54 ms (step) remaining time: 8:31:20

Epoch 1 | iter 4608 step 9 | loss train: 2.646, val: 2.522 | iter time: 607.19 ms (step) remaining time: 8:25:47

Epoch 1 | iter 5120 step 10 | loss train: 2.565, val: 2.522 | iter time: 604.76 ms (step) remaining time: 8:20:23

# ...

Epoch 1 | iter 51712 step 101 | loss train: 2.562, val: 2.453 | iter time: 607.12 ms (step) remaining time: 0:48:29

Epoch 1 | iter 52224 step 102 | loss train: 2.637, val: 2.453 | iter time: 605.46 ms (step) remaining time: 0:43:31

Epoch 1 | iter 52736 step 103 | loss train: 2.629, val: 2.453 | iter time: 604.15 ms (step) remaining time: 0:38:34

Epoch 1 | iter 53248 step 104 | loss train: 2.629, val: 2.453 | iter time: 605.92 ms (step) remaining time: 0:33:36

Epoch 1 | iter 53760 step 105 | loss train: 2.606, val: 2.453 | iter time: 604.48 ms (step) remaining time: 0:28:38

Epoch 1 | iter 54272 step 106 | loss train: 2.581, val: 2.453 | iter time: 603.78 ms (step) remaining time: 0:23:41

Epoch 1 | iter 54784 step 107 | loss train: 2.580, val: 2.453 | iter time: 605.41 ms (step) remaining time: 0:18:43

Epoch 1 | iter 55296 step 108 | loss train: 2.602, val: 2.453 | iter time: 607.38 ms (step) remaining time: 0:13:46

Epoch 1 | iter 55808 step 109 | loss train: 2.633, val: 2.453 | iter time: 606.06 ms (step) remaining time: 0:08:49

Epoch 1 | iter 56320 step 110 | loss train: 2.631, val: 2.453 | iter time: 608.68 ms (step) remaining time: 0:03:51

Validating ...

iter 56320: val loss 2.4515, val time: 19303.40 ms

Saving checkpoint to '../out/pretrain-core-3/step-00000110/lit_model.pth'

Validating ...

Final evaluation | val loss: 2.451 | val ppl: 11.605

Saving checkpoint to '../out/pretrain-core-3/final/lit_model.pth'

----------------------------------------

| Performance

| - Total tokens  : 464,642,048

| - Training Time : 33018.19 s

| - Tok/sec       : 362.46 tok/s

| ----------------------------------------

| Memory Usage

| - Memory Used   : 22.33 GB

----------------------------------------

```

```bash

cp ../config-3.json ../out/pretrain-core-3/final/config.json

```

```bash

mv wandb wandb-pretrain-core-3

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-core-3/final

```

```bash

litgpt convert_pretrained_checkpoint ../out/pretrain-core-3/final ../out/pretrain-core-3/checkpoint

```

```bash

CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_4.yaml

```

```

# ...

```