|
|
|
|
|
|
|
seed: 1101 |
|
__set_seed: !apply:speechbrain.utils.seed_everything [1101] |
|
output_folder: uz_transformer_4000/model_saved |
|
output_wer_folder: uz_transformer_4000/model_saved/ |
|
save_folder: uz_transformer_4000/model_saved/save |
|
train_log: uz_transformer_4000/model_saved/train_log.txt |
|
|
|
|
|
data_folder: /mnt/data/commonvoice/uz |
|
data_folder_unlabeled: /mnt/data/youtube_audio |
|
train_tsv_file: /mnt/data/commonvoice/uz/train.tsv |
|
dev_tsv_file: /mnt/data/commonvoice/uz/dev.tsv |
|
test_tsv_file: /mnt/data/commonvoice/uz/test.tsv |
|
unlabeled_tsv_file: audio_data_loader/dataloader/youtube_gcp.tsv |
|
accented_letters: false |
|
language: uz |
|
train_csv: uz_transformer_4000/model_saved/train.csv |
|
valid_csv: uz_transformer_4000/model_saved/dev.csv |
|
test_csv: uz_transformer_4000/model_saved/test.csv |
|
unlabeled_csv: audio_data_loader/dataloader/unlabeled_ogg.csv |
|
skip_prep: false |
|
convert_to_wav: false |
|
|
|
|
|
|
|
avoid_if_longer_than: 50 |
|
|
|
|
|
|
|
|
|
|
|
avoid_if_longer_than_val_test: 100.0 |
|
|
|
ckpt_interval_minutes: 15 |
|
|
|
|
|
number_of_epochs: 100 |
|
optimizer_step_limit: 30000 |
|
batch_size: 200 |
|
ctc_weight: 0.3 |
|
grad_accumulation_factor: 3 |
|
loss_reduction: batchmean |
|
sorting: random |
|
num_workers: 4 |
|
precision: fp16 |
|
|
|
|
|
lr_adam: 0.0008 |
|
weight_decay: 0.01 |
|
warmup_steps: 1000 |
|
augment_warmup: 6000 |
|
|
|
|
|
token_type: bpe |
|
character_coverage: 1.0 |
|
|
|
|
|
sample_rate: 16000 |
|
n_fft: 400 |
|
n_mels: 80 |
|
|
|
|
|
|
|
dynamic_batching: true |
|
max_batch_length_train: 300 |
|
max_batch_length_val: 100 |
|
num_bucket: 200 |
|
shuffle: true |
|
batch_ordering: random |
|
max_batch_ex: 256 |
|
|
|
dynamic_batch_sampler_train: |
|
max_batch_length: 300 |
|
num_buckets: 200 |
|
shuffle: true |
|
batch_ordering: random |
|
max_batch_ex: 256 |
|
|
|
dynamic_batch_sampler_valid: |
|
max_batch_length: 100 |
|
num_buckets: 200 |
|
shuffle: true |
|
batch_ordering: random |
|
max_batch_ex: 256 |
|
|
|
|
|
train_dataloader_opts: |
|
batch_size: 200 |
|
shuffle: true |
|
num_workers: 4 |
|
|
|
valid_dataloader_opts: |
|
batch_size: 200 |
|
|
|
test_dataloader_opts: |
|
batch_size: 200 |
|
|
|
|
|
|
|
|
|
d_model: 384 |
|
nhead: 8 |
|
num_encoder_layers: 8 |
|
num_decoder_layers: 4 |
|
d_ffn: 1536 |
|
transformer_dropout: 0.1 |
|
activation: &id001 !name:torch.nn.GELU |
|
output_neurons: 4000 |
|
|
|
|
|
blank_index: 0 |
|
label_smoothing: 0.1 |
|
pad_index: 0 |
|
bos_index: 1 |
|
eos_index: 2 |
|
|
|
|
|
min_decode_ratio: 0.0 |
|
max_decode_ratio: 1.0 |
|
valid_search_interval: 1 |
|
valid_beam_size: 1 |
|
test_beam_size: 10 |
|
ctc_weight_decode: 0.3 |
|
scorer_beam_scale: 0.3 |
|
|
|
|
|
|
|
CNN: &id002 !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd |
|
input_shape: (8, 10, 80) |
|
num_blocks: 2 |
|
num_layers_per_block: 1 |
|
out_channels: (64, 32) |
|
kernel_sizes: (3, 3) |
|
strides: (2, 2) |
|
residuals: (False, False) |
|
|
|
Transformer: &id003 !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR |
|
|
|
input_size: 640 |
|
tgt_vocab: 4000 |
|
d_model: 384 |
|
nhead: 8 |
|
num_encoder_layers: 8 |
|
num_decoder_layers: 4 |
|
d_ffn: 1536 |
|
dropout: 0.1 |
|
conformer_activation: *id001 |
|
activation: *id001 |
|
encoder_module: conformer |
|
attention_type: RelPosMHAXL |
|
normalize_before: true |
|
causal: false |
|
|
|
ctc_lin: &id005 !new:speechbrain.nnet.linear.Linear |
|
|
|
input_size: 384 |
|
n_neurons: 4000 |
|
|
|
seq_lin: &id004 !new:speechbrain.nnet.linear.Linear |
|
input_size: 384 |
|
n_neurons: 4000 |
|
|
|
modules: |
|
CNN: *id002 |
|
Transformer: *id003 |
|
seq_lin: *id004 |
|
ctc_lin: *id005 |
|
model: &id008 !new:torch.nn.ModuleList |
|
- [*id002, *id003, *id004, *id005] |
|
Adam: !name:torch.optim.AdamW |
|
lr: 0.0008 |
|
weight_decay: 0.01 |
|
|
|
|
|
ctc_scorer: &id006 !new:speechbrain.decoders.scorer.CTCScorer |
|
eos_index: 2 |
|
blank_index: 0 |
|
ctc_fc: *id005 |
|
scorer: &id007 !new:speechbrain.decoders.scorer.ScorerBuilder |
|
|
|
full_scorers: [*id006] |
|
weights: |
|
ctc: 0.3 |
|
scorer_beam_scale: 0.3 |
|
|
|
valid_search: !new:speechbrain.decoders.S2STransformerBeamSearcher |
|
modules: [*id003, *id004] |
|
bos_index: 1 |
|
eos_index: 2 |
|
min_decode_ratio: 0.0 |
|
max_decode_ratio: 1.0 |
|
beam_size: 1 |
|
using_eos_threshold: false |
|
length_normalization: true |
|
scorer: *id007 |
|
test_search: !new:speechbrain.decoders.S2STransformerBeamSearcher |
|
modules: [*id003, *id004] |
|
bos_index: 1 |
|
eos_index: 2 |
|
min_decode_ratio: 0.0 |
|
max_decode_ratio: 1.0 |
|
beam_size: 10 |
|
temperature: 1.15 |
|
using_eos_threshold: true |
|
scorer: *id007 |
|
log_softmax: !new:torch.nn.LogSoftmax |
|
dim: -1 |
|
|
|
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss |
|
blank_index: 0 |
|
reduction: batchmean |
|
|
|
seq_cost: !name:speechbrain.nnet.losses.kldiv_loss |
|
label_smoothing: 0.1 |
|
reduction: batchmean |
|
|
|
noam_annealing: &id009 !new:speechbrain.nnet.schedulers.NoamScheduler |
|
lr_initial: 0.0008 |
|
n_warmup_steps: 1000 |
|
|
|
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer |
|
checkpoints_dir: uz_transformer_4000/model_saved/save |
|
recoverables: |
|
model: *id008 |
|
noam_scheduler: *id009 |
|
normalizer: &id011 !new:speechbrain.processing.features.InputNormalization |
|
norm_type: global |
|
update_until_epoch: 4 |
|
|
|
|
|
|
|
|
|
counter: &id010 !new:speechbrain.utils.epoch_loop.EpochCounter |
|
|
|
limit: 100 |
|
|
|
epoch_counter: *id010 |
|
normalize: *id011 |
|
time_drop: &id012 !new:speechbrain.augment.freq_domain.SpectrogramDrop |
|
drop_length_low: 15 |
|
drop_length_high: 25 |
|
drop_count_low: 3 |
|
drop_count_high: 3 |
|
replace: zeros |
|
dim: 1 |
|
|
|
|
|
freq_drop: &id013 !new:speechbrain.augment.freq_domain.SpectrogramDrop |
|
drop_length_low: 25 |
|
drop_length_high: 35 |
|
drop_count_low: 2 |
|
drop_count_high: 2 |
|
replace: zeros |
|
dim: 2 |
|
|
|
|
|
time_warp: &id014 !new:speechbrain.augment.freq_domain.Warping |
|
|
|
fea_augment: !new:speechbrain.augment.augmenter.Augmenter |
|
min_augmentations: 3 |
|
max_augmentations: 3 |
|
augment_prob: 1.0 |
|
augmentations: [*id012, *id013, *id014] |
|
|
|
compute_features: !new:speechbrain.lobes.features.Fbank |
|
sample_rate: 16000 |
|
n_fft: 400 |
|
n_mels: 80 |
|
|
|
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger |
|
save_file: uz_transformer_4000/model_saved/train_log.txt |
|
|
|
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats |
|
acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats |
|
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats |
|
split_tokens: true |
|
|