uz_asr_transformers_4000 / hyperparams.yaml
aslon1213's picture
Update hyperparams.yaml
6b8ade2 verified
# Generated 2025-02-12 from:
# /home/hamidovaslon1/speechbrain/recipes/CommonVoice/ASR/transformer/hparams/staging_5.yaml
# yamllint disable
seed: 1101
__set_seed: !apply:speechbrain.utils.seed_everything [1101]
output_folder: uz_transformer_4000/model_saved
output_wer_folder: uz_transformer_4000/model_saved/
save_folder: uz_transformer_4000/model_saved/save
train_log: uz_transformer_4000/model_saved/train_log.txt
# Data files
data_folder: /mnt/data/commonvoice/uz # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
data_folder_unlabeled: /mnt/data/youtube_audio
train_tsv_file: /mnt/data/commonvoice/uz/train.tsv # Standard CommonVoice .tsv files
dev_tsv_file: /mnt/data/commonvoice/uz/dev.tsv # Standard CommonVoice .tsv files
test_tsv_file: /mnt/data/commonvoice/uz/test.tsv # Standard CommonVoice .tsv files
unlabeled_tsv_file: audio_data_loader/dataloader/youtube_gcp.tsv # Path to the youtube dataset
accented_letters: false
language: uz # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
train_csv: uz_transformer_4000/model_saved/train.csv
valid_csv: uz_transformer_4000/model_saved/dev.csv
test_csv: uz_transformer_4000/model_saved/test.csv
unlabeled_csv: audio_data_loader/dataloader/unlabeled_ogg.csv # CREATE IN THE DIRECTORY
skip_prep: false # Skip data preparation
convert_to_wav: false # Switch this to True to convert all mp3 files to wav.
# We remove utterance slonger than 10s in the train/dev/test sets as
# longer sentences certainly correspond to "open microphones".
avoid_if_longer_than: 50
# THIS IS TERRIBLE BUT WE HAVE NO CHOICE.
# Some version of the CV dataset may contain one or two files of more than
# 2 min in the validation and or test. This is an error by design of the dataset
# as these files contain 90% of silence. We exclude them.
avoid_if_longer_than_val_test: 100.0
ckpt_interval_minutes: 15 # save checkpoint every N min
####################### Training Parameters ####################################
number_of_epochs: 100
optimizer_step_limit: 30000
batch_size: 200 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 !
ctc_weight: 0.3
grad_accumulation_factor: 3
loss_reduction: batchmean
sorting: random
num_workers: 4
precision: fp16 # bf16, fp16 or fp32
# stages related parameters
lr_adam: 0.0008
weight_decay: 0.01
warmup_steps: 1000
augment_warmup: 6000
# BPE parameters
token_type: bpe # ["unigram", "bpe", "char"]
character_coverage: 1.0
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
# This setup works well for A100 80GB GPU, adapts it to your needs.
# Or turn it off (but training speed will decrease)
dynamic_batching: true
max_batch_length_train: 300
max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
num_bucket: 200
shuffle: true # if true re-creates batches at each epoch shuffling examples.
batch_ordering: random
max_batch_ex: 256
dynamic_batch_sampler_train:
max_batch_length: 300
num_buckets: 200
shuffle: true
batch_ordering: random
max_batch_ex: 256
dynamic_batch_sampler_valid:
max_batch_length: 100
num_buckets: 200
shuffle: true
batch_ordering: random
max_batch_ex: 256
# Dataloader options
train_dataloader_opts:
batch_size: 200
shuffle: true
num_workers: 4
valid_dataloader_opts:
batch_size: 200
test_dataloader_opts:
batch_size: 200
####################### Model Parameters ###########################
# Transformer
d_model: 384
nhead: 8
num_encoder_layers: 8
num_decoder_layers: 4
d_ffn: 1536
transformer_dropout: 0.1
activation: &id001 !name:torch.nn.GELU
output_neurons: 4000
# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 1
valid_beam_size: 1 # We do greedy here so it's faster to decode ...
test_beam_size: 10
ctc_weight_decode: 0.3
scorer_beam_scale: 0.3
############################## models ################################
CNN: &id002 !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (64, 32)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: &id003 !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
# yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: 4000
d_model: 384
nhead: 8
num_encoder_layers: 8
num_decoder_layers: 4
d_ffn: 1536
dropout: 0.1
conformer_activation: *id001
activation: *id001
encoder_module: conformer
attention_type: RelPosMHAXL
normalize_before: true
causal: false
ctc_lin: &id005 !new:speechbrain.nnet.linear.Linear
input_size: 384
n_neurons: 4000
seq_lin: &id004 !new:speechbrain.nnet.linear.Linear
input_size: 384
n_neurons: 4000
modules:
CNN: *id002
Transformer: *id003
seq_lin: *id004
ctc_lin: *id005
model: &id008 !new:torch.nn.ModuleList
- [*id002, *id003, *id004, *id005]
Adam: !name:torch.optim.AdamW
lr: 0.0008
weight_decay: 0.01
# Scorer
ctc_scorer: &id006 !new:speechbrain.decoders.scorer.CTCScorer
eos_index: 2
blank_index: 0
ctc_fc: *id005
scorer: &id007 !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [*id006]
weights:
ctc: 0.3
scorer_beam_scale: 0.3
valid_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
modules: [*id003, *id004]
bos_index: 1
eos_index: 2
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 1
using_eos_threshold: false
length_normalization: true
scorer: *id007
test_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
modules: [*id003, *id004]
bos_index: 1
eos_index: 2
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 10
temperature: 1.15
using_eos_threshold: true
scorer: *id007
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: 0
reduction: batchmean
seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
label_smoothing: 0.1
reduction: batchmean
noam_annealing: &id009 !new:speechbrain.nnet.schedulers.NoamScheduler
lr_initial: 0.0008
n_warmup_steps: 1000
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: uz_transformer_4000/model_saved/save
recoverables:
model: *id008
noam_scheduler: *id009
normalizer: &id011 !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4
############################## Augmentations ###################################
# Time Drop
counter: &id010 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 100
epoch_counter: *id010
normalize: *id011
time_drop: &id012 !new:speechbrain.augment.freq_domain.SpectrogramDrop
drop_length_low: 15
drop_length_high: 25
drop_count_low: 3
drop_count_high: 3
replace: zeros
dim: 1
# Frequency Drop
freq_drop: &id013 !new:speechbrain.augment.freq_domain.SpectrogramDrop
drop_length_low: 25
drop_length_high: 35
drop_count_low: 2
drop_count_high: 2
replace: zeros
dim: 2
# Time warp
time_warp: &id014 !new:speechbrain.augment.freq_domain.Warping
fea_augment: !new:speechbrain.augment.augmenter.Augmenter
min_augmentations: 3
max_augmentations: 3
augment_prob: 1.0
augmentations: [*id012, *id013, *id014]
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: 16000
n_fft: 400
n_mels: 80
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: uz_transformer_4000/model_saved/train_log.txt
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: true