aslon1213
/

uz_asr_transformers_4000

Model card Files Files and versions Community

uz_asr_transformers_4000 / hyperparams.yaml

aslon1213

Update hyperparams.yaml

6b8ade2 verified about 1 month ago

raw

history blame contribute delete

7.83 kB

	# Generated 2025-02-12 from:
	# /home/hamidovaslon1/speechbrain/recipes/CommonVoice/ASR/transformer/hparams/staging_5.yaml
	# yamllint disable
	seed: 1101
	__set_seed: !apply:speechbrain.utils.seed_everything [1101]
	output_folder: uz_transformer_4000/model_saved
	output_wer_folder: uz_transformer_4000/model_saved/
	save_folder: uz_transformer_4000/model_saved/save
	train_log: uz_transformer_4000/model_saved/train_log.txt

	# Data files
	data_folder: /mnt/data/commonvoice/uz # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
	data_folder_unlabeled: /mnt/data/youtube_audio
	train_tsv_file: /mnt/data/commonvoice/uz/train.tsv # Standard CommonVoice .tsv files
	dev_tsv_file: /mnt/data/commonvoice/uz/dev.tsv # Standard CommonVoice .tsv files
	test_tsv_file: /mnt/data/commonvoice/uz/test.tsv # Standard CommonVoice .tsv files
	unlabeled_tsv_file: audio_data_loader/dataloader/youtube_gcp.tsv # Path to the youtube dataset
	accented_letters: false
	language: uz # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
	train_csv: uz_transformer_4000/model_saved/train.csv
	valid_csv: uz_transformer_4000/model_saved/dev.csv
	test_csv: uz_transformer_4000/model_saved/test.csv
	unlabeled_csv: audio_data_loader/dataloader/unlabeled_ogg.csv # CREATE IN THE DIRECTORY
	skip_prep: false # Skip data preparation
	convert_to_wav: false # Switch this to True to convert all mp3 files to wav.

	# We remove utterance slonger than 10s in the train/dev/test sets as
	# longer sentences certainly correspond to "open microphones".
	avoid_if_longer_than: 50

	# THIS IS TERRIBLE BUT WE HAVE NO CHOICE.
	# Some version of the CV dataset may contain one or two files of more than
	# 2 min in the validation and or test. This is an error by design of the dataset
	# as these files contain 90% of silence. We exclude them.
	avoid_if_longer_than_val_test: 100.0

	ckpt_interval_minutes: 15 # save checkpoint every N min

	####################### Training Parameters ####################################
	number_of_epochs: 100
	optimizer_step_limit: 30000
	batch_size: 200 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 !
	ctc_weight: 0.3
	grad_accumulation_factor: 3
	loss_reduction: batchmean
	sorting: random
	num_workers: 4
	precision: fp16 # bf16, fp16 or fp32

	# stages related parameters
	lr_adam: 0.0008
	weight_decay: 0.01
	warmup_steps: 1000
	augment_warmup: 6000

	# BPE parameters
	token_type: bpe # ["unigram", "bpe", "char"]
	character_coverage: 1.0

	# Feature parameters
	sample_rate: 16000
	n_fft: 400
	n_mels: 80

	# This setup works well for A100 80GB GPU, adapts it to your needs.
	# Or turn it off (but training speed will decrease)
	dynamic_batching: true
	max_batch_length_train: 300
	max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
	num_bucket: 200
	shuffle: true # if true re-creates batches at each epoch shuffling examples.
	batch_ordering: random
	max_batch_ex: 256

	dynamic_batch_sampler_train:
	max_batch_length: 300
	num_buckets: 200
	shuffle: true
	batch_ordering: random
	max_batch_ex: 256

	dynamic_batch_sampler_valid:
	max_batch_length: 100
	num_buckets: 200
	shuffle: true
	batch_ordering: random
	max_batch_ex: 256

	# Dataloader options
	train_dataloader_opts:
	batch_size: 200
	shuffle: true
	num_workers: 4

	valid_dataloader_opts:
	batch_size: 200

	test_dataloader_opts:
	batch_size: 200


	####################### Model Parameters ###########################
	# Transformer
	d_model: 384
	nhead: 8
	num_encoder_layers: 8
	num_decoder_layers: 4
	d_ffn: 1536
	transformer_dropout: 0.1
	activation: &id001 !name:torch.nn.GELU
	output_neurons: 4000

	# Outputs
	blank_index: 0
	label_smoothing: 0.1
	pad_index: 0
	bos_index: 1
	eos_index: 2

	# Decoding parameters
	min_decode_ratio: 0.0
	max_decode_ratio: 1.0
	valid_search_interval: 1
	valid_beam_size: 1 # We do greedy here so it's faster to decode ...
	test_beam_size: 10
	ctc_weight_decode: 0.3
	scorer_beam_scale: 0.3

	############################## models ################################

	CNN: &id002 !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
	input_shape: (8, 10, 80)
	num_blocks: 2
	num_layers_per_block: 1
	out_channels: (64, 32)
	kernel_sizes: (3, 3)
	strides: (2, 2)
	residuals: (False, False)

	Transformer: &id003 !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
	# yamllint disable-line rule:line-length
	input_size: 640
	tgt_vocab: 4000
	d_model: 384
	nhead: 8
	num_encoder_layers: 8
	num_decoder_layers: 4
	d_ffn: 1536
	dropout: 0.1
	conformer_activation: *id001
	activation: *id001
	encoder_module: conformer
	attention_type: RelPosMHAXL
	normalize_before: true
	causal: false

	ctc_lin: &id005 !new:speechbrain.nnet.linear.Linear

	input_size: 384
	n_neurons: 4000

	seq_lin: &id004 !new:speechbrain.nnet.linear.Linear
	input_size: 384
	n_neurons: 4000

	modules:
	CNN: *id002
	Transformer: *id003
	seq_lin: *id004
	ctc_lin: *id005
	model: &id008 !new:torch.nn.ModuleList
	- [id002, id003, id004, id005]
	Adam: !name:torch.optim.AdamW
	lr: 0.0008
	weight_decay: 0.01

	# Scorer
	ctc_scorer: &id006 !new:speechbrain.decoders.scorer.CTCScorer
	eos_index: 2
	blank_index: 0
	ctc_fc: *id005
	scorer: &id007 !new:speechbrain.decoders.scorer.ScorerBuilder

	full_scorers: [*id006]
	weights:
	ctc: 0.3
	scorer_beam_scale: 0.3

	valid_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
	modules: [id003, id004]
	bos_index: 1
	eos_index: 2
	min_decode_ratio: 0.0
	max_decode_ratio: 1.0
	beam_size: 1
	using_eos_threshold: false
	length_normalization: true
	scorer: *id007
	test_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
	modules: [id003, id004]
	bos_index: 1
	eos_index: 2
	min_decode_ratio: 0.0
	max_decode_ratio: 1.0
	beam_size: 10
	temperature: 1.15
	using_eos_threshold: true
	scorer: *id007
	log_softmax: !new:torch.nn.LogSoftmax
	dim: -1

	ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
	blank_index: 0
	reduction: batchmean

	seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
	label_smoothing: 0.1
	reduction: batchmean

	noam_annealing: &id009 !new:speechbrain.nnet.schedulers.NoamScheduler
	lr_initial: 0.0008
	n_warmup_steps: 1000

	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir: uz_transformer_4000/model_saved/save
	recoverables:
	model: *id008
	noam_scheduler: *id009
	normalizer: &id011 !new:speechbrain.processing.features.InputNormalization
	norm_type: global
	update_until_epoch: 4

	############################## Augmentations ###################################

	# Time Drop
	counter: &id010 !new:speechbrain.utils.epoch_loop.EpochCounter

	limit: 100

	epoch_counter: *id010
	normalize: *id011
	time_drop: &id012 !new:speechbrain.augment.freq_domain.SpectrogramDrop
	drop_length_low: 15
	drop_length_high: 25
	drop_count_low: 3
	drop_count_high: 3
	replace: zeros
	dim: 1

	# Frequency Drop
	freq_drop: &id013 !new:speechbrain.augment.freq_domain.SpectrogramDrop
	drop_length_low: 25
	drop_length_high: 35
	drop_count_low: 2
	drop_count_high: 2
	replace: zeros
	dim: 2

	# Time warp
	time_warp: &id014 !new:speechbrain.augment.freq_domain.Warping

	fea_augment: !new:speechbrain.augment.augmenter.Augmenter
	min_augmentations: 3
	max_augmentations: 3
	augment_prob: 1.0
	augmentations: [id012, id013, *id014]

	compute_features: !new:speechbrain.lobes.features.Fbank
	sample_rate: 16000
	n_fft: 400
	n_mels: 80

	train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	save_file: uz_transformer_4000/model_saved/train_log.txt

	error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
	acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
	cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
	split_tokens: true