Spaces:

amaai-lab
/

music2emo

Running

App Files Files Community

kjysmu commited on Feb 10

Commit

9870f94

verified ·

1 Parent(s): e881d42

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -50

app.py CHANGED Viewed

@@ -46,6 +46,10 @@ logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
 PITCH_CLASS = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
 pitch_num_dic = {
     'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5,
     'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11
@@ -167,6 +171,8 @@ def split_audio(waveform, sample_rate):
 class Music2emo:
     def __init__(
         self,
@@ -206,6 +212,37 @@ class Music2emo:
         self.music2emo_model.to(self.device)
         self.music2emo_model.eval()
     def predict(self, audio, threshold = 0.5):
         feature_dir = Path("./inference/temp_out")
@@ -263,23 +300,11 @@ class Music2emo:
         final_embedding_mert.to(self.device)
         # --- Chord feature extract ---
-        config = HParams.load("./inference/data/run_config.yaml")
-        config.feature['large_voca'] = True
-        config.model['num_chords'] = 170
-        model_file = './inference/data/btc_model_large_voca.pt'
-        idx_to_chord = idx2voca_chord()
-        model = BTC_model(config=config.model).to(self.device)
-        if os.path.isfile(model_file):
-            checkpoint = torch.load(model_file, map_location=self.device)
-            mean = checkpoint['mean']
-            std = checkpoint['std']
-            model.load_state_dict(checkpoint['model'])
         audio_path = audio
         audio_id = audio_path.split("/")[-1][:-4]
         try:
-            feature, feature_per_second, song_length_second = audio_file_to_features(audio_path, config)
         except:
             logger.info("audio file failed to load : %s" % audio_path)
             assert(False)
@@ -287,9 +312,9 @@ class Music2emo:
         logger.info("audio file loaded and feature computation success : %s" % audio_path)
         feature = feature.T
-        feature = (feature - mean) / std
         time_unit = feature_per_second
-        n_timestep = config.model['timestep']
         num_pad = n_timestep - (feature.shape[0] % n_timestep)
         feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
@@ -298,11 +323,11 @@ class Music2emo:
         start_time = 0.0
         lines = []
         with torch.no_grad():
-            model.eval()
             feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(self.device)
             for t in range(num_instance):
-                self_attn_output, _ = model.self_attn_layers(feature[:, n_timestep * t:n_timestep * (t + 1), :])
-                prediction, _ = model.output_layer(self_attn_output)
                 prediction = prediction.squeeze()
                 for i in range(n_timestep):
                     if t == 0 and i == 0:
@@ -310,12 +335,12 @@ class Music2emo:
                         continue
                     if prediction[i].item() != prev_chord:
                         lines.append(
-                            '%.3f %.3f %s\n' % (start_time, time_unit * (n_timestep * t + i), idx_to_chord[prev_chord]))
                         start_time = time_unit * (n_timestep * t + i)
                         prev_chord = prediction[i].item()
                     if t == num_instance - 1 and i + num_pad == n_timestep:
                         if start_time != time_unit * (n_timestep * t + i):
-                            lines.append('%.3f %.3f %s\n' % (start_time, time_unit * (n_timestep * t + i), idx_to_chord[prev_chord]))
                         break
         save_path = os.path.join(feature_dir, os.path.split(audio_path)[-1].replace('.mp3', '').replace('.wav', '') + '.lab')
@@ -356,24 +381,9 @@ class Music2emo:
         midi.instruments.append(instrument)
         midi.write(save_path.replace('.lab', '.midi'))
-        tonic_signatures = ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"]
-        mode_signatures = ["major", "minor"]  # Major and minor modes
-        tonic_to_idx = {tonic: idx for idx, tonic in enumerate(tonic_signatures)}
-        mode_to_idx = {mode: idx for idx, mode in enumerate(mode_signatures)}
-        idx_to_tonic = {idx: tonic for tonic, idx in tonic_to_idx.items()}
-        idx_to_mode = {idx: mode for mode, idx in mode_to_idx.items()}
-        with open('inference/data/chord.json', 'r') as f:
-            chord_to_idx = json.load(f)
-        with open('inference/data/chord_inv.json', 'r') as f:
-            idx_to_chord = json.load(f)
-            idx_to_chord = {int(k): v for k, v in idx_to_chord.items()}  # Ensure keys are ints
-        with open('inference/data/chord_root.json') as json_file:
-            chordRootDic = json.load(json_file)
-        with open('inference/data/chord_attr.json') as json_file:
-            chordAttrDic = json.load(json_file)
         try:
             midi_file = converter.parse(save_path.replace('.lab', '.midi'))
             key_signature = str(midi_file.analyze('key'))
@@ -390,7 +400,7 @@ class Music2emo:
         else:
             mode = key_signature.split()[-1]
-        encoded_mode = mode_to_idx.get(mode, 0)
         mode_tensor = torch.tensor([encoded_mode], dtype=torch.long).to(self.device)
         converted_lines = normalize_chord(save_path, key_signature, key_type)
@@ -419,19 +429,19 @@ class Music2emo:
         for start, end, chord in chords:
             chord_arr = chord.split(":")
             if len(chord_arr) == 1:
-                chordRootID = chordRootDic[chord_arr[0]]
                 if chord_arr[0] == "N" or chord_arr[0] == "X":
                     chordAttrID = 0
                 else:
                     chordAttrID = 1
             elif len(chord_arr) == 2:
-                chordRootID = chordRootDic[chord_arr[0]]
-                chordAttrID = chordAttrDic[chord_arr[1]]
             encoded_root.append(chordRootID)
             encoded_attr.append(chordAttrID)
-            if chord in chord_to_idx:
-                encoded.append(chord_to_idx[chord])
             else:
                 print(f"Warning: Chord {chord} not found in chord.json. Skipping.")
@@ -585,14 +595,6 @@ with gr.Blocks(css=css) as demo:
                 elem_id="output-text"
             )
-            # Add example usage
-            # gr.Examples(
-            #     examples=["inference/input/test.mp3"],
-            #     inputs=input_audio,
-            #     outputs=output_text,
-            #     fn=lambda x: format_prediction(music2emo.predict(x, 0.5)),
-            #     cache_examples=True
-            # )
     predict_btn.click(
         fn=lambda audio, thresh: format_prediction(music2emo.predict(audio, thresh)),

 PITCH_CLASS = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+tonic_signatures = ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"]
+mode_signatures = ["major", "minor"]  # Major and minor modes
 pitch_num_dic = {
     'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5,
     'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11
 class Music2emo:
     def __init__(
         self,
         self.music2emo_model.to(self.device)
         self.music2emo_model.eval()
+        self.config = HParams.load("./inference/data/run_config.yaml")
+        self.config.feature['large_voca'] = True
+        self.config.model['num_chords'] = 170
+        model_file = './inference/data/btc_model_large_voca.pt'
+        self.idx_to_voca = idx2voca_chord()
+        self.btc_model = BTC_model(config=self.config.model).to(self.device)
+        if os.path.isfile(model_file):
+            checkpoint = torch.load(model_file, map_location=self.device)
+            self.mean = checkpoint['mean']
+            self.std = checkpoint['std']
+            self.btc_model.load_state_dict(checkpoint['model'])
+        self.tonic_to_idx = {tonic: idx for idx, tonic in enumerate(tonic_signatures)}
+        self.mode_to_idx = {mode: idx for idx, mode in enumerate(mode_signatures)}
+        self.idx_to_tonic = {idx: tonic for tonic, idx in self.tonic_to_idx.items()}
+        self.idx_to_mode = {idx: mode for mode, idx in self.mode_to_idx.items()}
+        with open('inference/data/chord.json', 'r') as f:
+            self.chord_to_idx = json.load(f)
+        with open('inference/data/chord_inv.json', 'r') as f:
+            self.idx_to_chord = json.load(f)
+            self.idx_to_chord = {int(k): v for k, v in self.idx_to_chord.items()}  # Ensure keys are ints
+        with open('inference/data/chord_root.json') as json_file:
+            self.chordRootDic = json.load(json_file)
+        with open('inference/data/chord_attr.json') as json_file:
+            self.chordAttrDic = json.load(json_file)
     def predict(self, audio, threshold = 0.5):
         feature_dir = Path("./inference/temp_out")
         final_embedding_mert.to(self.device)
         # --- Chord feature extract ---
         audio_path = audio
         audio_id = audio_path.split("/")[-1][:-4]
         try:
+            feature, feature_per_second, song_length_second = audio_file_to_features(audio_path, self.config)
         except:
             logger.info("audio file failed to load : %s" % audio_path)
             assert(False)
         logger.info("audio file loaded and feature computation success : %s" % audio_path)
         feature = feature.T
+        feature = (feature - self.mean) / self.std
         time_unit = feature_per_second
+        n_timestep = self.config.model['timestep']
         num_pad = n_timestep - (feature.shape[0] % n_timestep)
         feature = np.pad(feature, ((0, num_pad), (0, 0)), mode="constant", constant_values=0)
         start_time = 0.0
         lines = []
         with torch.no_grad():
+            self.btc_model.eval()
             feature = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(self.device)
             for t in range(num_instance):
+                self_attn_output, _ = self.btc_model.self_attn_layers(feature[:, n_timestep * t:n_timestep * (t + 1), :])
+                prediction, _ = self.btc_model.output_layer(self_attn_output)
                 prediction = prediction.squeeze()
                 for i in range(n_timestep):
                     if t == 0 and i == 0:
                         continue
                     if prediction[i].item() != prev_chord:
                         lines.append(
+                            '%.3f %.3f %s\n' % (start_time, time_unit * (n_timestep * t + i), self.idx_to_voca[prev_chord]))
                         start_time = time_unit * (n_timestep * t + i)
                         prev_chord = prediction[i].item()
                     if t == num_instance - 1 and i + num_pad == n_timestep:
                         if start_time != time_unit * (n_timestep * t + i):
+                            lines.append('%.3f %.3f %s\n' % (start_time, time_unit * (n_timestep * t + i), self.idx_to_voca[prev_chord]))
                         break
         save_path = os.path.join(feature_dir, os.path.split(audio_path)[-1].replace('.mp3', '').replace('.wav', '') + '.lab')
         midi.instruments.append(instrument)
         midi.write(save_path.replace('.lab', '.midi'))
         try:
             midi_file = converter.parse(save_path.replace('.lab', '.midi'))
             key_signature = str(midi_file.analyze('key'))
         else:
             mode = key_signature.split()[-1]
+        encoded_mode = self.mode_to_idx.get(mode, 0)
         mode_tensor = torch.tensor([encoded_mode], dtype=torch.long).to(self.device)
         converted_lines = normalize_chord(save_path, key_signature, key_type)
         for start, end, chord in chords:
             chord_arr = chord.split(":")
             if len(chord_arr) == 1:
+                chordRootID = self.chordRootDic[chord_arr[0]]
                 if chord_arr[0] == "N" or chord_arr[0] == "X":
                     chordAttrID = 0
                 else:
                     chordAttrID = 1
             elif len(chord_arr) == 2:
+                chordRootID = self.chordRootDic[chord_arr[0]]
+                chordAttrID = self.chordAttrDic[chord_arr[1]]
             encoded_root.append(chordRootID)
             encoded_attr.append(chordAttrID)
+            if chord in self.chord_to_idx:
+                encoded.append(self.chord_to_idx[chord])
             else:
                 print(f"Warning: Chord {chord} not found in chord.json. Skipping.")
                 elem_id="output-text"
             )
     predict_btn.click(
         fn=lambda audio, thresh: format_prediction(music2emo.predict(audio, thresh)),