Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,220 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multimodal Classification Model (Tamil, Malayalam, Telugu)
|
2 |
+
|
3 |
+
This repository contains deep learning models for **text and audio classification** in three languages: **Tamil, Malayalam, and Telugu**.
|
4 |
+
|
5 |
+
---
|
6 |
+
|
7 |
+
## 📌 Overview
|
8 |
+
|
9 |
+
The models accept **text and audio inputs** and classify them into predefined categories. Each language has dedicated trained models and label encoders:
|
10 |
+
|
11 |
+
- **Text Model:** Utilizes `xlm-roberta-large` for feature extraction with a deep learning classifier.
|
12 |
+
- **Audio Model:** Uses **MFCC feature extraction** and a CNN-based classifier.
|
13 |
+
|
14 |
+
---
|
15 |
+
|
16 |
+
## 🛠 1. Setup
|
17 |
+
|
18 |
+
### 1.1 Clone the Repository
|
19 |
+
|
20 |
+
```bash
|
21 |
+
git clone https://huggingface.co/<your-model-repo>
|
22 |
+
cd <your-model-repo>
|
23 |
+
```
|
24 |
+
|
25 |
+
### 1.2 Install Dependencies
|
26 |
+
|
27 |
+
Ensure Python is installed, then run:
|
28 |
+
|
29 |
+
```bash
|
30 |
+
pip install -r requirements.txt
|
31 |
+
```
|
32 |
+
|
33 |
+
---
|
34 |
+
|
35 |
+
## 📂 2. Directory Structure
|
36 |
+
|
37 |
+
```
|
38 |
+
├── audio_label_encoders/ # Label encoders for audio models
|
39 |
+
├── audio_models/ # Trained audio classification models
|
40 |
+
├── text_label_encoders/ # Label encoders for text models
|
41 |
+
└── text_models/ # Trained text classification models
|
42 |
+
```
|
43 |
+
|
44 |
+
Each folder contains three files, corresponding to **Tamil, Malayalam, and Telugu**.
|
45 |
+
|
46 |
+
---
|
47 |
+
|
48 |
+
## 🚀 3. How to Use
|
49 |
+
|
50 |
+
### 3.1 Load the Models
|
51 |
+
|
52 |
+
```python
|
53 |
+
import tensorflow as tf
|
54 |
+
import pickle
|
55 |
+
import numpy as np
|
56 |
+
import torch
|
57 |
+
from transformers import AutoTokenizer, AutoModel
|
58 |
+
|
59 |
+
# Load Label Encoders
|
60 |
+
with open("text_label_encoders/tamil_label_encoder.pkl", "rb") as f:
|
61 |
+
tamil_text_label_encoder = pickle.load(f)
|
62 |
+
|
63 |
+
with open("audio_label_encoders/tamil_audio_label_encoder.pkl", "rb") as f:
|
64 |
+
tamil_audio_label_encoder = pickle.load(f)
|
65 |
+
|
66 |
+
# Load Models
|
67 |
+
text_model = tf.keras.models.load_model("text_models/tamil_text_model.h5")
|
68 |
+
audio_model = tf.keras.models.load_model("audio_models/tamil_audio_model.keras")
|
69 |
+
```
|
70 |
+
|
71 |
+
---
|
72 |
+
|
73 |
+
## 📝 4. Text Classification
|
74 |
+
|
75 |
+
### 4.1 Preprocess Text
|
76 |
+
|
77 |
+
```python
|
78 |
+
from indicnlp.tokenize import indic_tokenize
|
79 |
+
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
|
80 |
+
import advertools as adv
|
81 |
+
|
82 |
+
stopwords = list(sorted(adv.stopwords["tamil"]))
|
83 |
+
|
84 |
+
def preprocess_tamil_text(text):
|
85 |
+
tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
|
86 |
+
tokens = [token for token in tokens if token not in stopwords]
|
87 |
+
return " ".join(tokens)
|
88 |
+
```
|
89 |
+
|
90 |
+
### 4.2 Extract Features and Predict
|
91 |
+
|
92 |
+
```python
|
93 |
+
def extract_embeddings(model_name, texts):
|
94 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
95 |
+
model = AutoModel.from_pretrained(model_name)
|
96 |
+
model.eval()
|
97 |
+
|
98 |
+
embeddings = []
|
99 |
+
batch_size = 16
|
100 |
+
with torch.no_grad():
|
101 |
+
for i in range(0, len(texts), batch_size):
|
102 |
+
batch_texts = texts[i:i + batch_size]
|
103 |
+
encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
|
104 |
+
outputs = model(**encoded_inputs)
|
105 |
+
batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
|
106 |
+
embeddings.extend(batch_embeddings)
|
107 |
+
return np.array(embeddings)
|
108 |
+
|
109 |
+
feature_extractor = "xlm-roberta-large"
|
110 |
+
text = "உங்கள் உதவி மிகவும் பயனுள்ளதாக இருந்தது"
|
111 |
+
processed_text = preprocess_tamil_text(text)
|
112 |
+
text_embeddings = extract_embeddings(feature_extractor, [processed_text])
|
113 |
+
|
114 |
+
text_predictions = text_model.predict(text_embeddings)
|
115 |
+
predicted_label = tamil_text_label_encoder.inverse_transform(np.argmax(text_predictions, axis=1))
|
116 |
+
print("Predicted Label:", predicted_label[0])
|
117 |
+
```
|
118 |
+
|
119 |
+
---
|
120 |
+
|
121 |
+
## 🔊 5. Audio Classification
|
122 |
+
|
123 |
+
### 5.1 Preprocess Audio
|
124 |
+
|
125 |
+
```python
|
126 |
+
import librosa
|
127 |
+
|
128 |
+
def extract_audio_features(file_path, sr=22050, n_mfcc=40):
|
129 |
+
audio, _ = librosa.load(file_path, sr=sr)
|
130 |
+
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
|
131 |
+
return np.mean(mfccs.T, axis=0)
|
132 |
+
```
|
133 |
+
|
134 |
+
### 5.2 Predict Audio Class
|
135 |
+
|
136 |
+
```python
|
137 |
+
def predict_audio(file_path):
|
138 |
+
features = extract_audio_features(file_path)
|
139 |
+
reshaped_features = features.reshape((1, 40, 1, 1))
|
140 |
+
predicted_class = np.argmax(audio_model.predict(reshaped_features), axis=1)
|
141 |
+
predicted_label = tamil_audio_label_encoder.inverse_transform(predicted_class)
|
142 |
+
return predicted_label[0]
|
143 |
+
|
144 |
+
audio_file = "test_audio.wav"
|
145 |
+
predicted_audio_label = predict_audio(audio_file)
|
146 |
+
print("Predicted Audio Label:", predicted_audio_label)
|
147 |
+
```
|
148 |
+
|
149 |
+
---
|
150 |
+
|
151 |
+
## 📊 6. Batch Processing for a Dataset
|
152 |
+
|
153 |
+
### 6.1 Load Dataset
|
154 |
+
|
155 |
+
```python
|
156 |
+
import os
|
157 |
+
import pandas as pd
|
158 |
+
|
159 |
+
def load_dataset(base_dir='../test', lang='tamil'):
|
160 |
+
dataset = []
|
161 |
+
lang_dir = os.path.join(base_dir, lang)
|
162 |
+
audio_dir = os.path.join(lang_dir, "audio")
|
163 |
+
text_dir = os.path.join(lang_dir, "text")
|
164 |
+
|
165 |
+
text_file = os.path.join(text_dir, [file for file in os.listdir(text_dir) if file.endswith(".xlsx")][0])
|
166 |
+
text_df = pd.read_excel(text_file)
|
167 |
+
|
168 |
+
for file in text_df["File Name"]:
|
169 |
+
if (file + ".wav") in os.listdir(audio_dir):
|
170 |
+
audio_path = os.path.join(audio_dir, file + ".wav")
|
171 |
+
transcript_row = text_df.loc[text_df["File Name"] == file]
|
172 |
+
transcript = transcript_row.iloc[0]["Transcript"] if not transcript_row.empty else ""
|
173 |
+
dataset.append({"File Name": audio_path, "Transcript": transcript})
|
174 |
+
else:
|
175 |
+
transcript_row = text_df.loc[text_df["File Name"] == file]
|
176 |
+
transcript = transcript_row.iloc[0]["Transcript"] if not transcript_row.empty else ""
|
177 |
+
dataset.append({"File Name": "Nil", "Transcript": transcript})
|
178 |
+
|
179 |
+
return pd.DataFrame(dataset)
|
180 |
+
|
181 |
+
dataset_df = load_dataset()
|
182 |
+
```
|
183 |
+
|
184 |
+
### 6.2 Predict Text and Audio in Bulk
|
185 |
+
|
186 |
+
```python
|
187 |
+
dataset_df["Transcript"] = dataset_df["Transcript"].apply(preprocess_tamil_text)
|
188 |
+
text_embeddings = extract_embeddings(feature_extractor, dataset_df["Transcript"].tolist())
|
189 |
+
text_predictions = text_model.predict(text_embeddings)
|
190 |
+
text_labels = tamil_text_label_encoder.inverse_transform(np.argmax(text_predictions, axis=1))
|
191 |
+
|
192 |
+
dataset_df["Predicted Text Label"] = text_labels
|
193 |
+
dataset_df["Predicted Audio Label"] = dataset_df["File Name"].apply(lambda x: predict_audio(x) if x != "Nil" else "No Audio")
|
194 |
+
dataset_df.to_csv("predictions.tsv", sep="\t", index=False)
|
195 |
+
```
|
196 |
+
|
197 |
+
---
|
198 |
+
|
199 |
+
## ☁️ 7. Deployment on HF中国镜像站
|
200 |
+
|
201 |
+
```bash
|
202 |
+
pip install huggingface_hub
|
203 |
+
huggingface-cli login
|
204 |
+
```
|
205 |
+
|
206 |
+
```python
|
207 |
+
from huggingface_hub import upload_file
|
208 |
+
|
209 |
+
upload_file(path_or_fileobj="text_models/tamil_text_model.h5", path_in_repo="text_models/tamil_text_model.h5", repo_id="<your-hf-repo>")
|
210 |
+
```
|
211 |
+
|
212 |
+
---
|
213 |
+
|
214 |
+
## 📬 Contact
|
215 |
+
|
216 |
+
For issues or improvements, feel free to raise an issue or email [**[email protected]**](mailto\:[email protected]).
|
217 |
+
|
218 |
+
---
|
219 |
+
|
220 |
+
**License:** CC BY-NC 4.0
|