|
import streamlit as st |
|
from transformers import pipeline |
|
from io import BytesIO |
|
from PIL import Image |
|
import torch |
|
|
|
|
|
image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") |
|
text_to_speech = pipeline("text-to-speech", model="facebook/mms-tts-eng") |
|
|
|
st.title("Image-to-Text and Text-to-Speech App") |
|
|
|
|
|
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) |
|
|
|
if uploaded_image: |
|
image = Image.open(uploaded_image) |
|
st.image(image, caption="Uploaded Image", use_container_width=True) |
|
|
|
|
|
text_output = image_to_text(image)[0]['generated_text'] |
|
st.write("### Extracted Text:") |
|
st.write(text_output) |
|
|
|
|
|
speech_output = text_to_speech(text_output) |
|
audio_bytes = BytesIO(speech_output['audio']) |
|
|
|
st.write("### Listen to Speech Output:") |
|
st.audio(audio_bytes, format="audio/wav") |
|
|