# Import part import streamlit as st from transformers import pipeline from PIL import Image import soundfile as sf import io import numpy as np # Function part # img2text def img2text(img): image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") text = image_to_text_model(img)[0]["generated_text"] return text # text2story (modified prompt, no exception handling) def text2story(text): # Create the messages list with the user-provided subject messages = [ {"role": "user", "content": f"write me a story of {text}. Story need to be at least 100 words and less than 200 words"} ] # Initialize the pipeline with max_length to ensure sufficient length pipe = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct", max_length=260, min_length=130) # Generate the story story = pipe(messages) # Extract the generated text extracted_text = story[0]['generated_text'][1]['content'] return extracted_text # text2audio def text2audio(story_text): # Initialize the pipeline pipe = pipeline("text-to-speech", model="facebook/mms-tts-eng") # Generate audio audio_output = pipe(story_text) # Extract audio data and ensure it's in the correct format audio_data = audio_output['audio'] # Convert to numpy array if it isn't already, and ensure float32 format if not isinstance(audio_data, np.ndarray): audio_data = np.array(audio_data, dtype=np.float32) elif audio_data.dtype != np.float32: audio_data = audio_data.astype(np.float32) # Get sampling rate (default to 16000 if not provided) sampling_rate = audio_output.get('sampling_rate', 16000) # Ensure audio data is 1D if len(audio_data.shape) > 1: audio_data = audio_data.flatten() # Save the audio to a file output_file = "story_output.wav" sf.write(output_file, audio_data, sampling_rate) return output_file # Main part st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image to Audio Story") uploaded_file = st.file_uploader("Select an Image...", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: # Display the uploaded image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) # Stage 1: Image to Text st.markdown("### *Processing img2text...*") scenario = img2text(image) st.write(scenario) # Stage 2: Text to Story st.success("Generating a story...") story = text2story(scenario) st.write(story) # Stage 3: Story to Audio data if st.button("Generate Audio"): st.info("Generating audio data...") audio_data = text2audio(story) st.audio("story_output.wav") # Footer st.write("Powered by HF中国镜像站 Transformers and Streamlit")