Spaces:
Runtime error
Runtime error
File size: 6,111 Bytes
a3962d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from transformers import pipeline
import streamlit as st
import requests
from bs4 import BeautifulSoup
import html
import time
from io import BytesIO
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.enums import TA_JUSTIFY
# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Set page layout to wide
st.set_page_config(layout="wide")
# Function to create PDF with justified text
def create_pdf(text):
# Create a BytesIO buffer to avoid saving the PDF to disk
pdf_buffer = BytesIO()
# Define the PDF document layout and page size
doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
# Define a style for justified text
styles = getSampleStyleSheet()
justified_style = ParagraphStyle(
name="JustifiedStyle",
parent=styles["BodyText"],
alignment=TA_JUSTIFY,
fontSize=12,
leading=15 # Adjust line spacing as needed
)
# Create a Paragraph object with justified text
paragraph = Paragraph(text, justified_style)
# Build the PDF in the buffer
elements = [paragraph]
doc.build(elements)
# Move the buffer to the beginning so Streamlit can read it
pdf_buffer.seek(0)
return pdf_buffer
# Main application
def main():
st.title("Article Extractor and Summarizer")
# Get URL from the user
url = st.text_input("Share an article URL:", key="url")
# Define max chunk size to split article into manageable parts
max_chunk = 300
if url:
try:
# Fetch and parse the article
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
# Clean and concatenate text
text = [html.unescape(result.get_text()) for result in results]
article = ' '.join(text)
# Display the extracted article text in a scrollable window
st.subheader("Extracted Article Content")
st.text_area("Article", article, height=300)
st.markdown(f"**Article Length:** {len(article)} characters")
# Preprocess text for chunking
article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
sentences = article.split('<eos>')
current_chunk = 0
chunks = [[]]
# Split text into manageable chunks
for sentence in sentences:
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
chunks[current_chunk].extend(sentence.split(' '))
else:
current_chunk += 1
chunks.append(sentence.split(' '))
# Join words back to form full sentences for each chunk
for chunk_id in range(len(chunks)):
chunks[chunk_id] = ' '.join(chunks[chunk_id])
# Streamlit progress bar, dynamic status display, and summaries list
progress_bar = st.progress(0)
status_text = st.empty() # Placeholder for dynamic status updates
summaries = []
start_time = time.time()
# Summarize each chunk and update progress
for i, chunk in enumerate(chunks):
summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
summaries.append(summary[0]['summary_text'])
# Calculate and display percentage completed and estimated time
percent_complete = (i + 1) / len(chunks)
elapsed_time = time.time() - start_time
estimated_total_time = elapsed_time / percent_complete
estimated_time_remaining = estimated_total_time - elapsed_time
# Update progress bar and status text
progress_bar.progress(percent_complete)
status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")
# Combine summaries into a single text output
summary_text = ' '.join(summaries)
# Display the summarized text
st.subheader("Summarized Article Content")
st.text_area("Summary", summary_text, height=300)
st.markdown(f"**Summary Length:** {len(summary_text)} characters")
# Create the PDF from the summary text with justified alignment and wrapping
pdf_buffer = create_pdf(summary_text)
# Display the download button for the PDF
st.download_button(
label="Download Summary as PDF",
data=pdf_buffer,
file_name="summarized_article.pdf",
mime="application/pdf"
)
# Display the compression ratio
original_length = len(article.split())
summary_length = len(summary_text.split())
compression_ratio = (summary_length / original_length) * 100
# Evaluate if the compression ratio is good or bad
if compression_ratio < 20:
st.success(
f"{round(compression_ratio)}% Great Compression!\nThe summary is succinct and effectively "
f"highlights key points.")
elif 20 <= compression_ratio <= 40:
st.info(
f"{round(compression_ratio)}% Well-balanced Summary.\nIt maintains essential details while being "
f"brief.")
else:
st.warning(
f"{round(compression_ratio)}% Compression may be excessive.\nThe summary could be too brief and "
f"miss important details.")
except Exception as e:
st.warning(f"Error: {e}")
# Run the app
if __name__ == '__main__':
main()
|