Runtime error
Runtime error
Add application file
Browse files
@@ -0,0 +1,161 @@
1 |
from transformers import pipeline
2 |
import streamlit as st
3 |
import requests
4 |
from bs4 import BeautifulSoup
5 |
import html
6 |
import time
7 |
from io import BytesIO
8 |
from reportlab.lib.pagesizes import A4
9 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
10 |
from reportlab.platypus import SimpleDocTemplate, Paragraph
11 |
from reportlab.lib.enums import TA_JUSTIFY
12 |
13 |
# Initialize the summarization pipeline
14 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
15 |
16 |
# Set page layout to wide
17 |
18 |
19 |
20 |
# Function to create PDF with justified text
21 |
def create_pdf(text):
22 |
# Create a BytesIO buffer to avoid saving the PDF to disk
23 |
pdf_buffer = BytesIO()
24 |
25 |
# Define the PDF document layout and page size
26 |
doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
27 |
28 |
# Define a style for justified text
29 |
styles = getSampleStyleSheet()
30 |
justified_style = ParagraphStyle(
31 |
32 |
33 |
34 |
35 |
leading=15 # Adjust line spacing as needed
36 |
37 |
38 |
# Create a Paragraph object with justified text
39 |
paragraph = Paragraph(text, justified_style)
40 |
41 |
# Build the PDF in the buffer
42 |
elements = [paragraph]
43 |
44 |
45 |
# Move the buffer to the beginning so Streamlit can read it
46 |
47 |
return pdf_buffer
48 |
49 |
50 |
# Main application
51 |
def main():
52 |
st.title("Article Extractor and Summarizer")
53 |
54 |
# Get URL from the user
55 |
url = st.text_input("Share an article URL:", key="url")
56 |
57 |
# Define max chunk size to split article into manageable parts
58 |
max_chunk = 300
59 |
60 |
if url:
61 |
62 |
# Fetch and parse the article
63 |
response = requests.get(url)
64 |
response.encoding = 'utf-8'
65 |
soup = BeautifulSoup(response.text, 'html.parser')
66 |
results = soup.find_all(['h1', 'p'])
67 |
68 |
# Clean and concatenate text
69 |
text = [html.unescape(result.get_text()) for result in results]
70 |
article = ' '.join(text)
71 |
72 |
# Display the extracted article text in a scrollable window
73 |
st.subheader("Extracted Article Content")
74 |
st.text_area("Article", article, height=300)
75 |
st.markdown(f"**Article Length:** {len(article)} characters")
76 |
77 |
# Preprocess text for chunking
78 |
article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
79 |
sentences = article.split('<eos>')
80 |
current_chunk = 0
81 |
chunks = [[]]
82 |
83 |
# Split text into manageable chunks
84 |
for sentence in sentences:
85 |
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
86 |
chunks[current_chunk].extend(sentence.split(' '))
87 |
88 |
current_chunk += 1
89 |
chunks.append(sentence.split(' '))
90 |
91 |
# Join words back to form full sentences for each chunk
92 |
for chunk_id in range(len(chunks)):
93 |
chunks[chunk_id] = ' '.join(chunks[chunk_id])
94 |
95 |
# Streamlit progress bar, dynamic status display, and summaries list
96 |
progress_bar = st.progress(0)
97 |
status_text = st.empty() # Placeholder for dynamic status updates
98 |
summaries = []
99 |
start_time = time.time()
100 |
101 |
# Summarize each chunk and update progress
102 |
for i, chunk in enumerate(chunks):
103 |
summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
104 |
105 |
106 |
# Calculate and display percentage completed and estimated time
107 |
percent_complete = (i + 1) / len(chunks)
108 |
elapsed_time = time.time() - start_time
109 |
estimated_total_time = elapsed_time / percent_complete
110 |
estimated_time_remaining = estimated_total_time - elapsed_time
111 |
112 |
# Update progress bar and status text
113 |
114 |
status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
115 |
f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")
116 |
117 |
# Combine summaries into a single text output
118 |
summary_text = ' '.join(summaries)
119 |
120 |
# Display the summarized text
121 |
st.subheader("Summarized Article Content")
122 |
st.text_area("Summary", summary_text, height=300)
123 |
st.markdown(f"**Summary Length:** {len(summary_text)} characters")
124 |
125 |
# Create the PDF from the summary text with justified alignment and wrapping
126 |
pdf_buffer = create_pdf(summary_text)
127 |
128 |
# Display the download button for the PDF
129 |
130 |
label="Download Summary as PDF",
131 |
132 |
133 |
134 |
135 |
136 |
# Display the compression ratio
137 |
original_length = len(article.split())
138 |
summary_length = len(summary_text.split())
139 |
compression_ratio = (summary_length / original_length) * 100
140 |
141 |
# Evaluate if the compression ratio is good or bad
142 |
if compression_ratio < 20:
143 |
144 |
f"{round(compression_ratio)}% Great Compression!\nThe summary is succinct and effectively "
145 |
f"highlights key points.")
146 |
elif 20 <= compression_ratio <= 40:
147 |
148 |
f"{round(compression_ratio)}% Well-balanced Summary.\nIt maintains essential details while being "
149 |
150 |
151 |
152 |
f"{round(compression_ratio)}% Compression may be excessive.\nThe summary could be too brief and "
153 |
f"miss important details.")
154 |
155 |
except Exception as e:
156 |
st.warning(f"Error: {e}")
157 |
158 |
159 |
# Run the app
160 |
if __name__ == '__main__':
161 |