ojasrohatgi commited on
Commit
a3962d0
·
1 Parent(s): 64fa951

Add application file

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import streamlit as st
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import html
6
+ import time
7
+ from io import BytesIO
8
+ from reportlab.lib.pagesizes import A4
9
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
10
+ from reportlab.platypus import SimpleDocTemplate, Paragraph
11
+ from reportlab.lib.enums import TA_JUSTIFY
12
+
13
+ # Initialize the summarization pipeline
14
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
15
+
16
+ # Set page layout to wide
17
+ st.set_page_config(layout="wide")
18
+
19
+
20
+ # Function to create PDF with justified text
21
+ def create_pdf(text):
22
+ # Create a BytesIO buffer to avoid saving the PDF to disk
23
+ pdf_buffer = BytesIO()
24
+
25
+ # Define the PDF document layout and page size
26
+ doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
27
+
28
+ # Define a style for justified text
29
+ styles = getSampleStyleSheet()
30
+ justified_style = ParagraphStyle(
31
+ name="JustifiedStyle",
32
+ parent=styles["BodyText"],
33
+ alignment=TA_JUSTIFY,
34
+ fontSize=12,
35
+ leading=15 # Adjust line spacing as needed
36
+ )
37
+
38
+ # Create a Paragraph object with justified text
39
+ paragraph = Paragraph(text, justified_style)
40
+
41
+ # Build the PDF in the buffer
42
+ elements = [paragraph]
43
+ doc.build(elements)
44
+
45
+ # Move the buffer to the beginning so Streamlit can read it
46
+ pdf_buffer.seek(0)
47
+ return pdf_buffer
48
+
49
+
50
+ # Main application
51
+ def main():
52
+ st.title("Article Extractor and Summarizer")
53
+
54
+ # Get URL from the user
55
+ url = st.text_input("Share an article URL:", key="url")
56
+
57
+ # Define max chunk size to split article into manageable parts
58
+ max_chunk = 300
59
+
60
+ if url:
61
+ try:
62
+ # Fetch and parse the article
63
+ response = requests.get(url)
64
+ response.encoding = 'utf-8'
65
+ soup = BeautifulSoup(response.text, 'html.parser')
66
+ results = soup.find_all(['h1', 'p'])
67
+
68
+ # Clean and concatenate text
69
+ text = [html.unescape(result.get_text()) for result in results]
70
+ article = ' '.join(text)
71
+
72
+ # Display the extracted article text in a scrollable window
73
+ st.subheader("Extracted Article Content")
74
+ st.text_area("Article", article, height=300)
75
+ st.markdown(f"**Article Length:** {len(article)} characters")
76
+
77
+ # Preprocess text for chunking
78
+ article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
79
+ sentences = article.split('<eos>')
80
+ current_chunk = 0
81
+ chunks = [[]]
82
+
83
+ # Split text into manageable chunks
84
+ for sentence in sentences:
85
+ if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
86
+ chunks[current_chunk].extend(sentence.split(' '))
87
+ else:
88
+ current_chunk += 1
89
+ chunks.append(sentence.split(' '))
90
+
91
+ # Join words back to form full sentences for each chunk
92
+ for chunk_id in range(len(chunks)):
93
+ chunks[chunk_id] = ' '.join(chunks[chunk_id])
94
+
95
+ # Streamlit progress bar, dynamic status display, and summaries list
96
+ progress_bar = st.progress(0)
97
+ status_text = st.empty() # Placeholder for dynamic status updates
98
+ summaries = []
99
+ start_time = time.time()
100
+
101
+ # Summarize each chunk and update progress
102
+ for i, chunk in enumerate(chunks):
103
+ summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
104
+ summaries.append(summary[0]['summary_text'])
105
+
106
+ # Calculate and display percentage completed and estimated time
107
+ percent_complete = (i + 1) / len(chunks)
108
+ elapsed_time = time.time() - start_time
109
+ estimated_total_time = elapsed_time / percent_complete
110
+ estimated_time_remaining = estimated_total_time - elapsed_time
111
+
112
+ # Update progress bar and status text
113
+ progress_bar.progress(percent_complete)
114
+ status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
115
+ f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")
116
+
117
+ # Combine summaries into a single text output
118
+ summary_text = ' '.join(summaries)
119
+
120
+ # Display the summarized text
121
+ st.subheader("Summarized Article Content")
122
+ st.text_area("Summary", summary_text, height=300)
123
+ st.markdown(f"**Summary Length:** {len(summary_text)} characters")
124
+
125
+ # Create the PDF from the summary text with justified alignment and wrapping
126
+ pdf_buffer = create_pdf(summary_text)
127
+
128
+ # Display the download button for the PDF
129
+ st.download_button(
130
+ label="Download Summary as PDF",
131
+ data=pdf_buffer,
132
+ file_name="summarized_article.pdf",
133
+ mime="application/pdf"
134
+ )
135
+
136
+ # Display the compression ratio
137
+ original_length = len(article.split())
138
+ summary_length = len(summary_text.split())
139
+ compression_ratio = (summary_length / original_length) * 100
140
+
141
+ # Evaluate if the compression ratio is good or bad
142
+ if compression_ratio < 20:
143
+ st.success(
144
+ f"{round(compression_ratio)}% Great Compression!\nThe summary is succinct and effectively "
145
+ f"highlights key points.")
146
+ elif 20 <= compression_ratio <= 40:
147
+ st.info(
148
+ f"{round(compression_ratio)}% Well-balanced Summary.\nIt maintains essential details while being "
149
+ f"brief.")
150
+ else:
151
+ st.warning(
152
+ f"{round(compression_ratio)}% Compression may be excessive.\nThe summary could be too brief and "
153
+ f"miss important details.")
154
+
155
+ except Exception as e:
156
+ st.warning(f"Error: {e}")
157
+
158
+
159
+ # Run the app
160
+ if __name__ == '__main__':
161
+ main()