ojasrohatgi commited on
Commit
06ae4c0
·
verified ·
1 Parent(s): ebc9977

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -53
app.py CHANGED
@@ -9,6 +9,7 @@ from reportlab.lib.pagesizes import A4
9
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
10
  from reportlab.platypus import SimpleDocTemplate, Paragraph
11
  from reportlab.lib.enums import TA_JUSTIFY
 
12
 
13
  # Initialize the summarization pipeline
14
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
@@ -19,68 +20,54 @@ st.set_page_config(layout="wide")
19
 
20
  # Function to create PDF with justified text
21
  def create_pdf(text):
22
- # Create a BytesIO buffer to avoid saving the PDF to disk
23
  pdf_buffer = BytesIO()
24
-
25
- # Define the PDF document layout and page size
26
  doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
27
-
28
- # Define a style for justified text
29
  styles = getSampleStyleSheet()
30
  justified_style = ParagraphStyle(
31
  name="JustifiedStyle",
32
  parent=styles["BodyText"],
33
  alignment=TA_JUSTIFY,
34
  fontSize=12,
35
- leading=15 # Adjust line spacing as needed
36
  )
37
-
38
- # Create a Paragraph object with justified text
39
  paragraph = Paragraph(text, justified_style)
40
-
41
- # Build the PDF in the buffer
42
- elements = [paragraph]
43
- doc.build(elements)
44
-
45
- # Move the buffer to the beginning so Streamlit can read it
46
  pdf_buffer.seek(0)
47
  return pdf_buffer
48
 
49
 
 
 
 
 
 
 
 
50
  # Main application
51
  def main():
52
- st.title("Article Extractor and Summarizer")
53
-
54
- # Get URL from the user
55
- url = st.text_input("Share an article URL:", key="url")
56
 
57
- # Define max chunk size to split article into manageable parts
58
  max_chunk = 300
59
 
60
  if url:
61
  try:
62
- # Fetch and parse the article
63
  response = requests.get(url)
64
  response.encoding = 'utf-8'
65
  soup = BeautifulSoup(response.text, 'html.parser')
66
  results = soup.find_all(['h1', 'p'])
67
-
68
- # Clean and concatenate text
69
  text = [html.unescape(result.get_text()) for result in results]
70
  article = ' '.join(text)
71
 
72
- # Display the extracted article text in a scrollable window
73
  st.subheader("Extracted Article Content")
74
  st.text_area("Article", article, height=300)
75
  st.markdown(f"**Article Length:** {len(article)} characters")
76
 
77
- # Preprocess text for chunking
78
  article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
79
  sentences = article.split('<eos>')
80
  current_chunk = 0
81
  chunks = [[]]
82
 
83
- # Split text into manageable chunks
84
  for sentence in sentences:
85
  if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
86
  chunks[current_chunk].extend(sentence.split(' '))
@@ -88,69 +75,62 @@ def main():
88
  current_chunk += 1
89
  chunks.append(sentence.split(' '))
90
 
91
- # Join words back to form full sentences for each chunk
92
  for chunk_id in range(len(chunks)):
93
  chunks[chunk_id] = ' '.join(chunks[chunk_id])
94
 
95
- # Streamlit progress bar, dynamic status display, and summaries list
96
  progress_bar = st.progress(0)
97
- status_text = st.empty() # Placeholder for dynamic status updates
98
  summaries = []
99
  start_time = time.time()
100
 
101
- # Summarize each chunk and update progress
102
  for i, chunk in enumerate(chunks):
103
  summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
104
  summaries.append(summary[0]['summary_text'])
105
 
106
- # Calculate and display percentage completed and estimated time
107
  percent_complete = (i + 1) / len(chunks)
108
  elapsed_time = time.time() - start_time
109
  estimated_total_time = elapsed_time / percent_complete
110
  estimated_time_remaining = estimated_total_time - elapsed_time
111
 
112
- # Update progress bar and status text
113
  progress_bar.progress(percent_complete)
114
  status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
115
  f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")
116
 
117
- # Combine summaries into a single text output
118
  summary_text = ' '.join(summaries)
119
 
120
- # Display the summarized text
121
  st.subheader("Summarized Article Content")
122
  st.text_area("Summary", summary_text, height=300)
123
  st.markdown(f"**Summary Length:** {len(summary_text)} characters")
124
 
125
- # Create the PDF from the summary text with justified alignment and wrapping
126
  pdf_buffer = create_pdf(summary_text)
127
 
128
- # Display the download button for the PDF
129
- st.download_button(
130
- label="Download Summary as PDF",
131
- data=pdf_buffer,
132
- file_name="summarized_article.pdf",
133
- mime="application/pdf"
134
- )
135
-
136
- # Display the compression ratio
137
  original_length = len(article.split())
138
  summary_length = len(summary_text.split())
139
  compression_ratio = (summary_length / original_length) * 100
140
 
141
- # Evaluate if the compression ratio is good or bad
142
  if compression_ratio < 20:
143
- st.success(
144
- f"{round(compression_ratio)}% Great Compression!\nThe summary is succinct and effectively "
145
- f"highlights key points.")
146
  elif 20 <= compression_ratio <= 40:
147
- st.info(
148
- f"{round(compression_ratio)}% Well-balanced Summary.\nIt maintains essential details while being "
149
- f"brief.")
150
  else:
151
- st.warning(
152
- f"{round(compression_ratio)}% Compression may be excessive.\nThe summary could be too brief and "
153
- f"miss important details.")
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  except Exception as e:
156
  st.warning(f"Error: {e}")
 
9
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
10
  from reportlab.platypus import SimpleDocTemplate, Paragraph
11
  from reportlab.lib.enums import TA_JUSTIFY
12
+ import pyttsx3
13
 
14
  # Initialize the summarization pipeline
15
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
20
 
21
  # Function to create PDF with justified text
22
  def create_pdf(text):
 
23
  pdf_buffer = BytesIO()
 
 
24
  doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
 
 
25
  styles = getSampleStyleSheet()
26
  justified_style = ParagraphStyle(
27
  name="JustifiedStyle",
28
  parent=styles["BodyText"],
29
  alignment=TA_JUSTIFY,
30
  fontSize=12,
31
+ leading=15
32
  )
 
 
33
  paragraph = Paragraph(text, justified_style)
34
+ doc.build([paragraph])
 
 
 
 
 
35
  pdf_buffer.seek(0)
36
  return pdf_buffer
37
 
38
 
39
+ # Function to read aloud the summary
40
+ def read_aloud(text):
41
+ engine = pyttsx3.init()
42
+ engine.say(text)
43
+ engine.runAndWait()
44
+
45
+
46
  # Main application
47
  def main():
48
+ st.title("Enhanced Article Extractor and Summarizer")
 
 
 
49
 
50
+ url = st.text_input("Enter the URL of an article:", key="url")
51
  max_chunk = 300
52
 
53
  if url:
54
  try:
 
55
  response = requests.get(url)
56
  response.encoding = 'utf-8'
57
  soup = BeautifulSoup(response.text, 'html.parser')
58
  results = soup.find_all(['h1', 'p'])
 
 
59
  text = [html.unescape(result.get_text()) for result in results]
60
  article = ' '.join(text)
61
 
 
62
  st.subheader("Extracted Article Content")
63
  st.text_area("Article", article, height=300)
64
  st.markdown(f"**Article Length:** {len(article)} characters")
65
 
 
66
  article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
67
  sentences = article.split('<eos>')
68
  current_chunk = 0
69
  chunks = [[]]
70
 
 
71
  for sentence in sentences:
72
  if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
73
  chunks[current_chunk].extend(sentence.split(' '))
 
75
  current_chunk += 1
76
  chunks.append(sentence.split(' '))
77
 
 
78
  for chunk_id in range(len(chunks)):
79
  chunks[chunk_id] = ' '.join(chunks[chunk_id])
80
 
 
81
  progress_bar = st.progress(0)
82
+ status_text = st.empty()
83
  summaries = []
84
  start_time = time.time()
85
 
 
86
  for i, chunk in enumerate(chunks):
87
  summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
88
  summaries.append(summary[0]['summary_text'])
89
 
 
90
  percent_complete = (i + 1) / len(chunks)
91
  elapsed_time = time.time() - start_time
92
  estimated_total_time = elapsed_time / percent_complete
93
  estimated_time_remaining = estimated_total_time - elapsed_time
94
 
 
95
  progress_bar.progress(percent_complete)
96
  status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
97
  f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")
98
 
 
99
  summary_text = ' '.join(summaries)
100
 
 
101
  st.subheader("Summarized Article Content")
102
  st.text_area("Summary", summary_text, height=300)
103
  st.markdown(f"**Summary Length:** {len(summary_text)} characters")
104
 
 
105
  pdf_buffer = create_pdf(summary_text)
106
 
107
+ # Compression Ratio
 
 
 
 
 
 
 
 
108
  original_length = len(article.split())
109
  summary_length = len(summary_text.split())
110
  compression_ratio = (summary_length / original_length) * 100
111
 
112
+ st.markdown(f"### Compression Ratio: {round(compression_ratio)}%")
113
  if compression_ratio < 20:
114
+ st.success(f"Great Compression!\nThe summary is succinct and effectively highlights key points.")
 
 
115
  elif 20 <= compression_ratio <= 40:
116
+ st.info(f"Well-balanced Summary.\nIt maintains essential details while being brief.")
 
 
117
  else:
118
+ st.warning(f"Compression may be excessive.\nThe summary could be too brief and miss important details.")
119
+
120
+ # Display buttons in columns
121
+ col1, col2 = st.columns([1, 1])
122
+
123
+ with col1:
124
+ st.download_button(
125
+ label="Download Summary as PDF",
126
+ data=pdf_buffer,
127
+ file_name="summarized_article.pdf",
128
+ mime="application/pdf"
129
+ )
130
+
131
+ with col2:
132
+ if st.button("Read Aloud Summary"):
133
+ read_aloud(summary_text)
134
 
135
  except Exception as e:
136
  st.warning(f"Error: {e}")