Spaces:

ojasrohatgi
/

Abstractive-Article_Summarizer

Runtime error

App Files Files Community

ojasrohatgi commited on Nov 18, 2024

Commit

a3962d0

1 Parent(s): 64fa951

Add application file

Browse files

Files changed (1) hide show

app.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from transformers import pipeline
+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import html
+import time
+from io import BytesIO
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.platypus import SimpleDocTemplate, Paragraph
+from reportlab.lib.enums import TA_JUSTIFY
+# Initialize the summarization pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# Set page layout to wide
+st.set_page_config(layout="wide")
+# Function to create PDF with justified text
+def create_pdf(text):
+    # Create a BytesIO buffer to avoid saving the PDF to disk
+    pdf_buffer = BytesIO()
+    # Define the PDF document layout and page size
+    doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
+    # Define a style for justified text
+    styles = getSampleStyleSheet()
+    justified_style = ParagraphStyle(
+        name="JustifiedStyle",
+        parent=styles["BodyText"],
+        alignment=TA_JUSTIFY,
+        fontSize=12,
+        leading=15  # Adjust line spacing as needed
+    )
+    # Create a Paragraph object with justified text
+    paragraph = Paragraph(text, justified_style)
+    # Build the PDF in the buffer
+    elements = [paragraph]
+    doc.build(elements)
+    # Move the buffer to the beginning so Streamlit can read it
+    pdf_buffer.seek(0)
+    return pdf_buffer
+# Main application
+def main():
+    st.title("Article Extractor and Summarizer")
+    # Get URL from the user
+    url = st.text_input("Share an article URL:", key="url")
+    # Define max chunk size to split article into manageable parts
+    max_chunk = 300
+    if url:
+        try:
+            # Fetch and parse the article
+            response = requests.get(url)
+            response.encoding = 'utf-8'
+            soup = BeautifulSoup(response.text, 'html.parser')
+            results = soup.find_all(['h1', 'p'])
+            # Clean and concatenate text
+            text = [html.unescape(result.get_text()) for result in results]
+            article = ' '.join(text)
+            # Display the extracted article text in a scrollable window
+            st.subheader("Extracted Article Content")
+            st.text_area("Article", article, height=300)
+            st.markdown(f"**Article Length:** {len(article)} characters")
+            # Preprocess text for chunking
+            article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
+            sentences = article.split('<eos>')
+            current_chunk = 0
+            chunks = [[]]
+            # Split text into manageable chunks
+            for sentence in sentences:
+                if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
+                    chunks[current_chunk].extend(sentence.split(' '))
+                else:
+                    current_chunk += 1
+                    chunks.append(sentence.split(' '))
+            # Join words back to form full sentences for each chunk
+            for chunk_id in range(len(chunks)):
+                chunks[chunk_id] = ' '.join(chunks[chunk_id])
+            # Streamlit progress bar, dynamic status display, and summaries list
+            progress_bar = st.progress(0)
+            status_text = st.empty()  # Placeholder for dynamic status updates
+            summaries = []
+            start_time = time.time()
+            # Summarize each chunk and update progress
+            for i, chunk in enumerate(chunks):
+                summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
+                summaries.append(summary[0]['summary_text'])
+                # Calculate and display percentage completed and estimated time
+                percent_complete = (i + 1) / len(chunks)
+                elapsed_time = time.time() - start_time
+                estimated_total_time = elapsed_time / percent_complete
+                estimated_time_remaining = estimated_total_time - elapsed_time
+                # Update progress bar and status text
+                progress_bar.progress(percent_complete)
+                status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
+                                     f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")
+            # Combine summaries into a single text output
+            summary_text = ' '.join(summaries)
+            # Display the summarized text
+            st.subheader("Summarized Article Content")
+            st.text_area("Summary", summary_text, height=300)
+            st.markdown(f"**Summary Length:** {len(summary_text)} characters")
+            # Create the PDF from the summary text with justified alignment and wrapping
+            pdf_buffer = create_pdf(summary_text)
+            # Display the download button for the PDF
+            st.download_button(
+                label="Download Summary as PDF",
+                data=pdf_buffer,
+                file_name="summarized_article.pdf",
+                mime="application/pdf"
+            )
+            # Display the compression ratio
+            original_length = len(article.split())
+            summary_length = len(summary_text.split())
+            compression_ratio = (summary_length / original_length) * 100
+            # Evaluate if the compression ratio is good or bad
+            if compression_ratio < 20:
+                st.success(
+                    f"{round(compression_ratio)}% Great Compression!\nThe summary is succinct and effectively "
+                    f"highlights key points.")
+            elif 20 <= compression_ratio <= 40:
+                st.info(
+                    f"{round(compression_ratio)}% Well-balanced Summary.\nIt maintains essential details while being "
+                    f"brief.")
+            else:
+                st.warning(
+                    f"{round(compression_ratio)}% Compression may be excessive.\nThe summary could be too brief and "
+                    f"miss important details.")
+        except Exception as e:
+            st.warning(f"Error: {e}")
+# Run the app
+if __name__ == '__main__':
+    main()