File size: 6,111 Bytes
a3962d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from transformers import pipeline
import streamlit as st
import requests
from bs4 import BeautifulSoup
import html
import time
from io import BytesIO
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.enums import TA_JUSTIFY

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Set page layout to wide
st.set_page_config(layout="wide")


# Function to create PDF with justified text
def create_pdf(text):
    # Create a BytesIO buffer to avoid saving the PDF to disk
    pdf_buffer = BytesIO()

    # Define the PDF document layout and page size
    doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)

    # Define a style for justified text
    styles = getSampleStyleSheet()
    justified_style = ParagraphStyle(
        name="JustifiedStyle",
        parent=styles["BodyText"],
        alignment=TA_JUSTIFY,
        fontSize=12,
        leading=15  # Adjust line spacing as needed
    )

    # Create a Paragraph object with justified text
    paragraph = Paragraph(text, justified_style)

    # Build the PDF in the buffer
    elements = [paragraph]
    doc.build(elements)

    # Move the buffer to the beginning so Streamlit can read it
    pdf_buffer.seek(0)
    return pdf_buffer


# Main application
def main():
    st.title("Article Extractor and Summarizer")

    # Get URL from the user
    url = st.text_input("Share an article URL:", key="url")

    # Define max chunk size to split article into manageable parts
    max_chunk = 300

    if url:
        try:
            # Fetch and parse the article
            response = requests.get(url)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'html.parser')
            results = soup.find_all(['h1', 'p'])

            # Clean and concatenate text
            text = [html.unescape(result.get_text()) for result in results]
            article = ' '.join(text)

            # Display the extracted article text in a scrollable window
            st.subheader("Extracted Article Content")
            st.text_area("Article", article, height=300)
            st.markdown(f"**Article Length:** {len(article)} characters")

            # Preprocess text for chunking
            article = article.replace('.', '.<eos>').replace('?', '?<eos>').replace('!', '!<eos>')
            sentences = article.split('<eos>')
            current_chunk = 0
            chunks = [[]]

            # Split text into manageable chunks
            for sentence in sentences:
                if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                    chunks[current_chunk].extend(sentence.split(' '))
                else:
                    current_chunk += 1
                    chunks.append(sentence.split(' '))

            # Join words back to form full sentences for each chunk
            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])

            # Streamlit progress bar, dynamic status display, and summaries list
            progress_bar = st.progress(0)
            status_text = st.empty()  # Placeholder for dynamic status updates
            summaries = []
            start_time = time.time()

            # Summarize each chunk and update progress
            for i, chunk in enumerate(chunks):
                summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)
                summaries.append(summary[0]['summary_text'])

                # Calculate and display percentage completed and estimated time
                percent_complete = (i + 1) / len(chunks)
                elapsed_time = time.time() - start_time
                estimated_total_time = elapsed_time / percent_complete
                estimated_time_remaining = estimated_total_time - elapsed_time

                # Update progress bar and status text
                progress_bar.progress(percent_complete)
                status_text.markdown(f"**Progress:** {percent_complete * 100:.2f}% - "
                                     f"**Estimated time remaining:** {estimated_time_remaining:.2f} seconds")

            # Combine summaries into a single text output
            summary_text = ' '.join(summaries)

            # Display the summarized text
            st.subheader("Summarized Article Content")
            st.text_area("Summary", summary_text, height=300)
            st.markdown(f"**Summary Length:** {len(summary_text)} characters")

            # Create the PDF from the summary text with justified alignment and wrapping
            pdf_buffer = create_pdf(summary_text)

            # Display the download button for the PDF
            st.download_button(
                label="Download Summary as PDF",
                data=pdf_buffer,
                file_name="summarized_article.pdf",
                mime="application/pdf"
            )

            # Display the compression ratio
            original_length = len(article.split())
            summary_length = len(summary_text.split())
            compression_ratio = (summary_length / original_length) * 100

            # Evaluate if the compression ratio is good or bad
            if compression_ratio < 20:
                st.success(
                    f"{round(compression_ratio)}% Great Compression!\nThe summary is succinct and effectively "
                    f"highlights key points.")
            elif 20 <= compression_ratio <= 40:
                st.info(
                    f"{round(compression_ratio)}% Well-balanced Summary.\nIt maintains essential details while being "
                    f"brief.")
            else:
                st.warning(
                    f"{round(compression_ratio)}% Compression may be excessive.\nThe summary could be too brief and "
                    f"miss important details.")

        except Exception as e:
            st.warning(f"Error: {e}")


# Run the app
if __name__ == '__main__':
    main()