File size: 7,331 Bytes
22acb53
2130106
 
 
e29024a
2130106
 
 
fec291e
f2496ac
 
6bc398e
 
f2496ac
c450990
aaef8fd
d90cb9b
aaef8fd
879bc79
 
404f618
879bc79
 
 
 
 
 
 
 
 
 
 
 
 
2130106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818643e
2130106
 
 
6c0af57
2130106
6c0af57
2130106
6c0af57
 
6bc398e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c0af57
6bc398e
 
 
 
fec291e
6bc398e
 
fec291e
6bc398e
 
 
 
 
 
 
 
 
 
 
45dbbbe
6bc398e
9c72e61
 
 
 
6bc398e
 
4693a2c
 
 
 
b7783e7
6c0af57
 
 
 
 
 
 
9c6f4a1
 
e94df39
5a17582
eae7f3b
 
 
e94df39
5c4531d
fec291e
 
eae7f3b
 
d79e81e
 
 
6e1bdf8
 
 
 
 
 
6bc398e
6e1bdf8
 
 
 
 
 
 
 
 
 
d79e81e
 
6bc398e
d79e81e
 
 
 
 
 
 
 
 
6e1bdf8
d79e81e
 
6e1bdf8
 
 
 
 
6bc398e
6e1bdf8
 
 
 
 
 
 
 
 
eae7f3b
 
6ad7f19
6e1bdf8
 
 
 
 
6bc398e
6e1bdf8
 
 
 
 
 
 
 
 
e94df39
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
import requests
from bs4 import BeautifulSoup
import time
import instaloader
from instaloader import Post

PATH = './checkpoint-18750/'
SEQ_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

def create_sentiment_bert():
  # 버트 pretrained 모델 로드
  model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
  # 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
  token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
  mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
  segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
  # 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
  bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

  bert_outputs = bert_outputs[1]
  sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
  sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

  sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
  return sentiment_model

def sentence_convert_data(data):
    global tokenizer
    tokens, masks, segments = [], [], []
    token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
    
    num_zeros = token.count(0) 
    mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros 
    segment = [0]*SEQ_LEN

    tokens.append(token)
    segments.append(segment)
    masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = sentiment_model.predict(data_x)
    predict_value = np.ravel(predict)
    # 0:부정, 1:긍정
    predict_answer = np.round(predict_value,0).item()
    return predict_answer

def get_comments(news_url):
    
    if ('naver' in news_url):
        # oid, aid 추출
        
        list = news_url.split("/")
        oid = list[-2]
        aid = list[-1]
        if len(aid) > 10:
            aid = aid[:10]
        
        # API URL 구성
        api_url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
        params = {
            "ticket": "news",
            "templateId": "default_society",
            "pool": "cbox5",
            "lang": "ko",
            "country": "KR",
            "objectId": f"news{oid},{aid}",
            "pageSize": 100,
            "indexSize": 10,
            "page": 1,
            "sort": "FAVORITE" # 'NEW'(최신순), 'FAVORITE'(순공감순)
        }
        
        headers = {
            "User-Agent": "Mozilla/5.0",
            "Referer": news_url
        }
    
        # API 호출 및 데이터 처리
        response = requests.get(api_url, params=params, headers=headers)
        content = response.text.replace("_callback(", "").replace(");", "")
        json_data = json.loads(content)
    
        response = requests.get(news_url)
        article_soup = BeautifulSoup(response.text, "html.parser")
    
        # 제목 추출
        title = article_soup.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
        if title is None:
            title = article_soup.select_one("#content > div.end_ct > div > h2")
            
        # 본문 추출
        article = article_soup.select_one("#dic_area")
        if article is None:
            article = article_soup.select_one("#articeBody")
    
        return title.text.strip(), article.text.strip(), processing_data(json_data['result']['commentList'])
    elif ('insta' in news_url):
        list = news_url.split('/')
        pid = ''
        for i in list:
            if len(i) == 11:
                pid = i
        L = instaloader.Instaloader()
        post = Post.from_shortcode(L.context, pid)
        try:
            comments = [x.text for x in post.get_comments()]
        except:
            comments = ['로그인이 필요합니다']
        return '', post.caption, comments

def processing_data(comments):
    comment_list = []
    for comment in comments:
        comment_list.append(comment['contents'])
    comment_listR = [x for x in comment_list if x]
    return comment_listR


def main():
    global sentiment_model
    title = ''
    content = ''
    comments = []
    sentiment_model = create_sentiment_bert()
    st.title("댓글 필터링 서비스")
    
    # URL 입력 받기
    if "q" in st.query_params:
        value = st.query_params['q']
        if value:
            url = st.text_input("url을 입력하세요",value=st.query_params['q'])
            title, content, comments = get_comments(url)

            if st.button("스크랩 시작"):
                if url:
                    title, content, comments = get_comments(url)
                    
                    # 결과 표시
                    st.subheader("제목")
                    st.write(title)
                    
                    st.subheader("본문 내용")
                    st.write(content)
                    
                    st.subheader("댓글")
                    for comment in comments:
                        if evaluation_predict(comment) == 1:
                            st.write(comment)
            
    
            # 결과 표시
            st.subheader("제목")
            st.write(title)
            
            st.subheader("본문 내용")
            st.write(content)
            
            st.subheader("댓글")
            for comment in comments:
                if evaluation_predict(comment) == 1:
                    st.write(comment)
                    
        else:
            url = st.text_input("url을 입력하세요")
            if st.button("스크랩 시작"):
                if url:
                    title, content, comments = get_comments(url)
                    
                    # 결과 표시
                    st.subheader("제목")
                    st.write(title)
                    
                    st.subheader("본문 내용")
                    st.write(content)
                    
                    st.subheader("댓글")
                    for comment in comments:
                        if evaluation_predict(comment) == 1:
                            st.write(comment)
    else:
        url = st.text_input("url을 입력하세요")
    
        if st.button("스크랩 시작"):
            if url:
                title, content, comments = get_comments(url)
                
                # 결과 표시
                st.subheader("제목")
                st.write(title)
                
                st.subheader("본문 내용")
                st.write(content)
                
                st.subheader("댓글")
                for comment in comments:
                    if evaluation_predict(comment) == 1:
                        st.write(comment)
    return 0

if __name__ == "__main__":
    main()