Spaces:
Sleeping
Sleeping
File size: 7,331 Bytes
22acb53 2130106 e29024a 2130106 fec291e f2496ac 6bc398e f2496ac c450990 aaef8fd d90cb9b aaef8fd 879bc79 404f618 879bc79 2130106 818643e 2130106 6c0af57 2130106 6c0af57 2130106 6c0af57 6bc398e 6c0af57 6bc398e fec291e 6bc398e fec291e 6bc398e 45dbbbe 6bc398e 9c72e61 6bc398e 4693a2c b7783e7 6c0af57 9c6f4a1 e94df39 5a17582 eae7f3b e94df39 5c4531d fec291e eae7f3b d79e81e 6e1bdf8 6bc398e 6e1bdf8 d79e81e 6bc398e d79e81e 6e1bdf8 d79e81e 6e1bdf8 6bc398e 6e1bdf8 eae7f3b 6ad7f19 6e1bdf8 6bc398e 6e1bdf8 e94df39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
import requests
from bs4 import BeautifulSoup
import time
import instaloader
from instaloader import Post
PATH = './checkpoint-18750/'
SEQ_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
def create_sentiment_bert():
# 버트 pretrained 모델 로드
model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_outputs = bert_outputs[1]
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
return sentiment_model
def sentence_convert_data(data):
global tokenizer
tokens, masks, segments = [], [], []
token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
num_zeros = token.count(0)
mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
segment = [0]*SEQ_LEN
tokens.append(token)
segments.append(segment)
masks.append(mask)
tokens = np.array(tokens)
masks = np.array(masks)
segments = np.array(segments)
return [tokens, masks, segments]
def evaluation_predict(sentence):
data_x = sentence_convert_data(sentence)
predict = sentiment_model.predict(data_x)
predict_value = np.ravel(predict)
# 0:부정, 1:긍정
predict_answer = np.round(predict_value,0).item()
return predict_answer
def get_comments(news_url):
if ('naver' in news_url):
# oid, aid 추출
list = news_url.split("/")
oid = list[-2]
aid = list[-1]
if len(aid) > 10:
aid = aid[:10]
# API URL 구성
api_url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
params = {
"ticket": "news",
"templateId": "default_society",
"pool": "cbox5",
"lang": "ko",
"country": "KR",
"objectId": f"news{oid},{aid}",
"pageSize": 100,
"indexSize": 10,
"page": 1,
"sort": "FAVORITE" # 'NEW'(최신순), 'FAVORITE'(순공감순)
}
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": news_url
}
# API 호출 및 데이터 처리
response = requests.get(api_url, params=params, headers=headers)
content = response.text.replace("_callback(", "").replace(");", "")
json_data = json.loads(content)
response = requests.get(news_url)
article_soup = BeautifulSoup(response.text, "html.parser")
# 제목 추출
title = article_soup.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
if title is None:
title = article_soup.select_one("#content > div.end_ct > div > h2")
# 본문 추출
article = article_soup.select_one("#dic_area")
if article is None:
article = article_soup.select_one("#articeBody")
return title.text.strip(), article.text.strip(), processing_data(json_data['result']['commentList'])
elif ('insta' in news_url):
list = news_url.split('/')
pid = ''
for i in list:
if len(i) == 11:
pid = i
L = instaloader.Instaloader()
post = Post.from_shortcode(L.context, pid)
try:
comments = [x.text for x in post.get_comments()]
except:
comments = ['로그인이 필요합니다']
return '', post.caption, comments
def processing_data(comments):
comment_list = []
for comment in comments:
comment_list.append(comment['contents'])
comment_listR = [x for x in comment_list if x]
return comment_listR
def main():
global sentiment_model
title = ''
content = ''
comments = []
sentiment_model = create_sentiment_bert()
st.title("댓글 필터링 서비스")
# URL 입력 받기
if "q" in st.query_params:
value = st.query_params['q']
if value:
url = st.text_input("url을 입력하세요",value=st.query_params['q'])
title, content, comments = get_comments(url)
if st.button("스크랩 시작"):
if url:
title, content, comments = get_comments(url)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
else:
url = st.text_input("url을 입력하세요")
if st.button("스크랩 시작"):
if url:
title, content, comments = get_comments(url)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
else:
url = st.text_input("url을 입력하세요")
if st.button("스크랩 시작"):
if url:
title, content, comments = get_comments(url)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
return 0
if __name__ == "__main__":
main()
|