comment_filter / app.py
duwing's picture
Update app.py
4693a2c verified
raw
history blame
7.33 kB
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
import requests
from bs4 import BeautifulSoup
import time
import instaloader
from instaloader import Post
PATH = './checkpoint-18750/'
SEQ_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
def create_sentiment_bert():
# 버트 pretrained 모델 로드
model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_outputs = bert_outputs[1]
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
return sentiment_model
def sentence_convert_data(data):
global tokenizer
tokens, masks, segments = [], [], []
token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
num_zeros = token.count(0)
mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
segment = [0]*SEQ_LEN
tokens.append(token)
segments.append(segment)
masks.append(mask)
tokens = np.array(tokens)
masks = np.array(masks)
segments = np.array(segments)
return [tokens, masks, segments]
def evaluation_predict(sentence):
data_x = sentence_convert_data(sentence)
predict = sentiment_model.predict(data_x)
predict_value = np.ravel(predict)
# 0:부정, 1:긍정
predict_answer = np.round(predict_value,0).item()
return predict_answer
def get_comments(news_url):
if ('naver' in news_url):
# oid, aid 추출
list = news_url.split("/")
oid = list[-2]
aid = list[-1]
if len(aid) > 10:
aid = aid[:10]
# API URL 구성
api_url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
params = {
"ticket": "news",
"templateId": "default_society",
"pool": "cbox5",
"lang": "ko",
"country": "KR",
"objectId": f"news{oid},{aid}",
"pageSize": 100,
"indexSize": 10,
"page": 1,
"sort": "FAVORITE" # 'NEW'(최신순), 'FAVORITE'(순공감순)
}
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": news_url
}
# API 호출 및 데이터 처리
response = requests.get(api_url, params=params, headers=headers)
content = response.text.replace("_callback(", "").replace(");", "")
json_data = json.loads(content)
response = requests.get(news_url)
article_soup = BeautifulSoup(response.text, "html.parser")
# 제목 추출
title = article_soup.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
if title is None:
title = article_soup.select_one("#content > div.end_ct > div > h2")
# 본문 추출
article = article_soup.select_one("#dic_area")
if article is None:
article = article_soup.select_one("#articeBody")
return title.text.strip(), article.text.strip(), processing_data(json_data['result']['commentList'])
elif ('insta' in news_url):
list = news_url.split('/')
pid = ''
for i in list:
if len(i) == 11:
pid = i
L = instaloader.Instaloader()
post = Post.from_shortcode(L.context, pid)
try:
comments = [x.text for x in post.get_comments()]
except:
comments = ['로그인이 필요합니다']
return '', post.caption, comments
def processing_data(comments):
comment_list = []
for comment in comments:
comment_list.append(comment['contents'])
comment_listR = [x for x in comment_list if x]
return comment_listR
def main():
global sentiment_model
title = ''
content = ''
comments = []
sentiment_model = create_sentiment_bert()
st.title("댓글 필터링 서비스")
# URL 입력 받기
if "q" in st.query_params:
value = st.query_params['q']
if value:
url = st.text_input("url을 입력하세요",value=st.query_params['q'])
title, content, comments = get_comments(url)
if st.button("스크랩 시작"):
if url:
title, content, comments = get_comments(url)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
else:
url = st.text_input("url을 입력하세요")
if st.button("스크랩 시작"):
if url:
title, content, comments = get_comments(url)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
else:
url = st.text_input("url을 입력하세요")
if st.button("스크랩 시작"):
if url:
title, content, comments = get_comments(url)
# 결과 표시
st.subheader("제목")
st.write(title)
st.subheader("본문 내용")
st.write(content)
st.subheader("댓글")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
return 0
if __name__ == "__main__":
main()