Spaces:

duwing
/

comment_filter

Sleeping

App Files Files Community

comment_filter / app.py

duwing

Update app.py

e94df39 verified 5 months ago

raw

history blame

4.51 kB

	import streamlit as st
	import tensorflow as tf
	import numpy as np
	import pandas as pd
	from transformers import *
	from tqdm import tqdm
	from tensorflow.python.client import device_lib

	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup
	import time


	PATH = './checkpoint-7500/'
	SEQ_LEN = 128
	tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

	def create_sentiment_bert():
	# 버트 pretrained 모델 로드
	model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
	# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
	token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
	mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
	segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
	# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
	bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

	bert_outputs = bert_outputs[1]
	sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
	sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

	sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
	return sentiment_model

	def sentence_convert_data(data):
	global tokenizer
	tokens, masks, segments = [], [], []
	token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')

	num_zeros = token.count(0)
	mask = [1](SEQ_LEN-num_zeros) + [0]num_zeros
	segment = [0]*SEQ_LEN

	tokens.append(token)
	segments.append(segment)
	masks.append(mask)

	tokens = np.array(tokens)
	masks = np.array(masks)
	segments = np.array(segments)
	return [tokens, masks, segments]

	def movie_evaluation_predict(sentence):
	data_x = sentence_convert_data(sentence)
	predict = sentiment_model.predict(data_x)
	predict_value = np.ravel(predict)
	predict_answer = np.round(predict_value,0).item()

	print(predict_value)

	if predict_answer == 0:
	st.write("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1.0-predict_value))
	elif predict_answer == 1:
	st.write("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value)

	def setup_driver():
	chrome_options = Options()
	chrome_options.add_argument("--headless") # 백그라운드 실행
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")

	driver = webdriver.Chrome(options=chrome_options)
	return driver

	def scrape_content(url):
	driver = setup_driver()
	try:
	driver.get(url)
	# 페이지 로딩 대기
	time.sleep(3)

	# 본문 추출
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	content = soup.find('article') # 본문 태그에 맞게 수정

	# 댓글 추출
	comments = soup.find_all('span', class_='u_cbox_contents') # 댓글 태그에 맞게 수정

	return {
	'content': content.text if content else "본문을 찾을 수 없습니다.",
	'comments': [comment.text for comment in comments]
	}
	finally:
	driver.quit()


	def main():
	sentiment_model = create_sentiment_bert()

	url = st.text_input("URL을 입력하세요")

	if st.button("크롤링 시작"):
	if url:
	with st.spinner("크롤링 중..."):
	result = scrape_content(url)

	st.subheader("본문")
	st.write(result['content'])

	st.subheader("댓글")
	for idx, comment in enumerate(result['comments'], 1):
	st.write(f"{idx}. {comment}")
	else:
	st.error("URL을 입력해주세요")


	'''
	test = st.form('test')
	sentence = test.text_input("Your sentence")
	submit = test.form_submit_button("Submit")

	if submit:
	movie_evaluation_predict(sentence)
	'''
	return 0

	if __name__ == "__main__":
	main()