Spaces:
Sleeping
Sleeping
import streamlit as st | |
import tensorflow as tf | |
import numpy as np | |
import pandas as pd | |
from transformers import * | |
from tqdm import tqdm | |
from tensorflow.python.client import device_lib | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from bs4 import BeautifulSoup | |
import time | |
PATH = './checkpoint-7500/' | |
SEQ_LEN = 128 | |
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') | |
def create_sentiment_bert(): | |
# 버트 pretrained 모델 로드 | |
model = TFAutoModel.from_pretrained(PATH,local_files_only=True) | |
# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의 | |
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids') | |
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks') | |
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment') | |
# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의 | |
bert_outputs = model([token_inputs, mask_inputs, segment_inputs]) | |
bert_outputs = bert_outputs[1] | |
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs) | |
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first) | |
sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy']) | |
return sentiment_model | |
def sentence_convert_data(data): | |
global tokenizer | |
tokens, masks, segments = [], [], [] | |
token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length') | |
num_zeros = token.count(0) | |
mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros | |
segment = [0]*SEQ_LEN | |
tokens.append(token) | |
segments.append(segment) | |
masks.append(mask) | |
tokens = np.array(tokens) | |
masks = np.array(masks) | |
segments = np.array(segments) | |
return [tokens, masks, segments] | |
def movie_evaluation_predict(sentence): | |
data_x = sentence_convert_data(sentence) | |
predict = sentiment_model.predict(data_x) | |
predict_value = np.ravel(predict) | |
predict_answer = np.round(predict_value,0).item() | |
print(predict_value) | |
if predict_answer == 0: | |
st.write("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1.0-predict_value)) | |
elif predict_answer == 1: | |
st.write("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value) | |
def setup_driver(): | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # 백그라운드 실행 | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
driver = webdriver.Chrome(options=chrome_options) | |
return driver | |
def scrape_content(url): | |
driver = setup_driver() | |
try: | |
driver.get(url) | |
# 페이지 로딩 대기 | |
time.sleep(3) | |
# 본문 추출 | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
content = soup.find('article') # 본문 태그에 맞게 수정 | |
# 댓글 추출 | |
comments = soup.find_all('span', class_='u_cbox_contents') # 댓글 태그에 맞게 수정 | |
return { | |
'content': content.text if content else "본문을 찾을 수 없습니다.", | |
'comments': [comment.text for comment in comments] | |
} | |
finally: | |
driver.quit() | |
def main(): | |
sentiment_model = create_sentiment_bert() | |
url = st.text_input("URL을 입력하세요") | |
if st.button("크롤링 시작"): | |
if url: | |
with st.spinner("크롤링 중..."): | |
result = scrape_content(url) | |
st.subheader("본문") | |
st.write(result['content']) | |
st.subheader("댓글") | |
for idx, comment in enumerate(result['comments'], 1): | |
st.write(f"{idx}. {comment}") | |
else: | |
st.error("URL을 입력해주세요") | |
''' | |
test = st.form('test') | |
sentence = test.text_input("Your sentence") | |
submit = test.form_submit_button("Submit") | |
if submit: | |
movie_evaluation_predict(sentence) | |
''' | |
return 0 | |
if __name__ == "__main__": | |
main() | |