Spaces:

sigmadream
/

ko-review

Sleeping

App Files Files Community

sigmadream commited on Sep 21, 2023

Commit

7eb81bd

1 Parent(s): 593a37b

v1

Browse files

Files changed (8) hide show

app.py +217 -0
examples.csv +51 -0
gitattributes.txt +34 -0
klue_roberta-small-2400.pt +3 -0
lid.176.ftz +3 -0
model-1900.pt +3 -0
requirements.txt +7 -0
roberta-base-1900.pt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import gradio as gr
+import fasttext
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import numpy as np
+import pandas as pd
+import torch
+id2label = {0: "NEGATIVE", 1: "POSITIVE"}
+label2id = {"NEGATIVE": 0, "POSITIVE": 1}
+title = "Movie Review Score Discriminator"
+description = "It is a program that classifies whether it is positive or negative by entering movie reviews.  \
+                You can choose between the Korean version and the English version.  \
+                It also provides a version called ""Default"", which determines whether it is Korean or English and predicts it."
+class LanguageIdentification:
+    def __init__(self):
+        pretrained_lang_model = "./lid.176.ftz"
+        self.model = fasttext.load_model(pretrained_lang_model)
+    def predict_lang(self, text):
+        predictions = self.model.predict(text, k=200) # returns top 200 matching languages
+        return predictions
+LANGUAGE = LanguageIdentification()
+def tokenized_data(tokenizer, inputs):
+    return tokenizer.batch_encode_plus(
+        [inputs],
+        return_tensors="pt",
+        padding="max_length",
+        max_length=64,
+        truncation=True)
+examples = []
+df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
+np.random.seed(100)
+idx = np.random.choice(50, size=5, replace=False)
+eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
+kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
+examples = eng_examples + kor_examples
+eng_model_name = "roberta-base"
+eng_step = 1900
+eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
+eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step)
+eng_state_dict = torch.load(eng_file_name)
+eng_model = AutoModelForSequenceClassification.from_pretrained(
+    eng_model_name, num_labels=2, id2label=id2label, label2id=label2id,
+    state_dict=eng_state_dict
+)
+kor_model_name = "klue/roberta-small"
+kor_step = 2400
+kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name)
+kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step)
+kor_state_dict = torch.load(kor_file_name)
+kor_model = AutoModelForSequenceClassification.from_pretrained(
+    kor_model_name, num_labels=2, id2label=id2label, label2id=label2id,
+    state_dict=kor_state_dict
+)
+def builder(Lang, Text):
+    percent_kor, percent_eng = 0, 0
+    text_list = Text.split(' ')
+    # [ output_1 ]
+    if Lang == '언어감지 기능 사용':
+        pred = LANGUAGE.predict_lang(Text)
+        if '__label__en' in pred[0]:
+            Lang = 'Eng'
+            idx = pred[0].index('__label__en')
+            p_eng = pred[1][idx]
+        if '__label__ko' in pred[0]:
+            Lang = 'Kor'
+            idx = pred[0].index('__label__ko')
+            p_kor = pred[1][idx]
+        # Normalize Percentage
+        percent_kor = p_kor / (p_kor+p_eng)
+        percent_eng = p_eng / (p_kor+p_eng)
+    if Lang == 'Eng':
+        model = eng_model
+        tokenizer = eng_tokenizer
+        if percent_eng==0: percent_eng=1
+    if Lang == 'Kor':
+        model = kor_model
+        tokenizer = kor_tokenizer
+        if percent_kor==0: percent_kor=1
+    # [ output_2 ]
+    inputs = tokenized_data(tokenizer, Text)
+    model.eval()
+    with torch.no_grad():
+        logits = model(input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask']).logits
+    m = torch.nn.Softmax(dim=1)
+    output = m(logits)
+    # print(logits, output)
+    # [ output_3 ]
+    output_analysis = []
+    for word in text_list:
+        tokenized_word = tokenized_data(tokenizer, word)
+        with torch.no_grad():
+            logit = model(input_ids=tokenized_word['input_ids'],
+                attention_mask=tokenized_word['attention_mask']).logits
+        word_output = m(logit)
+        if word_output[0][1] > 0.99:
+            output_analysis.append( (word, '+++') )
+        elif word_output[0][1] > 0.9:
+            output_analysis.append( (word, '++') )
+        elif word_output[0][1] > 0.8:
+            output_analysis.append( (word, '+') )
+        elif word_output[0][1] < 0.01:
+            output_analysis.append( (word, '---') )
+        elif word_output[0][1] < 0.1:
+            output_analysis.append( (word, '--') )
+        elif word_output[0][1] < 0.2:
+            output_analysis.append( (word, '-') )
+        else:
+            output_analysis.append( (word, None) )
+    return [ {'Kor': percent_kor, 'Eng': percent_eng},
+            {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
+            output_analysis ]
+    # prediction = torch.argmax(logits, axis=1)
+    return id2label[prediction.item()]
+# demo3 = gr.Interface.load("models/mdj1412/movie_review_score_discriminator_eng", inputs="text", outputs="text",
+#                          title=title, theme="peach",
+#                          allow_flagging="auto",
+#                          description=description, examples=examples)
+# demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Default', 'Eng', 'Kor']), gr.Textbox(placeholder="리뷰를 입력하시오.")],
+#                     outputs=[ gr.Label(num_top_classes=3, label='Lang'),
+#                             gr.Label(num_top_classes=2, label='Result'),
+#                             gr.HighlightedText(label="Analysis", combine_adjacent=False)
+#                             .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"}) ],
+#                     # outputs='label',
+#                     title=title, description=description, examples=examples)
+with gr.Blocks() as demo1:
+    gr.Markdown(
+    """
+    <h1 align="center">
+    Movie Review Score Discriminator
+    </h1>
+    """)
+    gr.Markdown(
+    """
+    영화 리뷰를 입력하면, 리뷰가 긍정인지 부정인지 판별해주는 모델이다. \
+    영어와 한글을 지원하며, 언어를 직접 선택할수도, 혹은 모델이 언어감지를 직접 하도록 할 수 있다.
+    리뷰를 입력하면, (1) 감지된 언어, (2) 긍정 리뷰일 확률과 부정 리뷰일 확률, (3) 입력된 리뷰의 어느 단어가 긍정/부정 결정에 영향을 주었는지 \
+    (긍정일 경우 빨강색, 부정일 경우 파란색)를 확인할 수 있다.
+    """)
+    with gr.Accordion(label="모델에 대한 설명 ( 여기를 클릭 하시오. )", open=False):
+        gr.Markdown(
+        """
+        영어 모델은 bert-base-uncased 기반으로, 영어 영화 리뷰 분석 데이터셋인 SST-2로 학습 및 평가되었다.
+        한글 모델은 klue/roberta-base 기반이다. 기존 한글 영화 리뷰 분석 데이터셋이 존재하지 않아, 네이버 영화의 리뷰를 크롤링해서 영화 리뷰 분석 데이터셋을 제작하고, 이를 이용하여 모델을 학습 및 평가하였다.
+        영어 모델은 SST-2에서 92.8%, 한글 모델은 네이버 영화 리뷰 데이터셋에서 94%의 정확도를 가진다 (test set 기준).
+        언어감지는 fasttext의 language detector를 사용하였다. 리뷰의 단어별 영향력은, 단어 각각을 모델에 넣었을 때 결과가 긍정으로 나오는지 부정으로 나오는지를 바탕으로 측정하였다.
+        """)
+    with gr.Row():
+        with gr.Column():
+            inputs_1 = gr.Dropdown(choices=['언어감지 기능 사용', 'Eng', 'Kor'], value='언어감지 기능 사용', label='Lang')
+            inputs_2 = gr.Textbox(placeholder="리뷰를 입력하시오.", label='Text')
+            with gr.Row():
+                # btn2 = gr.Button("클리어")
+                btn = gr.Button("제출하기")
+        with gr.Column():
+            output_1 = gr.Label(num_top_classes=3, label='Lang')
+            output_2 = gr.Label(num_top_classes=2, label='Result')
+            output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
+                .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
+    # btn2.click(fn=fn2, inputs=[None, None], output=[output_1, output_2, output_3])
+    btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
+    gr.Examples(examples, inputs=[inputs_1, inputs_2])
+if __name__ == "__main__":
+    # print(examples)
+    # demo.launch()
+    demo1.launch()

examples.csv ADDED Viewed

	@@ -0,0 +1,51 @@

+	eng	kor
+0	of saucy 	1점도아깝다4명보다재미없어서2명나감
+1	cold movie 	매트릭스?ㄴㄴ 짜장 묻은 존윅
+2	redundant concept 	개인의 선택으로 1점을 줬습니다
+3	in world cinema 	보는내내 니 생각만 났다.
+4	on all cylinders 	영화보다가 잠든적은 처음이네요
+5	sit through , 	따뜻한 영화에요~^^추천해요!
+6	heroes 	별로에요 생각보다 노잼임
+7	sharply 	좋아요 가족들과 보기 좋아요
+8	sometimes dry 	♡ 재밌게 잘봤습니다ㅎㅎ
+9	disappointments 	반제 호빗 사랑해요~
+10	the horrors 	똥도 이런 거대한 똥이 없었다..
+11	many pointless 	개지립니다 나만당할순없지
+12	a beautifully 	이게무슨...만화네 만화 ㅉㅉㅉ
+13	a doa 	7광구와 쌍벽을 이루는 망작
+14	no apparent joy 	영화 보다가 중간에 나왔습니다
+15	seem fresh 	최악 그냥 보지 마세요진짜 노잼
+16	weak and 	짱구 극장판은 언제나 최고에요
+17	skip this dreck , 	내 시간은 소중한 거다.
+18	generates 	겁나 재밌는디,,,,
+19	funny yet 	그냥 개재밌음 평점 믿으면 안됨
+20	in memory 	재밋게 잘봣습니다 너무좋습니다요
+21	hawaiian shirt 	밥 먹으면서 보기 좋은 영화
+22	grievous but 	재미와 감동을 겸비한 명작입니다!!
+23	hopeless 	재개봉 감사합니다.정말로
+24	bring tissues . 	끝더 이상 설명이 필요할까.
+25	just too silly 	역시 믿보 황.정.민 배우님~^^
+26	cinematic bon bons 	연출+연기+스토리+영상미+OST
+27	irritates and 	추억에 묻어두지 그랬냐
+28	collapse 	이시대 최고의 코미디 영화
+29	no lika da 	재미있게 관람하였습니다
+30	a welcome relief 	스마우그랑 있을땐 스릴이 많다.
+31	, compelling 	처음으로 극장에서 잤습니다
+32	infectiously 	너무나도 잘봤어요 굿입니댜
+33	imax in short 	ㅈㄹ게 웃기고 잼있네.ㅋ
+34	i hate it . 	연말에 보면 뭉클하다 정말
+35	a good one 	그냥 게임으로 내지 그랬냐.
+36	, plodding picture 	진짜 강추 최고의 한국영화
+37	inane and awful 	진짜최악입니다...명절에보세요
+38	whole mess 	대망작 보지마세요 돈 아까움
+39	enjoy the ride 	이거 볼 시간에 야동이나 봐라
+40	the horror 	너무너무 재밌음 버즈 최고
+41	a dim 	3시간이 전혀 아깝지 않은
+42	amazingly lame . 	졸작이다..
+43	to spare wildlife 	노우잼스ㅡ  이만잡 열자 채우기
+44	carnage and 	2022년 최고 한국영화
+45	second fiddle 	재미없다너무재미없다OST지겹다
+46	a stylish exercise 	나름 재밌게 봄 가볍게 보기 좋은듯
+47	than this mess 	와...감독판이 더좋다... 더긴데
+48	valuable messages 	갑자기 도게자 ㄹㅇㅋㅋ
+49	usual worst 	별점 1점도 주기가 아까운 영화..

gitattributes.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

klue_roberta-small-2400.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b572a576888999c3696750507168b1ec8c194b93e3b0a5fb69d5932cb61a410
+size 272408049

lid.176.ftz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f3472cfe8738a7b6099e8e999c3cbfae0dcd15696aac7d7738a8039db603e83
+size 938013

model-1900.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f0dcb5d42751656f47868d0b1cd793c33bd2c497df57dde5514a2b15a791d05
+size 498658641

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+datasets
+transformers
+torch
+pandas
+numpy
+fasttext

roberta-base-1900.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f0dcb5d42751656f47868d0b1cd793c33bd2c497df57dde5514a2b15a791d05
+size 498658641