import gradio as gr
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("fnlp/moss-moon-003-sft-plugin", trust_remote_code=True)
model = AutoModel.from_pretrained("fnlp/moss-moon-003-sft-plugin", trust_remote_code=True).half().cuda()
model = model.eval()

def predict(input, history=None):
    if history is None:
        history = []
    prompt = ""
    for (q, r) in history:
        prompt += "<|Human|>: " + q + "<eoh>\n"
        prompt += "<|MOSS|>: " + r + "\n"
    prompt += "<|Human|>: " + input + "<eoh>\n<|MOSS|>:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, do_sample=True, temperature=0.7, top_p=0.8, repetition_penalty=1.1, max_new_tokens=512)
    response = tokenizer.decode(outputs[0])
    completion = response[len(prompt)+2:]
    history = history + [(input, completion)]
    return history, history


with gr.Blocks() as demo:
    gr.Markdown("moss-moon-003-sft-plugin")
    state = gr.State([])
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
    with gr.Row():
        with gr.Column(scale=4):
            txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
        with gr.Column(scale=1):
            button = gr.Button("Generate")
    txt.submit(predict, [txt, state], [chatbot, state])
    button.click(predict, [txt, state], [chatbot, state])
demo.queue().launch()