File size: 6,512 Bytes
dce3048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165f5e4
 
 
 
 
 
 
 
dce3048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os #line:1
import streamlit as st #line:2
from dotenv import load_dotenv #line:3
from PyPDF2 import PdfReader #line:4
from langchain .text_splitter import RecursiveCharacterTextSplitter #line:5
from langchain .document_loaders import UnstructuredPDFLoader #line:6
from langchain .text_splitter import CharacterTextSplitter #line:7
from langchain .embeddings import HuggingFaceEmbeddings #line:8
from langchain .vectorstores import FAISS #line:9
from langchain .chat_models import ChatOpenAI #line:10
from langchain .memory import ConversationBufferMemory #line:11
from langchain .chains import ConversationalRetrievalChain #line:12
from htmlTemplates import css ,bot_template ,user_template #line:13
from langchain .llms import HuggingFaceHub #line:14
from langchain .vectorstores import Chroma #line:15
from gpt4all import GPT4All #line:16
os .environ ["HUGGINGFACEHUB_API_TOKEN"]=st .secrets ['huggingface_token']#line:20
def add_logo ():#line:23
    st .markdown (f"""
            <style>
                [data-testid="stSidebar"] {{
                    background-image: url(https://smbk.s3.amazonaws.com/media/organization_logos/111579646d1241f4be17bd7394dcb238.jpg);
                    background-repeat: no-repeat;
                    padding-top: 80px;
                    background-position: 20px 20px;
                }}
            </style>
            """,unsafe_allow_html =True ,)#line:37
def get_pdf_text (OOO0OO00OO0OOO0OO :list )->str :#line:43
    OO0OOO000000O0OOO =""#line:44
    for O0OO000O0OOO00O0O in OOO0OO00OO0OOO0OO :#line:45
        O0O00OO0O00O0OOOO =PdfReader (O0OO000O0OOO00O0O )#line:46
        for OO0OOO000O0000O00 in O0O00OO0O00O0OOOO .pages :#line:47
            OO0OOO000000O0OOO +=OO0OOO000O0000O00 .extract_text ()#line:48
    return OO0OOO000000O0OOO #line:49
def get_pdf_pages (OOOO000000OOOO0O0 ):#line:51
    ""#line:62
    OO0OO0O0OO0OO000O =[]#line:63
    import tempfile #line:64
    with tempfile .TemporaryDirectory ()as OOO0000O000O00OOO :#line:66
        for OO0OOO0O000OO0OO0 in OOOO000000OOOO0O0 :#line:67
            OO0OOO00OOOOOOO0O =os .path .join (OOO0000O000O00OOO ,OO0OOO0O000OO0OO0 .name )#line:68
            with open (OO0OOO00OOOOOOO0O ,"wb")as O0OOOOO0O0O0OO00O :#line:69
               O0OOOOO0O0O0OO00O .write (OO0OOO0O000OO0OO0 .getbuffer ())#line:70
            OOO000OO0OO00OOO0 =UnstructuredPDFLoader (OO0OOO00OOOOOOO0O )#line:72
            OOOO0OOOOO000OOO0 =OOO000OO0OO00OOO0 .load_and_split ()#line:73
            OO0OO0O0OO0OO000O =OO0OO0O0OO0OO000O +OOOO0OOOOO000OOO0 #line:74
    return OO0OO0O0OO0OO000O #line:75
def get_text_chunks (OOOO00OOOOO0O00OO ):#line:85
    ""#line:96
    OO0OOO00O000OO0OO =RecursiveCharacterTextSplitter (chunk_size =1024 ,chunk_overlap =64 )#line:99
    O00O0OOOOOOOOO00O =OO0OOO00O000OO0OO .split_documents (OOOO00OOOOO0O00OO )#line:100
    print (str (len (O00O0OOOOOOOOO00O )))#line:101
    return O00O0OOOOOOOOO00O #line:102
def get_vectorstore (O00000O0O0OOOO0OO ):#line:119
    ""#line:130
    O000O00OO00O00OO0 ="WhereIsAI/UAE-Large-V1"#line:131
    O000O00OO00O00OO0 ="sentence-transformers/all-MiniLM-L6-v2"#line:132
    O000O00OO00O00OO0 ="intfloat/e5-mistral-7b-instruct"#line:134
    O000O00OO00O00OO0 ="avsolatorio/GIST-Embedding-v0"#line:135
    O000O00OO00O00OO0 ="BAAI/bge-large-en-v1.5"#line:138
    O0O0OO0O0O00O0O00 =HuggingFaceEmbeddings (model_name =O000O00OO00O00OO0 )#line:139
    O00O0OOOO0O0000OO =Chroma .from_documents (O00000O0O0OOOO0OO ,O0O0OO0O0O00O0O00 ,persist_directory ="db")#line:140
    return O00O0OOOO0O0000OO #line:141
def get_conversation_chain (OOOOOOO0OOOO0000O :FAISS )->ConversationalRetrievalChain :#line:146
    O000OO0O00000O0O0 =HuggingFaceHub (repo_id ="mistralai/Mixtral-8x7B-Instruct-v0.1",model_kwargs ={"temperature":0.5 ,"max_new_tokens":1024 ,"max_length":1048 ,"top_k":3 ,"trust_remote_code":True ,"torch_dtype":"auto"},)#line:157
    OO0000OOO00000000 =ConversationBufferMemory (memory_key ="chat_history",return_messages =True )#line:162
    OOO0OO0O00OO0O0O0 =ConversationalRetrievalChain .from_llm (llm =O000OO0O00000O0O0 ,retriever =OOOOOOO0OOOO0000O .as_retriever (),memory =OO0000OOO00000000 )#line:165
    return OOO0OO0O00OO0O0O0 #line:166
def handle_userinput (OO000OO000O0O0000 ):#line:180
    ""#line:187
    O0OOO0O0OOO0OO00O =st .session_state .conversation ({"question":OO000OO000O0O0000 })#line:188
    st .session_state .chat_history =O0OOO0O0OOO0OO00O ["chat_history"]#line:189
    for O0OOOOOOOO0OOOOOO ,O0O00OOOOOOOO0O00 in enumerate (st .session_state .chat_history ):#line:191
        if O0OOOOOOOO0OOOOOO %2 ==0 :#line:192
            st .write ("//_^ User: "+O0O00OOOOOOOO0O00 .content )#line:193
        else :#line:194
            st .write ("🤖 ChatBot: "+O0O00OOOOOOOO0O00 .content )#line:195
def main ():#line:200
    st .set_page_config (page_title ="Chat with a Bot that tries to answer questions about multiple PDFs",page_icon =":books:",)#line:204
    st .markdown ("# Chat with TedCasBot")#line:208
    st .markdown ("This Bot is a powerful AI tool designed to simplify the process of extracting information from PDF documents")#line:209
    st .write (css ,unsafe_allow_html =True )#line:211
    if "conversation"not in st .session_state :#line:214
        st .session_state .conversation =None #line:215
    if "chat_history"not in st .session_state :#line:216
        st .session_state .chat_history =None #line:217
    st .header ("Chat with the TedCasBot. He will help you with any doubt you may have with your documents:")#line:221
    O00O00O00OO0000OO =st .text_input ("Ask what you need!:")#line:223
    if O00O00O00OO0000OO :#line:224
        handle_userinput (O00O00O00OO0000OO )#line:225
    with st .sidebar :#line:228
        add_logo ()#line:229
        st .subheader ("Your documents")#line:230
        O00O0O0O0O000000O =st .file_uploader ("Upload your documents and ress 'Process'",accept_multiple_files =True )#line:233
        if st .button ("Process"):#line:234
            with st .spinner ("Processing"):#line:235
                O000000OOO00OO0O0 =get_pdf_text (O00O0O0O0O000000O )#line:237
                OOOOOO000O000O00O =get_pdf_pages (O00O0O0O0O000000O )#line:238
                O0000O00O0OOO0O00 =get_text_chunks (OOOOOO000O000O00O )#line:242
                OO0O0OOO0O0000O0O =get_vectorstore (O0000O00O0OOO0O00 )#line:244
                st .session_state .conversation =get_conversation_chain (OO0O0OOO0O0000O0O )#line:247
if __name__ =="__main__":#line:250
    main ()