chat-PDF-demo

Sleeping

App Files Files Community

JPLTedCas commited on Apr 11, 2024

Commit

dce3048

verified ·

1 Parent(s): 93c3e21

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -243

app.py CHANGED Viewed

@@ -1,29 +1,22 @@
-import os
-import streamlit as st
-from dotenv import load_dotenv
-from PyPDF2 import PdfReader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.document_loaders import UnstructuredPDFLoader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.chat_models import ChatOpenAI
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import ConversationalRetrievalChain
-from htmlTemplates import css, bot_template, user_template
-from langchain.llms import HuggingFaceHub
-from langchain.vectorstores import Chroma
-from gpt4all import GPT4All
-# set this key as an environment variable
-os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets['huggingface_token']
-def add_logo():
-    st.markdown(
-        f"""
             <style>
                 [data-testid="stSidebar"] {{
                     background-image: url(https://smbk.s3.amazonaws.com/media/organization_logos/111579646d1241f4be17bd7394dcb238.jpg);
@@ -32,220 +25,80 @@ def add_logo():
                     background-position: 20px 20px;
                 }}
             </style>
-            """,
-        unsafe_allow_html=True,
-    )
-def get_pdf_text(pdf_docs : list) -> str:
-    text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-def get_pdf_pages(pdf_docs):
-    """
-    Extract text from a list of PDF documents.
-    Parameters
-    ----------
-    pdf_docs : list
-        List of PDF documents to extract text from.
-    Returns
-    -------
-    str
-        Extracted text from all the PDF documents.
-    """
-    pages = []
-    import tempfile
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        for pdf in pdf_docs:
-            pdf_path=os.path.join(tmpdirname,pdf.name)
-            with open(pdf_path, "wb") as f:
-               f.write(pdf.getbuffer())
-            pdf_loader = UnstructuredPDFLoader(pdf_path)
-            pdf_pages = pdf_loader.load_and_split()
-            pages=pages+pdf_pages
-    return pages
-#def get_text_chunks(text:str) ->list:
-#    text_splitter = CharacterTextSplitter(
-#        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
-#    )
-#    chunks = text_splitter.split_text(text)
-#    return chunks
-def get_text_chunks(pages):
-    """
-    Split the input text into chunks.
-    Parameters
-    ----------
-    text : str
-        The input text to be split.
-    Returns
-    -------
-    list
-        List of text chunks.
-    """
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1024, chunk_overlap=64
-    )
-    texts = text_splitter.split_documents(pages)
-    print(str(len(texts)))
-    return texts
-#def get_vectorstore(text_chunks : list) -> FAISS:
-#    model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-#    encode_kwargs = {
-#        "normalize_embeddings": True
-#    }  # set True to compute cosine similarity
-#    embeddings = HuggingFaceBgeEmbeddings(
-#        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
-#    )
-#    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-#    return vectorstore
-def get_vectorstore(text_chunks):
-    """
-    Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
-    Parameters
-    ----------
-    text_chunks : list
-        List of text chunks to be embedded.
-    Returns
-    -------
-    FAISS
-        A FAISS vector store containing the embeddings of the text chunks.
-    """
-    MODEL_NAME = "WhereIsAI/UAE-Large-V1"
-    MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-    #MODEL_NAME = "avsolatorio/GIST-Embedding-v0"
-    MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
-    MODEL_NAME="avsolatorio/GIST-Embedding-v0"
-    #MODEL_NAME="intfloat/multilingual-e5-base"
-    #MODEL_NAME="BAAI/bge-base-en-v1.5" Alucina un poco
-    MODEL_NAME="BAAI/bge-large-en-v1.5"
-    hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
-    vectorstore = Chroma.from_documents(text_chunks, hf_embeddings, persist_directory="db")
-    return vectorstore
-def get_conversation_chain(vectorstore:FAISS) -> ConversationalRetrievalChain:
-    # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
-    #llm = HuggingFaceHub(
-    #    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
-    #    #repo_id="clibrain/lince-mistral-7b-it-es",
-    #    #repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
-    #    model_kwargs={"temperature": 0.5, "max_length": 2096},#1048
-    #)
-    llm = HuggingFaceHub(
-        repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
-        model_kwargs={"temperature": 0.5, "max_new_tokens": 1024, "max_length": 1048, "top_k": 3, "trust_remote_code": True, "torch_dtype": "auto"},
-    )
-    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-    conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm, retriever=vectorstore.as_retriever(), memory=memory
-    )
-    return conversation_chain
-#def handle_userinput(user_question:str):
-#    response = st.session_state.conversation({"pregunta": user_question})
-#    st.session_state.chat_history = response["chat_history"]
-#
-#    for i, message in enumerate(st.session_state.chat_history):
- #       if i % 2 == 0:
-#            st.write("   Usuario: " + message.content)
- #       else:
-#            st.write("🤖 ChatBot: " + message.content)
-def handle_userinput(user_question):
-    """
-    Handle user input and generate a response using the conversational retrieval chain.
-    Parameters
-    ----------
-    user_question : str
-        The user's question.
-    """
-    response = st.session_state.conversation({"question": user_question})
-    st.session_state.chat_history = response["chat_history"]
-    for i, message in enumerate(st.session_state.chat_history):
-        if i % 2 == 0:
-            st.write("//_^ User: " + message.content)
-        else:
-            st.write("🤖 ChatBot: " + message.content)
-def main():
-    st.set_page_config(
-        page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
-        page_icon=":books:",
-    )
-    #st.markdown("# Charla con TedCasBot")
-    #st.markdown("Este Bot será tu aliado a la hora de buscar información en múltiples documentos pdf. Déjanos ayudarte! 🙏🏾")
-    st.markdown("# Chat with TedCasBot")
-    st.markdown("This Bot is a powerful AI tool designed to simplify the process of extracting information from PDF documents")
-    st.write(css, unsafe_allow_html=True)
-    if "conversation" not in st.session_state:
-        st.session_state.conversation = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
-    #st.header("Charla con un Bot 🤖🦾 que te ayudará a responder preguntas sobre tus pdfs:")
-    st.header("Chat with the TedCasBot. He will help you with any doubt you may have with your documents:")
-    user_question = st.text_input("Ask what you need!:")
-    if user_question:
-        handle_userinput(user_question)
-    with st.sidebar:
-        add_logo()
-        st.subheader("Your documents")
-        pdf_docs = st.file_uploader(
-            "Upload your documents and ress 'Process'", accept_multiple_files=True
-        )
-        if st.button("Process"):
-            with st.spinner("Processing"):
-                # get pdf text
-                raw_text = get_pdf_text(pdf_docs)
-                pages = get_pdf_pages(pdf_docs)
-                # get the text chunks
-                #text_chunks = get_text_chunks(raw_text)
-                text_chunks = get_text_chunks(pages)
-                # create vector store
-                vectorstore = get_vectorstore(text_chunks)
-                # create conversation chain
-                st.session_state.conversation = get_conversation_chain(vectorstore)
-if __name__ == "__main__":
-    main()

+import os #line:1
+import streamlit as st #line:2
+from dotenv import load_dotenv #line:3
+from PyPDF2 import PdfReader #line:4
+from langchain .text_splitter import RecursiveCharacterTextSplitter #line:5
+from langchain .document_loaders import UnstructuredPDFLoader #line:6
+from langchain .text_splitter import CharacterTextSplitter #line:7
+from langchain .embeddings import HuggingFaceEmbeddings #line:8
+from langchain .vectorstores import FAISS #line:9
+from langchain .chat_models import ChatOpenAI #line:10
+from langchain .memory import ConversationBufferMemory #line:11
+from langchain .chains import ConversationalRetrievalChain #line:12
+from htmlTemplates import css ,bot_template ,user_template #line:13
+from langchain .llms import HuggingFaceHub #line:14
+from langchain .vectorstores import Chroma #line:15
+from gpt4all import GPT4All #line:16
+os .environ ["HUGGINGFACEHUB_API_TOKEN"]=st .secrets ['huggingface_token']#line:20
+def add_logo ():#line:23
+    st .markdown (f"""
             <style>
                 [data-testid="stSidebar"] {{
                     background-image: url(https://smbk.s3.amazonaws.com/media/organization_logos/111579646d1241f4be17bd7394dcb238.jpg);
                     background-position: 20px 20px;
                 }}
             </style>
+            """,unsafe_allow_html =True ,)#line:37
+def get_pdf_text (OOO0OO00OO0OOO0OO :list )->str :#line:43
+    OO0OOO000000O0OOO =""#line:44
+    for O0OO000O0OOO00O0O in OOO0OO00OO0OOO0OO :#line:45
+        O0O00OO0O00O0OOOO =PdfReader (O0OO000O0OOO00O0O )#line:46
+        for OO0OOO000O0000O00 in O0O00OO0O00O0OOOO .pages :#line:47
+            OO0OOO000000O0OOO +=OO0OOO000O0000O00 .extract_text ()#line:48
+    return OO0OOO000000O0OOO #line:49
+def get_pdf_pages (OOOO000000OOOO0O0 ):#line:51
+    ""#line:62
+    OO0OO0O0OO0OO000O =[]#line:63
+    import tempfile #line:64
+    with tempfile .TemporaryDirectory ()as OOO0000O000O00OOO :#line:66
+        for OO0OOO0O000OO0OO0 in OOOO000000OOOO0O0 :#line:67
+            OO0OOO00OOOOOOO0O =os .path .join (OOO0000O000O00OOO ,OO0OOO0O000OO0OO0 .name )#line:68
+            with open (OO0OOO00OOOOOOO0O ,"wb")as O0OOOOO0O0O0OO00O :#line:69
+               O0OOOOO0O0O0OO00O .write (OO0OOO0O000OO0OO0 .getbuffer ())#line:70
+            OOO000OO0OO00OOO0 =UnstructuredPDFLoader (OO0OOO00OOOOOOO0O )#line:72
+            OOOO0OOOOO000OOO0 =OOO000OO0OO00OOO0 .load_and_split ()#line:73
+            OO0OO0O0OO0OO000O =OO0OO0O0OO0OO000O +OOOO0OOOOO000OOO0 #line:74
+    return OO0OO0O0OO0OO000O #line:75
+def get_text_chunks (OOOO00OOOOO0O00OO ):#line:85
+    ""#line:96
+    OO0OOO00O000OO0OO =RecursiveCharacterTextSplitter (chunk_size =1024 ,chunk_overlap =64 )#line:99
+    O00O0OOOOOOOOO00O =OO0OOO00O000OO0OO .split_documents (OOOO00OOOOO0O00OO )#line:100
+    print (str (len (O00O0OOOOOOOOO00O )))#line:101
+    return O00O0OOOOOOOOO00O #line:102
+def get_vectorstore (O00000O0O0OOOO0OO ):#line:119
+    ""#line:130
+    O000O00OO00O00OO0 ="WhereIsAI/UAE-Large-V1"#line:131
+    O000O00OO00O00OO0 ="sentence-transformers/all-MiniLM-L6-v2"#line:132
+    O000O00OO00O00OO0 ="intfloat/e5-mistral-7b-instruct"#line:134
+    O000O00OO00O00OO0 ="avsolatorio/GIST-Embedding-v0"#line:135
+    O000O00OO00O00OO0 ="BAAI/bge-large-en-v1.5"#line:138
+    O0O0OO0O0O00O0O00 =HuggingFaceEmbeddings (model_name =O000O00OO00O00OO0 )#line:139
+    O00O0OOOO0O0000OO =Chroma .from_documents (O00000O0O0OOOO0OO ,O0O0OO0O0O00O0O00 ,persist_directory ="db")#line:140
+    return O00O0OOOO0O0000OO #line:141
+def get_conversation_chain (OOOOOOO0OOOO0000O :FAISS )->ConversationalRetrievalChain :#line:146
+    O000OO0O00000O0O0 =HuggingFaceHub (repo_id ="mistralai/Mixtral-8x7B-Instruct-v0.1",model_kwargs ={"temperature":0.5 ,"max_new_tokens":1024 ,"max_length":1048 ,"top_k":3 ,"trust_remote_code":True ,"torch_dtype":"auto"},)#line:157
+    OO0000OOO00000000 =ConversationBufferMemory (memory_key ="chat_history",return_messages =True )#line:162
+    OOO0OO0O00OO0O0O0 =ConversationalRetrievalChain .from_llm (llm =O000OO0O00000O0O0 ,retriever =OOOOOOO0OOOO0000O .as_retriever (),memory =OO0000OOO00000000 )#line:165
+    return OOO0OO0O00OO0O0O0 #line:166
+def handle_userinput (OO000OO000O0O0000 ):#line:180
+    ""#line:187
+    O0OOO0O0OOO0OO00O =st .session_state .conversation ({"question":OO000OO000O0O0000 })#line:188
+    st .session_state .chat_history =O0OOO0O0OOO0OO00O ["chat_history"]#line:189
+    for O0OOOOOOOO0OOOOOO ,O0O00OOOOOOOO0O00 in enumerate (st .session_state .chat_history ):#line:191
+        if O0OOOOOOOO0OOOOOO %2 ==0 :#line:192
+            st .write ("//_^ User: "+O0O00OOOOOOOO0O00 .content )#line:193
+        else :#line:194
+            st .write ("🤖 ChatBot: "+O0O00OOOOOOOO0O00 .content )#line:195
+def main ():#line:200
+    st .set_page_config (page_title ="Chat with a Bot that tries to answer questions about multiple PDFs",page_icon =":books:",)#line:204
+    st .markdown ("# Chat with TedCasBot")#line:208
+    st .markdown ("This Bot is a powerful AI tool designed to simplify the process of extracting information from PDF documents")#line:209
+    st .write (css ,unsafe_allow_html =True )#line:211
+    if "conversation"not in st .session_state :#line:214
+        st .session_state .conversation =None #line:215
+    if "chat_history"not in st .session_state :#line:216
+        st .session_state .chat_history =None #line:217
+    st .header ("Chat with the TedCasBot. He will help you with any doubt you may have with your documents:")#line:221
+    O00O00O00OO0000OO =st .text_input ("Ask what you need!:")#line:223
+    if O00O00O00OO0000OO :#line:224
+        handle_userinput (O00O00O00OO0000OO )#line:225
+    with st .sidebar :#line:228
+        add_logo ()#line:229
+        st .subheader ("Your documents")#line:230
+        O00O0O0O0O000000O =st .file_uploader ("Upload your documents and ress 'Process'",accept_multiple_files =True )#line:233
+        if st .button ("Process"):#line:234
+            with st .spinner ("Processing"):#line:235
+                O000000OOO00OO0O0 =get_pdf_text (O00O0O0O0O000000O )#line:237
+                OOOOOO000O000O00O =get_pdf_pages (O00O0O0O0O000000O )#line:238
+                O0000O00O0OOO0O00 =get_text_chunks (OOOOOO000O000O00O )#line:242
+                OO0O0OOO0O0000O0O =get_vectorstore (O0000O00O0OOO0O00 )#line:244
+                st .session_state .conversation =get_conversation_chain (OO0O0OOO0O0000O0O )#line:247
+if __name__ =="__main__":#line:250
+    main ()