JPLTedCas commited on
Commit
dce3048
·
verified ·
1 Parent(s): 93c3e21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -243
app.py CHANGED
@@ -1,29 +1,22 @@
1
- import os
2
- import streamlit as st
3
- from dotenv import load_dotenv
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.document_loaders import UnstructuredPDFLoader
7
- from langchain.text_splitter import CharacterTextSplitter
8
- from langchain.embeddings import HuggingFaceEmbeddings
9
- from langchain.vectorstores import FAISS
10
- from langchain.chat_models import ChatOpenAI
11
- from langchain.memory import ConversationBufferMemory
12
- from langchain.chains import ConversationalRetrievalChain
13
- from htmlTemplates import css, bot_template, user_template
14
- from langchain.llms import HuggingFaceHub
15
- from langchain.vectorstores import Chroma
16
- from gpt4all import GPT4All
17
-
18
-
19
- # set this key as an environment variable
20
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets['huggingface_token']
21
-
22
-
23
- def add_logo():
24
-
25
- st.markdown(
26
- f"""
27
  <style>
28
  [data-testid="stSidebar"] {{
29
  background-image: url(https://smbk.s3.amazonaws.com/media/organization_logos/111579646d1241f4be17bd7394dcb238.jpg);
@@ -32,220 +25,80 @@ def add_logo():
32
  background-position: 20px 20px;
33
  }}
34
  </style>
35
- """,
36
- unsafe_allow_html=True,
37
- )
38
-
39
-
40
-
41
-
42
-
43
- def get_pdf_text(pdf_docs : list) -> str:
44
- text = ""
45
- for pdf in pdf_docs:
46
- pdf_reader = PdfReader(pdf)
47
- for page in pdf_reader.pages:
48
- text += page.extract_text()
49
- return text
50
-
51
- def get_pdf_pages(pdf_docs):
52
- """
53
- Extract text from a list of PDF documents.
54
- Parameters
55
- ----------
56
- pdf_docs : list
57
- List of PDF documents to extract text from.
58
- Returns
59
- -------
60
- str
61
- Extracted text from all the PDF documents.
62
- """
63
- pages = []
64
- import tempfile
65
-
66
- with tempfile.TemporaryDirectory() as tmpdirname:
67
- for pdf in pdf_docs:
68
- pdf_path=os.path.join(tmpdirname,pdf.name)
69
- with open(pdf_path, "wb") as f:
70
- f.write(pdf.getbuffer())
71
-
72
- pdf_loader = UnstructuredPDFLoader(pdf_path)
73
- pdf_pages = pdf_loader.load_and_split()
74
- pages=pages+pdf_pages
75
- return pages
76
-
77
-
78
- #def get_text_chunks(text:str) ->list:
79
- # text_splitter = CharacterTextSplitter(
80
- # separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
81
- # )
82
- # chunks = text_splitter.split_text(text)
83
- # return chunks
84
-
85
- def get_text_chunks(pages):
86
- """
87
- Split the input text into chunks.
88
- Parameters
89
- ----------
90
- text : str
91
- The input text to be split.
92
- Returns
93
- -------
94
- list
95
- List of text chunks.
96
- """
97
- text_splitter = RecursiveCharacterTextSplitter(
98
- chunk_size=1024, chunk_overlap=64
99
- )
100
- texts = text_splitter.split_documents(pages)
101
- print(str(len(texts)))
102
- return texts
103
-
104
-
105
-
106
-
107
- #def get_vectorstore(text_chunks : list) -> FAISS:
108
- # model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
109
- # encode_kwargs = {
110
- # "normalize_embeddings": True
111
- # } # set True to compute cosine similarity
112
- # embeddings = HuggingFaceBgeEmbeddings(
113
- # model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
114
- # )
115
- # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
116
- # return vectorstore
117
-
118
-
119
- def get_vectorstore(text_chunks):
120
- """
121
- Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
122
- Parameters
123
- ----------
124
- text_chunks : list
125
- List of text chunks to be embedded.
126
- Returns
127
- -------
128
- FAISS
129
- A FAISS vector store containing the embeddings of the text chunks.
130
- """
131
- MODEL_NAME = "WhereIsAI/UAE-Large-V1"
132
- MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
133
- #MODEL_NAME = "avsolatorio/GIST-Embedding-v0"
134
- MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
135
- MODEL_NAME="avsolatorio/GIST-Embedding-v0"
136
- #MODEL_NAME="intfloat/multilingual-e5-base"
137
- #MODEL_NAME="BAAI/bge-base-en-v1.5" Alucina un poco
138
- MODEL_NAME="BAAI/bge-large-en-v1.5"
139
- hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
140
- vectorstore = Chroma.from_documents(text_chunks, hf_embeddings, persist_directory="db")
141
- return vectorstore
142
-
143
-
144
-
145
-
146
- def get_conversation_chain(vectorstore:FAISS) -> ConversationalRetrievalChain:
147
- # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
148
- #llm = HuggingFaceHub(
149
- # repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
150
- # #repo_id="clibrain/lince-mistral-7b-it-es",
151
- # #repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
152
- # model_kwargs={"temperature": 0.5, "max_length": 2096},#1048
153
- #)
154
- llm = HuggingFaceHub(
155
- repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
156
- model_kwargs={"temperature": 0.5, "max_new_tokens": 1024, "max_length": 1048, "top_k": 3, "trust_remote_code": True, "torch_dtype": "auto"},
157
- )
158
-
159
-
160
-
161
-
162
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
163
- conversation_chain = ConversationalRetrievalChain.from_llm(
164
- llm=llm, retriever=vectorstore.as_retriever(), memory=memory
165
- )
166
- return conversation_chain
167
-
168
-
169
- #def handle_userinput(user_question:str):
170
- # response = st.session_state.conversation({"pregunta": user_question})
171
- # st.session_state.chat_history = response["chat_history"]
172
- #
173
- # for i, message in enumerate(st.session_state.chat_history):
174
- # if i % 2 == 0:
175
- # st.write(" Usuario: " + message.content)
176
- # else:
177
- # st.write("🤖 ChatBot: " + message.content)
178
-
179
-
180
- def handle_userinput(user_question):
181
- """
182
- Handle user input and generate a response using the conversational retrieval chain.
183
- Parameters
184
- ----------
185
- user_question : str
186
- The user's question.
187
- """
188
- response = st.session_state.conversation({"question": user_question})
189
- st.session_state.chat_history = response["chat_history"]
190
-
191
- for i, message in enumerate(st.session_state.chat_history):
192
- if i % 2 == 0:
193
- st.write("//_^ User: " + message.content)
194
- else:
195
- st.write("🤖 ChatBot: " + message.content)
196
-
197
-
198
-
199
-
200
- def main():
201
- st.set_page_config(
202
- page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
203
- page_icon=":books:",
204
- )
205
-
206
- #st.markdown("# Charla con TedCasBot")
207
- #st.markdown("Este Bot será tu aliado a la hora de buscar información en múltiples documentos pdf. Déjanos ayudarte! 🙏🏾")
208
- st.markdown("# Chat with TedCasBot")
209
- st.markdown("This Bot is a powerful AI tool designed to simplify the process of extracting information from PDF documents")
210
-
211
- st.write(css, unsafe_allow_html=True)
212
-
213
-
214
- if "conversation" not in st.session_state:
215
- st.session_state.conversation = None
216
- if "chat_history" not in st.session_state:
217
- st.session_state.chat_history = None
218
-
219
-
220
- #st.header("Charla con un Bot 🤖🦾 que te ayudará a responder preguntas sobre tus pdfs:")
221
- st.header("Chat with the TedCasBot. He will help you with any doubt you may have with your documents:")
222
-
223
- user_question = st.text_input("Ask what you need!:")
224
- if user_question:
225
- handle_userinput(user_question)
226
-
227
-
228
- with st.sidebar:
229
- add_logo()
230
- st.subheader("Your documents")
231
- pdf_docs = st.file_uploader(
232
- "Upload your documents and ress 'Process'", accept_multiple_files=True
233
- )
234
- if st.button("Process"):
235
- with st.spinner("Processing"):
236
- # get pdf text
237
- raw_text = get_pdf_text(pdf_docs)
238
- pages = get_pdf_pages(pdf_docs)
239
-
240
- # get the text chunks
241
- #text_chunks = get_text_chunks(raw_text)
242
- text_chunks = get_text_chunks(pages)
243
- # create vector store
244
- vectorstore = get_vectorstore(text_chunks)
245
-
246
- # create conversation chain
247
- st.session_state.conversation = get_conversation_chain(vectorstore)
248
-
249
-
250
- if __name__ == "__main__":
251
- main()
 
1
+ import os #line:1
2
+ import streamlit as st #line:2
3
+ from dotenv import load_dotenv #line:3
4
+ from PyPDF2 import PdfReader #line:4
5
+ from langchain .text_splitter import RecursiveCharacterTextSplitter #line:5
6
+ from langchain .document_loaders import UnstructuredPDFLoader #line:6
7
+ from langchain .text_splitter import CharacterTextSplitter #line:7
8
+ from langchain .embeddings import HuggingFaceEmbeddings #line:8
9
+ from langchain .vectorstores import FAISS #line:9
10
+ from langchain .chat_models import ChatOpenAI #line:10
11
+ from langchain .memory import ConversationBufferMemory #line:11
12
+ from langchain .chains import ConversationalRetrievalChain #line:12
13
+ from htmlTemplates import css ,bot_template ,user_template #line:13
14
+ from langchain .llms import HuggingFaceHub #line:14
15
+ from langchain .vectorstores import Chroma #line:15
16
+ from gpt4all import GPT4All #line:16
17
+ os .environ ["HUGGINGFACEHUB_API_TOKEN"]=st .secrets ['huggingface_token']#line:20
18
+ def add_logo ():#line:23
19
+ st .markdown (f"""
 
 
 
 
 
 
 
20
  <style>
21
  [data-testid="stSidebar"] {{
22
  background-image: url(https://smbk.s3.amazonaws.com/media/organization_logos/111579646d1241f4be17bd7394dcb238.jpg);
 
25
  background-position: 20px 20px;
26
  }}
27
  </style>
28
+ """,unsafe_allow_html =True ,)#line:37
29
+ def get_pdf_text (OOO0OO00OO0OOO0OO :list )->str :#line:43
30
+ OO0OOO000000O0OOO =""#line:44
31
+ for O0OO000O0OOO00O0O in OOO0OO00OO0OOO0OO :#line:45
32
+ O0O00OO0O00O0OOOO =PdfReader (O0OO000O0OOO00O0O )#line:46
33
+ for OO0OOO000O0000O00 in O0O00OO0O00O0OOOO .pages :#line:47
34
+ OO0OOO000000O0OOO +=OO0OOO000O0000O00 .extract_text ()#line:48
35
+ return OO0OOO000000O0OOO #line:49
36
+ def get_pdf_pages (OOOO000000OOOO0O0 ):#line:51
37
+ ""#line:62
38
+ OO0OO0O0OO0OO000O =[]#line:63
39
+ import tempfile #line:64
40
+ with tempfile .TemporaryDirectory ()as OOO0000O000O00OOO :#line:66
41
+ for OO0OOO0O000OO0OO0 in OOOO000000OOOO0O0 :#line:67
42
+ OO0OOO00OOOOOOO0O =os .path .join (OOO0000O000O00OOO ,OO0OOO0O000OO0OO0 .name )#line:68
43
+ with open (OO0OOO00OOOOOOO0O ,"wb")as O0OOOOO0O0O0OO00O :#line:69
44
+ O0OOOOO0O0O0OO00O .write (OO0OOO0O000OO0OO0 .getbuffer ())#line:70
45
+ OOO000OO0OO00OOO0 =UnstructuredPDFLoader (OO0OOO00OOOOOOO0O )#line:72
46
+ OOOO0OOOOO000OOO0 =OOO000OO0OO00OOO0 .load_and_split ()#line:73
47
+ OO0OO0O0OO0OO000O =OO0OO0O0OO0OO000O +OOOO0OOOOO000OOO0 #line:74
48
+ return OO0OO0O0OO0OO000O #line:75
49
+ def get_text_chunks (OOOO00OOOOO0O00OO ):#line:85
50
+ ""#line:96
51
+ OO0OOO00O000OO0OO =RecursiveCharacterTextSplitter (chunk_size =1024 ,chunk_overlap =64 )#line:99
52
+ O00O0OOOOOOOOO00O =OO0OOO00O000OO0OO .split_documents (OOOO00OOOOO0O00OO )#line:100
53
+ print (str (len (O00O0OOOOOOOOO00O )))#line:101
54
+ return O00O0OOOOOOOOO00O #line:102
55
+ def get_vectorstore (O00000O0O0OOOO0OO ):#line:119
56
+ ""#line:130
57
+ O000O00OO00O00OO0 ="WhereIsAI/UAE-Large-V1"#line:131
58
+ O000O00OO00O00OO0 ="sentence-transformers/all-MiniLM-L6-v2"#line:132
59
+ O000O00OO00O00OO0 ="intfloat/e5-mistral-7b-instruct"#line:134
60
+ O000O00OO00O00OO0 ="avsolatorio/GIST-Embedding-v0"#line:135
61
+ O000O00OO00O00OO0 ="BAAI/bge-large-en-v1.5"#line:138
62
+ O0O0OO0O0O00O0O00 =HuggingFaceEmbeddings (model_name =O000O00OO00O00OO0 )#line:139
63
+ O00O0OOOO0O0000OO =Chroma .from_documents (O00000O0O0OOOO0OO ,O0O0OO0O0O00O0O00 ,persist_directory ="db")#line:140
64
+ return O00O0OOOO0O0000OO #line:141
65
+ def get_conversation_chain (OOOOOOO0OOOO0000O :FAISS )->ConversationalRetrievalChain :#line:146
66
+ O000OO0O00000O0O0 =HuggingFaceHub (repo_id ="mistralai/Mixtral-8x7B-Instruct-v0.1",model_kwargs ={"temperature":0.5 ,"max_new_tokens":1024 ,"max_length":1048 ,"top_k":3 ,"trust_remote_code":True ,"torch_dtype":"auto"},)#line:157
67
+ OO0000OOO00000000 =ConversationBufferMemory (memory_key ="chat_history",return_messages =True )#line:162
68
+ OOO0OO0O00OO0O0O0 =ConversationalRetrievalChain .from_llm (llm =O000OO0O00000O0O0 ,retriever =OOOOOOO0OOOO0000O .as_retriever (),memory =OO0000OOO00000000 )#line:165
69
+ return OOO0OO0O00OO0O0O0 #line:166
70
+ def handle_userinput (OO000OO000O0O0000 ):#line:180
71
+ ""#line:187
72
+ O0OOO0O0OOO0OO00O =st .session_state .conversation ({"question":OO000OO000O0O0000 })#line:188
73
+ st .session_state .chat_history =O0OOO0O0OOO0OO00O ["chat_history"]#line:189
74
+ for O0OOOOOOOO0OOOOOO ,O0O00OOOOOOOO0O00 in enumerate (st .session_state .chat_history ):#line:191
75
+ if O0OOOOOOOO0OOOOOO %2 ==0 :#line:192
76
+ st .write ("//_^ User: "+O0O00OOOOOOOO0O00 .content )#line:193
77
+ else :#line:194
78
+ st .write ("🤖 ChatBot: "+O0O00OOOOOOOO0O00 .content )#line:195
79
+ def main ():#line:200
80
+ st .set_page_config (page_title ="Chat with a Bot that tries to answer questions about multiple PDFs",page_icon =":books:",)#line:204
81
+ st .markdown ("# Chat with TedCasBot")#line:208
82
+ st .markdown ("This Bot is a powerful AI tool designed to simplify the process of extracting information from PDF documents")#line:209
83
+ st .write (css ,unsafe_allow_html =True )#line:211
84
+ if "conversation"not in st .session_state :#line:214
85
+ st .session_state .conversation =None #line:215
86
+ if "chat_history"not in st .session_state :#line:216
87
+ st .session_state .chat_history =None #line:217
88
+ st .header ("Chat with the TedCasBot. He will help you with any doubt you may have with your documents:")#line:221
89
+ O00O00O00OO0000OO =st .text_input ("Ask what you need!:")#line:223
90
+ if O00O00O00OO0000OO :#line:224
91
+ handle_userinput (O00O00O00OO0000OO )#line:225
92
+ with st .sidebar :#line:228
93
+ add_logo ()#line:229
94
+ st .subheader ("Your documents")#line:230
95
+ O00O0O0O0O000000O =st .file_uploader ("Upload your documents and ress 'Process'",accept_multiple_files =True )#line:233
96
+ if st .button ("Process"):#line:234
97
+ with st .spinner ("Processing"):#line:235
98
+ O000000OOO00OO0O0 =get_pdf_text (O00O0O0O0O000000O )#line:237
99
+ OOOOOO000O000O00O =get_pdf_pages (O00O0O0O0O000000O )#line:238
100
+ O0000O00O0OOO0O00 =get_text_chunks (OOOOOO000O000O00O )#line:242
101
+ OO0O0OOO0O0000O0O =get_vectorstore (O0000O00O0OOO0O00 )#line:244
102
+ st .session_state .conversation =get_conversation_chain (OO0O0OOO0O0000O0O )#line:247
103
+ if __name__ =="__main__":#line:250
104
+ main ()