Spaces:
Sleeping
Sleeping
chore: init
Browse files- .gitignore +2 -0
- README.md +6 -5
- app.bak +511 -0
- app.py +367 -0
- requirements.txt +41 -0
- scripts/cookies.py +715 -0
- scripts/gaia_scorer.py +124 -0
- scripts/mdconvert.py +949 -0
- scripts/reformulator.py +86 -0
- scripts/run_agents.py +87 -0
- scripts/text_inspector_tool.py +122 -0
- scripts/text_web_browser.py +563 -0
- scripts/visual_qa.py +187 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.idea/
|
2 |
+
.env
|
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title: Cata Deep
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Cata Deep-Research
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.14.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
short_description: Cata's Deep Research
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.bak
ADDED
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import threading
|
5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6 |
+
from datetime import datetime
|
7 |
+
from pathlib import Path
|
8 |
+
from typing import List, Optional
|
9 |
+
|
10 |
+
import datasets
|
11 |
+
import pandas as pd
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
from huggingface_hub import login
|
14 |
+
import gradio as gr
|
15 |
+
|
16 |
+
from scripts.reformulator import prepare_response
|
17 |
+
from scripts.run_agents import (
|
18 |
+
get_single_file_description,
|
19 |
+
get_zip_description,
|
20 |
+
)
|
21 |
+
from scripts.text_inspector_tool import TextInspectorTool
|
22 |
+
from scripts.text_web_browser import (
|
23 |
+
ArchiveSearchTool,
|
24 |
+
FinderTool,
|
25 |
+
FindNextTool,
|
26 |
+
PageDownTool,
|
27 |
+
PageUpTool,
|
28 |
+
SimpleTextBrowser,
|
29 |
+
VisitTool,
|
30 |
+
)
|
31 |
+
from scripts.visual_qa import visualizer
|
32 |
+
from tqdm import tqdm
|
33 |
+
|
34 |
+
from smolagents import (
|
35 |
+
CodeAgent,
|
36 |
+
HfApiModel,
|
37 |
+
LiteLLMModel,
|
38 |
+
Model,
|
39 |
+
ToolCallingAgent,
|
40 |
+
)
|
41 |
+
from smolagents.agent_types import AgentText, AgentImage, AgentAudio
|
42 |
+
from smolagents.gradio_ui import pull_messages_from_step, handle_agent_output_types
|
43 |
+
|
44 |
+
from smolagents import Tool
|
45 |
+
|
46 |
+
|
47 |
+
class GoogleSearchTool(Tool):
|
48 |
+
name = "web_search"
|
49 |
+
description = """Performs a google web search for your query then returns a string of the top search results."""
|
50 |
+
inputs = {
|
51 |
+
"query": {"type": "string", "description": "The search query to perform."},
|
52 |
+
"filter_year": {
|
53 |
+
"type": "integer",
|
54 |
+
"description": "Optionally restrict results to a certain year",
|
55 |
+
"nullable": True,
|
56 |
+
},
|
57 |
+
}
|
58 |
+
output_type = "string"
|
59 |
+
|
60 |
+
def __init__(self):
|
61 |
+
super().__init__(self)
|
62 |
+
import os
|
63 |
+
|
64 |
+
self.serpapi_key = os.getenv("SERPER_API_KEY")
|
65 |
+
|
66 |
+
def forward(self, query: str, filter_year: Optional[int] = None) -> str:
|
67 |
+
import requests
|
68 |
+
|
69 |
+
if self.serpapi_key is None:
|
70 |
+
raise ValueError("Missing SerpAPI key. Make sure you have 'SERPER_API_KEY' in your env variables.")
|
71 |
+
|
72 |
+
params = {
|
73 |
+
"engine": "google",
|
74 |
+
"q": query,
|
75 |
+
"api_key": self.serpapi_key,
|
76 |
+
"google_domain": "google.com",
|
77 |
+
}
|
78 |
+
|
79 |
+
headers = {
|
80 |
+
'X-API-KEY': self.serpapi_key,
|
81 |
+
'Content-Type': 'application/json'
|
82 |
+
}
|
83 |
+
|
84 |
+
if filter_year is not None:
|
85 |
+
params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
|
86 |
+
|
87 |
+
response = requests.request("POST", "https://google.serper.dev/search", headers=headers, data=json.dumps(params))
|
88 |
+
|
89 |
+
|
90 |
+
if response.status_code == 200:
|
91 |
+
results = response.json()
|
92 |
+
else:
|
93 |
+
raise ValueError(response.json())
|
94 |
+
|
95 |
+
if "organic" not in results.keys():
|
96 |
+
print("REZZZ", results.keys())
|
97 |
+
if filter_year is not None:
|
98 |
+
raise Exception(
|
99 |
+
f"No results found for query: '{query}' with filtering on year={filter_year}. Use a less restrictive query or do not filter on year."
|
100 |
+
)
|
101 |
+
else:
|
102 |
+
raise Exception(f"No results found for query: '{query}'. Use a less restrictive query.")
|
103 |
+
if len(results["organic"]) == 0:
|
104 |
+
year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
|
105 |
+
return f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
|
106 |
+
|
107 |
+
web_snippets = []
|
108 |
+
if "organic" in results:
|
109 |
+
for idx, page in enumerate(results["organic"]):
|
110 |
+
date_published = ""
|
111 |
+
if "date" in page:
|
112 |
+
date_published = "\nDate published: " + page["date"]
|
113 |
+
|
114 |
+
source = ""
|
115 |
+
if "source" in page:
|
116 |
+
source = "\nSource: " + page["source"]
|
117 |
+
|
118 |
+
snippet = ""
|
119 |
+
if "snippet" in page:
|
120 |
+
snippet = "\n" + page["snippet"]
|
121 |
+
|
122 |
+
redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}"
|
123 |
+
|
124 |
+
redacted_version = redacted_version.replace("Your browser can't play this video.", "")
|
125 |
+
web_snippets.append(redacted_version)
|
126 |
+
|
127 |
+
return "## Search Results\n" + "\n\n".join(web_snippets)
|
128 |
+
|
129 |
+
# web_search = GoogleSearchTool()
|
130 |
+
|
131 |
+
# print(web_search(query="Donald Trump news"))
|
132 |
+
# quit()
|
133 |
+
AUTHORIZED_IMPORTS = [
|
134 |
+
"requests",
|
135 |
+
"zipfile",
|
136 |
+
"os",
|
137 |
+
"pandas",
|
138 |
+
"numpy",
|
139 |
+
"sympy",
|
140 |
+
"json",
|
141 |
+
"bs4",
|
142 |
+
"pubchempy",
|
143 |
+
"xml",
|
144 |
+
"yahoo_finance",
|
145 |
+
"Bio",
|
146 |
+
"sklearn",
|
147 |
+
"scipy",
|
148 |
+
"pydub",
|
149 |
+
"io",
|
150 |
+
"PIL",
|
151 |
+
"chess",
|
152 |
+
"PyPDF2",
|
153 |
+
"pptx",
|
154 |
+
"torch",
|
155 |
+
"datetime",
|
156 |
+
"fractions",
|
157 |
+
"csv",
|
158 |
+
]
|
159 |
+
load_dotenv(override=True)
|
160 |
+
login(os.getenv("HF_TOKEN"))
|
161 |
+
|
162 |
+
append_answer_lock = threading.Lock()
|
163 |
+
|
164 |
+
custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
|
165 |
+
|
166 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
|
167 |
+
|
168 |
+
BROWSER_CONFIG = {
|
169 |
+
"viewport_size": 1024 * 5,
|
170 |
+
"downloads_folder": "downloads_folder",
|
171 |
+
"request_kwargs": {
|
172 |
+
"headers": {"User-Agent": user_agent},
|
173 |
+
"timeout": 300,
|
174 |
+
},
|
175 |
+
"serpapi_key": os.getenv("SERPAPI_API_KEY"),
|
176 |
+
}
|
177 |
+
|
178 |
+
os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
|
179 |
+
|
180 |
+
model = HfApiModel(
|
181 |
+
custom_role_conversions=custom_role_conversions,
|
182 |
+
)
|
183 |
+
|
184 |
+
text_limit = 20000
|
185 |
+
ti_tool = TextInspectorTool(model, text_limit)
|
186 |
+
|
187 |
+
browser = SimpleTextBrowser(**BROWSER_CONFIG)
|
188 |
+
|
189 |
+
WEB_TOOLS = [
|
190 |
+
GoogleSearchTool(),
|
191 |
+
VisitTool(browser),
|
192 |
+
PageUpTool(browser),
|
193 |
+
PageDownTool(browser),
|
194 |
+
FinderTool(browser),
|
195 |
+
FindNextTool(browser),
|
196 |
+
ArchiveSearchTool(browser),
|
197 |
+
TextInspectorTool(model, text_limit),
|
198 |
+
]
|
199 |
+
|
200 |
+
# Agent creation in a factory function
|
201 |
+
def create_agent():
|
202 |
+
"""Creates a fresh agent instance for each session"""
|
203 |
+
return CodeAgent(
|
204 |
+
model=model,
|
205 |
+
tools=[visualizer] + WEB_TOOLS,
|
206 |
+
max_steps=10,
|
207 |
+
verbosity_level=1,
|
208 |
+
additional_authorized_imports=AUTHORIZED_IMPORTS,
|
209 |
+
planning_interval=4,
|
210 |
+
)
|
211 |
+
|
212 |
+
document_inspection_tool = TextInspectorTool(model, 20000)
|
213 |
+
|
214 |
+
def stream_to_gradio(
|
215 |
+
agent,
|
216 |
+
task: str,
|
217 |
+
reset_agent_memory: bool = False,
|
218 |
+
additional_args: Optional[dict] = None,
|
219 |
+
):
|
220 |
+
"""Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
|
221 |
+
for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
|
222 |
+
for message in pull_messages_from_step(
|
223 |
+
step_log,
|
224 |
+
):
|
225 |
+
yield message
|
226 |
+
|
227 |
+
final_answer = step_log # Last log is the run's final_answer
|
228 |
+
final_answer = handle_agent_output_types(final_answer)
|
229 |
+
|
230 |
+
if isinstance(final_answer, AgentText):
|
231 |
+
yield gr.ChatMessage(
|
232 |
+
role="assistant",
|
233 |
+
content=f"**Final answer:**\n{final_answer.to_string()}\n",
|
234 |
+
)
|
235 |
+
elif isinstance(final_answer, AgentImage):
|
236 |
+
yield gr.ChatMessage(
|
237 |
+
role="assistant",
|
238 |
+
content={"path": final_answer.to_string(), "mime_type": "image/png"},
|
239 |
+
)
|
240 |
+
elif isinstance(final_answer, AgentAudio):
|
241 |
+
yield gr.ChatMessage(
|
242 |
+
role="assistant",
|
243 |
+
content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
|
244 |
+
)
|
245 |
+
else:
|
246 |
+
yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
|
247 |
+
|
248 |
+
|
249 |
+
class GradioUI:
|
250 |
+
"""A one-line interface to launch your agent in Gradio"""
|
251 |
+
|
252 |
+
def __init__(self, file_upload_folder: str | None = None):
|
253 |
+
|
254 |
+
self.file_upload_folder = file_upload_folder
|
255 |
+
if self.file_upload_folder is not None:
|
256 |
+
if not os.path.exists(file_upload_folder):
|
257 |
+
os.mkdir(file_upload_folder)
|
258 |
+
|
259 |
+
def interact_with_agent(self, prompt, messages, session_state):
|
260 |
+
# Get or create session-specific agent
|
261 |
+
if 'agent' not in session_state:
|
262 |
+
session_state['agent'] = create_agent()
|
263 |
+
|
264 |
+
# Adding monitoring
|
265 |
+
try:
|
266 |
+
# log the existence of agent memory
|
267 |
+
has_memory = hasattr(session_state['agent'], 'memory')
|
268 |
+
print(f"Agent has memory: {has_memory}")
|
269 |
+
if has_memory:
|
270 |
+
print(f"Memory type: {type(session_state['agent'].memory)}")
|
271 |
+
|
272 |
+
messages.append(gr.ChatMessage(role="user", content=prompt))
|
273 |
+
yield messages
|
274 |
+
|
275 |
+
for msg in stream_to_gradio(session_state['agent'], task=prompt, reset_agent_memory=False):
|
276 |
+
messages.append(msg)
|
277 |
+
yield messages
|
278 |
+
yield messages
|
279 |
+
except Exception as e:
|
280 |
+
print(f"Error in interaction: {str(e)}")
|
281 |
+
raise
|
282 |
+
|
283 |
+
def upload_file(
|
284 |
+
self,
|
285 |
+
file,
|
286 |
+
file_uploads_log,
|
287 |
+
allowed_file_types=[
|
288 |
+
"application/pdf",
|
289 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
290 |
+
"text/plain",
|
291 |
+
],
|
292 |
+
):
|
293 |
+
"""
|
294 |
+
Handle file uploads, default allowed types are .pdf, .docx, and .txt
|
295 |
+
"""
|
296 |
+
if file is None:
|
297 |
+
return gr.Textbox("No file uploaded", visible=True), file_uploads_log
|
298 |
+
|
299 |
+
try:
|
300 |
+
mime_type, _ = mimetypes.guess_type(file.name)
|
301 |
+
except Exception as e:
|
302 |
+
return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
|
303 |
+
|
304 |
+
if mime_type not in allowed_file_types:
|
305 |
+
return gr.Textbox("File type disallowed", visible=True), file_uploads_log
|
306 |
+
|
307 |
+
# Sanitize file name
|
308 |
+
original_name = os.path.basename(file.name)
|
309 |
+
sanitized_name = re.sub(
|
310 |
+
r"[^\w\-.]", "_", original_name
|
311 |
+
) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
|
312 |
+
|
313 |
+
type_to_ext = {}
|
314 |
+
for ext, t in mimetypes.types_map.items():
|
315 |
+
if t not in type_to_ext:
|
316 |
+
type_to_ext[t] = ext
|
317 |
+
|
318 |
+
# Ensure the extension correlates to the mime type
|
319 |
+
sanitized_name = sanitized_name.split(".")[:-1]
|
320 |
+
sanitized_name.append("" + type_to_ext[mime_type])
|
321 |
+
sanitized_name = "".join(sanitized_name)
|
322 |
+
|
323 |
+
# Save the uploaded file to the specified folder
|
324 |
+
file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
|
325 |
+
shutil.copy(file.name, file_path)
|
326 |
+
|
327 |
+
return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
|
328 |
+
|
329 |
+
def log_user_message(self, text_input, file_uploads_log):
|
330 |
+
return (
|
331 |
+
text_input
|
332 |
+
+ (
|
333 |
+
f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
|
334 |
+
if len(file_uploads_log) > 0
|
335 |
+
else ""
|
336 |
+
),
|
337 |
+
gr.Textbox(value="", interactive=False, placeholder="Please wait while Steps are getting populated"),
|
338 |
+
gr.Button(interactive=False)
|
339 |
+
)
|
340 |
+
|
341 |
+
def detect_device(self, request: gr.Request):
|
342 |
+
# Check whether the user device is a mobile or a computer
|
343 |
+
|
344 |
+
if not request:
|
345 |
+
return "Unknown device"
|
346 |
+
# Method 1: Check sec-ch-ua-mobile header
|
347 |
+
is_mobile_header = request.headers.get('sec-ch-ua-mobile')
|
348 |
+
if is_mobile_header:
|
349 |
+
return "Mobile" if '?1' in is_mobile_header else "Desktop"
|
350 |
+
|
351 |
+
# Method 2: Check user-agent string
|
352 |
+
user_agent = request.headers.get('user-agent', '').lower()
|
353 |
+
mobile_keywords = ['android', 'iphone', 'ipad', 'mobile', 'phone']
|
354 |
+
|
355 |
+
if any(keyword in user_agent for keyword in mobile_keywords):
|
356 |
+
return "Mobile"
|
357 |
+
|
358 |
+
# Method 3: Check platform
|
359 |
+
platform = request.headers.get('sec-ch-ua-platform', '').lower()
|
360 |
+
if platform:
|
361 |
+
if platform in ['"android"', '"ios"']:
|
362 |
+
return "Mobile"
|
363 |
+
elif platform in ['"windows"', '"macos"', '"linux"']:
|
364 |
+
return "Desktop"
|
365 |
+
|
366 |
+
# Default case if no clear indicators
|
367 |
+
return "Desktop"
|
368 |
+
|
369 |
+
def launch(self, **kwargs):
|
370 |
+
|
371 |
+
with gr.Blocks(theme="ocean", fill_height=True) as demo:
|
372 |
+
# Different layouts for mobile and computer devices
|
373 |
+
@gr.render()
|
374 |
+
def layout(request: gr.Request):
|
375 |
+
device = self.detect_device(request)
|
376 |
+
print(f"device - {device}")
|
377 |
+
# Render layout with sidebar
|
378 |
+
if device == "Desktop":
|
379 |
+
with gr.Blocks(fill_height=True,) as sidebar_demo:
|
380 |
+
with gr.Sidebar():
|
381 |
+
gr.Markdown("""# open Deep Research - free the AI agents!
|
382 |
+
|
383 |
+
OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
|
384 |
+
|
385 |
+
However, their agent has a huge downside: it's not open. So we've started a 24-hour rush to replicate and open-source it. Our resulting [open-Deep-Research agent](https://github.com/huggingface/smolagents/tree/main/examples/open_deep_research) took the #1 rank of any open submission on the GAIA leaderboard! ✨
|
386 |
+
|
387 |
+
You can try a simplified version here.<br><br>""")
|
388 |
+
with gr.Group():
|
389 |
+
gr.Markdown("**Your request**", container=True)
|
390 |
+
text_input = gr.Textbox(lines=3, label="Your request", container=False, placeholder="Enter your prompt here and press Shift+Enter or press the button")
|
391 |
+
launch_research_btn = gr.Button("Run", variant="primary")
|
392 |
+
|
393 |
+
# If an upload folder is provided, enable the upload feature
|
394 |
+
if self.file_upload_folder is not None:
|
395 |
+
upload_file = gr.File(label="Upload a file")
|
396 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
|
397 |
+
upload_file.change(
|
398 |
+
self.upload_file,
|
399 |
+
[upload_file, file_uploads_log],
|
400 |
+
[upload_status, file_uploads_log],
|
401 |
+
)
|
402 |
+
|
403 |
+
gr.HTML("<br><br><h4><center>Powered by:</center></h4>")
|
404 |
+
with gr.Row():
|
405 |
+
gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">
|
406 |
+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
|
407 |
+
<a href="https://github.com/huggingface/smolagents"><b>huggingface/smolagents</b></a>
|
408 |
+
</div>""")
|
409 |
+
|
410 |
+
# Add session state to store session-specific data
|
411 |
+
session_state = gr.State({}) # Initialize empty state for each session
|
412 |
+
stored_messages = gr.State([])
|
413 |
+
file_uploads_log = gr.State([])
|
414 |
+
chatbot = gr.Chatbot(
|
415 |
+
label="open-Deep-Research",
|
416 |
+
type="messages",
|
417 |
+
avatar_images=(
|
418 |
+
None,
|
419 |
+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
|
420 |
+
),
|
421 |
+
resizeable=False,
|
422 |
+
scale=1,
|
423 |
+
elem_id="my-chatbot"
|
424 |
+
)
|
425 |
+
|
426 |
+
text_input.submit(
|
427 |
+
self.log_user_message,
|
428 |
+
[text_input, file_uploads_log],
|
429 |
+
[stored_messages, text_input, launch_research_btn],
|
430 |
+
).then(self.interact_with_agent,
|
431 |
+
# Include session_state in function calls
|
432 |
+
[stored_messages, chatbot, session_state],
|
433 |
+
[chatbot]
|
434 |
+
).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
|
435 |
+
None,
|
436 |
+
[text_input, launch_research_btn])
|
437 |
+
launch_research_btn.click(
|
438 |
+
self.log_user_message,
|
439 |
+
[text_input, file_uploads_log],
|
440 |
+
[stored_messages, text_input, launch_research_btn],
|
441 |
+
).then(self.interact_with_agent,
|
442 |
+
# Include session_state in function calls
|
443 |
+
[stored_messages, chatbot, session_state],
|
444 |
+
[chatbot]
|
445 |
+
).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
|
446 |
+
None,
|
447 |
+
[text_input, launch_research_btn])
|
448 |
+
|
449 |
+
# Render simple layout
|
450 |
+
else:
|
451 |
+
with gr.Blocks(fill_height=True,) as simple_demo:
|
452 |
+
gr.Markdown("""# open Deep Research - free the AI agents!
|
453 |
+
_Built with [smolagents](https://github.com/huggingface/smolagents)_
|
454 |
+
|
455 |
+
OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
|
456 |
+
|
457 |
+
However, their agent has a huge downside: it's not open. So we've started a 24-hour rush to replicate and open-source it. Our resulting [open-Deep-Research agent](https://github.com/huggingface/smolagents/tree/main/examples/open_deep_research) took the #1 rank of any open submission on the GAIA leaderboard! ✨
|
458 |
+
|
459 |
+
You can try a simplified version below (uses `Qwen-Coder-32B` instead of `o1`, so much less powerful than the original open-Deep-Research)👇""")
|
460 |
+
# Add session state to store session-specific data
|
461 |
+
session_state = gr.State({}) # Initialize empty state for each session
|
462 |
+
stored_messages = gr.State([])
|
463 |
+
file_uploads_log = gr.State([])
|
464 |
+
chatbot = gr.Chatbot(
|
465 |
+
label="open-Deep-Research",
|
466 |
+
type="messages",
|
467 |
+
avatar_images=(
|
468 |
+
None,
|
469 |
+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
|
470 |
+
),
|
471 |
+
resizeable=True,
|
472 |
+
scale=1,
|
473 |
+
)
|
474 |
+
# If an upload folder is provided, enable the upload feature
|
475 |
+
if self.file_upload_folder is not None:
|
476 |
+
upload_file = gr.File(label="Upload a file")
|
477 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
|
478 |
+
upload_file.change(
|
479 |
+
self.upload_file,
|
480 |
+
[upload_file, file_uploads_log],
|
481 |
+
[upload_status, file_uploads_log],
|
482 |
+
)
|
483 |
+
text_input = gr.Textbox(lines=1, label="Your request", placeholder="Enter your prompt here and press the button")
|
484 |
+
launch_research_btn = gr.Button("Run", variant="primary",)
|
485 |
+
|
486 |
+
text_input.submit(
|
487 |
+
self.log_user_message,
|
488 |
+
[text_input, file_uploads_log],
|
489 |
+
[stored_messages, text_input, launch_research_btn],
|
490 |
+
).then(self.interact_with_agent,
|
491 |
+
# Include session_state in function calls
|
492 |
+
[stored_messages, chatbot, session_state],
|
493 |
+
[chatbot]
|
494 |
+
).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
|
495 |
+
None,
|
496 |
+
[text_input, launch_research_btn])
|
497 |
+
launch_research_btn.click(
|
498 |
+
self.log_user_message,
|
499 |
+
[text_input, file_uploads_log],
|
500 |
+
[stored_messages, text_input, launch_research_btn],
|
501 |
+
).then(self.interact_with_agent,
|
502 |
+
# Include session_state in function calls
|
503 |
+
[stored_messages, chatbot, session_state],
|
504 |
+
[chatbot]
|
505 |
+
).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
|
506 |
+
None,
|
507 |
+
[text_input, launch_research_btn])
|
508 |
+
|
509 |
+
demo.launch(debug=True, **kwargs)
|
510 |
+
|
511 |
+
GradioUI().launch()
|
app.py
ADDED
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mimetypes
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import shutil
|
5 |
+
import threading
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from huggingface_hub import login
|
11 |
+
from smolagents import (
|
12 |
+
CodeAgent,
|
13 |
+
HfApiModel,
|
14 |
+
)
|
15 |
+
from smolagents.agent_types import AgentText, AgentImage, AgentAudio
|
16 |
+
from smolagents.gradio_ui import pull_messages_from_step, handle_agent_output_types
|
17 |
+
|
18 |
+
from scripts.visual_qa import visualizer
|
19 |
+
|
20 |
+
AUTHORIZED_IMPORTS = [
|
21 |
+
"requests",
|
22 |
+
"zipfile",
|
23 |
+
"os",
|
24 |
+
"pandas",
|
25 |
+
"numpy",
|
26 |
+
"sympy",
|
27 |
+
"json",
|
28 |
+
"bs4",
|
29 |
+
"pubchempy",
|
30 |
+
"xml",
|
31 |
+
"yahoo_finance",
|
32 |
+
"Bio",
|
33 |
+
"sklearn",
|
34 |
+
"scipy",
|
35 |
+
"pydub",
|
36 |
+
"io",
|
37 |
+
"PIL",
|
38 |
+
"chess",
|
39 |
+
"PyPDF2",
|
40 |
+
"pptx",
|
41 |
+
"torch",
|
42 |
+
"datetime",
|
43 |
+
"fractions",
|
44 |
+
"csv",
|
45 |
+
]
|
46 |
+
load_dotenv(override=True)
|
47 |
+
login(os.getenv("HF_TOKEN"))
|
48 |
+
|
49 |
+
append_answer_lock = threading.Lock()
|
50 |
+
|
51 |
+
custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
|
52 |
+
|
53 |
+
model = HfApiModel(
|
54 |
+
custom_role_conversions=custom_role_conversions,
|
55 |
+
)
|
56 |
+
|
57 |
+
|
58 |
+
# Agent creation in a factory function
|
59 |
+
def create_agent():
|
60 |
+
"""Creates a fresh agent instance for each session"""
|
61 |
+
return CodeAgent(
|
62 |
+
model=model,
|
63 |
+
tools=[visualizer],
|
64 |
+
max_steps=10,
|
65 |
+
verbosity_level=1,
|
66 |
+
additional_authorized_imports=AUTHORIZED_IMPORTS,
|
67 |
+
planning_interval=4,
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
def stream_to_gradio(
|
72 |
+
agent,
|
73 |
+
task: str,
|
74 |
+
reset_agent_memory: bool = False,
|
75 |
+
additional_args: Optional[dict] = None,
|
76 |
+
):
|
77 |
+
"""Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
|
78 |
+
for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
|
79 |
+
for message in pull_messages_from_step(
|
80 |
+
step_log,
|
81 |
+
):
|
82 |
+
yield message
|
83 |
+
|
84 |
+
final_answer = step_log # Last log is the run's final_answer
|
85 |
+
final_answer = handle_agent_output_types(final_answer)
|
86 |
+
|
87 |
+
if isinstance(final_answer, AgentText):
|
88 |
+
yield gr.ChatMessage(
|
89 |
+
role="assistant",
|
90 |
+
content=f"**Final answer:**\n{final_answer.to_string()}\n",
|
91 |
+
)
|
92 |
+
elif isinstance(final_answer, AgentImage):
|
93 |
+
yield gr.ChatMessage(
|
94 |
+
role="assistant",
|
95 |
+
content={"path": final_answer.to_string(), "mime_type": "image/png"},
|
96 |
+
)
|
97 |
+
elif isinstance(final_answer, AgentAudio):
|
98 |
+
yield gr.ChatMessage(
|
99 |
+
role="assistant",
|
100 |
+
content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
|
101 |
+
)
|
102 |
+
else:
|
103 |
+
yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
|
104 |
+
|
105 |
+
|
106 |
+
class GradioUI:
|
107 |
+
"""A one-line interface to launch your agent in Gradio"""
|
108 |
+
|
109 |
+
def __init__(self, file_upload_folder: str | None = None):
|
110 |
+
|
111 |
+
self.file_upload_folder = file_upload_folder
|
112 |
+
if self.file_upload_folder is not None:
|
113 |
+
if not os.path.exists(file_upload_folder):
|
114 |
+
os.mkdir(file_upload_folder)
|
115 |
+
|
116 |
+
def interact_with_agent(self, prompt, messages, session_state):
|
117 |
+
# Get or create session-specific agent
|
118 |
+
if 'agent' not in session_state:
|
119 |
+
session_state['agent'] = create_agent()
|
120 |
+
|
121 |
+
# Adding monitoring
|
122 |
+
try:
|
123 |
+
# log the existence of agent memory
|
124 |
+
has_memory = hasattr(session_state['agent'], 'memory')
|
125 |
+
print(f"Agent has memory: {has_memory}")
|
126 |
+
if has_memory:
|
127 |
+
print(f"Memory type: {type(session_state['agent'].memory)}")
|
128 |
+
|
129 |
+
messages.append(gr.ChatMessage(role="user", content=prompt))
|
130 |
+
yield messages
|
131 |
+
|
132 |
+
for msg in stream_to_gradio(session_state['agent'], task=prompt, reset_agent_memory=False):
|
133 |
+
messages.append(msg)
|
134 |
+
yield messages
|
135 |
+
yield messages
|
136 |
+
except Exception as e:
|
137 |
+
print(f"Error in interaction: {str(e)}")
|
138 |
+
raise
|
139 |
+
|
140 |
+
def upload_file(
|
141 |
+
self,
|
142 |
+
file,
|
143 |
+
file_uploads_log,
|
144 |
+
allowed_file_types=[
|
145 |
+
"application/pdf",
|
146 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
147 |
+
"text/plain",
|
148 |
+
],
|
149 |
+
):
|
150 |
+
"""
|
151 |
+
Handle file uploads, default allowed types are .pdf, .docx, and .txt
|
152 |
+
"""
|
153 |
+
if file is None:
|
154 |
+
return gr.Textbox("No file uploaded", visible=True), file_uploads_log
|
155 |
+
|
156 |
+
try:
|
157 |
+
mime_type, _ = mimetypes.guess_type(file.name)
|
158 |
+
except Exception as e:
|
159 |
+
return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
|
160 |
+
|
161 |
+
if mime_type not in allowed_file_types:
|
162 |
+
return gr.Textbox("File type disallowed", visible=True), file_uploads_log
|
163 |
+
|
164 |
+
# Sanitize file name
|
165 |
+
original_name = os.path.basename(file.name)
|
166 |
+
sanitized_name = re.sub(
|
167 |
+
r"[^\w\-.]", "_", original_name
|
168 |
+
) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
|
169 |
+
|
170 |
+
type_to_ext = {}
|
171 |
+
for ext, t in mimetypes.types_map.items():
|
172 |
+
if t not in type_to_ext:
|
173 |
+
type_to_ext[t] = ext
|
174 |
+
|
175 |
+
# Ensure the extension correlates to the mime type
|
176 |
+
sanitized_name = sanitized_name.split(".")[:-1]
|
177 |
+
sanitized_name.append("" + type_to_ext[mime_type])
|
178 |
+
sanitized_name = "".join(sanitized_name)
|
179 |
+
|
180 |
+
# Save the uploaded file to the specified folder
|
181 |
+
file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
|
182 |
+
shutil.copy(file.name, file_path)
|
183 |
+
|
184 |
+
return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
|
185 |
+
|
186 |
+
def log_user_message(self, text_input, file_uploads_log):
|
187 |
+
return (
|
188 |
+
text_input
|
189 |
+
+ (
|
190 |
+
f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
|
191 |
+
if len(file_uploads_log) > 0
|
192 |
+
else ""
|
193 |
+
),
|
194 |
+
gr.Textbox(value="", interactive=False, placeholder="Please wait while Steps are getting populated"),
|
195 |
+
gr.Button(interactive=False)
|
196 |
+
)
|
197 |
+
|
198 |
+
def detect_device(self, request: gr.Request):
|
199 |
+
# Check whether the user device is a mobile or a computer
|
200 |
+
|
201 |
+
if not request:
|
202 |
+
return "Unknown device"
|
203 |
+
# Method 1: Check sec-ch-ua-mobile header
|
204 |
+
is_mobile_header = request.headers.get('sec-ch-ua-mobile')
|
205 |
+
if is_mobile_header:
|
206 |
+
return "Mobile" if '?1' in is_mobile_header else "Desktop"
|
207 |
+
|
208 |
+
# Method 2: Check user-agent string
|
209 |
+
user_agent = request.headers.get('user-agent', '').lower()
|
210 |
+
mobile_keywords = ['android', 'iphone', 'ipad', 'mobile', 'phone']
|
211 |
+
|
212 |
+
if any(keyword in user_agent for keyword in mobile_keywords):
|
213 |
+
return "Mobile"
|
214 |
+
|
215 |
+
# Method 3: Check platform
|
216 |
+
platform = request.headers.get('sec-ch-ua-platform', '').lower()
|
217 |
+
if platform:
|
218 |
+
if platform in ['"android"', '"ios"']:
|
219 |
+
return "Mobile"
|
220 |
+
elif platform in ['"windows"', '"macos"', '"linux"']:
|
221 |
+
return "Desktop"
|
222 |
+
|
223 |
+
# Default case if no clear indicators
|
224 |
+
return "Desktop"
|
225 |
+
|
226 |
+
def launch(self, **kwargs):
|
227 |
+
|
228 |
+
with gr.Blocks(theme="ocean", fill_height=True) as demo:
|
229 |
+
# Different layouts for mobile and computer devices
|
230 |
+
@gr.render()
|
231 |
+
def layout(request: gr.Request):
|
232 |
+
device = self.detect_device(request)
|
233 |
+
print(f"device - {device}")
|
234 |
+
# Render layout with sidebar
|
235 |
+
if device == "Desktop":
|
236 |
+
with gr.Blocks(fill_height=True, ) as sidebar_demo:
|
237 |
+
with gr.Sidebar():
|
238 |
+
gr.Markdown("""# Cata Deep Research
|
239 |
+
|
240 |
+
OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
|
241 |
+
|
242 |
+
You can try a simplified version here.<br><br>""")
|
243 |
+
with gr.Group():
|
244 |
+
gr.Markdown("**Your request**", container=True)
|
245 |
+
text_input = gr.Textbox(lines=3, label="Your request", container=False,
|
246 |
+
placeholder="Enter your prompt here and press Shift+Enter or press the button")
|
247 |
+
launch_research_btn = gr.Button("Run", variant="primary")
|
248 |
+
|
249 |
+
# If an upload folder is provided, enable the upload feature
|
250 |
+
if self.file_upload_folder is not None:
|
251 |
+
upload_file = gr.File(label="Upload a file")
|
252 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
|
253 |
+
upload_file.change(
|
254 |
+
self.upload_file,
|
255 |
+
[upload_file, file_uploads_log],
|
256 |
+
[upload_status, file_uploads_log],
|
257 |
+
)
|
258 |
+
|
259 |
+
# gr.HTML("<br><br><h4><center>Powered by:</center></h4>")
|
260 |
+
# with gr.Row():
|
261 |
+
# gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">
|
262 |
+
# <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
|
263 |
+
# <a href="https://github.com/huggingface/smolagents"><b>huggingface/smolagents</b></a>
|
264 |
+
# </div>""")
|
265 |
+
|
266 |
+
# Add session state to store session-specific data
|
267 |
+
session_state = gr.State({}) # Initialize empty state for each session
|
268 |
+
stored_messages = gr.State([])
|
269 |
+
file_uploads_log = gr.State([])
|
270 |
+
chatbot = gr.Chatbot(
|
271 |
+
label="Cata-Deep-Research",
|
272 |
+
type="messages",
|
273 |
+
resizeable=False,
|
274 |
+
scale=1,
|
275 |
+
elem_id="my-chatbot"
|
276 |
+
)
|
277 |
+
|
278 |
+
text_input.submit(
|
279 |
+
self.log_user_message,
|
280 |
+
[text_input, file_uploads_log],
|
281 |
+
[stored_messages, text_input, launch_research_btn],
|
282 |
+
).then(self.interact_with_agent,
|
283 |
+
# Include session_state in function calls
|
284 |
+
[stored_messages, chatbot, session_state],
|
285 |
+
[chatbot]
|
286 |
+
).then(lambda: (
|
287 |
+
gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
|
288 |
+
gr.Button(interactive=True)),
|
289 |
+
None,
|
290 |
+
[text_input, launch_research_btn])
|
291 |
+
launch_research_btn.click(
|
292 |
+
self.log_user_message,
|
293 |
+
[text_input, file_uploads_log],
|
294 |
+
[stored_messages, text_input, launch_research_btn],
|
295 |
+
).then(self.interact_with_agent,
|
296 |
+
# Include session_state in function calls
|
297 |
+
[stored_messages, chatbot, session_state],
|
298 |
+
[chatbot]
|
299 |
+
).then(lambda: (
|
300 |
+
gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
|
301 |
+
gr.Button(interactive=True)),
|
302 |
+
None,
|
303 |
+
[text_input, launch_research_btn])
|
304 |
+
|
305 |
+
# Render simple layout
|
306 |
+
else:
|
307 |
+
with gr.Blocks(fill_height=True, ) as simple_demo:
|
308 |
+
gr.Markdown("""# Cata Deep Research
|
309 |
+
_Built with [smolagents](https://github.com/huggingface/smolagents)_
|
310 |
+
|
311 |
+
OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
|
312 |
+
|
313 |
+
You can try a simplified version below (uses `Qwen-Coder-32B` instead of `o1`, so much less powerful than the original open-Deep-Research)👇""")
|
314 |
+
# Add session state to store session-specific data
|
315 |
+
session_state = gr.State({}) # Initialize empty state for each session
|
316 |
+
stored_messages = gr.State([])
|
317 |
+
file_uploads_log = gr.State([])
|
318 |
+
chatbot = gr.Chatbot(
|
319 |
+
label="Cata-Deep-Research",
|
320 |
+
type="messages",
|
321 |
+
resizeable=True,
|
322 |
+
scale=1,
|
323 |
+
)
|
324 |
+
# If an upload folder is provided, enable the upload feature
|
325 |
+
if self.file_upload_folder is not None:
|
326 |
+
upload_file = gr.File(label="Upload a file")
|
327 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
|
328 |
+
upload_file.change(
|
329 |
+
self.upload_file,
|
330 |
+
[upload_file, file_uploads_log],
|
331 |
+
[upload_status, file_uploads_log],
|
332 |
+
)
|
333 |
+
text_input = gr.Textbox(lines=1, label="Your request",
|
334 |
+
placeholder="Enter your prompt here and press the button")
|
335 |
+
launch_research_btn = gr.Button("Run", variant="primary", )
|
336 |
+
|
337 |
+
text_input.submit(
|
338 |
+
self.log_user_message,
|
339 |
+
[text_input, file_uploads_log],
|
340 |
+
[stored_messages, text_input, launch_research_btn],
|
341 |
+
).then(self.interact_with_agent,
|
342 |
+
# Include session_state in function calls
|
343 |
+
[stored_messages, chatbot, session_state],
|
344 |
+
[chatbot]
|
345 |
+
).then(lambda: (
|
346 |
+
gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
|
347 |
+
gr.Button(interactive=True)),
|
348 |
+
None,
|
349 |
+
[text_input, launch_research_btn])
|
350 |
+
launch_research_btn.click(
|
351 |
+
self.log_user_message,
|
352 |
+
[text_input, file_uploads_log],
|
353 |
+
[stored_messages, text_input, launch_research_btn],
|
354 |
+
).then(self.interact_with_agent,
|
355 |
+
# Include session_state in function calls
|
356 |
+
[stored_messages, chatbot, session_state],
|
357 |
+
[chatbot]
|
358 |
+
).then(lambda: (
|
359 |
+
gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
|
360 |
+
gr.Button(interactive=True)),
|
361 |
+
None,
|
362 |
+
[text_input, launch_research_btn])
|
363 |
+
|
364 |
+
demo.launch(debug=True, **kwargs)
|
365 |
+
|
366 |
+
|
367 |
+
GradioUI().launch()
|
requirements.txt
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/smolagents.git@main#egg=smolagents
|
2 |
+
anthropic>=0.37.1
|
3 |
+
beautifulsoup4>=4.12.3
|
4 |
+
datasets>=2.21.0
|
5 |
+
google_search_results>=2.4.2
|
6 |
+
huggingface_hub>=0.23.4
|
7 |
+
mammoth>=1.8.0
|
8 |
+
markdownify>=0.13.1
|
9 |
+
numexpr>=2.10.1
|
10 |
+
numpy>=2.1.2
|
11 |
+
openai>=1.52.2
|
12 |
+
openpyxl
|
13 |
+
pandas>=2.2.3
|
14 |
+
pathvalidate>=3.2.1
|
15 |
+
pdfminer>=20191125
|
16 |
+
pdfminer.six>=20240706
|
17 |
+
Pillow>=11.0.0
|
18 |
+
puremagic>=1.28
|
19 |
+
pypdf>=5.1.0
|
20 |
+
python-dotenv>=1.0.1
|
21 |
+
python_pptx>=1.0.2
|
22 |
+
Requests>=2.32.3
|
23 |
+
serpapi>=0.1.5
|
24 |
+
tqdm>=4.66.4
|
25 |
+
torch>=2.2.2
|
26 |
+
torchvision>=0.17.2
|
27 |
+
transformers>=4.46.0
|
28 |
+
youtube_transcript_api>=0.6.2
|
29 |
+
chess
|
30 |
+
sympy
|
31 |
+
pubchempy
|
32 |
+
Bio
|
33 |
+
scikit-learn
|
34 |
+
scipy
|
35 |
+
pydub
|
36 |
+
PyPDF2
|
37 |
+
python-pptx
|
38 |
+
torch
|
39 |
+
xlrd
|
40 |
+
SpeechRecognition
|
41 |
+
litellm
|
scripts/cookies.py
ADDED
@@ -0,0 +1,715 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from requests.cookies import RequestsCookieJar
|
2 |
+
|
3 |
+
|
4 |
+
COOKIES_LIST = [
|
5 |
+
{
|
6 |
+
"domain": ".youtube.com",
|
7 |
+
"expirationDate": 1718884961,
|
8 |
+
"hostOnly": False,
|
9 |
+
"httpOnly": False,
|
10 |
+
"name": "ST-xuwub9",
|
11 |
+
"path": "/",
|
12 |
+
"sameSite": None,
|
13 |
+
"secure": False,
|
14 |
+
"session": False,
|
15 |
+
"storeId": None,
|
16 |
+
"value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"domain": ".youtube.com",
|
20 |
+
"expirationDate": 1753004444.745411,
|
21 |
+
"hostOnly": False,
|
22 |
+
"httpOnly": True,
|
23 |
+
"name": "__Secure-YEC",
|
24 |
+
"path": "/",
|
25 |
+
"sameSite": "lax",
|
26 |
+
"secure": True,
|
27 |
+
"session": False,
|
28 |
+
"storeId": None,
|
29 |
+
"value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"domain": ".youtube.com",
|
33 |
+
"expirationDate": 1753434620.050824,
|
34 |
+
"hostOnly": False,
|
35 |
+
"httpOnly": True,
|
36 |
+
"name": "__Secure-3PSID",
|
37 |
+
"path": "/",
|
38 |
+
"sameSite": "no_restriction",
|
39 |
+
"secure": True,
|
40 |
+
"session": False,
|
41 |
+
"storeId": None,
|
42 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"domain": ".youtube.com",
|
46 |
+
"expirationDate": 1750420959.974642,
|
47 |
+
"hostOnly": False,
|
48 |
+
"httpOnly": False,
|
49 |
+
"name": "SIDCC",
|
50 |
+
"path": "/",
|
51 |
+
"sameSite": None,
|
52 |
+
"secure": False,
|
53 |
+
"session": False,
|
54 |
+
"storeId": None,
|
55 |
+
"value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"domain": ".youtube.com",
|
59 |
+
"expirationDate": 1753434620.050652,
|
60 |
+
"hostOnly": False,
|
61 |
+
"httpOnly": False,
|
62 |
+
"name": "SID",
|
63 |
+
"path": "/",
|
64 |
+
"sameSite": None,
|
65 |
+
"secure": False,
|
66 |
+
"session": False,
|
67 |
+
"storeId": None,
|
68 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"domain": ".youtube.com",
|
72 |
+
"expirationDate": 1750420958.397534,
|
73 |
+
"hostOnly": False,
|
74 |
+
"httpOnly": True,
|
75 |
+
"name": "__Secure-1PSIDTS",
|
76 |
+
"path": "/",
|
77 |
+
"sameSite": None,
|
78 |
+
"secure": True,
|
79 |
+
"session": False,
|
80 |
+
"storeId": None,
|
81 |
+
"value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"domain": ".youtube.com",
|
85 |
+
"expirationDate": 1753433494.44729,
|
86 |
+
"hostOnly": False,
|
87 |
+
"httpOnly": False,
|
88 |
+
"name": "_ga_M0180HEFCY",
|
89 |
+
"path": "/",
|
90 |
+
"sameSite": None,
|
91 |
+
"secure": False,
|
92 |
+
"session": False,
|
93 |
+
"storeId": None,
|
94 |
+
"value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"domain": ".youtube.com",
|
98 |
+
"expirationDate": 1753434620.050933,
|
99 |
+
"hostOnly": False,
|
100 |
+
"httpOnly": False,
|
101 |
+
"name": "SAPISID",
|
102 |
+
"path": "/",
|
103 |
+
"sameSite": None,
|
104 |
+
"secure": True,
|
105 |
+
"session": False,
|
106 |
+
"storeId": None,
|
107 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"domain": ".youtube.com",
|
111 |
+
"expirationDate": 1750420959.974764,
|
112 |
+
"hostOnly": False,
|
113 |
+
"httpOnly": True,
|
114 |
+
"name": "__Secure-1PSIDCC",
|
115 |
+
"path": "/",
|
116 |
+
"sameSite": None,
|
117 |
+
"secure": True,
|
118 |
+
"session": False,
|
119 |
+
"storeId": None,
|
120 |
+
"value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"domain": ".youtube.com",
|
124 |
+
"expirationDate": 1753434620.050881,
|
125 |
+
"hostOnly": False,
|
126 |
+
"httpOnly": True,
|
127 |
+
"name": "SSID",
|
128 |
+
"path": "/",
|
129 |
+
"sameSite": None,
|
130 |
+
"secure": True,
|
131 |
+
"session": False,
|
132 |
+
"storeId": None,
|
133 |
+
"value": "AmlwXHnQvOQ10LVd-",
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"domain": ".youtube.com",
|
137 |
+
"expirationDate": 1753434620.050959,
|
138 |
+
"hostOnly": False,
|
139 |
+
"httpOnly": False,
|
140 |
+
"name": "__Secure-1PAPISID",
|
141 |
+
"path": "/",
|
142 |
+
"sameSite": None,
|
143 |
+
"secure": True,
|
144 |
+
"session": False,
|
145 |
+
"storeId": None,
|
146 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"domain": ".youtube.com",
|
150 |
+
"expirationDate": 1753434620.050795,
|
151 |
+
"hostOnly": False,
|
152 |
+
"httpOnly": True,
|
153 |
+
"name": "__Secure-1PSID",
|
154 |
+
"path": "/",
|
155 |
+
"sameSite": None,
|
156 |
+
"secure": True,
|
157 |
+
"session": False,
|
158 |
+
"storeId": None,
|
159 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"domain": ".youtube.com",
|
163 |
+
"expirationDate": 1753434620.050993,
|
164 |
+
"hostOnly": False,
|
165 |
+
"httpOnly": False,
|
166 |
+
"name": "__Secure-3PAPISID",
|
167 |
+
"path": "/",
|
168 |
+
"sameSite": "no_restriction",
|
169 |
+
"secure": True,
|
170 |
+
"session": False,
|
171 |
+
"storeId": None,
|
172 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"domain": ".youtube.com",
|
176 |
+
"expirationDate": 1750420959.974815,
|
177 |
+
"hostOnly": False,
|
178 |
+
"httpOnly": True,
|
179 |
+
"name": "__Secure-3PSIDCC",
|
180 |
+
"path": "/",
|
181 |
+
"sameSite": "no_restriction",
|
182 |
+
"secure": True,
|
183 |
+
"session": False,
|
184 |
+
"storeId": None,
|
185 |
+
"value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"domain": ".youtube.com",
|
189 |
+
"expirationDate": 1750420958.397647,
|
190 |
+
"hostOnly": False,
|
191 |
+
"httpOnly": True,
|
192 |
+
"name": "__Secure-3PSIDTS",
|
193 |
+
"path": "/",
|
194 |
+
"sameSite": "no_restriction",
|
195 |
+
"secure": True,
|
196 |
+
"session": False,
|
197 |
+
"storeId": None,
|
198 |
+
"value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"domain": ".youtube.com",
|
202 |
+
"expirationDate": 1753434620.050908,
|
203 |
+
"hostOnly": False,
|
204 |
+
"httpOnly": False,
|
205 |
+
"name": "APISID",
|
206 |
+
"path": "/",
|
207 |
+
"sameSite": None,
|
208 |
+
"secure": False,
|
209 |
+
"session": False,
|
210 |
+
"storeId": None,
|
211 |
+
"value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"domain": ".youtube.com",
|
215 |
+
"expirationDate": 1753434620.050855,
|
216 |
+
"hostOnly": False,
|
217 |
+
"httpOnly": True,
|
218 |
+
"name": "HSID",
|
219 |
+
"path": "/",
|
220 |
+
"sameSite": None,
|
221 |
+
"secure": False,
|
222 |
+
"session": False,
|
223 |
+
"storeId": None,
|
224 |
+
"value": "AasA7hmRuTFv7vjoq",
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"domain": ".youtube.com",
|
228 |
+
"expirationDate": 1753435873.577793,
|
229 |
+
"hostOnly": False,
|
230 |
+
"httpOnly": True,
|
231 |
+
"name": "LOGIN_INFO",
|
232 |
+
"path": "/",
|
233 |
+
"sameSite": "no_restriction",
|
234 |
+
"secure": True,
|
235 |
+
"session": False,
|
236 |
+
"storeId": None,
|
237 |
+
"value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"domain": ".youtube.com",
|
241 |
+
"expirationDate": 1753444956.555608,
|
242 |
+
"hostOnly": False,
|
243 |
+
"httpOnly": False,
|
244 |
+
"name": "PREF",
|
245 |
+
"path": "/",
|
246 |
+
"sameSite": None,
|
247 |
+
"secure": True,
|
248 |
+
"session": False,
|
249 |
+
"storeId": None,
|
250 |
+
"value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
|
251 |
+
},
|
252 |
+
]
|
253 |
+
|
254 |
+
COOKIES_LIST += [
|
255 |
+
{
|
256 |
+
"domain": ".www.researchgate.net",
|
257 |
+
"hostOnly": False,
|
258 |
+
"httpOnly": True,
|
259 |
+
"name": "isInstIp",
|
260 |
+
"path": "/",
|
261 |
+
"sameSite": None,
|
262 |
+
"secure": True,
|
263 |
+
"session": True,
|
264 |
+
"storeId": None,
|
265 |
+
"value": "False",
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"domain": ".researchgate.net",
|
269 |
+
"expirationDate": 1734423981,
|
270 |
+
"hostOnly": False,
|
271 |
+
"httpOnly": False,
|
272 |
+
"name": "__eoi",
|
273 |
+
"path": "/",
|
274 |
+
"sameSite": None,
|
275 |
+
"secure": False,
|
276 |
+
"session": False,
|
277 |
+
"storeId": None,
|
278 |
+
"value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"domain": ".www.researchgate.net",
|
282 |
+
"expirationDate": 1753444909.646103,
|
283 |
+
"hostOnly": False,
|
284 |
+
"httpOnly": True,
|
285 |
+
"name": "ptc",
|
286 |
+
"path": "/",
|
287 |
+
"sameSite": None,
|
288 |
+
"secure": True,
|
289 |
+
"session": False,
|
290 |
+
"storeId": None,
|
291 |
+
"value": "RG1.8947708639250500550.1718872043",
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"domain": ".researchgate.net",
|
295 |
+
"expirationDate": 1750507578,
|
296 |
+
"hostOnly": False,
|
297 |
+
"httpOnly": False,
|
298 |
+
"name": "euconsent-v2-didomi",
|
299 |
+
"path": "/",
|
300 |
+
"sameSite": "lax",
|
301 |
+
"secure": True,
|
302 |
+
"session": False,
|
303 |
+
"storeId": None,
|
304 |
+
"value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"domain": ".researchgate.net",
|
308 |
+
"expirationDate": 1718885236,
|
309 |
+
"hostOnly": False,
|
310 |
+
"httpOnly": False,
|
311 |
+
"name": "_gat",
|
312 |
+
"path": "/",
|
313 |
+
"sameSite": None,
|
314 |
+
"secure": False,
|
315 |
+
"session": False,
|
316 |
+
"storeId": None,
|
317 |
+
"value": "1",
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"domain": "www.researchgate.net",
|
321 |
+
"expirationDate": 1721477183,
|
322 |
+
"hostOnly": True,
|
323 |
+
"httpOnly": False,
|
324 |
+
"name": "_pbjs_userid_consent_data",
|
325 |
+
"path": "/",
|
326 |
+
"sameSite": "lax",
|
327 |
+
"secure": False,
|
328 |
+
"session": False,
|
329 |
+
"storeId": None,
|
330 |
+
"value": "3524755945110770",
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"domain": ".researchgate.net",
|
334 |
+
"expirationDate": 1752567981,
|
335 |
+
"hostOnly": False,
|
336 |
+
"httpOnly": False,
|
337 |
+
"name": "__gads",
|
338 |
+
"path": "/",
|
339 |
+
"sameSite": None,
|
340 |
+
"secure": False,
|
341 |
+
"session": False,
|
342 |
+
"storeId": None,
|
343 |
+
"value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
|
344 |
+
},
|
345 |
+
{
|
346 |
+
"domain": ".researchgate.net",
|
347 |
+
"expirationDate": 1718886709.646173,
|
348 |
+
"hostOnly": False,
|
349 |
+
"httpOnly": True,
|
350 |
+
"name": "__cf_bm",
|
351 |
+
"path": "/",
|
352 |
+
"sameSite": "no_restriction",
|
353 |
+
"secure": True,
|
354 |
+
"session": False,
|
355 |
+
"storeId": None,
|
356 |
+
"value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"domain": ".researchgate.net",
|
360 |
+
"expirationDate": 1752567981,
|
361 |
+
"hostOnly": False,
|
362 |
+
"httpOnly": False,
|
363 |
+
"name": "__gpi",
|
364 |
+
"path": "/",
|
365 |
+
"sameSite": None,
|
366 |
+
"secure": False,
|
367 |
+
"session": False,
|
368 |
+
"storeId": None,
|
369 |
+
"value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
|
370 |
+
},
|
371 |
+
{
|
372 |
+
"domain": ".researchgate.net",
|
373 |
+
"hostOnly": False,
|
374 |
+
"httpOnly": True,
|
375 |
+
"name": "_cfuvid",
|
376 |
+
"path": "/",
|
377 |
+
"sameSite": "no_restriction",
|
378 |
+
"secure": True,
|
379 |
+
"session": True,
|
380 |
+
"storeId": None,
|
381 |
+
"value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"domain": ".researchgate.net",
|
385 |
+
"expirationDate": 1753445177.271667,
|
386 |
+
"hostOnly": False,
|
387 |
+
"httpOnly": False,
|
388 |
+
"name": "_ga",
|
389 |
+
"path": "/",
|
390 |
+
"sameSite": None,
|
391 |
+
"secure": False,
|
392 |
+
"session": False,
|
393 |
+
"storeId": None,
|
394 |
+
"value": "GA1.1.1525244793.1718885177",
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"domain": ".researchgate.net",
|
398 |
+
"expirationDate": 1753445177.271482,
|
399 |
+
"hostOnly": False,
|
400 |
+
"httpOnly": False,
|
401 |
+
"name": "_ga_4P31SJ70EJ",
|
402 |
+
"path": "/",
|
403 |
+
"sameSite": None,
|
404 |
+
"secure": False,
|
405 |
+
"session": False,
|
406 |
+
"storeId": None,
|
407 |
+
"value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"domain": ".researchgate.net",
|
411 |
+
"expirationDate": 1718971576,
|
412 |
+
"hostOnly": False,
|
413 |
+
"httpOnly": False,
|
414 |
+
"name": "_gid",
|
415 |
+
"path": "/",
|
416 |
+
"sameSite": None,
|
417 |
+
"secure": False,
|
418 |
+
"session": False,
|
419 |
+
"storeId": None,
|
420 |
+
"value": "GA1.2.854907463.1718885177",
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"domain": ".www.researchgate.net",
|
424 |
+
"expirationDate": 1750407982.506505,
|
425 |
+
"hostOnly": False,
|
426 |
+
"httpOnly": True,
|
427 |
+
"name": "did",
|
428 |
+
"path": "/",
|
429 |
+
"sameSite": None,
|
430 |
+
"secure": True,
|
431 |
+
"session": False,
|
432 |
+
"storeId": None,
|
433 |
+
"value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"domain": ".researchgate.net",
|
437 |
+
"expirationDate": 1750507578,
|
438 |
+
"hostOnly": False,
|
439 |
+
"httpOnly": False,
|
440 |
+
"name": "didomi_token",
|
441 |
+
"path": "/",
|
442 |
+
"sameSite": "lax",
|
443 |
+
"secure": True,
|
444 |
+
"session": False,
|
445 |
+
"storeId": None,
|
446 |
+
"value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"domain": ".www.researchgate.net",
|
450 |
+
"hostOnly": False,
|
451 |
+
"httpOnly": True,
|
452 |
+
"name": "hasPdpNext",
|
453 |
+
"path": "/",
|
454 |
+
"sameSite": None,
|
455 |
+
"secure": True,
|
456 |
+
"session": True,
|
457 |
+
"storeId": None,
|
458 |
+
"value": "False",
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"domain": ".researchgate.net",
|
462 |
+
"expirationDate": 1750421183,
|
463 |
+
"hostOnly": False,
|
464 |
+
"httpOnly": False,
|
465 |
+
"name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
|
466 |
+
"path": "/",
|
467 |
+
"sameSite": "lax",
|
468 |
+
"secure": True,
|
469 |
+
"session": False,
|
470 |
+
"storeId": None,
|
471 |
+
"value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
|
472 |
+
},
|
473 |
+
{
|
474 |
+
"domain": ".www.researchgate.net",
|
475 |
+
"hostOnly": False,
|
476 |
+
"httpOnly": True,
|
477 |
+
"name": "sid",
|
478 |
+
"path": "/",
|
479 |
+
"sameSite": None,
|
480 |
+
"secure": True,
|
481 |
+
"session": True,
|
482 |
+
"storeId": None,
|
483 |
+
"value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
|
484 |
+
},
|
485 |
+
]
|
486 |
+
|
487 |
+
COOKIES_LIST += [
|
488 |
+
{
|
489 |
+
"domain": "github.com",
|
490 |
+
"hostOnly": True,
|
491 |
+
"httpOnly": True,
|
492 |
+
"name": "_gh_sess",
|
493 |
+
"path": "/",
|
494 |
+
"sameSite": "lax",
|
495 |
+
"secure": True,
|
496 |
+
"session": True,
|
497 |
+
"storeId": None,
|
498 |
+
"value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"domain": ".github.com",
|
502 |
+
"expirationDate": 1750408875.763785,
|
503 |
+
"hostOnly": False,
|
504 |
+
"httpOnly": False,
|
505 |
+
"name": "_octo",
|
506 |
+
"path": "/",
|
507 |
+
"sameSite": "lax",
|
508 |
+
"secure": True,
|
509 |
+
"session": False,
|
510 |
+
"storeId": None,
|
511 |
+
"value": "GH1.1.728652011.1718872875",
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"domain": ".github.com",
|
515 |
+
"expirationDate": 1750408875.763926,
|
516 |
+
"hostOnly": False,
|
517 |
+
"httpOnly": True,
|
518 |
+
"name": "logged_in",
|
519 |
+
"path": "/",
|
520 |
+
"sameSite": "lax",
|
521 |
+
"secure": True,
|
522 |
+
"session": False,
|
523 |
+
"storeId": None,
|
524 |
+
"value": "no",
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"domain": ".github.com",
|
528 |
+
"hostOnly": False,
|
529 |
+
"httpOnly": False,
|
530 |
+
"name": "preferred_color_mode",
|
531 |
+
"path": "/",
|
532 |
+
"sameSite": "lax",
|
533 |
+
"secure": True,
|
534 |
+
"session": True,
|
535 |
+
"storeId": None,
|
536 |
+
"value": "dark",
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"domain": ".github.com",
|
540 |
+
"hostOnly": False,
|
541 |
+
"httpOnly": False,
|
542 |
+
"name": "tz",
|
543 |
+
"path": "/",
|
544 |
+
"sameSite": "lax",
|
545 |
+
"secure": True,
|
546 |
+
"session": True,
|
547 |
+
"storeId": None,
|
548 |
+
"value": "Europe%2FParis",
|
549 |
+
},
|
550 |
+
]
|
551 |
+
|
552 |
+
COOKIES_LIST += [
|
553 |
+
{
|
554 |
+
"domain": ".web.archive.org",
|
555 |
+
"expirationDate": 1718886430,
|
556 |
+
"hostOnly": False,
|
557 |
+
"httpOnly": False,
|
558 |
+
"name": "_gat",
|
559 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
560 |
+
"sameSite": None,
|
561 |
+
"secure": False,
|
562 |
+
"session": False,
|
563 |
+
"storeId": None,
|
564 |
+
"value": "1",
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"domain": ".web.archive.org",
|
568 |
+
"expirationDate": 1718972770,
|
569 |
+
"hostOnly": False,
|
570 |
+
"httpOnly": False,
|
571 |
+
"name": "_gid",
|
572 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
573 |
+
"sameSite": None,
|
574 |
+
"secure": False,
|
575 |
+
"session": False,
|
576 |
+
"storeId": None,
|
577 |
+
"value": "GA1.2.402246368.1606169825",
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"domain": ".web.archive.org",
|
581 |
+
"expirationDate": 1753446370.315621,
|
582 |
+
"hostOnly": False,
|
583 |
+
"httpOnly": False,
|
584 |
+
"name": "_ga",
|
585 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
586 |
+
"sameSite": None,
|
587 |
+
"secure": False,
|
588 |
+
"session": False,
|
589 |
+
"storeId": None,
|
590 |
+
"value": "GA1.2.1301409987.1606169825",
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"domain": ".web.archive.org",
|
594 |
+
"expirationDate": 1750422367,
|
595 |
+
"hostOnly": False,
|
596 |
+
"httpOnly": False,
|
597 |
+
"name": "_hjid",
|
598 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
599 |
+
"sameSite": "lax",
|
600 |
+
"secure": False,
|
601 |
+
"session": False,
|
602 |
+
"storeId": None,
|
603 |
+
"value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
|
604 |
+
},
|
605 |
+
{
|
606 |
+
"domain": ".web.archive.org",
|
607 |
+
"expirationDate": 1718888167,
|
608 |
+
"hostOnly": False,
|
609 |
+
"httpOnly": False,
|
610 |
+
"name": "_hjFirstSeen",
|
611 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
612 |
+
"sameSite": "lax",
|
613 |
+
"secure": False,
|
614 |
+
"session": False,
|
615 |
+
"storeId": None,
|
616 |
+
"value": "1",
|
617 |
+
},
|
618 |
+
]
|
619 |
+
COOKIES_LIST += [
|
620 |
+
{
|
621 |
+
"domain": "orcid.org",
|
622 |
+
"hostOnly": True,
|
623 |
+
"httpOnly": False,
|
624 |
+
"name": "AWSELBCORS",
|
625 |
+
"path": "/",
|
626 |
+
"sameSite": "no_restriction",
|
627 |
+
"secure": True,
|
628 |
+
"session": True,
|
629 |
+
"storeId": None,
|
630 |
+
"value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"domain": ".orcid.org",
|
634 |
+
"expirationDate": 1753452454.637671,
|
635 |
+
"hostOnly": False,
|
636 |
+
"httpOnly": False,
|
637 |
+
"name": "_ga_9R61FWK9H5",
|
638 |
+
"path": "/",
|
639 |
+
"sameSite": None,
|
640 |
+
"secure": False,
|
641 |
+
"session": False,
|
642 |
+
"storeId": None,
|
643 |
+
"value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
|
644 |
+
},
|
645 |
+
{
|
646 |
+
"domain": ".orcid.org",
|
647 |
+
"expirationDate": 1753452454.63421,
|
648 |
+
"hostOnly": False,
|
649 |
+
"httpOnly": False,
|
650 |
+
"name": "_ga",
|
651 |
+
"path": "/",
|
652 |
+
"sameSite": None,
|
653 |
+
"secure": False,
|
654 |
+
"session": False,
|
655 |
+
"storeId": None,
|
656 |
+
"value": "GA1.1.2021310691.1718892455",
|
657 |
+
},
|
658 |
+
{
|
659 |
+
"domain": "orcid.org",
|
660 |
+
"hostOnly": True,
|
661 |
+
"httpOnly": False,
|
662 |
+
"name": "AWSELB",
|
663 |
+
"path": "/",
|
664 |
+
"sameSite": None,
|
665 |
+
"secure": False,
|
666 |
+
"session": True,
|
667 |
+
"storeId": None,
|
668 |
+
"value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
|
669 |
+
},
|
670 |
+
{
|
671 |
+
"domain": ".orcid.org",
|
672 |
+
"expirationDate": 1750428454,
|
673 |
+
"hostOnly": False,
|
674 |
+
"httpOnly": False,
|
675 |
+
"name": "OptanonAlertBoxClosed",
|
676 |
+
"path": "/",
|
677 |
+
"sameSite": "lax",
|
678 |
+
"secure": False,
|
679 |
+
"session": False,
|
680 |
+
"storeId": None,
|
681 |
+
"value": "2024-06-20T14:07:34.583Z",
|
682 |
+
},
|
683 |
+
{
|
684 |
+
"domain": ".orcid.org",
|
685 |
+
"expirationDate": 1750428454,
|
686 |
+
"hostOnly": False,
|
687 |
+
"httpOnly": False,
|
688 |
+
"name": "OptanonConsent",
|
689 |
+
"path": "/",
|
690 |
+
"sameSite": "lax",
|
691 |
+
"secure": False,
|
692 |
+
"session": False,
|
693 |
+
"storeId": None,
|
694 |
+
"value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"domain": "orcid.org",
|
698 |
+
"hostOnly": True,
|
699 |
+
"httpOnly": False,
|
700 |
+
"name": "XSRF-TOKEN",
|
701 |
+
"path": "/",
|
702 |
+
"sameSite": None,
|
703 |
+
"secure": True,
|
704 |
+
"session": True,
|
705 |
+
"storeId": None,
|
706 |
+
"value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
|
707 |
+
},
|
708 |
+
]
|
709 |
+
|
710 |
+
# Create a RequestsCookieJar instance
|
711 |
+
COOKIES = RequestsCookieJar()
|
712 |
+
|
713 |
+
# Add cookies to the jar
|
714 |
+
for cookie in COOKIES_LIST:
|
715 |
+
COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
|
scripts/gaia_scorer.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import warnings
|
4 |
+
|
5 |
+
|
6 |
+
def normalize_number_str(number_str: str) -> float:
|
7 |
+
# we replace these common units and commas to allow
|
8 |
+
# conversion to float
|
9 |
+
for char in ["$", "%", ","]:
|
10 |
+
number_str = number_str.replace(char, "")
|
11 |
+
try:
|
12 |
+
return float(number_str)
|
13 |
+
except ValueError:
|
14 |
+
print(f"String {number_str} cannot be normalized to number str.")
|
15 |
+
return float("inf")
|
16 |
+
|
17 |
+
|
18 |
+
def split_string(
|
19 |
+
s: str,
|
20 |
+
char_list: list[str] = [",", ";"],
|
21 |
+
) -> list[str]:
|
22 |
+
pattern = f"[{''.join(char_list)}]"
|
23 |
+
return re.split(pattern, s)
|
24 |
+
|
25 |
+
|
26 |
+
def is_float(element: any) -> bool:
|
27 |
+
try:
|
28 |
+
float(element)
|
29 |
+
return True
|
30 |
+
except ValueError:
|
31 |
+
return False
|
32 |
+
|
33 |
+
|
34 |
+
def question_scorer(
|
35 |
+
model_answer: str,
|
36 |
+
ground_truth: str,
|
37 |
+
) -> bool:
|
38 |
+
# if gt is a number
|
39 |
+
if is_float(ground_truth):
|
40 |
+
normalized_answer = normalize_number_str(str(model_answer))
|
41 |
+
return normalized_answer == float(ground_truth)
|
42 |
+
|
43 |
+
# if gt is a list
|
44 |
+
elif any(char in ground_truth for char in [",", ";"]):
|
45 |
+
# question with the fish: normalization removes punct
|
46 |
+
|
47 |
+
gt_elems = split_string(ground_truth)
|
48 |
+
ma_elems = split_string(model_answer)
|
49 |
+
|
50 |
+
# check length is the same
|
51 |
+
if len(gt_elems) != len(ma_elems):
|
52 |
+
warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
|
53 |
+
return False
|
54 |
+
|
55 |
+
# compare each element as float or str
|
56 |
+
comparisons = []
|
57 |
+
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
|
58 |
+
if is_float(gt_elem):
|
59 |
+
normalized_ma_elem = normalize_number_str(ma_elem)
|
60 |
+
comparisons.append(normalized_ma_elem == float(gt_elem))
|
61 |
+
else:
|
62 |
+
# we do not remove punct since comparisons can include punct
|
63 |
+
comparisons.append(
|
64 |
+
normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
|
65 |
+
)
|
66 |
+
return all(comparisons)
|
67 |
+
|
68 |
+
# if gt is a str
|
69 |
+
else:
|
70 |
+
return normalize_str(model_answer) == normalize_str(ground_truth)
|
71 |
+
|
72 |
+
|
73 |
+
def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
|
74 |
+
prediction = prediction.lower()
|
75 |
+
true_answer = true_answer.lower()
|
76 |
+
if len(prediction) > len(true_answer) * 3:
|
77 |
+
return False
|
78 |
+
i = 0
|
79 |
+
for letter in true_answer:
|
80 |
+
if letter in prediction[i:]:
|
81 |
+
i += prediction[i:].index(letter)
|
82 |
+
else:
|
83 |
+
return False
|
84 |
+
return True
|
85 |
+
|
86 |
+
|
87 |
+
def check_close_call(prediction, true_answer, is_correct):
|
88 |
+
if is_correct:
|
89 |
+
return True
|
90 |
+
else:
|
91 |
+
if is_float(true_answer):
|
92 |
+
return is_correct
|
93 |
+
else:
|
94 |
+
if (
|
95 |
+
check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
|
96 |
+
and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
|
97 |
+
):
|
98 |
+
print(f"Close call: {prediction} vs {true_answer}")
|
99 |
+
return True
|
100 |
+
else:
|
101 |
+
return False
|
102 |
+
|
103 |
+
|
104 |
+
def normalize_str(input_str, remove_punct=True) -> str:
|
105 |
+
"""
|
106 |
+
Normalize a string by:
|
107 |
+
- Removing all white spaces
|
108 |
+
- Optionally removing punctuation (if remove_punct is True)
|
109 |
+
- Converting to lowercase
|
110 |
+
Parameters:
|
111 |
+
- input_str: str, the string to normalize
|
112 |
+
- remove_punct: bool, whether to remove punctuation (default: True)
|
113 |
+
Returns:
|
114 |
+
- str, the normalized string
|
115 |
+
"""
|
116 |
+
# Remove all white spaces. Required e.g for seagull vs. sea gull
|
117 |
+
no_spaces = re.sub(r"\s", "", input_str)
|
118 |
+
|
119 |
+
# Remove punctuation, if specified.
|
120 |
+
if remove_punct:
|
121 |
+
translator = str.maketrans("", "", string.punctuation)
|
122 |
+
return no_spaces.lower().translate(translator)
|
123 |
+
else:
|
124 |
+
return no_spaces.lower()
|
scripts/mdconvert.py
ADDED
@@ -0,0 +1,949 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
|
2 |
+
# Thanks to Microsoft researchers for open-sourcing this!
|
3 |
+
# type: ignore
|
4 |
+
import base64
|
5 |
+
import copy
|
6 |
+
import html
|
7 |
+
import json
|
8 |
+
import mimetypes
|
9 |
+
import os
|
10 |
+
import re
|
11 |
+
import shutil
|
12 |
+
import subprocess
|
13 |
+
import sys
|
14 |
+
import tempfile
|
15 |
+
import traceback
|
16 |
+
from typing import Any, Dict, List, Optional, Union
|
17 |
+
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
18 |
+
|
19 |
+
import mammoth
|
20 |
+
import markdownify
|
21 |
+
import pandas as pd
|
22 |
+
import pdfminer
|
23 |
+
import pdfminer.high_level
|
24 |
+
import pptx
|
25 |
+
|
26 |
+
# File-format detection
|
27 |
+
import puremagic
|
28 |
+
import pydub
|
29 |
+
import requests
|
30 |
+
import speech_recognition as sr
|
31 |
+
from bs4 import BeautifulSoup
|
32 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
33 |
+
from youtube_transcript_api.formatters import SRTFormatter
|
34 |
+
|
35 |
+
|
36 |
+
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
37 |
+
"""
|
38 |
+
A custom version of markdownify's MarkdownConverter. Changes include:
|
39 |
+
|
40 |
+
- Altering the default heading style to use '#', '##', etc.
|
41 |
+
- Removing javascript hyperlinks.
|
42 |
+
- Truncating images with large data:uri sources.
|
43 |
+
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
44 |
+
"""
|
45 |
+
|
46 |
+
def __init__(self, **options: Any):
|
47 |
+
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
48 |
+
# Explicitly cast options to the expected type if necessary
|
49 |
+
super().__init__(**options)
|
50 |
+
|
51 |
+
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
52 |
+
"""Same as usual, but be sure to start with a new line"""
|
53 |
+
if not convert_as_inline:
|
54 |
+
if not re.search(r"^\n", text):
|
55 |
+
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
56 |
+
|
57 |
+
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
58 |
+
|
59 |
+
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
60 |
+
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
61 |
+
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
62 |
+
if not text:
|
63 |
+
return ""
|
64 |
+
href = el.get("href")
|
65 |
+
title = el.get("title")
|
66 |
+
|
67 |
+
# Escape URIs and skip non-http or file schemes
|
68 |
+
if href:
|
69 |
+
try:
|
70 |
+
parsed_url = urlparse(href) # type: ignore
|
71 |
+
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
72 |
+
return "%s%s%s" % (prefix, text, suffix)
|
73 |
+
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
74 |
+
except ValueError: # It's not clear if this ever gets thrown
|
75 |
+
return "%s%s%s" % (prefix, text, suffix)
|
76 |
+
|
77 |
+
# For the replacement see #29: text nodes underscores are escaped
|
78 |
+
if (
|
79 |
+
self.options["autolinks"]
|
80 |
+
and text.replace(r"\_", "_") == href
|
81 |
+
and not title
|
82 |
+
and not self.options["default_title"]
|
83 |
+
):
|
84 |
+
# Shortcut syntax
|
85 |
+
return "<%s>" % href
|
86 |
+
if self.options["default_title"] and not title:
|
87 |
+
title = href
|
88 |
+
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
89 |
+
return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
|
90 |
+
|
91 |
+
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
92 |
+
"""Same as usual converter, but removes data URIs"""
|
93 |
+
|
94 |
+
alt = el.attrs.get("alt", None) or ""
|
95 |
+
src = el.attrs.get("src", None) or ""
|
96 |
+
title = el.attrs.get("title", None) or ""
|
97 |
+
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
98 |
+
if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
|
99 |
+
return alt
|
100 |
+
|
101 |
+
# Remove dataURIs
|
102 |
+
if src.startswith("data:"):
|
103 |
+
src = src.split(",")[0] + "..."
|
104 |
+
|
105 |
+
return "" % (alt, src, title_part)
|
106 |
+
|
107 |
+
def convert_soup(self, soup: Any) -> str:
|
108 |
+
return super().convert_soup(soup) # type: ignore
|
109 |
+
|
110 |
+
|
111 |
+
class DocumentConverterResult:
|
112 |
+
"""The result of converting a document to text."""
|
113 |
+
|
114 |
+
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
|
115 |
+
self.title: Union[str, None] = title
|
116 |
+
self.text_content: str = text_content
|
117 |
+
|
118 |
+
|
119 |
+
class DocumentConverter:
|
120 |
+
"""Abstract superclass of all DocumentConverters."""
|
121 |
+
|
122 |
+
def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
|
123 |
+
raise NotImplementedError()
|
124 |
+
|
125 |
+
|
126 |
+
class PlainTextConverter(DocumentConverter):
|
127 |
+
"""Anything with content type text/plain"""
|
128 |
+
|
129 |
+
def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
|
130 |
+
# Guess the content type from any file extension that might be around
|
131 |
+
content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
|
132 |
+
|
133 |
+
# Only accept text files
|
134 |
+
if content_type is None:
|
135 |
+
return None
|
136 |
+
# elif "text/" not in content_type.lower():
|
137 |
+
# return None
|
138 |
+
|
139 |
+
text_content = ""
|
140 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
141 |
+
text_content = fh.read()
|
142 |
+
return DocumentConverterResult(
|
143 |
+
title=None,
|
144 |
+
text_content=text_content,
|
145 |
+
)
|
146 |
+
|
147 |
+
|
148 |
+
class HtmlConverter(DocumentConverter):
|
149 |
+
"""Anything with content type text/html"""
|
150 |
+
|
151 |
+
def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
|
152 |
+
# Bail if not html
|
153 |
+
extension = kwargs.get("file_extension", "")
|
154 |
+
if extension.lower() not in [".html", ".htm"]:
|
155 |
+
return None
|
156 |
+
|
157 |
+
result = None
|
158 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
159 |
+
result = self._convert(fh.read())
|
160 |
+
|
161 |
+
return result
|
162 |
+
|
163 |
+
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
164 |
+
"""Helper function that converts and HTML string."""
|
165 |
+
|
166 |
+
# Parse the string
|
167 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
168 |
+
|
169 |
+
# Remove javascript and style blocks
|
170 |
+
for script in soup(["script", "style"]):
|
171 |
+
script.extract()
|
172 |
+
|
173 |
+
# Print only the main content
|
174 |
+
body_elm = soup.find("body")
|
175 |
+
webpage_text = ""
|
176 |
+
if body_elm:
|
177 |
+
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
178 |
+
else:
|
179 |
+
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
180 |
+
|
181 |
+
assert isinstance(webpage_text, str)
|
182 |
+
|
183 |
+
return DocumentConverterResult(
|
184 |
+
title=None if soup.title is None else soup.title.string, text_content=webpage_text
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
+
class WikipediaConverter(DocumentConverter):
|
189 |
+
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
190 |
+
|
191 |
+
def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
|
192 |
+
# Bail if not Wikipedia
|
193 |
+
extension = kwargs.get("file_extension", "")
|
194 |
+
if extension.lower() not in [".html", ".htm"]:
|
195 |
+
return None
|
196 |
+
url = kwargs.get("url", "")
|
197 |
+
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
198 |
+
return None
|
199 |
+
|
200 |
+
# Parse the file
|
201 |
+
soup = None
|
202 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
203 |
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
204 |
+
|
205 |
+
# Remove javascript and style blocks
|
206 |
+
for script in soup(["script", "style"]):
|
207 |
+
script.extract()
|
208 |
+
|
209 |
+
# Print only the main content
|
210 |
+
body_elm = soup.find("div", {"id": "mw-content-text"})
|
211 |
+
title_elm = soup.find("span", {"class": "mw-page-title-main"})
|
212 |
+
|
213 |
+
webpage_text = ""
|
214 |
+
main_title = None if soup.title is None else soup.title.string
|
215 |
+
|
216 |
+
if body_elm:
|
217 |
+
# What's the title
|
218 |
+
if title_elm and len(title_elm) > 0:
|
219 |
+
main_title = title_elm.string # type: ignore
|
220 |
+
assert isinstance(main_title, str)
|
221 |
+
|
222 |
+
# Convert the page
|
223 |
+
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
|
224 |
+
else:
|
225 |
+
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
226 |
+
|
227 |
+
return DocumentConverterResult(
|
228 |
+
title=main_title,
|
229 |
+
text_content=webpage_text,
|
230 |
+
)
|
231 |
+
|
232 |
+
|
233 |
+
class YouTubeConverter(DocumentConverter):
|
234 |
+
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
235 |
+
|
236 |
+
def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
|
237 |
+
# Bail if not YouTube
|
238 |
+
extension = kwargs.get("file_extension", "")
|
239 |
+
if extension.lower() not in [".html", ".htm"]:
|
240 |
+
return None
|
241 |
+
url = kwargs.get("url", "")
|
242 |
+
if not url.startswith("https://www.youtube.com/watch?"):
|
243 |
+
return None
|
244 |
+
|
245 |
+
# Parse the file
|
246 |
+
soup = None
|
247 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
248 |
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
249 |
+
|
250 |
+
# Read the meta tags
|
251 |
+
assert soup.title is not None and soup.title.string is not None
|
252 |
+
metadata: Dict[str, str] = {"title": soup.title.string}
|
253 |
+
for meta in soup(["meta"]):
|
254 |
+
for a in meta.attrs:
|
255 |
+
if a in ["itemprop", "property", "name"]:
|
256 |
+
metadata[meta[a]] = meta.get("content", "")
|
257 |
+
break
|
258 |
+
|
259 |
+
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
|
260 |
+
try:
|
261 |
+
for script in soup(["script"]):
|
262 |
+
content = script.text
|
263 |
+
if "ytInitialData" in content:
|
264 |
+
lines = re.split(r"\r?\n", content)
|
265 |
+
obj_start = lines[0].find("{")
|
266 |
+
obj_end = lines[0].rfind("}")
|
267 |
+
if obj_start >= 0 and obj_end >= 0:
|
268 |
+
data = json.loads(lines[0][obj_start : obj_end + 1])
|
269 |
+
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
270 |
+
if attrdesc:
|
271 |
+
metadata["description"] = str(attrdesc["content"])
|
272 |
+
break
|
273 |
+
except Exception:
|
274 |
+
pass
|
275 |
+
|
276 |
+
# Start preparing the page
|
277 |
+
webpage_text = "# YouTube\n"
|
278 |
+
|
279 |
+
title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
|
280 |
+
assert isinstance(title, str)
|
281 |
+
|
282 |
+
if title:
|
283 |
+
webpage_text += f"\n## {title}\n"
|
284 |
+
|
285 |
+
stats = ""
|
286 |
+
views = self._get(metadata, ["interactionCount"]) # type: ignore
|
287 |
+
if views:
|
288 |
+
stats += f"- **Views:** {views}\n"
|
289 |
+
|
290 |
+
keywords = self._get(metadata, ["keywords"]) # type: ignore
|
291 |
+
if keywords:
|
292 |
+
stats += f"- **Keywords:** {keywords}\n"
|
293 |
+
|
294 |
+
runtime = self._get(metadata, ["duration"]) # type: ignore
|
295 |
+
if runtime:
|
296 |
+
stats += f"- **Runtime:** {runtime}\n"
|
297 |
+
|
298 |
+
if len(stats) > 0:
|
299 |
+
webpage_text += f"\n### Video Metadata\n{stats}\n"
|
300 |
+
|
301 |
+
description = self._get(metadata, ["description", "og:description"]) # type: ignore
|
302 |
+
if description:
|
303 |
+
webpage_text += f"\n### Description\n{description}\n"
|
304 |
+
|
305 |
+
transcript_text = ""
|
306 |
+
parsed_url = urlparse(url) # type: ignore
|
307 |
+
params = parse_qs(parsed_url.query) # type: ignore
|
308 |
+
if "v" in params:
|
309 |
+
assert isinstance(params["v"][0], str)
|
310 |
+
video_id = str(params["v"][0])
|
311 |
+
try:
|
312 |
+
# Must be a single transcript.
|
313 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
|
314 |
+
# transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
315 |
+
# Alternative formatting:
|
316 |
+
transcript_text = SRTFormatter().format_transcript(transcript)
|
317 |
+
except Exception:
|
318 |
+
pass
|
319 |
+
if transcript_text:
|
320 |
+
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
321 |
+
|
322 |
+
title = title if title else soup.title.string
|
323 |
+
assert isinstance(title, str)
|
324 |
+
|
325 |
+
return DocumentConverterResult(
|
326 |
+
title=title,
|
327 |
+
text_content=webpage_text,
|
328 |
+
)
|
329 |
+
|
330 |
+
def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
|
331 |
+
for k in keys:
|
332 |
+
if k in metadata:
|
333 |
+
return metadata[k]
|
334 |
+
return default
|
335 |
+
|
336 |
+
def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
|
337 |
+
if isinstance(json, list):
|
338 |
+
for elm in json:
|
339 |
+
ret = self._findKey(elm, key)
|
340 |
+
if ret is not None:
|
341 |
+
return ret
|
342 |
+
elif isinstance(json, dict):
|
343 |
+
for k in json:
|
344 |
+
if k == key:
|
345 |
+
return json[k]
|
346 |
+
else:
|
347 |
+
ret = self._findKey(json[k], key)
|
348 |
+
if ret is not None:
|
349 |
+
return ret
|
350 |
+
return None
|
351 |
+
|
352 |
+
|
353 |
+
class PdfConverter(DocumentConverter):
|
354 |
+
"""
|
355 |
+
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
356 |
+
"""
|
357 |
+
|
358 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
359 |
+
# Bail if not a PDF
|
360 |
+
extension = kwargs.get("file_extension", "")
|
361 |
+
if extension.lower() != ".pdf":
|
362 |
+
return None
|
363 |
+
|
364 |
+
return DocumentConverterResult(
|
365 |
+
title=None,
|
366 |
+
text_content=pdfminer.high_level.extract_text(local_path),
|
367 |
+
)
|
368 |
+
|
369 |
+
|
370 |
+
class DocxConverter(HtmlConverter):
|
371 |
+
"""
|
372 |
+
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
373 |
+
"""
|
374 |
+
|
375 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
376 |
+
# Bail if not a DOCX
|
377 |
+
extension = kwargs.get("file_extension", "")
|
378 |
+
if extension.lower() != ".docx":
|
379 |
+
return None
|
380 |
+
|
381 |
+
result = None
|
382 |
+
with open(local_path, "rb") as docx_file:
|
383 |
+
result = mammoth.convert_to_html(docx_file)
|
384 |
+
html_content = result.value
|
385 |
+
result = self._convert(html_content)
|
386 |
+
|
387 |
+
return result
|
388 |
+
|
389 |
+
|
390 |
+
class XlsxConverter(HtmlConverter):
|
391 |
+
"""
|
392 |
+
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
393 |
+
"""
|
394 |
+
|
395 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
396 |
+
# Bail if not a XLSX
|
397 |
+
extension = kwargs.get("file_extension", "")
|
398 |
+
if extension.lower() not in [".xlsx", ".xls"]:
|
399 |
+
return None
|
400 |
+
|
401 |
+
sheets = pd.read_excel(local_path, sheet_name=None)
|
402 |
+
md_content = ""
|
403 |
+
for s in sheets:
|
404 |
+
md_content += f"## {s}\n"
|
405 |
+
html_content = sheets[s].to_html(index=False)
|
406 |
+
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
407 |
+
|
408 |
+
return DocumentConverterResult(
|
409 |
+
title=None,
|
410 |
+
text_content=md_content.strip(),
|
411 |
+
)
|
412 |
+
|
413 |
+
|
414 |
+
class PptxConverter(HtmlConverter):
|
415 |
+
"""
|
416 |
+
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
417 |
+
"""
|
418 |
+
|
419 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
420 |
+
# Bail if not a PPTX
|
421 |
+
extension = kwargs.get("file_extension", "")
|
422 |
+
if extension.lower() != ".pptx":
|
423 |
+
return None
|
424 |
+
|
425 |
+
md_content = ""
|
426 |
+
|
427 |
+
presentation = pptx.Presentation(local_path)
|
428 |
+
slide_num = 0
|
429 |
+
for slide in presentation.slides:
|
430 |
+
slide_num += 1
|
431 |
+
|
432 |
+
md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
|
433 |
+
|
434 |
+
title = slide.shapes.title
|
435 |
+
for shape in slide.shapes:
|
436 |
+
# Pictures
|
437 |
+
if self._is_picture(shape):
|
438 |
+
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
439 |
+
alt_text = ""
|
440 |
+
try:
|
441 |
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
442 |
+
except Exception:
|
443 |
+
pass
|
444 |
+
|
445 |
+
# A placeholder name
|
446 |
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
447 |
+
md_content += "\n\n"
|
448 |
+
|
449 |
+
# Tables
|
450 |
+
if self._is_table(shape):
|
451 |
+
html_table = "<html><body><table>"
|
452 |
+
first_row = True
|
453 |
+
for row in shape.table.rows:
|
454 |
+
html_table += "<tr>"
|
455 |
+
for cell in row.cells:
|
456 |
+
if first_row:
|
457 |
+
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
458 |
+
else:
|
459 |
+
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
460 |
+
html_table += "</tr>"
|
461 |
+
first_row = False
|
462 |
+
html_table += "</table></body></html>"
|
463 |
+
md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
|
464 |
+
|
465 |
+
# Text areas
|
466 |
+
elif shape.has_text_frame:
|
467 |
+
if shape == title:
|
468 |
+
md_content += "# " + shape.text.lstrip() + "\n"
|
469 |
+
else:
|
470 |
+
md_content += shape.text + "\n"
|
471 |
+
|
472 |
+
md_content = md_content.strip()
|
473 |
+
|
474 |
+
if slide.has_notes_slide:
|
475 |
+
md_content += "\n\n### Notes:\n"
|
476 |
+
notes_frame = slide.notes_slide.notes_text_frame
|
477 |
+
if notes_frame is not None:
|
478 |
+
md_content += notes_frame.text
|
479 |
+
md_content = md_content.strip()
|
480 |
+
|
481 |
+
return DocumentConverterResult(
|
482 |
+
title=None,
|
483 |
+
text_content=md_content.strip(),
|
484 |
+
)
|
485 |
+
|
486 |
+
def _is_picture(self, shape):
|
487 |
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
488 |
+
return True
|
489 |
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
|
490 |
+
if hasattr(shape, "image"):
|
491 |
+
return True
|
492 |
+
return False
|
493 |
+
|
494 |
+
def _is_table(self, shape):
|
495 |
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
|
496 |
+
return True
|
497 |
+
return False
|
498 |
+
|
499 |
+
|
500 |
+
class MediaConverter(DocumentConverter):
|
501 |
+
"""
|
502 |
+
Abstract class for multi-modal media (e.g., images and audio)
|
503 |
+
"""
|
504 |
+
|
505 |
+
def _get_metadata(self, local_path):
|
506 |
+
exiftool = shutil.which("exiftool")
|
507 |
+
if not exiftool:
|
508 |
+
return None
|
509 |
+
else:
|
510 |
+
try:
|
511 |
+
result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
|
512 |
+
return json.loads(result)[0]
|
513 |
+
except Exception:
|
514 |
+
return None
|
515 |
+
|
516 |
+
|
517 |
+
class WavConverter(MediaConverter):
|
518 |
+
"""
|
519 |
+
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
520 |
+
"""
|
521 |
+
|
522 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
523 |
+
# Bail if not a XLSX
|
524 |
+
extension = kwargs.get("file_extension", "")
|
525 |
+
if extension.lower() != ".wav":
|
526 |
+
return None
|
527 |
+
|
528 |
+
md_content = ""
|
529 |
+
|
530 |
+
# Add metadata
|
531 |
+
metadata = self._get_metadata(local_path)
|
532 |
+
if metadata:
|
533 |
+
for f in [
|
534 |
+
"Title",
|
535 |
+
"Artist",
|
536 |
+
"Author",
|
537 |
+
"Band",
|
538 |
+
"Album",
|
539 |
+
"Genre",
|
540 |
+
"Track",
|
541 |
+
"DateTimeOriginal",
|
542 |
+
"CreateDate",
|
543 |
+
"Duration",
|
544 |
+
]:
|
545 |
+
if f in metadata:
|
546 |
+
md_content += f"{f}: {metadata[f]}\n"
|
547 |
+
|
548 |
+
# Transcribe
|
549 |
+
try:
|
550 |
+
transcript = self._transcribe_audio(local_path)
|
551 |
+
md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
|
552 |
+
except Exception:
|
553 |
+
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
554 |
+
|
555 |
+
return DocumentConverterResult(
|
556 |
+
title=None,
|
557 |
+
text_content=md_content.strip(),
|
558 |
+
)
|
559 |
+
|
560 |
+
def _transcribe_audio(self, local_path) -> str:
|
561 |
+
recognizer = sr.Recognizer()
|
562 |
+
with sr.AudioFile(local_path) as source:
|
563 |
+
audio = recognizer.record(source)
|
564 |
+
return recognizer.recognize_google(audio).strip()
|
565 |
+
|
566 |
+
|
567 |
+
class Mp3Converter(WavConverter):
|
568 |
+
"""
|
569 |
+
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
570 |
+
"""
|
571 |
+
|
572 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
573 |
+
# Bail if not a MP3
|
574 |
+
extension = kwargs.get("file_extension", "")
|
575 |
+
if extension.lower() != ".mp3":
|
576 |
+
return None
|
577 |
+
|
578 |
+
md_content = ""
|
579 |
+
|
580 |
+
# Add metadata
|
581 |
+
metadata = self._get_metadata(local_path)
|
582 |
+
if metadata:
|
583 |
+
for f in [
|
584 |
+
"Title",
|
585 |
+
"Artist",
|
586 |
+
"Author",
|
587 |
+
"Band",
|
588 |
+
"Album",
|
589 |
+
"Genre",
|
590 |
+
"Track",
|
591 |
+
"DateTimeOriginal",
|
592 |
+
"CreateDate",
|
593 |
+
"Duration",
|
594 |
+
]:
|
595 |
+
if f in metadata:
|
596 |
+
md_content += f"{f}: {metadata[f]}\n"
|
597 |
+
|
598 |
+
# Transcribe
|
599 |
+
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
600 |
+
os.close(handle)
|
601 |
+
try:
|
602 |
+
sound = pydub.AudioSegment.from_mp3(local_path)
|
603 |
+
sound.export(temp_path, format="wav")
|
604 |
+
|
605 |
+
_args = dict()
|
606 |
+
_args.update(kwargs)
|
607 |
+
_args["file_extension"] = ".wav"
|
608 |
+
|
609 |
+
try:
|
610 |
+
transcript = super()._transcribe_audio(temp_path).strip()
|
611 |
+
md_content += "\n\n### Audio Transcript:\n" + (
|
612 |
+
"[No speech detected]" if transcript == "" else transcript
|
613 |
+
)
|
614 |
+
except Exception:
|
615 |
+
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
616 |
+
|
617 |
+
finally:
|
618 |
+
os.unlink(temp_path)
|
619 |
+
|
620 |
+
# Return the result
|
621 |
+
return DocumentConverterResult(
|
622 |
+
title=None,
|
623 |
+
text_content=md_content.strip(),
|
624 |
+
)
|
625 |
+
|
626 |
+
|
627 |
+
class ImageConverter(MediaConverter):
|
628 |
+
"""
|
629 |
+
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
|
630 |
+
"""
|
631 |
+
|
632 |
+
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
633 |
+
# Bail if not a XLSX
|
634 |
+
extension = kwargs.get("file_extension", "")
|
635 |
+
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
636 |
+
return None
|
637 |
+
|
638 |
+
md_content = ""
|
639 |
+
|
640 |
+
# Add metadata
|
641 |
+
metadata = self._get_metadata(local_path)
|
642 |
+
if metadata:
|
643 |
+
for f in [
|
644 |
+
"ImageSize",
|
645 |
+
"Title",
|
646 |
+
"Caption",
|
647 |
+
"Description",
|
648 |
+
"Keywords",
|
649 |
+
"Artist",
|
650 |
+
"Author",
|
651 |
+
"DateTimeOriginal",
|
652 |
+
"CreateDate",
|
653 |
+
"GPSPosition",
|
654 |
+
]:
|
655 |
+
if f in metadata:
|
656 |
+
md_content += f"{f}: {metadata[f]}\n"
|
657 |
+
|
658 |
+
# Try describing the image with GPTV
|
659 |
+
mlm_client = kwargs.get("mlm_client")
|
660 |
+
mlm_model = kwargs.get("mlm_model")
|
661 |
+
if mlm_client is not None and mlm_model is not None:
|
662 |
+
md_content += (
|
663 |
+
"\n# Description:\n"
|
664 |
+
+ self._get_mlm_description(
|
665 |
+
local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
|
666 |
+
).strip()
|
667 |
+
+ "\n"
|
668 |
+
)
|
669 |
+
|
670 |
+
return DocumentConverterResult(
|
671 |
+
title=None,
|
672 |
+
text_content=md_content,
|
673 |
+
)
|
674 |
+
|
675 |
+
def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
|
676 |
+
if prompt is None or prompt.strip() == "":
|
677 |
+
prompt = "Write a detailed caption for this image."
|
678 |
+
|
679 |
+
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
680 |
+
|
681 |
+
data_uri = ""
|
682 |
+
with open(local_path, "rb") as image_file:
|
683 |
+
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
684 |
+
if content_type is None:
|
685 |
+
content_type = "image/jpeg"
|
686 |
+
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
687 |
+
data_uri = f"data:{content_type};base64,{image_base64}"
|
688 |
+
|
689 |
+
messages = [
|
690 |
+
{
|
691 |
+
"role": "user",
|
692 |
+
"content": [
|
693 |
+
{"type": "text", "text": prompt},
|
694 |
+
{
|
695 |
+
"type": "image_url",
|
696 |
+
"image_url": {
|
697 |
+
"url": data_uri,
|
698 |
+
},
|
699 |
+
},
|
700 |
+
],
|
701 |
+
}
|
702 |
+
]
|
703 |
+
|
704 |
+
response = client.chat.completions.create(model=model, messages=messages)
|
705 |
+
return response.choices[0].message.content
|
706 |
+
|
707 |
+
|
708 |
+
class FileConversionException(BaseException):
|
709 |
+
pass
|
710 |
+
|
711 |
+
|
712 |
+
class UnsupportedFormatException(BaseException):
|
713 |
+
pass
|
714 |
+
|
715 |
+
|
716 |
+
class MarkdownConverter:
|
717 |
+
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
718 |
+
This reader will convert common file-types or webpages to Markdown."""
|
719 |
+
|
720 |
+
def __init__(
|
721 |
+
self,
|
722 |
+
requests_session: Optional[requests.Session] = None,
|
723 |
+
mlm_client: Optional[Any] = None,
|
724 |
+
mlm_model: Optional[Any] = None,
|
725 |
+
):
|
726 |
+
if requests_session is None:
|
727 |
+
self._requests_session = requests.Session()
|
728 |
+
else:
|
729 |
+
self._requests_session = requests_session
|
730 |
+
|
731 |
+
self._mlm_client = mlm_client
|
732 |
+
self._mlm_model = mlm_model
|
733 |
+
|
734 |
+
self._page_converters: List[DocumentConverter] = []
|
735 |
+
|
736 |
+
# Register converters for successful browsing operations
|
737 |
+
# Later registrations are tried first / take higher priority than earlier registrations
|
738 |
+
# To this end, the most specific converters should appear below the most generic converters
|
739 |
+
self.register_page_converter(PlainTextConverter())
|
740 |
+
self.register_page_converter(HtmlConverter())
|
741 |
+
self.register_page_converter(WikipediaConverter())
|
742 |
+
self.register_page_converter(YouTubeConverter())
|
743 |
+
self.register_page_converter(DocxConverter())
|
744 |
+
self.register_page_converter(XlsxConverter())
|
745 |
+
self.register_page_converter(PptxConverter())
|
746 |
+
self.register_page_converter(WavConverter())
|
747 |
+
self.register_page_converter(Mp3Converter())
|
748 |
+
self.register_page_converter(ImageConverter())
|
749 |
+
self.register_page_converter(PdfConverter())
|
750 |
+
|
751 |
+
def convert(
|
752 |
+
self, source: Union[str, requests.Response], **kwargs: Any
|
753 |
+
) -> DocumentConverterResult: # TODO: deal with kwargs
|
754 |
+
"""
|
755 |
+
Args:
|
756 |
+
- source: can be a string representing a path or url, or a requests.response object
|
757 |
+
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
758 |
+
"""
|
759 |
+
|
760 |
+
# Local path or url
|
761 |
+
if isinstance(source, str):
|
762 |
+
if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
|
763 |
+
return self.convert_url(source, **kwargs)
|
764 |
+
else:
|
765 |
+
return self.convert_local(source, **kwargs)
|
766 |
+
# Request response
|
767 |
+
elif isinstance(source, requests.Response):
|
768 |
+
return self.convert_response(source, **kwargs)
|
769 |
+
|
770 |
+
def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
|
771 |
+
# Prepare a list of extensions to try (in order of priority)
|
772 |
+
ext = kwargs.get("file_extension")
|
773 |
+
extensions = [ext] if ext is not None else []
|
774 |
+
|
775 |
+
# Get extension alternatives from the path and puremagic
|
776 |
+
base, ext = os.path.splitext(path)
|
777 |
+
self._append_ext(extensions, ext)
|
778 |
+
self._append_ext(extensions, self._guess_ext_magic(path))
|
779 |
+
|
780 |
+
# Convert
|
781 |
+
return self._convert(path, extensions, **kwargs)
|
782 |
+
|
783 |
+
# TODO what should stream's type be?
|
784 |
+
def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
|
785 |
+
# Prepare a list of extensions to try (in order of priority)
|
786 |
+
ext = kwargs.get("file_extension")
|
787 |
+
extensions = [ext] if ext is not None else []
|
788 |
+
|
789 |
+
# Save the file locally to a temporary file. It will be deleted before this method exits
|
790 |
+
handle, temp_path = tempfile.mkstemp()
|
791 |
+
fh = os.fdopen(handle, "wb")
|
792 |
+
result = None
|
793 |
+
try:
|
794 |
+
# Write to the temporary file
|
795 |
+
content = stream.read()
|
796 |
+
if isinstance(content, str):
|
797 |
+
fh.write(content.encode("utf-8"))
|
798 |
+
else:
|
799 |
+
fh.write(content)
|
800 |
+
fh.close()
|
801 |
+
|
802 |
+
# Use puremagic to check for more extension options
|
803 |
+
self._append_ext(extensions, self._guess_ext_magic(temp_path))
|
804 |
+
|
805 |
+
# Convert
|
806 |
+
result = self._convert(temp_path, extensions, **kwargs)
|
807 |
+
# Clean up
|
808 |
+
finally:
|
809 |
+
try:
|
810 |
+
fh.close()
|
811 |
+
except Exception:
|
812 |
+
pass
|
813 |
+
os.unlink(temp_path)
|
814 |
+
|
815 |
+
return result
|
816 |
+
|
817 |
+
def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
|
818 |
+
# Send a HTTP request to the URL
|
819 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
|
820 |
+
response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
|
821 |
+
response.raise_for_status()
|
822 |
+
return self.convert_response(response, **kwargs)
|
823 |
+
|
824 |
+
def convert_response(
|
825 |
+
self, response: requests.Response, **kwargs: Any
|
826 |
+
) -> DocumentConverterResult: # TODO fix kwargs type
|
827 |
+
# Prepare a list of extensions to try (in order of priority)
|
828 |
+
ext = kwargs.get("file_extension")
|
829 |
+
extensions = [ext] if ext is not None else []
|
830 |
+
|
831 |
+
# Guess from the mimetype
|
832 |
+
content_type = response.headers.get("content-type", "").split(";")[0]
|
833 |
+
self._append_ext(extensions, mimetypes.guess_extension(content_type))
|
834 |
+
|
835 |
+
# Read the content disposition if there is one
|
836 |
+
content_disposition = response.headers.get("content-disposition", "")
|
837 |
+
m = re.search(r"filename=([^;]+)", content_disposition)
|
838 |
+
if m:
|
839 |
+
base, ext = os.path.splitext(m.group(1).strip("\"'"))
|
840 |
+
self._append_ext(extensions, ext)
|
841 |
+
|
842 |
+
# Read from the extension from the path
|
843 |
+
base, ext = os.path.splitext(urlparse(response.url).path)
|
844 |
+
self._append_ext(extensions, ext)
|
845 |
+
|
846 |
+
# Save the file locally to a temporary file. It will be deleted before this method exits
|
847 |
+
handle, temp_path = tempfile.mkstemp()
|
848 |
+
fh = os.fdopen(handle, "wb")
|
849 |
+
result = None
|
850 |
+
try:
|
851 |
+
# Download the file
|
852 |
+
for chunk in response.iter_content(chunk_size=512):
|
853 |
+
fh.write(chunk)
|
854 |
+
fh.close()
|
855 |
+
|
856 |
+
# Use puremagic to check for more extension options
|
857 |
+
self._append_ext(extensions, self._guess_ext_magic(temp_path))
|
858 |
+
|
859 |
+
# Convert
|
860 |
+
result = self._convert(temp_path, extensions, url=response.url)
|
861 |
+
except Exception as e:
|
862 |
+
print(f"Error in converting: {e}")
|
863 |
+
|
864 |
+
# Clean up
|
865 |
+
finally:
|
866 |
+
try:
|
867 |
+
fh.close()
|
868 |
+
except Exception:
|
869 |
+
pass
|
870 |
+
os.unlink(temp_path)
|
871 |
+
|
872 |
+
return result
|
873 |
+
|
874 |
+
def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
|
875 |
+
error_trace = ""
|
876 |
+
for ext in extensions + [None]: # Try last with no extension
|
877 |
+
for converter in self._page_converters:
|
878 |
+
_kwargs = copy.deepcopy(kwargs)
|
879 |
+
|
880 |
+
# Overwrite file_extension appropriately
|
881 |
+
if ext is None:
|
882 |
+
if "file_extension" in _kwargs:
|
883 |
+
del _kwargs["file_extension"]
|
884 |
+
else:
|
885 |
+
_kwargs.update({"file_extension": ext})
|
886 |
+
|
887 |
+
# Copy any additional global options
|
888 |
+
if "mlm_client" not in _kwargs and self._mlm_client is not None:
|
889 |
+
_kwargs["mlm_client"] = self._mlm_client
|
890 |
+
|
891 |
+
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
892 |
+
_kwargs["mlm_model"] = self._mlm_model
|
893 |
+
|
894 |
+
# If we hit an error log it and keep trying
|
895 |
+
try:
|
896 |
+
res = converter.convert(local_path, **_kwargs)
|
897 |
+
except Exception:
|
898 |
+
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
899 |
+
|
900 |
+
if res is not None:
|
901 |
+
# Normalize the content
|
902 |
+
res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
|
903 |
+
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
904 |
+
|
905 |
+
# Todo
|
906 |
+
return res
|
907 |
+
|
908 |
+
# If we got this far without success, report any exceptions
|
909 |
+
if len(error_trace) > 0:
|
910 |
+
raise FileConversionException(
|
911 |
+
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
912 |
+
)
|
913 |
+
|
914 |
+
# Nothing can handle it!
|
915 |
+
raise UnsupportedFormatException(
|
916 |
+
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
917 |
+
)
|
918 |
+
|
919 |
+
def _append_ext(self, extensions, ext):
|
920 |
+
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
921 |
+
if ext is None:
|
922 |
+
return
|
923 |
+
ext = ext.strip()
|
924 |
+
if ext == "":
|
925 |
+
return
|
926 |
+
# if ext not in extensions:
|
927 |
+
if True:
|
928 |
+
extensions.append(ext)
|
929 |
+
|
930 |
+
def _guess_ext_magic(self, path):
|
931 |
+
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
932 |
+
# Use puremagic to guess
|
933 |
+
try:
|
934 |
+
guesses = puremagic.magic_file(path)
|
935 |
+
if len(guesses) > 0:
|
936 |
+
ext = guesses[0].extension.strip()
|
937 |
+
if len(ext) > 0:
|
938 |
+
return ext
|
939 |
+
except FileNotFoundError:
|
940 |
+
pass
|
941 |
+
except IsADirectoryError:
|
942 |
+
pass
|
943 |
+
except PermissionError:
|
944 |
+
pass
|
945 |
+
return None
|
946 |
+
|
947 |
+
def register_page_converter(self, converter: DocumentConverter) -> None:
|
948 |
+
"""Register a page text converter."""
|
949 |
+
self._page_converters.insert(0, converter)
|
scripts/reformulator.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
|
2 |
+
# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
|
3 |
+
import copy
|
4 |
+
|
5 |
+
from smolagents.models import MessageRole, Model
|
6 |
+
|
7 |
+
|
8 |
+
def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
|
9 |
+
messages = [
|
10 |
+
{
|
11 |
+
"role": MessageRole.SYSTEM,
|
12 |
+
"content": [
|
13 |
+
{
|
14 |
+
"type": "text",
|
15 |
+
"text": f"""Earlier you were asked the following:
|
16 |
+
|
17 |
+
{original_task}
|
18 |
+
|
19 |
+
Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
|
20 |
+
}
|
21 |
+
],
|
22 |
+
}
|
23 |
+
]
|
24 |
+
|
25 |
+
# The first message just repeats the question, so remove it
|
26 |
+
# if len(inner_messages) > 1:
|
27 |
+
# del inner_messages[0]
|
28 |
+
|
29 |
+
# copy them to this context
|
30 |
+
try:
|
31 |
+
for message in inner_messages:
|
32 |
+
if not message.get("content"):
|
33 |
+
continue
|
34 |
+
message = copy.deepcopy(message)
|
35 |
+
message["role"] = MessageRole.USER
|
36 |
+
messages.append(message)
|
37 |
+
except Exception:
|
38 |
+
messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
|
39 |
+
|
40 |
+
# ask for the final answer
|
41 |
+
messages.append(
|
42 |
+
{
|
43 |
+
"role": MessageRole.USER,
|
44 |
+
"content": [
|
45 |
+
{
|
46 |
+
"type": "text",
|
47 |
+
"text": f"""
|
48 |
+
Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
|
49 |
+
|
50 |
+
{original_task}
|
51 |
+
|
52 |
+
To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
|
53 |
+
Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
54 |
+
ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
|
55 |
+
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
|
56 |
+
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
|
57 |
+
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
|
58 |
+
If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
|
59 |
+
""",
|
60 |
+
}
|
61 |
+
],
|
62 |
+
}
|
63 |
+
)
|
64 |
+
|
65 |
+
response = reformulation_model(messages).content
|
66 |
+
|
67 |
+
final_answer = response.split("FINAL ANSWER: ")[-1].strip()
|
68 |
+
print("> Reformulated answer: ", final_answer)
|
69 |
+
|
70 |
+
# if "unable to determine" in final_answer.lower():
|
71 |
+
# messages.append({"role": MessageRole.ASSISTANT, "content": response })
|
72 |
+
# messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
|
73 |
+
# I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
|
74 |
+
|
75 |
+
# To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
|
76 |
+
# Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
|
77 |
+
# ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
|
78 |
+
# If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
|
79 |
+
# If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
|
80 |
+
# If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
|
81 |
+
# """.strip()}]})
|
82 |
+
|
83 |
+
# response = model(messages).content
|
84 |
+
# print("\n>>>Making an educated guess.\n", response)
|
85 |
+
# final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
|
86 |
+
return final_answer
|
scripts/run_agents.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import textwrap
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
# import tqdm.asyncio
|
8 |
+
from smolagents.utils import AgentError
|
9 |
+
|
10 |
+
|
11 |
+
def serialize_agent_error(obj):
|
12 |
+
if isinstance(obj, AgentError):
|
13 |
+
return {"error_type": obj.__class__.__name__, "message": obj.message}
|
14 |
+
else:
|
15 |
+
return str(obj)
|
16 |
+
|
17 |
+
|
18 |
+
def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
|
19 |
+
prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
|
20 |
+
{question}. But do not try to answer the question directly!
|
21 |
+
Do not add any information that is not present in the image."""
|
22 |
+
return visual_inspection_tool(image_path=file_name, question=prompt)
|
23 |
+
|
24 |
+
|
25 |
+
def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
|
26 |
+
prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
|
27 |
+
{question}. But do not try to answer the question directly!
|
28 |
+
Do not add any information that is not present in the document."""
|
29 |
+
return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
|
30 |
+
|
31 |
+
|
32 |
+
def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
|
33 |
+
file_extension = file_path.split(".")[-1]
|
34 |
+
if file_extension in ["png", "jpg", "jpeg"]:
|
35 |
+
file_description = f" - Attached image: {file_path}"
|
36 |
+
file_description += (
|
37 |
+
f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
|
38 |
+
)
|
39 |
+
return file_description
|
40 |
+
elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
|
41 |
+
file_description = f" - Attached document: {file_path}"
|
42 |
+
image_path = file_path.split(".")[0] + ".png"
|
43 |
+
if os.path.exists(image_path):
|
44 |
+
description = get_image_description(image_path, question, visual_inspection_tool)
|
45 |
+
else:
|
46 |
+
description = get_document_description(file_path, question, document_inspection_tool)
|
47 |
+
file_description += f"\n -> File description: {description}"
|
48 |
+
return file_description
|
49 |
+
elif file_extension in ["mp3", "m4a", "wav"]:
|
50 |
+
return f" - Attached audio: {file_path}"
|
51 |
+
else:
|
52 |
+
return f" - Attached file: {file_path}"
|
53 |
+
|
54 |
+
|
55 |
+
def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
|
56 |
+
folder_path = file_path.replace(".zip", "")
|
57 |
+
os.makedirs(folder_path, exist_ok=True)
|
58 |
+
shutil.unpack_archive(file_path, folder_path)
|
59 |
+
|
60 |
+
prompt_use_files = ""
|
61 |
+
for root, dirs, files in os.walk(folder_path):
|
62 |
+
for file in files:
|
63 |
+
file_path = os.path.join(root, file)
|
64 |
+
prompt_use_files += "\n" + textwrap.indent(
|
65 |
+
get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
|
66 |
+
prefix=" ",
|
67 |
+
)
|
68 |
+
return prompt_use_files
|
69 |
+
|
70 |
+
|
71 |
+
def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
|
72 |
+
f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
|
73 |
+
done = set()
|
74 |
+
if f.exists():
|
75 |
+
with open(f, encoding="utf-8") as fh:
|
76 |
+
done = {json.loads(line)["task_id"] for line in fh if line.strip()}
|
77 |
+
|
78 |
+
tasks = []
|
79 |
+
for i in range(total):
|
80 |
+
task_id = int(data[i]["task_id"])
|
81 |
+
if task_id not in done:
|
82 |
+
if tasks_ids is not None:
|
83 |
+
if task_id in tasks_ids:
|
84 |
+
tasks.append(data[i])
|
85 |
+
else:
|
86 |
+
tasks.append(data[i])
|
87 |
+
return tasks
|
scripts/text_inspector_tool.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
from smolagents import Tool
|
4 |
+
from smolagents.models import MessageRole, Model
|
5 |
+
|
6 |
+
from .mdconvert import MarkdownConverter
|
7 |
+
|
8 |
+
|
9 |
+
class TextInspectorTool(Tool):
|
10 |
+
name = "inspect_file_as_text"
|
11 |
+
description = """
|
12 |
+
You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
|
13 |
+
This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
|
14 |
+
|
15 |
+
inputs = {
|
16 |
+
"file_path": {
|
17 |
+
"description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
|
18 |
+
"type": "string",
|
19 |
+
},
|
20 |
+
"question": {
|
21 |
+
"description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
|
22 |
+
"type": "string",
|
23 |
+
"nullable": True,
|
24 |
+
},
|
25 |
+
}
|
26 |
+
output_type = "string"
|
27 |
+
md_converter = MarkdownConverter()
|
28 |
+
|
29 |
+
def __init__(self, model: Model, text_limit: int):
|
30 |
+
super().__init__()
|
31 |
+
self.model = model
|
32 |
+
self.text_limit = text_limit
|
33 |
+
|
34 |
+
def forward_initial_exam_mode(self, file_path, question):
|
35 |
+
result = self.md_converter.convert(file_path)
|
36 |
+
|
37 |
+
if file_path[-4:] in [".png", ".jpg"]:
|
38 |
+
raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
|
39 |
+
|
40 |
+
if ".zip" in file_path:
|
41 |
+
return result.text_content
|
42 |
+
|
43 |
+
if not question:
|
44 |
+
return result.text_content
|
45 |
+
|
46 |
+
if len(result.text_content) < 4000:
|
47 |
+
return "Document content: " + result.text_content
|
48 |
+
|
49 |
+
messages = [
|
50 |
+
{
|
51 |
+
"role": MessageRole.SYSTEM,
|
52 |
+
"content": [
|
53 |
+
{
|
54 |
+
"type": "text",
|
55 |
+
"text": "Here is a file:\n### "
|
56 |
+
+ str(result.title)
|
57 |
+
+ "\n\n"
|
58 |
+
+ result.text_content[: self.text_limit],
|
59 |
+
}
|
60 |
+
],
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"role": MessageRole.USER,
|
64 |
+
"content": [
|
65 |
+
{
|
66 |
+
"type": "text",
|
67 |
+
"text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
|
68 |
+
+ question
|
69 |
+
+ "\n\nDon't answer the question yourself! Just provide useful notes on the document",
|
70 |
+
}
|
71 |
+
],
|
72 |
+
},
|
73 |
+
]
|
74 |
+
return self.model(messages).content
|
75 |
+
|
76 |
+
def forward(self, file_path, question: Optional[str] = None) -> str:
|
77 |
+
result = self.md_converter.convert(file_path)
|
78 |
+
|
79 |
+
if file_path[-4:] in [".png", ".jpg"]:
|
80 |
+
raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
|
81 |
+
|
82 |
+
if ".zip" in file_path:
|
83 |
+
return result.text_content
|
84 |
+
|
85 |
+
if not question:
|
86 |
+
return result.text_content
|
87 |
+
|
88 |
+
messages = [
|
89 |
+
{
|
90 |
+
"role": MessageRole.SYSTEM,
|
91 |
+
"content": [
|
92 |
+
{
|
93 |
+
"type": "text",
|
94 |
+
"text": "You will have to write a short caption for this file, then answer this question:"
|
95 |
+
+ question,
|
96 |
+
}
|
97 |
+
],
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"role": MessageRole.USER,
|
101 |
+
"content": [
|
102 |
+
{
|
103 |
+
"type": "text",
|
104 |
+
"text": "Here is the complete file:\n### "
|
105 |
+
+ str(result.title)
|
106 |
+
+ "\n\n"
|
107 |
+
+ result.text_content[: self.text_limit],
|
108 |
+
}
|
109 |
+
],
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"role": MessageRole.USER,
|
113 |
+
"content": [
|
114 |
+
{
|
115 |
+
"type": "text",
|
116 |
+
"text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
|
117 |
+
+ question,
|
118 |
+
}
|
119 |
+
],
|
120 |
+
},
|
121 |
+
]
|
122 |
+
return self.model(messages).content
|
scripts/text_web_browser.py
ADDED
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
|
2 |
+
# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
|
3 |
+
import mimetypes
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import re
|
7 |
+
import time
|
8 |
+
import uuid
|
9 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
10 |
+
from urllib.parse import unquote, urljoin, urlparse
|
11 |
+
|
12 |
+
import pathvalidate
|
13 |
+
import requests
|
14 |
+
from serpapi import GoogleSearch
|
15 |
+
|
16 |
+
from smolagents import Tool
|
17 |
+
|
18 |
+
from .cookies import COOKIES
|
19 |
+
from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
|
20 |
+
|
21 |
+
|
22 |
+
class SimpleTextBrowser:
|
23 |
+
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
start_page: Optional[str] = None,
|
28 |
+
viewport_size: Optional[int] = 1024 * 8,
|
29 |
+
downloads_folder: Optional[Union[str, None]] = None,
|
30 |
+
serpapi_key: Optional[Union[str, None]] = None,
|
31 |
+
request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
|
32 |
+
):
|
33 |
+
self.start_page: str = start_page if start_page else "about:blank"
|
34 |
+
self.viewport_size = viewport_size # Applies only to the standard uri types
|
35 |
+
self.downloads_folder = downloads_folder
|
36 |
+
self.history: List[Tuple[str, float]] = list()
|
37 |
+
self.page_title: Optional[str] = None
|
38 |
+
self.viewport_current_page = 0
|
39 |
+
self.viewport_pages: List[Tuple[int, int]] = list()
|
40 |
+
self.set_address(self.start_page)
|
41 |
+
self.serpapi_key = serpapi_key
|
42 |
+
self.request_kwargs = request_kwargs
|
43 |
+
self.request_kwargs["cookies"] = COOKIES
|
44 |
+
self._mdconvert = MarkdownConverter()
|
45 |
+
self._page_content: str = ""
|
46 |
+
|
47 |
+
self._find_on_page_query: Union[str, None] = None
|
48 |
+
self._find_on_page_last_result: Union[int, None] = None # Location of the last result
|
49 |
+
|
50 |
+
@property
|
51 |
+
def address(self) -> str:
|
52 |
+
"""Return the address of the current page."""
|
53 |
+
return self.history[-1][0]
|
54 |
+
|
55 |
+
def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
|
56 |
+
# TODO: Handle anchors
|
57 |
+
self.history.append((uri_or_path, time.time()))
|
58 |
+
|
59 |
+
# Handle special URIs
|
60 |
+
if uri_or_path == "about:blank":
|
61 |
+
self._set_page_content("")
|
62 |
+
elif uri_or_path.startswith("google:"):
|
63 |
+
self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
|
64 |
+
else:
|
65 |
+
if (
|
66 |
+
not uri_or_path.startswith("http:")
|
67 |
+
and not uri_or_path.startswith("https:")
|
68 |
+
and not uri_or_path.startswith("file:")
|
69 |
+
):
|
70 |
+
if len(self.history) > 1:
|
71 |
+
prior_address = self.history[-2][0]
|
72 |
+
uri_or_path = urljoin(prior_address, uri_or_path)
|
73 |
+
# Update the address with the fully-qualified path
|
74 |
+
self.history[-1] = (uri_or_path, self.history[-1][1])
|
75 |
+
self._fetch_page(uri_or_path)
|
76 |
+
|
77 |
+
self.viewport_current_page = 0
|
78 |
+
self.find_on_page_query = None
|
79 |
+
self.find_on_page_viewport = None
|
80 |
+
|
81 |
+
@property
|
82 |
+
def viewport(self) -> str:
|
83 |
+
"""Return the content of the current viewport."""
|
84 |
+
bounds = self.viewport_pages[self.viewport_current_page]
|
85 |
+
return self.page_content[bounds[0] : bounds[1]]
|
86 |
+
|
87 |
+
@property
|
88 |
+
def page_content(self) -> str:
|
89 |
+
"""Return the full contents of the current page."""
|
90 |
+
return self._page_content
|
91 |
+
|
92 |
+
def _set_page_content(self, content: str) -> None:
|
93 |
+
"""Sets the text content of the current page."""
|
94 |
+
self._page_content = content
|
95 |
+
self._split_pages()
|
96 |
+
if self.viewport_current_page >= len(self.viewport_pages):
|
97 |
+
self.viewport_current_page = len(self.viewport_pages) - 1
|
98 |
+
|
99 |
+
def page_down(self) -> None:
|
100 |
+
self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
|
101 |
+
|
102 |
+
def page_up(self) -> None:
|
103 |
+
self.viewport_current_page = max(self.viewport_current_page - 1, 0)
|
104 |
+
|
105 |
+
def find_on_page(self, query: str) -> Union[str, None]:
|
106 |
+
"""Searches for the query from the current viewport forward, looping back to the start if necessary."""
|
107 |
+
|
108 |
+
# Did we get here via a previous find_on_page search with the same query?
|
109 |
+
# If so, map to find_next
|
110 |
+
if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
|
111 |
+
return self.find_next()
|
112 |
+
|
113 |
+
# Ok it's a new search start from the current viewport
|
114 |
+
self._find_on_page_query = query
|
115 |
+
viewport_match = self._find_next_viewport(query, self.viewport_current_page)
|
116 |
+
if viewport_match is None:
|
117 |
+
self._find_on_page_last_result = None
|
118 |
+
return None
|
119 |
+
else:
|
120 |
+
self.viewport_current_page = viewport_match
|
121 |
+
self._find_on_page_last_result = viewport_match
|
122 |
+
return self.viewport
|
123 |
+
|
124 |
+
def find_next(self) -> Union[str, None]:
|
125 |
+
"""Scroll to the next viewport that matches the query"""
|
126 |
+
|
127 |
+
if self._find_on_page_query is None:
|
128 |
+
return None
|
129 |
+
|
130 |
+
starting_viewport = self._find_on_page_last_result
|
131 |
+
if starting_viewport is None:
|
132 |
+
starting_viewport = 0
|
133 |
+
else:
|
134 |
+
starting_viewport += 1
|
135 |
+
if starting_viewport >= len(self.viewport_pages):
|
136 |
+
starting_viewport = 0
|
137 |
+
|
138 |
+
viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
|
139 |
+
if viewport_match is None:
|
140 |
+
self._find_on_page_last_result = None
|
141 |
+
return None
|
142 |
+
else:
|
143 |
+
self.viewport_current_page = viewport_match
|
144 |
+
self._find_on_page_last_result = viewport_match
|
145 |
+
return self.viewport
|
146 |
+
|
147 |
+
def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
|
148 |
+
"""Search for matches between the starting viewport looping when reaching the end."""
|
149 |
+
|
150 |
+
if query is None:
|
151 |
+
return None
|
152 |
+
|
153 |
+
# Normalize the query, and convert to a regular expression
|
154 |
+
nquery = re.sub(r"\*", "__STAR__", query)
|
155 |
+
nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
|
156 |
+
nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
|
157 |
+
nquery = nquery.replace("__STAR__", ".*").lower()
|
158 |
+
|
159 |
+
if nquery.strip() == "":
|
160 |
+
return None
|
161 |
+
|
162 |
+
idxs = list()
|
163 |
+
idxs.extend(range(starting_viewport, len(self.viewport_pages)))
|
164 |
+
idxs.extend(range(0, starting_viewport))
|
165 |
+
|
166 |
+
for i in idxs:
|
167 |
+
bounds = self.viewport_pages[i]
|
168 |
+
content = self.page_content[bounds[0] : bounds[1]]
|
169 |
+
|
170 |
+
# TODO: Remove markdown links and images
|
171 |
+
ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
|
172 |
+
if re.search(nquery, ncontent):
|
173 |
+
return i
|
174 |
+
|
175 |
+
return None
|
176 |
+
|
177 |
+
def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
|
178 |
+
"""Update the address, visit the page, and return the content of the viewport."""
|
179 |
+
self.set_address(path_or_uri, filter_year=filter_year)
|
180 |
+
return self.viewport
|
181 |
+
|
182 |
+
def _split_pages(self) -> None:
|
183 |
+
# Do not split search results
|
184 |
+
if self.address.startswith("google:"):
|
185 |
+
self.viewport_pages = [(0, len(self._page_content))]
|
186 |
+
return
|
187 |
+
|
188 |
+
# Handle empty pages
|
189 |
+
if len(self._page_content) == 0:
|
190 |
+
self.viewport_pages = [(0, 0)]
|
191 |
+
return
|
192 |
+
|
193 |
+
# Break the viewport into pages
|
194 |
+
self.viewport_pages = []
|
195 |
+
start_idx = 0
|
196 |
+
while start_idx < len(self._page_content):
|
197 |
+
end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
|
198 |
+
# Adjust to end on a space
|
199 |
+
while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
|
200 |
+
end_idx += 1
|
201 |
+
self.viewport_pages.append((start_idx, end_idx))
|
202 |
+
start_idx = end_idx
|
203 |
+
|
204 |
+
def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
|
205 |
+
if self.serpapi_key is None:
|
206 |
+
raise ValueError("Missing SerpAPI key.")
|
207 |
+
|
208 |
+
params = {
|
209 |
+
"engine": "google",
|
210 |
+
"q": query,
|
211 |
+
"api_key": self.serpapi_key,
|
212 |
+
}
|
213 |
+
if filter_year is not None:
|
214 |
+
params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
|
215 |
+
|
216 |
+
search = GoogleSearch(params)
|
217 |
+
results = search.get_dict()
|
218 |
+
self.page_title = f"{query} - Search"
|
219 |
+
if "organic_results" not in results.keys():
|
220 |
+
raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
|
221 |
+
if len(results["organic_results"]) == 0:
|
222 |
+
year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
|
223 |
+
self._set_page_content(
|
224 |
+
f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
|
225 |
+
)
|
226 |
+
return
|
227 |
+
|
228 |
+
def _prev_visit(url):
|
229 |
+
for i in range(len(self.history) - 1, -1, -1):
|
230 |
+
if self.history[i][0] == url:
|
231 |
+
return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
|
232 |
+
return ""
|
233 |
+
|
234 |
+
web_snippets: List[str] = list()
|
235 |
+
idx = 0
|
236 |
+
if "organic_results" in results:
|
237 |
+
for page in results["organic_results"]:
|
238 |
+
idx += 1
|
239 |
+
date_published = ""
|
240 |
+
if "date" in page:
|
241 |
+
date_published = "\nDate published: " + page["date"]
|
242 |
+
|
243 |
+
source = ""
|
244 |
+
if "source" in page:
|
245 |
+
source = "\nSource: " + page["source"]
|
246 |
+
|
247 |
+
snippet = ""
|
248 |
+
if "snippet" in page:
|
249 |
+
snippet = "\n" + page["snippet"]
|
250 |
+
|
251 |
+
redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
|
252 |
+
|
253 |
+
redacted_version = redacted_version.replace("Your browser can't play this video.", "")
|
254 |
+
web_snippets.append(redacted_version)
|
255 |
+
|
256 |
+
content = (
|
257 |
+
f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
|
258 |
+
+ "\n\n".join(web_snippets)
|
259 |
+
)
|
260 |
+
|
261 |
+
self._set_page_content(content)
|
262 |
+
|
263 |
+
def _fetch_page(self, url: str) -> None:
|
264 |
+
download_path = ""
|
265 |
+
try:
|
266 |
+
if url.startswith("file://"):
|
267 |
+
download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
|
268 |
+
res = self._mdconvert.convert_local(download_path)
|
269 |
+
self.page_title = res.title
|
270 |
+
self._set_page_content(res.text_content)
|
271 |
+
else:
|
272 |
+
# Prepare the request parameters
|
273 |
+
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
|
274 |
+
request_kwargs["stream"] = True
|
275 |
+
|
276 |
+
# Send a HTTP request to the URL
|
277 |
+
response = requests.get(url, **request_kwargs)
|
278 |
+
response.raise_for_status()
|
279 |
+
|
280 |
+
# If the HTTP request was successful
|
281 |
+
content_type = response.headers.get("content-type", "")
|
282 |
+
|
283 |
+
# Text or HTML
|
284 |
+
if "text/" in content_type.lower():
|
285 |
+
res = self._mdconvert.convert_response(response)
|
286 |
+
self.page_title = res.title
|
287 |
+
self._set_page_content(res.text_content)
|
288 |
+
# A download
|
289 |
+
else:
|
290 |
+
# Try producing a safe filename
|
291 |
+
fname = None
|
292 |
+
download_path = None
|
293 |
+
try:
|
294 |
+
fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
|
295 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
|
296 |
+
|
297 |
+
suffix = 0
|
298 |
+
while os.path.exists(download_path) and suffix < 1000:
|
299 |
+
suffix += 1
|
300 |
+
base, ext = os.path.splitext(fname)
|
301 |
+
new_fname = f"{base}__{suffix}{ext}"
|
302 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
|
303 |
+
|
304 |
+
except NameError:
|
305 |
+
pass
|
306 |
+
|
307 |
+
# No suitable name, so make one
|
308 |
+
if fname is None:
|
309 |
+
extension = mimetypes.guess_extension(content_type)
|
310 |
+
if extension is None:
|
311 |
+
extension = ".download"
|
312 |
+
fname = str(uuid.uuid4()) + extension
|
313 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
|
314 |
+
|
315 |
+
# Open a file for writing
|
316 |
+
with open(download_path, "wb") as fh:
|
317 |
+
for chunk in response.iter_content(chunk_size=512):
|
318 |
+
fh.write(chunk)
|
319 |
+
|
320 |
+
# Render it
|
321 |
+
local_uri = pathlib.Path(download_path).as_uri()
|
322 |
+
self.set_address(local_uri)
|
323 |
+
|
324 |
+
except UnsupportedFormatException as e:
|
325 |
+
print(e)
|
326 |
+
self.page_title = ("Download complete.",)
|
327 |
+
self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
|
328 |
+
except FileConversionException as e:
|
329 |
+
print(e)
|
330 |
+
self.page_title = ("Download complete.",)
|
331 |
+
self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
|
332 |
+
except FileNotFoundError:
|
333 |
+
self.page_title = "Error 404"
|
334 |
+
self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
|
335 |
+
except requests.exceptions.RequestException as request_exception:
|
336 |
+
try:
|
337 |
+
self.page_title = f"Error {response.status_code}"
|
338 |
+
|
339 |
+
# If the error was rendered in HTML we might as well render it
|
340 |
+
content_type = response.headers.get("content-type", "")
|
341 |
+
if content_type is not None and "text/html" in content_type.lower():
|
342 |
+
res = self._mdconvert.convert(response)
|
343 |
+
self.page_title = f"Error {response.status_code}"
|
344 |
+
self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
|
345 |
+
else:
|
346 |
+
text = ""
|
347 |
+
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
|
348 |
+
text += chunk
|
349 |
+
self.page_title = f"Error {response.status_code}"
|
350 |
+
self._set_page_content(f"## Error {response.status_code}\n\n{text}")
|
351 |
+
except NameError:
|
352 |
+
self.page_title = "Error"
|
353 |
+
self._set_page_content(f"## Error\n\n{str(request_exception)}")
|
354 |
+
|
355 |
+
def _state(self) -> Tuple[str, str]:
|
356 |
+
header = f"Address: {self.address}\n"
|
357 |
+
if self.page_title is not None:
|
358 |
+
header += f"Title: {self.page_title}\n"
|
359 |
+
|
360 |
+
current_page = self.viewport_current_page
|
361 |
+
total_pages = len(self.viewport_pages)
|
362 |
+
|
363 |
+
address = self.address
|
364 |
+
for i in range(len(self.history) - 2, -1, -1): # Start from the second last
|
365 |
+
if self.history[i][0] == address:
|
366 |
+
header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
|
367 |
+
break
|
368 |
+
|
369 |
+
header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
|
370 |
+
return (header, self.viewport)
|
371 |
+
|
372 |
+
|
373 |
+
class SearchInformationTool(Tool):
|
374 |
+
name = "web_search"
|
375 |
+
description = "Perform a web search query (think a google search) and returns the search results."
|
376 |
+
inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
|
377 |
+
inputs["filter_year"] = {
|
378 |
+
"type": "string",
|
379 |
+
"description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
|
380 |
+
"nullable": True,
|
381 |
+
}
|
382 |
+
output_type = "string"
|
383 |
+
|
384 |
+
def __init__(self, browser):
|
385 |
+
super().__init__()
|
386 |
+
self.browser = browser
|
387 |
+
|
388 |
+
def forward(self, query: str, filter_year: Optional[int] = None) -> str:
|
389 |
+
self.browser.visit_page(f"google: {query}", filter_year=filter_year)
|
390 |
+
header, content = self.browser._state()
|
391 |
+
return header.strip() + "\n=======================\n" + content
|
392 |
+
|
393 |
+
|
394 |
+
class VisitTool(Tool):
|
395 |
+
name = "visit_page"
|
396 |
+
description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
|
397 |
+
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
|
398 |
+
output_type = "string"
|
399 |
+
|
400 |
+
def __init__(self, browser):
|
401 |
+
super().__init__()
|
402 |
+
self.browser = browser
|
403 |
+
|
404 |
+
def forward(self, url: str) -> str:
|
405 |
+
self.browser.visit_page(url)
|
406 |
+
header, content = self.browser._state()
|
407 |
+
return header.strip() + "\n=======================\n" + content
|
408 |
+
|
409 |
+
|
410 |
+
class DownloadTool(Tool):
|
411 |
+
name = "download_file"
|
412 |
+
description = """
|
413 |
+
Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
|
414 |
+
After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
|
415 |
+
DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
|
416 |
+
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
|
417 |
+
output_type = "string"
|
418 |
+
|
419 |
+
def __init__(self, browser):
|
420 |
+
super().__init__()
|
421 |
+
self.browser = browser
|
422 |
+
|
423 |
+
def forward(self, url: str) -> str:
|
424 |
+
if "arxiv" in url:
|
425 |
+
url = url.replace("abs", "pdf")
|
426 |
+
response = requests.get(url)
|
427 |
+
content_type = response.headers.get("content-type", "")
|
428 |
+
extension = mimetypes.guess_extension(content_type)
|
429 |
+
if extension and isinstance(extension, str):
|
430 |
+
new_path = f"./downloads/file{extension}"
|
431 |
+
else:
|
432 |
+
new_path = "./downloads/file.object"
|
433 |
+
|
434 |
+
with open(new_path, "wb") as f:
|
435 |
+
f.write(response.content)
|
436 |
+
|
437 |
+
if "pdf" in extension or "txt" in extension or "htm" in extension:
|
438 |
+
raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
|
439 |
+
|
440 |
+
return f"File was downloaded and saved under path {new_path}."
|
441 |
+
|
442 |
+
|
443 |
+
class ArchiveSearchTool(Tool):
|
444 |
+
name = "find_archived_url"
|
445 |
+
description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
|
446 |
+
inputs = {
|
447 |
+
"url": {"type": "string", "description": "The url you need the archive for."},
|
448 |
+
"date": {
|
449 |
+
"type": "string",
|
450 |
+
"description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
|
451 |
+
},
|
452 |
+
}
|
453 |
+
output_type = "string"
|
454 |
+
|
455 |
+
def __init__(self, browser):
|
456 |
+
super().__init__()
|
457 |
+
self.browser = browser
|
458 |
+
|
459 |
+
def forward(self, url, date) -> str:
|
460 |
+
no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
|
461 |
+
archive_url = no_timestamp_url + f"×tamp={date}"
|
462 |
+
response = requests.get(archive_url).json()
|
463 |
+
response_notimestamp = requests.get(no_timestamp_url).json()
|
464 |
+
if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
|
465 |
+
closest = response["archived_snapshots"]["closest"]
|
466 |
+
print("Archive found!", closest)
|
467 |
+
|
468 |
+
elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
|
469 |
+
closest = response_notimestamp["archived_snapshots"]["closest"]
|
470 |
+
print("Archive found!", closest)
|
471 |
+
else:
|
472 |
+
raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
|
473 |
+
target_url = closest["url"]
|
474 |
+
self.browser.visit_page(target_url)
|
475 |
+
header, content = self.browser._state()
|
476 |
+
return (
|
477 |
+
f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
|
478 |
+
+ header.strip()
|
479 |
+
+ "\n=======================\n"
|
480 |
+
+ content
|
481 |
+
)
|
482 |
+
|
483 |
+
|
484 |
+
class PageUpTool(Tool):
|
485 |
+
name = "page_up"
|
486 |
+
description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
|
487 |
+
inputs = {}
|
488 |
+
output_type = "string"
|
489 |
+
|
490 |
+
def __init__(self, browser):
|
491 |
+
super().__init__()
|
492 |
+
self.browser = browser
|
493 |
+
|
494 |
+
def forward(self) -> str:
|
495 |
+
self.browser.page_up()
|
496 |
+
header, content = self.browser._state()
|
497 |
+
return header.strip() + "\n=======================\n" + content
|
498 |
+
|
499 |
+
|
500 |
+
class PageDownTool(Tool):
|
501 |
+
name = "page_down"
|
502 |
+
description = (
|
503 |
+
"Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
|
504 |
+
)
|
505 |
+
inputs = {}
|
506 |
+
output_type = "string"
|
507 |
+
|
508 |
+
def __init__(self, browser):
|
509 |
+
super().__init__()
|
510 |
+
self.browser = browser
|
511 |
+
|
512 |
+
def forward(self) -> str:
|
513 |
+
self.browser.page_down()
|
514 |
+
header, content = self.browser._state()
|
515 |
+
return header.strip() + "\n=======================\n" + content
|
516 |
+
|
517 |
+
|
518 |
+
class FinderTool(Tool):
|
519 |
+
name = "find_on_page_ctrl_f"
|
520 |
+
description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
|
521 |
+
inputs = {
|
522 |
+
"search_string": {
|
523 |
+
"type": "string",
|
524 |
+
"description": "The string to search for on the page. This search string supports wildcards like '*'",
|
525 |
+
}
|
526 |
+
}
|
527 |
+
output_type = "string"
|
528 |
+
|
529 |
+
def __init__(self, browser):
|
530 |
+
super().__init__()
|
531 |
+
self.browser = browser
|
532 |
+
|
533 |
+
def forward(self, search_string: str) -> str:
|
534 |
+
find_result = self.browser.find_on_page(search_string)
|
535 |
+
header, content = self.browser._state()
|
536 |
+
|
537 |
+
if find_result is None:
|
538 |
+
return (
|
539 |
+
header.strip()
|
540 |
+
+ f"\n=======================\nThe search string '{search_string}' was not found on this page."
|
541 |
+
)
|
542 |
+
else:
|
543 |
+
return header.strip() + "\n=======================\n" + content
|
544 |
+
|
545 |
+
|
546 |
+
class FindNextTool(Tool):
|
547 |
+
name = "find_next"
|
548 |
+
description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
|
549 |
+
inputs = {}
|
550 |
+
output_type = "string"
|
551 |
+
|
552 |
+
def __init__(self, browser):
|
553 |
+
super().__init__()
|
554 |
+
self.browser = browser
|
555 |
+
|
556 |
+
def forward(self) -> str:
|
557 |
+
find_result = self.browser.find_next()
|
558 |
+
header, content = self.browser._state()
|
559 |
+
|
560 |
+
if find_result is None:
|
561 |
+
return header.strip() + "\n=======================\nThe search string was not found on this page."
|
562 |
+
else:
|
563 |
+
return header.strip() + "\n=======================\n" + content
|
scripts/visual_qa.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import json
|
3 |
+
import mimetypes
|
4 |
+
import os
|
5 |
+
import uuid
|
6 |
+
from io import BytesIO
|
7 |
+
from typing import Optional
|
8 |
+
|
9 |
+
import requests
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from huggingface_hub import InferenceClient
|
12 |
+
from PIL import Image
|
13 |
+
from transformers import AutoProcessor
|
14 |
+
|
15 |
+
from smolagents import Tool, tool
|
16 |
+
|
17 |
+
|
18 |
+
load_dotenv(override=True)
|
19 |
+
|
20 |
+
idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
|
21 |
+
|
22 |
+
|
23 |
+
def process_images_and_text(image_path, query, client):
|
24 |
+
messages = [
|
25 |
+
{
|
26 |
+
"role": "user",
|
27 |
+
"content": [
|
28 |
+
{"type": "image"},
|
29 |
+
{"type": "text", "text": query},
|
30 |
+
],
|
31 |
+
},
|
32 |
+
]
|
33 |
+
|
34 |
+
prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
|
35 |
+
|
36 |
+
# load images from local directory
|
37 |
+
|
38 |
+
# encode images to strings which can be sent to the endpoint
|
39 |
+
def encode_local_image(image_path):
|
40 |
+
# load image
|
41 |
+
image = Image.open(image_path).convert("RGB")
|
42 |
+
|
43 |
+
# Convert the image to a base64 string
|
44 |
+
buffer = BytesIO()
|
45 |
+
image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
|
46 |
+
base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
47 |
+
|
48 |
+
# add string formatting required by the endpoint
|
49 |
+
image_string = f"data:image/jpeg;base64,{base64_image}"
|
50 |
+
|
51 |
+
return image_string
|
52 |
+
|
53 |
+
image_string = encode_local_image(image_path)
|
54 |
+
prompt_with_images = prompt_with_template.replace("<image>", " ").format(image_string)
|
55 |
+
|
56 |
+
payload = {
|
57 |
+
"inputs": prompt_with_images,
|
58 |
+
"parameters": {
|
59 |
+
"return_full_text": False,
|
60 |
+
"max_new_tokens": 200,
|
61 |
+
},
|
62 |
+
}
|
63 |
+
|
64 |
+
return json.loads(client.post(json=payload).decode())[0]
|
65 |
+
|
66 |
+
|
67 |
+
# Function to encode the image
|
68 |
+
def encode_image(image_path):
|
69 |
+
if image_path.startswith("http"):
|
70 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
|
71 |
+
request_kwargs = {
|
72 |
+
"headers": {"User-Agent": user_agent},
|
73 |
+
"stream": True,
|
74 |
+
}
|
75 |
+
|
76 |
+
# Send a HTTP request to the URL
|
77 |
+
response = requests.get(image_path, **request_kwargs)
|
78 |
+
response.raise_for_status()
|
79 |
+
content_type = response.headers.get("content-type", "")
|
80 |
+
|
81 |
+
extension = mimetypes.guess_extension(content_type)
|
82 |
+
if extension is None:
|
83 |
+
extension = ".download"
|
84 |
+
|
85 |
+
fname = str(uuid.uuid4()) + extension
|
86 |
+
download_path = os.path.abspath(os.path.join("downloads", fname))
|
87 |
+
|
88 |
+
with open(download_path, "wb") as fh:
|
89 |
+
for chunk in response.iter_content(chunk_size=512):
|
90 |
+
fh.write(chunk)
|
91 |
+
|
92 |
+
image_path = download_path
|
93 |
+
|
94 |
+
with open(image_path, "rb") as image_file:
|
95 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
96 |
+
|
97 |
+
|
98 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
|
99 |
+
|
100 |
+
|
101 |
+
def resize_image(image_path):
|
102 |
+
img = Image.open(image_path)
|
103 |
+
width, height = img.size
|
104 |
+
img = img.resize((int(width / 2), int(height / 2)))
|
105 |
+
new_image_path = f"resized_{image_path}"
|
106 |
+
img.save(new_image_path)
|
107 |
+
return new_image_path
|
108 |
+
|
109 |
+
|
110 |
+
class VisualQATool(Tool):
|
111 |
+
name = "visualizer"
|
112 |
+
description = "A tool that can answer questions about attached images."
|
113 |
+
inputs = {
|
114 |
+
"image_path": {
|
115 |
+
"description": "The path to the image on which to answer the question",
|
116 |
+
"type": "string",
|
117 |
+
},
|
118 |
+
"question": {"description": "the question to answer", "type": "string", "nullable": True},
|
119 |
+
}
|
120 |
+
output_type = "string"
|
121 |
+
|
122 |
+
client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
|
123 |
+
|
124 |
+
def forward(self, image_path: str, question: Optional[str] = None) -> str:
|
125 |
+
output = ""
|
126 |
+
add_note = False
|
127 |
+
if not question:
|
128 |
+
add_note = True
|
129 |
+
question = "Please write a detailed caption for this image."
|
130 |
+
try:
|
131 |
+
output = process_images_and_text(image_path, question, self.client)
|
132 |
+
except Exception as e:
|
133 |
+
print(e)
|
134 |
+
if "Payload Too Large" in str(e):
|
135 |
+
new_image_path = resize_image(image_path)
|
136 |
+
output = process_images_and_text(new_image_path, question, self.client)
|
137 |
+
|
138 |
+
if add_note:
|
139 |
+
output = (
|
140 |
+
f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
|
141 |
+
)
|
142 |
+
|
143 |
+
return output
|
144 |
+
|
145 |
+
|
146 |
+
@tool
|
147 |
+
def visualizer(image_path: str, question: Optional[str] = None) -> str:
|
148 |
+
"""A tool that can answer questions about attached images.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
|
152 |
+
question: The question to answer.
|
153 |
+
"""
|
154 |
+
|
155 |
+
add_note = False
|
156 |
+
if not question:
|
157 |
+
add_note = True
|
158 |
+
question = "Please write a detailed caption for this image."
|
159 |
+
if not isinstance(image_path, str):
|
160 |
+
raise Exception("You should provide at least `image_path` string argument to this tool!")
|
161 |
+
|
162 |
+
mime_type, _ = mimetypes.guess_type(image_path)
|
163 |
+
base64_image = encode_image(image_path)
|
164 |
+
|
165 |
+
payload = {
|
166 |
+
"model": "gpt-4o",
|
167 |
+
"messages": [
|
168 |
+
{
|
169 |
+
"role": "user",
|
170 |
+
"content": [
|
171 |
+
{"type": "text", "text": question},
|
172 |
+
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
|
173 |
+
],
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"max_tokens": 1000,
|
177 |
+
}
|
178 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
179 |
+
try:
|
180 |
+
output = response.json()["choices"][0]["message"]["content"]
|
181 |
+
except Exception:
|
182 |
+
raise Exception(f"Response format unexpected: {response.json()}")
|
183 |
+
|
184 |
+
if add_note:
|
185 |
+
output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
|
186 |
+
|
187 |
+
return output
|