Zzrk commited on
Commit
1120ff7
·
1 Parent(s): c6c495d

chore: init

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .idea/
2
+ .env
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Cata Deep Research
3
- emoji: 🦀
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.16.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Cata Deep-Research
3
+ emoji: 🏆
4
+ colorFrom: yellow
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.14.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Cata's Deep Research
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.bak ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import threading
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+
10
+ import datasets
11
+ import pandas as pd
12
+ from dotenv import load_dotenv
13
+ from huggingface_hub import login
14
+ import gradio as gr
15
+
16
+ from scripts.reformulator import prepare_response
17
+ from scripts.run_agents import (
18
+ get_single_file_description,
19
+ get_zip_description,
20
+ )
21
+ from scripts.text_inspector_tool import TextInspectorTool
22
+ from scripts.text_web_browser import (
23
+ ArchiveSearchTool,
24
+ FinderTool,
25
+ FindNextTool,
26
+ PageDownTool,
27
+ PageUpTool,
28
+ SimpleTextBrowser,
29
+ VisitTool,
30
+ )
31
+ from scripts.visual_qa import visualizer
32
+ from tqdm import tqdm
33
+
34
+ from smolagents import (
35
+ CodeAgent,
36
+ HfApiModel,
37
+ LiteLLMModel,
38
+ Model,
39
+ ToolCallingAgent,
40
+ )
41
+ from smolagents.agent_types import AgentText, AgentImage, AgentAudio
42
+ from smolagents.gradio_ui import pull_messages_from_step, handle_agent_output_types
43
+
44
+ from smolagents import Tool
45
+
46
+
47
+ class GoogleSearchTool(Tool):
48
+ name = "web_search"
49
+ description = """Performs a google web search for your query then returns a string of the top search results."""
50
+ inputs = {
51
+ "query": {"type": "string", "description": "The search query to perform."},
52
+ "filter_year": {
53
+ "type": "integer",
54
+ "description": "Optionally restrict results to a certain year",
55
+ "nullable": True,
56
+ },
57
+ }
58
+ output_type = "string"
59
+
60
+ def __init__(self):
61
+ super().__init__(self)
62
+ import os
63
+
64
+ self.serpapi_key = os.getenv("SERPER_API_KEY")
65
+
66
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
67
+ import requests
68
+
69
+ if self.serpapi_key is None:
70
+ raise ValueError("Missing SerpAPI key. Make sure you have 'SERPER_API_KEY' in your env variables.")
71
+
72
+ params = {
73
+ "engine": "google",
74
+ "q": query,
75
+ "api_key": self.serpapi_key,
76
+ "google_domain": "google.com",
77
+ }
78
+
79
+ headers = {
80
+ 'X-API-KEY': self.serpapi_key,
81
+ 'Content-Type': 'application/json'
82
+ }
83
+
84
+ if filter_year is not None:
85
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
86
+
87
+ response = requests.request("POST", "https://google.serper.dev/search", headers=headers, data=json.dumps(params))
88
+
89
+
90
+ if response.status_code == 200:
91
+ results = response.json()
92
+ else:
93
+ raise ValueError(response.json())
94
+
95
+ if "organic" not in results.keys():
96
+ print("REZZZ", results.keys())
97
+ if filter_year is not None:
98
+ raise Exception(
99
+ f"No results found for query: '{query}' with filtering on year={filter_year}. Use a less restrictive query or do not filter on year."
100
+ )
101
+ else:
102
+ raise Exception(f"No results found for query: '{query}'. Use a less restrictive query.")
103
+ if len(results["organic"]) == 0:
104
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
105
+ return f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
106
+
107
+ web_snippets = []
108
+ if "organic" in results:
109
+ for idx, page in enumerate(results["organic"]):
110
+ date_published = ""
111
+ if "date" in page:
112
+ date_published = "\nDate published: " + page["date"]
113
+
114
+ source = ""
115
+ if "source" in page:
116
+ source = "\nSource: " + page["source"]
117
+
118
+ snippet = ""
119
+ if "snippet" in page:
120
+ snippet = "\n" + page["snippet"]
121
+
122
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}"
123
+
124
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
125
+ web_snippets.append(redacted_version)
126
+
127
+ return "## Search Results\n" + "\n\n".join(web_snippets)
128
+
129
+ # web_search = GoogleSearchTool()
130
+
131
+ # print(web_search(query="Donald Trump news"))
132
+ # quit()
133
+ AUTHORIZED_IMPORTS = [
134
+ "requests",
135
+ "zipfile",
136
+ "os",
137
+ "pandas",
138
+ "numpy",
139
+ "sympy",
140
+ "json",
141
+ "bs4",
142
+ "pubchempy",
143
+ "xml",
144
+ "yahoo_finance",
145
+ "Bio",
146
+ "sklearn",
147
+ "scipy",
148
+ "pydub",
149
+ "io",
150
+ "PIL",
151
+ "chess",
152
+ "PyPDF2",
153
+ "pptx",
154
+ "torch",
155
+ "datetime",
156
+ "fractions",
157
+ "csv",
158
+ ]
159
+ load_dotenv(override=True)
160
+ login(os.getenv("HF_TOKEN"))
161
+
162
+ append_answer_lock = threading.Lock()
163
+
164
+ custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
165
+
166
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
167
+
168
+ BROWSER_CONFIG = {
169
+ "viewport_size": 1024 * 5,
170
+ "downloads_folder": "downloads_folder",
171
+ "request_kwargs": {
172
+ "headers": {"User-Agent": user_agent},
173
+ "timeout": 300,
174
+ },
175
+ "serpapi_key": os.getenv("SERPAPI_API_KEY"),
176
+ }
177
+
178
+ os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
179
+
180
+ model = HfApiModel(
181
+ custom_role_conversions=custom_role_conversions,
182
+ )
183
+
184
+ text_limit = 20000
185
+ ti_tool = TextInspectorTool(model, text_limit)
186
+
187
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
188
+
189
+ WEB_TOOLS = [
190
+ GoogleSearchTool(),
191
+ VisitTool(browser),
192
+ PageUpTool(browser),
193
+ PageDownTool(browser),
194
+ FinderTool(browser),
195
+ FindNextTool(browser),
196
+ ArchiveSearchTool(browser),
197
+ TextInspectorTool(model, text_limit),
198
+ ]
199
+
200
+ # Agent creation in a factory function
201
+ def create_agent():
202
+ """Creates a fresh agent instance for each session"""
203
+ return CodeAgent(
204
+ model=model,
205
+ tools=[visualizer] + WEB_TOOLS,
206
+ max_steps=10,
207
+ verbosity_level=1,
208
+ additional_authorized_imports=AUTHORIZED_IMPORTS,
209
+ planning_interval=4,
210
+ )
211
+
212
+ document_inspection_tool = TextInspectorTool(model, 20000)
213
+
214
+ def stream_to_gradio(
215
+ agent,
216
+ task: str,
217
+ reset_agent_memory: bool = False,
218
+ additional_args: Optional[dict] = None,
219
+ ):
220
+ """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
221
+ for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
222
+ for message in pull_messages_from_step(
223
+ step_log,
224
+ ):
225
+ yield message
226
+
227
+ final_answer = step_log # Last log is the run's final_answer
228
+ final_answer = handle_agent_output_types(final_answer)
229
+
230
+ if isinstance(final_answer, AgentText):
231
+ yield gr.ChatMessage(
232
+ role="assistant",
233
+ content=f"**Final answer:**\n{final_answer.to_string()}\n",
234
+ )
235
+ elif isinstance(final_answer, AgentImage):
236
+ yield gr.ChatMessage(
237
+ role="assistant",
238
+ content={"path": final_answer.to_string(), "mime_type": "image/png"},
239
+ )
240
+ elif isinstance(final_answer, AgentAudio):
241
+ yield gr.ChatMessage(
242
+ role="assistant",
243
+ content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
244
+ )
245
+ else:
246
+ yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
247
+
248
+
249
+ class GradioUI:
250
+ """A one-line interface to launch your agent in Gradio"""
251
+
252
+ def __init__(self, file_upload_folder: str | None = None):
253
+
254
+ self.file_upload_folder = file_upload_folder
255
+ if self.file_upload_folder is not None:
256
+ if not os.path.exists(file_upload_folder):
257
+ os.mkdir(file_upload_folder)
258
+
259
+ def interact_with_agent(self, prompt, messages, session_state):
260
+ # Get or create session-specific agent
261
+ if 'agent' not in session_state:
262
+ session_state['agent'] = create_agent()
263
+
264
+ # Adding monitoring
265
+ try:
266
+ # log the existence of agent memory
267
+ has_memory = hasattr(session_state['agent'], 'memory')
268
+ print(f"Agent has memory: {has_memory}")
269
+ if has_memory:
270
+ print(f"Memory type: {type(session_state['agent'].memory)}")
271
+
272
+ messages.append(gr.ChatMessage(role="user", content=prompt))
273
+ yield messages
274
+
275
+ for msg in stream_to_gradio(session_state['agent'], task=prompt, reset_agent_memory=False):
276
+ messages.append(msg)
277
+ yield messages
278
+ yield messages
279
+ except Exception as e:
280
+ print(f"Error in interaction: {str(e)}")
281
+ raise
282
+
283
+ def upload_file(
284
+ self,
285
+ file,
286
+ file_uploads_log,
287
+ allowed_file_types=[
288
+ "application/pdf",
289
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
290
+ "text/plain",
291
+ ],
292
+ ):
293
+ """
294
+ Handle file uploads, default allowed types are .pdf, .docx, and .txt
295
+ """
296
+ if file is None:
297
+ return gr.Textbox("No file uploaded", visible=True), file_uploads_log
298
+
299
+ try:
300
+ mime_type, _ = mimetypes.guess_type(file.name)
301
+ except Exception as e:
302
+ return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
303
+
304
+ if mime_type not in allowed_file_types:
305
+ return gr.Textbox("File type disallowed", visible=True), file_uploads_log
306
+
307
+ # Sanitize file name
308
+ original_name = os.path.basename(file.name)
309
+ sanitized_name = re.sub(
310
+ r"[^\w\-.]", "_", original_name
311
+ ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
312
+
313
+ type_to_ext = {}
314
+ for ext, t in mimetypes.types_map.items():
315
+ if t not in type_to_ext:
316
+ type_to_ext[t] = ext
317
+
318
+ # Ensure the extension correlates to the mime type
319
+ sanitized_name = sanitized_name.split(".")[:-1]
320
+ sanitized_name.append("" + type_to_ext[mime_type])
321
+ sanitized_name = "".join(sanitized_name)
322
+
323
+ # Save the uploaded file to the specified folder
324
+ file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
325
+ shutil.copy(file.name, file_path)
326
+
327
+ return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
328
+
329
+ def log_user_message(self, text_input, file_uploads_log):
330
+ return (
331
+ text_input
332
+ + (
333
+ f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
334
+ if len(file_uploads_log) > 0
335
+ else ""
336
+ ),
337
+ gr.Textbox(value="", interactive=False, placeholder="Please wait while Steps are getting populated"),
338
+ gr.Button(interactive=False)
339
+ )
340
+
341
+ def detect_device(self, request: gr.Request):
342
+ # Check whether the user device is a mobile or a computer
343
+
344
+ if not request:
345
+ return "Unknown device"
346
+ # Method 1: Check sec-ch-ua-mobile header
347
+ is_mobile_header = request.headers.get('sec-ch-ua-mobile')
348
+ if is_mobile_header:
349
+ return "Mobile" if '?1' in is_mobile_header else "Desktop"
350
+
351
+ # Method 2: Check user-agent string
352
+ user_agent = request.headers.get('user-agent', '').lower()
353
+ mobile_keywords = ['android', 'iphone', 'ipad', 'mobile', 'phone']
354
+
355
+ if any(keyword in user_agent for keyword in mobile_keywords):
356
+ return "Mobile"
357
+
358
+ # Method 3: Check platform
359
+ platform = request.headers.get('sec-ch-ua-platform', '').lower()
360
+ if platform:
361
+ if platform in ['"android"', '"ios"']:
362
+ return "Mobile"
363
+ elif platform in ['"windows"', '"macos"', '"linux"']:
364
+ return "Desktop"
365
+
366
+ # Default case if no clear indicators
367
+ return "Desktop"
368
+
369
+ def launch(self, **kwargs):
370
+
371
+ with gr.Blocks(theme="ocean", fill_height=True) as demo:
372
+ # Different layouts for mobile and computer devices
373
+ @gr.render()
374
+ def layout(request: gr.Request):
375
+ device = self.detect_device(request)
376
+ print(f"device - {device}")
377
+ # Render layout with sidebar
378
+ if device == "Desktop":
379
+ with gr.Blocks(fill_height=True,) as sidebar_demo:
380
+ with gr.Sidebar():
381
+ gr.Markdown("""# open Deep Research - free the AI agents!
382
+
383
+ OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
384
+
385
+ However, their agent has a huge downside: it's not open. So we've started a 24-hour rush to replicate and open-source it. Our resulting [open-Deep-Research agent](https://github.com/huggingface/smolagents/tree/main/examples/open_deep_research) took the #1 rank of any open submission on the GAIA leaderboard! ✨
386
+
387
+ You can try a simplified version here.<br><br>""")
388
+ with gr.Group():
389
+ gr.Markdown("**Your request**", container=True)
390
+ text_input = gr.Textbox(lines=3, label="Your request", container=False, placeholder="Enter your prompt here and press Shift+Enter or press the button")
391
+ launch_research_btn = gr.Button("Run", variant="primary")
392
+
393
+ # If an upload folder is provided, enable the upload feature
394
+ if self.file_upload_folder is not None:
395
+ upload_file = gr.File(label="Upload a file")
396
+ upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
397
+ upload_file.change(
398
+ self.upload_file,
399
+ [upload_file, file_uploads_log],
400
+ [upload_status, file_uploads_log],
401
+ )
402
+
403
+ gr.HTML("<br><br><h4><center>Powered by:</center></h4>")
404
+ with gr.Row():
405
+ gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">
406
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
407
+ <a href="https://github.com/huggingface/smolagents"><b>huggingface/smolagents</b></a>
408
+ </div>""")
409
+
410
+ # Add session state to store session-specific data
411
+ session_state = gr.State({}) # Initialize empty state for each session
412
+ stored_messages = gr.State([])
413
+ file_uploads_log = gr.State([])
414
+ chatbot = gr.Chatbot(
415
+ label="open-Deep-Research",
416
+ type="messages",
417
+ avatar_images=(
418
+ None,
419
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
420
+ ),
421
+ resizeable=False,
422
+ scale=1,
423
+ elem_id="my-chatbot"
424
+ )
425
+
426
+ text_input.submit(
427
+ self.log_user_message,
428
+ [text_input, file_uploads_log],
429
+ [stored_messages, text_input, launch_research_btn],
430
+ ).then(self.interact_with_agent,
431
+ # Include session_state in function calls
432
+ [stored_messages, chatbot, session_state],
433
+ [chatbot]
434
+ ).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
435
+ None,
436
+ [text_input, launch_research_btn])
437
+ launch_research_btn.click(
438
+ self.log_user_message,
439
+ [text_input, file_uploads_log],
440
+ [stored_messages, text_input, launch_research_btn],
441
+ ).then(self.interact_with_agent,
442
+ # Include session_state in function calls
443
+ [stored_messages, chatbot, session_state],
444
+ [chatbot]
445
+ ).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
446
+ None,
447
+ [text_input, launch_research_btn])
448
+
449
+ # Render simple layout
450
+ else:
451
+ with gr.Blocks(fill_height=True,) as simple_demo:
452
+ gr.Markdown("""# open Deep Research - free the AI agents!
453
+ _Built with [smolagents](https://github.com/huggingface/smolagents)_
454
+
455
+ OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
456
+
457
+ However, their agent has a huge downside: it's not open. So we've started a 24-hour rush to replicate and open-source it. Our resulting [open-Deep-Research agent](https://github.com/huggingface/smolagents/tree/main/examples/open_deep_research) took the #1 rank of any open submission on the GAIA leaderboard! ✨
458
+
459
+ You can try a simplified version below (uses `Qwen-Coder-32B` instead of `o1`, so much less powerful than the original open-Deep-Research)👇""")
460
+ # Add session state to store session-specific data
461
+ session_state = gr.State({}) # Initialize empty state for each session
462
+ stored_messages = gr.State([])
463
+ file_uploads_log = gr.State([])
464
+ chatbot = gr.Chatbot(
465
+ label="open-Deep-Research",
466
+ type="messages",
467
+ avatar_images=(
468
+ None,
469
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
470
+ ),
471
+ resizeable=True,
472
+ scale=1,
473
+ )
474
+ # If an upload folder is provided, enable the upload feature
475
+ if self.file_upload_folder is not None:
476
+ upload_file = gr.File(label="Upload a file")
477
+ upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
478
+ upload_file.change(
479
+ self.upload_file,
480
+ [upload_file, file_uploads_log],
481
+ [upload_status, file_uploads_log],
482
+ )
483
+ text_input = gr.Textbox(lines=1, label="Your request", placeholder="Enter your prompt here and press the button")
484
+ launch_research_btn = gr.Button("Run", variant="primary",)
485
+
486
+ text_input.submit(
487
+ self.log_user_message,
488
+ [text_input, file_uploads_log],
489
+ [stored_messages, text_input, launch_research_btn],
490
+ ).then(self.interact_with_agent,
491
+ # Include session_state in function calls
492
+ [stored_messages, chatbot, session_state],
493
+ [chatbot]
494
+ ).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
495
+ None,
496
+ [text_input, launch_research_btn])
497
+ launch_research_btn.click(
498
+ self.log_user_message,
499
+ [text_input, file_uploads_log],
500
+ [stored_messages, text_input, launch_research_btn],
501
+ ).then(self.interact_with_agent,
502
+ # Include session_state in function calls
503
+ [stored_messages, chatbot, session_state],
504
+ [chatbot]
505
+ ).then(lambda : (gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"), gr.Button(interactive=True)),
506
+ None,
507
+ [text_input, launch_research_btn])
508
+
509
+ demo.launch(debug=True, **kwargs)
510
+
511
+ GradioUI().launch()
app.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mimetypes
2
+ import os
3
+ import re
4
+ import shutil
5
+ import threading
6
+ from typing import Optional
7
+
8
+ import gradio as gr
9
+ from dotenv import load_dotenv
10
+ from huggingface_hub import login
11
+ from smolagents import (
12
+ CodeAgent,
13
+ HfApiModel,
14
+ )
15
+ from smolagents.agent_types import AgentText, AgentImage, AgentAudio
16
+ from smolagents.gradio_ui import pull_messages_from_step, handle_agent_output_types
17
+
18
+ from scripts.visual_qa import visualizer
19
+
20
+ AUTHORIZED_IMPORTS = [
21
+ "requests",
22
+ "zipfile",
23
+ "os",
24
+ "pandas",
25
+ "numpy",
26
+ "sympy",
27
+ "json",
28
+ "bs4",
29
+ "pubchempy",
30
+ "xml",
31
+ "yahoo_finance",
32
+ "Bio",
33
+ "sklearn",
34
+ "scipy",
35
+ "pydub",
36
+ "io",
37
+ "PIL",
38
+ "chess",
39
+ "PyPDF2",
40
+ "pptx",
41
+ "torch",
42
+ "datetime",
43
+ "fractions",
44
+ "csv",
45
+ ]
46
+ load_dotenv(override=True)
47
+ login(os.getenv("HF_TOKEN"))
48
+
49
+ append_answer_lock = threading.Lock()
50
+
51
+ custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
52
+
53
+ model = HfApiModel(
54
+ custom_role_conversions=custom_role_conversions,
55
+ )
56
+
57
+
58
+ # Agent creation in a factory function
59
+ def create_agent():
60
+ """Creates a fresh agent instance for each session"""
61
+ return CodeAgent(
62
+ model=model,
63
+ tools=[visualizer],
64
+ max_steps=10,
65
+ verbosity_level=1,
66
+ additional_authorized_imports=AUTHORIZED_IMPORTS,
67
+ planning_interval=4,
68
+ )
69
+
70
+
71
+ def stream_to_gradio(
72
+ agent,
73
+ task: str,
74
+ reset_agent_memory: bool = False,
75
+ additional_args: Optional[dict] = None,
76
+ ):
77
+ """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
78
+ for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
79
+ for message in pull_messages_from_step(
80
+ step_log,
81
+ ):
82
+ yield message
83
+
84
+ final_answer = step_log # Last log is the run's final_answer
85
+ final_answer = handle_agent_output_types(final_answer)
86
+
87
+ if isinstance(final_answer, AgentText):
88
+ yield gr.ChatMessage(
89
+ role="assistant",
90
+ content=f"**Final answer:**\n{final_answer.to_string()}\n",
91
+ )
92
+ elif isinstance(final_answer, AgentImage):
93
+ yield gr.ChatMessage(
94
+ role="assistant",
95
+ content={"path": final_answer.to_string(), "mime_type": "image/png"},
96
+ )
97
+ elif isinstance(final_answer, AgentAudio):
98
+ yield gr.ChatMessage(
99
+ role="assistant",
100
+ content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
101
+ )
102
+ else:
103
+ yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
104
+
105
+
106
+ class GradioUI:
107
+ """A one-line interface to launch your agent in Gradio"""
108
+
109
+ def __init__(self, file_upload_folder: str | None = None):
110
+
111
+ self.file_upload_folder = file_upload_folder
112
+ if self.file_upload_folder is not None:
113
+ if not os.path.exists(file_upload_folder):
114
+ os.mkdir(file_upload_folder)
115
+
116
+ def interact_with_agent(self, prompt, messages, session_state):
117
+ # Get or create session-specific agent
118
+ if 'agent' not in session_state:
119
+ session_state['agent'] = create_agent()
120
+
121
+ # Adding monitoring
122
+ try:
123
+ # log the existence of agent memory
124
+ has_memory = hasattr(session_state['agent'], 'memory')
125
+ print(f"Agent has memory: {has_memory}")
126
+ if has_memory:
127
+ print(f"Memory type: {type(session_state['agent'].memory)}")
128
+
129
+ messages.append(gr.ChatMessage(role="user", content=prompt))
130
+ yield messages
131
+
132
+ for msg in stream_to_gradio(session_state['agent'], task=prompt, reset_agent_memory=False):
133
+ messages.append(msg)
134
+ yield messages
135
+ yield messages
136
+ except Exception as e:
137
+ print(f"Error in interaction: {str(e)}")
138
+ raise
139
+
140
+ def upload_file(
141
+ self,
142
+ file,
143
+ file_uploads_log,
144
+ allowed_file_types=[
145
+ "application/pdf",
146
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
147
+ "text/plain",
148
+ ],
149
+ ):
150
+ """
151
+ Handle file uploads, default allowed types are .pdf, .docx, and .txt
152
+ """
153
+ if file is None:
154
+ return gr.Textbox("No file uploaded", visible=True), file_uploads_log
155
+
156
+ try:
157
+ mime_type, _ = mimetypes.guess_type(file.name)
158
+ except Exception as e:
159
+ return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
160
+
161
+ if mime_type not in allowed_file_types:
162
+ return gr.Textbox("File type disallowed", visible=True), file_uploads_log
163
+
164
+ # Sanitize file name
165
+ original_name = os.path.basename(file.name)
166
+ sanitized_name = re.sub(
167
+ r"[^\w\-.]", "_", original_name
168
+ ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
169
+
170
+ type_to_ext = {}
171
+ for ext, t in mimetypes.types_map.items():
172
+ if t not in type_to_ext:
173
+ type_to_ext[t] = ext
174
+
175
+ # Ensure the extension correlates to the mime type
176
+ sanitized_name = sanitized_name.split(".")[:-1]
177
+ sanitized_name.append("" + type_to_ext[mime_type])
178
+ sanitized_name = "".join(sanitized_name)
179
+
180
+ # Save the uploaded file to the specified folder
181
+ file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
182
+ shutil.copy(file.name, file_path)
183
+
184
+ return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
185
+
186
+ def log_user_message(self, text_input, file_uploads_log):
187
+ return (
188
+ text_input
189
+ + (
190
+ f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
191
+ if len(file_uploads_log) > 0
192
+ else ""
193
+ ),
194
+ gr.Textbox(value="", interactive=False, placeholder="Please wait while Steps are getting populated"),
195
+ gr.Button(interactive=False)
196
+ )
197
+
198
+ def detect_device(self, request: gr.Request):
199
+ # Check whether the user device is a mobile or a computer
200
+
201
+ if not request:
202
+ return "Unknown device"
203
+ # Method 1: Check sec-ch-ua-mobile header
204
+ is_mobile_header = request.headers.get('sec-ch-ua-mobile')
205
+ if is_mobile_header:
206
+ return "Mobile" if '?1' in is_mobile_header else "Desktop"
207
+
208
+ # Method 2: Check user-agent string
209
+ user_agent = request.headers.get('user-agent', '').lower()
210
+ mobile_keywords = ['android', 'iphone', 'ipad', 'mobile', 'phone']
211
+
212
+ if any(keyword in user_agent for keyword in mobile_keywords):
213
+ return "Mobile"
214
+
215
+ # Method 3: Check platform
216
+ platform = request.headers.get('sec-ch-ua-platform', '').lower()
217
+ if platform:
218
+ if platform in ['"android"', '"ios"']:
219
+ return "Mobile"
220
+ elif platform in ['"windows"', '"macos"', '"linux"']:
221
+ return "Desktop"
222
+
223
+ # Default case if no clear indicators
224
+ return "Desktop"
225
+
226
+ def launch(self, **kwargs):
227
+
228
+ with gr.Blocks(theme="ocean", fill_height=True) as demo:
229
+ # Different layouts for mobile and computer devices
230
+ @gr.render()
231
+ def layout(request: gr.Request):
232
+ device = self.detect_device(request)
233
+ print(f"device - {device}")
234
+ # Render layout with sidebar
235
+ if device == "Desktop":
236
+ with gr.Blocks(fill_height=True, ) as sidebar_demo:
237
+ with gr.Sidebar():
238
+ gr.Markdown("""# Cata Deep Research
239
+
240
+ OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
241
+
242
+ You can try a simplified version here.<br><br>""")
243
+ with gr.Group():
244
+ gr.Markdown("**Your request**", container=True)
245
+ text_input = gr.Textbox(lines=3, label="Your request", container=False,
246
+ placeholder="Enter your prompt here and press Shift+Enter or press the button")
247
+ launch_research_btn = gr.Button("Run", variant="primary")
248
+
249
+ # If an upload folder is provided, enable the upload feature
250
+ if self.file_upload_folder is not None:
251
+ upload_file = gr.File(label="Upload a file")
252
+ upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
253
+ upload_file.change(
254
+ self.upload_file,
255
+ [upload_file, file_uploads_log],
256
+ [upload_status, file_uploads_log],
257
+ )
258
+
259
+ # gr.HTML("<br><br><h4><center>Powered by:</center></h4>")
260
+ # with gr.Row():
261
+ # gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">
262
+ # <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
263
+ # <a href="https://github.com/huggingface/smolagents"><b>huggingface/smolagents</b></a>
264
+ # </div>""")
265
+
266
+ # Add session state to store session-specific data
267
+ session_state = gr.State({}) # Initialize empty state for each session
268
+ stored_messages = gr.State([])
269
+ file_uploads_log = gr.State([])
270
+ chatbot = gr.Chatbot(
271
+ label="Cata-Deep-Research",
272
+ type="messages",
273
+ resizeable=False,
274
+ scale=1,
275
+ elem_id="my-chatbot"
276
+ )
277
+
278
+ text_input.submit(
279
+ self.log_user_message,
280
+ [text_input, file_uploads_log],
281
+ [stored_messages, text_input, launch_research_btn],
282
+ ).then(self.interact_with_agent,
283
+ # Include session_state in function calls
284
+ [stored_messages, chatbot, session_state],
285
+ [chatbot]
286
+ ).then(lambda: (
287
+ gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
288
+ gr.Button(interactive=True)),
289
+ None,
290
+ [text_input, launch_research_btn])
291
+ launch_research_btn.click(
292
+ self.log_user_message,
293
+ [text_input, file_uploads_log],
294
+ [stored_messages, text_input, launch_research_btn],
295
+ ).then(self.interact_with_agent,
296
+ # Include session_state in function calls
297
+ [stored_messages, chatbot, session_state],
298
+ [chatbot]
299
+ ).then(lambda: (
300
+ gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
301
+ gr.Button(interactive=True)),
302
+ None,
303
+ [text_input, launch_research_btn])
304
+
305
+ # Render simple layout
306
+ else:
307
+ with gr.Blocks(fill_height=True, ) as simple_demo:
308
+ gr.Markdown("""# Cata Deep Research
309
+ _Built with [smolagents](https://github.com/huggingface/smolagents)_
310
+
311
+ OpenAI just published [Deep Research](https://openai.com/index/introducing-deep-research/), a very nice assistant that can perform deep searches on the web to answer user questions.
312
+
313
+ You can try a simplified version below (uses `Qwen-Coder-32B` instead of `o1`, so much less powerful than the original open-Deep-Research)👇""")
314
+ # Add session state to store session-specific data
315
+ session_state = gr.State({}) # Initialize empty state for each session
316
+ stored_messages = gr.State([])
317
+ file_uploads_log = gr.State([])
318
+ chatbot = gr.Chatbot(
319
+ label="Cata-Deep-Research",
320
+ type="messages",
321
+ resizeable=True,
322
+ scale=1,
323
+ )
324
+ # If an upload folder is provided, enable the upload feature
325
+ if self.file_upload_folder is not None:
326
+ upload_file = gr.File(label="Upload a file")
327
+ upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
328
+ upload_file.change(
329
+ self.upload_file,
330
+ [upload_file, file_uploads_log],
331
+ [upload_status, file_uploads_log],
332
+ )
333
+ text_input = gr.Textbox(lines=1, label="Your request",
334
+ placeholder="Enter your prompt here and press the button")
335
+ launch_research_btn = gr.Button("Run", variant="primary", )
336
+
337
+ text_input.submit(
338
+ self.log_user_message,
339
+ [text_input, file_uploads_log],
340
+ [stored_messages, text_input, launch_research_btn],
341
+ ).then(self.interact_with_agent,
342
+ # Include session_state in function calls
343
+ [stored_messages, chatbot, session_state],
344
+ [chatbot]
345
+ ).then(lambda: (
346
+ gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
347
+ gr.Button(interactive=True)),
348
+ None,
349
+ [text_input, launch_research_btn])
350
+ launch_research_btn.click(
351
+ self.log_user_message,
352
+ [text_input, file_uploads_log],
353
+ [stored_messages, text_input, launch_research_btn],
354
+ ).then(self.interact_with_agent,
355
+ # Include session_state in function calls
356
+ [stored_messages, chatbot, session_state],
357
+ [chatbot]
358
+ ).then(lambda: (
359
+ gr.Textbox(interactive=True, placeholder="Enter your prompt here and press the button"),
360
+ gr.Button(interactive=True)),
361
+ None,
362
+ [text_input, launch_research_btn])
363
+
364
+ demo.launch(debug=True, **kwargs)
365
+
366
+
367
+ GradioUI().launch()
requirements.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/smolagents.git@main#egg=smolagents
2
+ anthropic>=0.37.1
3
+ beautifulsoup4>=4.12.3
4
+ datasets>=2.21.0
5
+ google_search_results>=2.4.2
6
+ huggingface_hub>=0.23.4
7
+ mammoth>=1.8.0
8
+ markdownify>=0.13.1
9
+ numexpr>=2.10.1
10
+ numpy>=2.1.2
11
+ openai>=1.52.2
12
+ openpyxl
13
+ pandas>=2.2.3
14
+ pathvalidate>=3.2.1
15
+ pdfminer>=20191125
16
+ pdfminer.six>=20240706
17
+ Pillow>=11.0.0
18
+ puremagic>=1.28
19
+ pypdf>=5.1.0
20
+ python-dotenv>=1.0.1
21
+ python_pptx>=1.0.2
22
+ Requests>=2.32.3
23
+ serpapi>=0.1.5
24
+ tqdm>=4.66.4
25
+ torch>=2.2.2
26
+ torchvision>=0.17.2
27
+ transformers>=4.46.0
28
+ youtube_transcript_api>=0.6.2
29
+ chess
30
+ sympy
31
+ pubchempy
32
+ Bio
33
+ scikit-learn
34
+ scipy
35
+ pydub
36
+ PyPDF2
37
+ python-pptx
38
+ torch
39
+ xlrd
40
+ SpeechRecognition
41
+ litellm
scripts/cookies.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests.cookies import RequestsCookieJar
2
+
3
+
4
+ COOKIES_LIST = [
5
+ {
6
+ "domain": ".youtube.com",
7
+ "expirationDate": 1718884961,
8
+ "hostOnly": False,
9
+ "httpOnly": False,
10
+ "name": "ST-xuwub9",
11
+ "path": "/",
12
+ "sameSite": None,
13
+ "secure": False,
14
+ "session": False,
15
+ "storeId": None,
16
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
+ },
18
+ {
19
+ "domain": ".youtube.com",
20
+ "expirationDate": 1753004444.745411,
21
+ "hostOnly": False,
22
+ "httpOnly": True,
23
+ "name": "__Secure-YEC",
24
+ "path": "/",
25
+ "sameSite": "lax",
26
+ "secure": True,
27
+ "session": False,
28
+ "storeId": None,
29
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
+ },
31
+ {
32
+ "domain": ".youtube.com",
33
+ "expirationDate": 1753434620.050824,
34
+ "hostOnly": False,
35
+ "httpOnly": True,
36
+ "name": "__Secure-3PSID",
37
+ "path": "/",
38
+ "sameSite": "no_restriction",
39
+ "secure": True,
40
+ "session": False,
41
+ "storeId": None,
42
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
+ },
44
+ {
45
+ "domain": ".youtube.com",
46
+ "expirationDate": 1750420959.974642,
47
+ "hostOnly": False,
48
+ "httpOnly": False,
49
+ "name": "SIDCC",
50
+ "path": "/",
51
+ "sameSite": None,
52
+ "secure": False,
53
+ "session": False,
54
+ "storeId": None,
55
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
+ },
57
+ {
58
+ "domain": ".youtube.com",
59
+ "expirationDate": 1753434620.050652,
60
+ "hostOnly": False,
61
+ "httpOnly": False,
62
+ "name": "SID",
63
+ "path": "/",
64
+ "sameSite": None,
65
+ "secure": False,
66
+ "session": False,
67
+ "storeId": None,
68
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
+ },
70
+ {
71
+ "domain": ".youtube.com",
72
+ "expirationDate": 1750420958.397534,
73
+ "hostOnly": False,
74
+ "httpOnly": True,
75
+ "name": "__Secure-1PSIDTS",
76
+ "path": "/",
77
+ "sameSite": None,
78
+ "secure": True,
79
+ "session": False,
80
+ "storeId": None,
81
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
+ },
83
+ {
84
+ "domain": ".youtube.com",
85
+ "expirationDate": 1753433494.44729,
86
+ "hostOnly": False,
87
+ "httpOnly": False,
88
+ "name": "_ga_M0180HEFCY",
89
+ "path": "/",
90
+ "sameSite": None,
91
+ "secure": False,
92
+ "session": False,
93
+ "storeId": None,
94
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
+ },
96
+ {
97
+ "domain": ".youtube.com",
98
+ "expirationDate": 1753434620.050933,
99
+ "hostOnly": False,
100
+ "httpOnly": False,
101
+ "name": "SAPISID",
102
+ "path": "/",
103
+ "sameSite": None,
104
+ "secure": True,
105
+ "session": False,
106
+ "storeId": None,
107
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
+ },
109
+ {
110
+ "domain": ".youtube.com",
111
+ "expirationDate": 1750420959.974764,
112
+ "hostOnly": False,
113
+ "httpOnly": True,
114
+ "name": "__Secure-1PSIDCC",
115
+ "path": "/",
116
+ "sameSite": None,
117
+ "secure": True,
118
+ "session": False,
119
+ "storeId": None,
120
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
+ },
122
+ {
123
+ "domain": ".youtube.com",
124
+ "expirationDate": 1753434620.050881,
125
+ "hostOnly": False,
126
+ "httpOnly": True,
127
+ "name": "SSID",
128
+ "path": "/",
129
+ "sameSite": None,
130
+ "secure": True,
131
+ "session": False,
132
+ "storeId": None,
133
+ "value": "AmlwXHnQvOQ10LVd-",
134
+ },
135
+ {
136
+ "domain": ".youtube.com",
137
+ "expirationDate": 1753434620.050959,
138
+ "hostOnly": False,
139
+ "httpOnly": False,
140
+ "name": "__Secure-1PAPISID",
141
+ "path": "/",
142
+ "sameSite": None,
143
+ "secure": True,
144
+ "session": False,
145
+ "storeId": None,
146
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
+ },
148
+ {
149
+ "domain": ".youtube.com",
150
+ "expirationDate": 1753434620.050795,
151
+ "hostOnly": False,
152
+ "httpOnly": True,
153
+ "name": "__Secure-1PSID",
154
+ "path": "/",
155
+ "sameSite": None,
156
+ "secure": True,
157
+ "session": False,
158
+ "storeId": None,
159
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
+ },
161
+ {
162
+ "domain": ".youtube.com",
163
+ "expirationDate": 1753434620.050993,
164
+ "hostOnly": False,
165
+ "httpOnly": False,
166
+ "name": "__Secure-3PAPISID",
167
+ "path": "/",
168
+ "sameSite": "no_restriction",
169
+ "secure": True,
170
+ "session": False,
171
+ "storeId": None,
172
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
+ },
174
+ {
175
+ "domain": ".youtube.com",
176
+ "expirationDate": 1750420959.974815,
177
+ "hostOnly": False,
178
+ "httpOnly": True,
179
+ "name": "__Secure-3PSIDCC",
180
+ "path": "/",
181
+ "sameSite": "no_restriction",
182
+ "secure": True,
183
+ "session": False,
184
+ "storeId": None,
185
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
+ },
187
+ {
188
+ "domain": ".youtube.com",
189
+ "expirationDate": 1750420958.397647,
190
+ "hostOnly": False,
191
+ "httpOnly": True,
192
+ "name": "__Secure-3PSIDTS",
193
+ "path": "/",
194
+ "sameSite": "no_restriction",
195
+ "secure": True,
196
+ "session": False,
197
+ "storeId": None,
198
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
+ },
200
+ {
201
+ "domain": ".youtube.com",
202
+ "expirationDate": 1753434620.050908,
203
+ "hostOnly": False,
204
+ "httpOnly": False,
205
+ "name": "APISID",
206
+ "path": "/",
207
+ "sameSite": None,
208
+ "secure": False,
209
+ "session": False,
210
+ "storeId": None,
211
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
+ },
213
+ {
214
+ "domain": ".youtube.com",
215
+ "expirationDate": 1753434620.050855,
216
+ "hostOnly": False,
217
+ "httpOnly": True,
218
+ "name": "HSID",
219
+ "path": "/",
220
+ "sameSite": None,
221
+ "secure": False,
222
+ "session": False,
223
+ "storeId": None,
224
+ "value": "AasA7hmRuTFv7vjoq",
225
+ },
226
+ {
227
+ "domain": ".youtube.com",
228
+ "expirationDate": 1753435873.577793,
229
+ "hostOnly": False,
230
+ "httpOnly": True,
231
+ "name": "LOGIN_INFO",
232
+ "path": "/",
233
+ "sameSite": "no_restriction",
234
+ "secure": True,
235
+ "session": False,
236
+ "storeId": None,
237
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
+ },
239
+ {
240
+ "domain": ".youtube.com",
241
+ "expirationDate": 1753444956.555608,
242
+ "hostOnly": False,
243
+ "httpOnly": False,
244
+ "name": "PREF",
245
+ "path": "/",
246
+ "sameSite": None,
247
+ "secure": True,
248
+ "session": False,
249
+ "storeId": None,
250
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
+ },
252
+ ]
253
+
254
+ COOKIES_LIST += [
255
+ {
256
+ "domain": ".www.researchgate.net",
257
+ "hostOnly": False,
258
+ "httpOnly": True,
259
+ "name": "isInstIp",
260
+ "path": "/",
261
+ "sameSite": None,
262
+ "secure": True,
263
+ "session": True,
264
+ "storeId": None,
265
+ "value": "False",
266
+ },
267
+ {
268
+ "domain": ".researchgate.net",
269
+ "expirationDate": 1734423981,
270
+ "hostOnly": False,
271
+ "httpOnly": False,
272
+ "name": "__eoi",
273
+ "path": "/",
274
+ "sameSite": None,
275
+ "secure": False,
276
+ "session": False,
277
+ "storeId": None,
278
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
+ },
280
+ {
281
+ "domain": ".www.researchgate.net",
282
+ "expirationDate": 1753444909.646103,
283
+ "hostOnly": False,
284
+ "httpOnly": True,
285
+ "name": "ptc",
286
+ "path": "/",
287
+ "sameSite": None,
288
+ "secure": True,
289
+ "session": False,
290
+ "storeId": None,
291
+ "value": "RG1.8947708639250500550.1718872043",
292
+ },
293
+ {
294
+ "domain": ".researchgate.net",
295
+ "expirationDate": 1750507578,
296
+ "hostOnly": False,
297
+ "httpOnly": False,
298
+ "name": "euconsent-v2-didomi",
299
+ "path": "/",
300
+ "sameSite": "lax",
301
+ "secure": True,
302
+ "session": False,
303
+ "storeId": None,
304
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
+ },
306
+ {
307
+ "domain": ".researchgate.net",
308
+ "expirationDate": 1718885236,
309
+ "hostOnly": False,
310
+ "httpOnly": False,
311
+ "name": "_gat",
312
+ "path": "/",
313
+ "sameSite": None,
314
+ "secure": False,
315
+ "session": False,
316
+ "storeId": None,
317
+ "value": "1",
318
+ },
319
+ {
320
+ "domain": "www.researchgate.net",
321
+ "expirationDate": 1721477183,
322
+ "hostOnly": True,
323
+ "httpOnly": False,
324
+ "name": "_pbjs_userid_consent_data",
325
+ "path": "/",
326
+ "sameSite": "lax",
327
+ "secure": False,
328
+ "session": False,
329
+ "storeId": None,
330
+ "value": "3524755945110770",
331
+ },
332
+ {
333
+ "domain": ".researchgate.net",
334
+ "expirationDate": 1752567981,
335
+ "hostOnly": False,
336
+ "httpOnly": False,
337
+ "name": "__gads",
338
+ "path": "/",
339
+ "sameSite": None,
340
+ "secure": False,
341
+ "session": False,
342
+ "storeId": None,
343
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
+ },
345
+ {
346
+ "domain": ".researchgate.net",
347
+ "expirationDate": 1718886709.646173,
348
+ "hostOnly": False,
349
+ "httpOnly": True,
350
+ "name": "__cf_bm",
351
+ "path": "/",
352
+ "sameSite": "no_restriction",
353
+ "secure": True,
354
+ "session": False,
355
+ "storeId": None,
356
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
+ },
358
+ {
359
+ "domain": ".researchgate.net",
360
+ "expirationDate": 1752567981,
361
+ "hostOnly": False,
362
+ "httpOnly": False,
363
+ "name": "__gpi",
364
+ "path": "/",
365
+ "sameSite": None,
366
+ "secure": False,
367
+ "session": False,
368
+ "storeId": None,
369
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
+ },
371
+ {
372
+ "domain": ".researchgate.net",
373
+ "hostOnly": False,
374
+ "httpOnly": True,
375
+ "name": "_cfuvid",
376
+ "path": "/",
377
+ "sameSite": "no_restriction",
378
+ "secure": True,
379
+ "session": True,
380
+ "storeId": None,
381
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
+ },
383
+ {
384
+ "domain": ".researchgate.net",
385
+ "expirationDate": 1753445177.271667,
386
+ "hostOnly": False,
387
+ "httpOnly": False,
388
+ "name": "_ga",
389
+ "path": "/",
390
+ "sameSite": None,
391
+ "secure": False,
392
+ "session": False,
393
+ "storeId": None,
394
+ "value": "GA1.1.1525244793.1718885177",
395
+ },
396
+ {
397
+ "domain": ".researchgate.net",
398
+ "expirationDate": 1753445177.271482,
399
+ "hostOnly": False,
400
+ "httpOnly": False,
401
+ "name": "_ga_4P31SJ70EJ",
402
+ "path": "/",
403
+ "sameSite": None,
404
+ "secure": False,
405
+ "session": False,
406
+ "storeId": None,
407
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
+ },
409
+ {
410
+ "domain": ".researchgate.net",
411
+ "expirationDate": 1718971576,
412
+ "hostOnly": False,
413
+ "httpOnly": False,
414
+ "name": "_gid",
415
+ "path": "/",
416
+ "sameSite": None,
417
+ "secure": False,
418
+ "session": False,
419
+ "storeId": None,
420
+ "value": "GA1.2.854907463.1718885177",
421
+ },
422
+ {
423
+ "domain": ".www.researchgate.net",
424
+ "expirationDate": 1750407982.506505,
425
+ "hostOnly": False,
426
+ "httpOnly": True,
427
+ "name": "did",
428
+ "path": "/",
429
+ "sameSite": None,
430
+ "secure": True,
431
+ "session": False,
432
+ "storeId": None,
433
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
+ },
435
+ {
436
+ "domain": ".researchgate.net",
437
+ "expirationDate": 1750507578,
438
+ "hostOnly": False,
439
+ "httpOnly": False,
440
+ "name": "didomi_token",
441
+ "path": "/",
442
+ "sameSite": "lax",
443
+ "secure": True,
444
+ "session": False,
445
+ "storeId": None,
446
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
+ },
448
+ {
449
+ "domain": ".www.researchgate.net",
450
+ "hostOnly": False,
451
+ "httpOnly": True,
452
+ "name": "hasPdpNext",
453
+ "path": "/",
454
+ "sameSite": None,
455
+ "secure": True,
456
+ "session": True,
457
+ "storeId": None,
458
+ "value": "False",
459
+ },
460
+ {
461
+ "domain": ".researchgate.net",
462
+ "expirationDate": 1750421183,
463
+ "hostOnly": False,
464
+ "httpOnly": False,
465
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
+ "path": "/",
467
+ "sameSite": "lax",
468
+ "secure": True,
469
+ "session": False,
470
+ "storeId": None,
471
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
+ },
473
+ {
474
+ "domain": ".www.researchgate.net",
475
+ "hostOnly": False,
476
+ "httpOnly": True,
477
+ "name": "sid",
478
+ "path": "/",
479
+ "sameSite": None,
480
+ "secure": True,
481
+ "session": True,
482
+ "storeId": None,
483
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
+ },
485
+ ]
486
+
487
+ COOKIES_LIST += [
488
+ {
489
+ "domain": "github.com",
490
+ "hostOnly": True,
491
+ "httpOnly": True,
492
+ "name": "_gh_sess",
493
+ "path": "/",
494
+ "sameSite": "lax",
495
+ "secure": True,
496
+ "session": True,
497
+ "storeId": None,
498
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
+ },
500
+ {
501
+ "domain": ".github.com",
502
+ "expirationDate": 1750408875.763785,
503
+ "hostOnly": False,
504
+ "httpOnly": False,
505
+ "name": "_octo",
506
+ "path": "/",
507
+ "sameSite": "lax",
508
+ "secure": True,
509
+ "session": False,
510
+ "storeId": None,
511
+ "value": "GH1.1.728652011.1718872875",
512
+ },
513
+ {
514
+ "domain": ".github.com",
515
+ "expirationDate": 1750408875.763926,
516
+ "hostOnly": False,
517
+ "httpOnly": True,
518
+ "name": "logged_in",
519
+ "path": "/",
520
+ "sameSite": "lax",
521
+ "secure": True,
522
+ "session": False,
523
+ "storeId": None,
524
+ "value": "no",
525
+ },
526
+ {
527
+ "domain": ".github.com",
528
+ "hostOnly": False,
529
+ "httpOnly": False,
530
+ "name": "preferred_color_mode",
531
+ "path": "/",
532
+ "sameSite": "lax",
533
+ "secure": True,
534
+ "session": True,
535
+ "storeId": None,
536
+ "value": "dark",
537
+ },
538
+ {
539
+ "domain": ".github.com",
540
+ "hostOnly": False,
541
+ "httpOnly": False,
542
+ "name": "tz",
543
+ "path": "/",
544
+ "sameSite": "lax",
545
+ "secure": True,
546
+ "session": True,
547
+ "storeId": None,
548
+ "value": "Europe%2FParis",
549
+ },
550
+ ]
551
+
552
+ COOKIES_LIST += [
553
+ {
554
+ "domain": ".web.archive.org",
555
+ "expirationDate": 1718886430,
556
+ "hostOnly": False,
557
+ "httpOnly": False,
558
+ "name": "_gat",
559
+ "path": "/web/20201123221659/http://orcid.org/",
560
+ "sameSite": None,
561
+ "secure": False,
562
+ "session": False,
563
+ "storeId": None,
564
+ "value": "1",
565
+ },
566
+ {
567
+ "domain": ".web.archive.org",
568
+ "expirationDate": 1718972770,
569
+ "hostOnly": False,
570
+ "httpOnly": False,
571
+ "name": "_gid",
572
+ "path": "/web/20201123221659/http://orcid.org/",
573
+ "sameSite": None,
574
+ "secure": False,
575
+ "session": False,
576
+ "storeId": None,
577
+ "value": "GA1.2.402246368.1606169825",
578
+ },
579
+ {
580
+ "domain": ".web.archive.org",
581
+ "expirationDate": 1753446370.315621,
582
+ "hostOnly": False,
583
+ "httpOnly": False,
584
+ "name": "_ga",
585
+ "path": "/web/20201123221659/http://orcid.org/",
586
+ "sameSite": None,
587
+ "secure": False,
588
+ "session": False,
589
+ "storeId": None,
590
+ "value": "GA1.2.1301409987.1606169825",
591
+ },
592
+ {
593
+ "domain": ".web.archive.org",
594
+ "expirationDate": 1750422367,
595
+ "hostOnly": False,
596
+ "httpOnly": False,
597
+ "name": "_hjid",
598
+ "path": "/web/20201123221659/http://orcid.org/",
599
+ "sameSite": "lax",
600
+ "secure": False,
601
+ "session": False,
602
+ "storeId": None,
603
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
+ },
605
+ {
606
+ "domain": ".web.archive.org",
607
+ "expirationDate": 1718888167,
608
+ "hostOnly": False,
609
+ "httpOnly": False,
610
+ "name": "_hjFirstSeen",
611
+ "path": "/web/20201123221659/http://orcid.org/",
612
+ "sameSite": "lax",
613
+ "secure": False,
614
+ "session": False,
615
+ "storeId": None,
616
+ "value": "1",
617
+ },
618
+ ]
619
+ COOKIES_LIST += [
620
+ {
621
+ "domain": "orcid.org",
622
+ "hostOnly": True,
623
+ "httpOnly": False,
624
+ "name": "AWSELBCORS",
625
+ "path": "/",
626
+ "sameSite": "no_restriction",
627
+ "secure": True,
628
+ "session": True,
629
+ "storeId": None,
630
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
+ },
632
+ {
633
+ "domain": ".orcid.org",
634
+ "expirationDate": 1753452454.637671,
635
+ "hostOnly": False,
636
+ "httpOnly": False,
637
+ "name": "_ga_9R61FWK9H5",
638
+ "path": "/",
639
+ "sameSite": None,
640
+ "secure": False,
641
+ "session": False,
642
+ "storeId": None,
643
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
+ },
645
+ {
646
+ "domain": ".orcid.org",
647
+ "expirationDate": 1753452454.63421,
648
+ "hostOnly": False,
649
+ "httpOnly": False,
650
+ "name": "_ga",
651
+ "path": "/",
652
+ "sameSite": None,
653
+ "secure": False,
654
+ "session": False,
655
+ "storeId": None,
656
+ "value": "GA1.1.2021310691.1718892455",
657
+ },
658
+ {
659
+ "domain": "orcid.org",
660
+ "hostOnly": True,
661
+ "httpOnly": False,
662
+ "name": "AWSELB",
663
+ "path": "/",
664
+ "sameSite": None,
665
+ "secure": False,
666
+ "session": True,
667
+ "storeId": None,
668
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
+ },
670
+ {
671
+ "domain": ".orcid.org",
672
+ "expirationDate": 1750428454,
673
+ "hostOnly": False,
674
+ "httpOnly": False,
675
+ "name": "OptanonAlertBoxClosed",
676
+ "path": "/",
677
+ "sameSite": "lax",
678
+ "secure": False,
679
+ "session": False,
680
+ "storeId": None,
681
+ "value": "2024-06-20T14:07:34.583Z",
682
+ },
683
+ {
684
+ "domain": ".orcid.org",
685
+ "expirationDate": 1750428454,
686
+ "hostOnly": False,
687
+ "httpOnly": False,
688
+ "name": "OptanonConsent",
689
+ "path": "/",
690
+ "sameSite": "lax",
691
+ "secure": False,
692
+ "session": False,
693
+ "storeId": None,
694
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
+ },
696
+ {
697
+ "domain": "orcid.org",
698
+ "hostOnly": True,
699
+ "httpOnly": False,
700
+ "name": "XSRF-TOKEN",
701
+ "path": "/",
702
+ "sameSite": None,
703
+ "secure": True,
704
+ "session": True,
705
+ "storeId": None,
706
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
+ },
708
+ ]
709
+
710
+ # Create a RequestsCookieJar instance
711
+ COOKIES = RequestsCookieJar()
712
+
713
+ # Add cookies to the jar
714
+ for cookie in COOKIES_LIST:
715
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
scripts/gaia_scorer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import warnings
4
+
5
+
6
+ def normalize_number_str(number_str: str) -> float:
7
+ # we replace these common units and commas to allow
8
+ # conversion to float
9
+ for char in ["$", "%", ","]:
10
+ number_str = number_str.replace(char, "")
11
+ try:
12
+ return float(number_str)
13
+ except ValueError:
14
+ print(f"String {number_str} cannot be normalized to number str.")
15
+ return float("inf")
16
+
17
+
18
+ def split_string(
19
+ s: str,
20
+ char_list: list[str] = [",", ";"],
21
+ ) -> list[str]:
22
+ pattern = f"[{''.join(char_list)}]"
23
+ return re.split(pattern, s)
24
+
25
+
26
+ def is_float(element: any) -> bool:
27
+ try:
28
+ float(element)
29
+ return True
30
+ except ValueError:
31
+ return False
32
+
33
+
34
+ def question_scorer(
35
+ model_answer: str,
36
+ ground_truth: str,
37
+ ) -> bool:
38
+ # if gt is a number
39
+ if is_float(ground_truth):
40
+ normalized_answer = normalize_number_str(str(model_answer))
41
+ return normalized_answer == float(ground_truth)
42
+
43
+ # if gt is a list
44
+ elif any(char in ground_truth for char in [",", ";"]):
45
+ # question with the fish: normalization removes punct
46
+
47
+ gt_elems = split_string(ground_truth)
48
+ ma_elems = split_string(model_answer)
49
+
50
+ # check length is the same
51
+ if len(gt_elems) != len(ma_elems):
52
+ warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
53
+ return False
54
+
55
+ # compare each element as float or str
56
+ comparisons = []
57
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
58
+ if is_float(gt_elem):
59
+ normalized_ma_elem = normalize_number_str(ma_elem)
60
+ comparisons.append(normalized_ma_elem == float(gt_elem))
61
+ else:
62
+ # we do not remove punct since comparisons can include punct
63
+ comparisons.append(
64
+ normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
65
+ )
66
+ return all(comparisons)
67
+
68
+ # if gt is a str
69
+ else:
70
+ return normalize_str(model_answer) == normalize_str(ground_truth)
71
+
72
+
73
+ def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
74
+ prediction = prediction.lower()
75
+ true_answer = true_answer.lower()
76
+ if len(prediction) > len(true_answer) * 3:
77
+ return False
78
+ i = 0
79
+ for letter in true_answer:
80
+ if letter in prediction[i:]:
81
+ i += prediction[i:].index(letter)
82
+ else:
83
+ return False
84
+ return True
85
+
86
+
87
+ def check_close_call(prediction, true_answer, is_correct):
88
+ if is_correct:
89
+ return True
90
+ else:
91
+ if is_float(true_answer):
92
+ return is_correct
93
+ else:
94
+ if (
95
+ check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
96
+ and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
97
+ ):
98
+ print(f"Close call: {prediction} vs {true_answer}")
99
+ return True
100
+ else:
101
+ return False
102
+
103
+
104
+ def normalize_str(input_str, remove_punct=True) -> str:
105
+ """
106
+ Normalize a string by:
107
+ - Removing all white spaces
108
+ - Optionally removing punctuation (if remove_punct is True)
109
+ - Converting to lowercase
110
+ Parameters:
111
+ - input_str: str, the string to normalize
112
+ - remove_punct: bool, whether to remove punctuation (default: True)
113
+ Returns:
114
+ - str, the normalized string
115
+ """
116
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
117
+ no_spaces = re.sub(r"\s", "", input_str)
118
+
119
+ # Remove punctuation, if specified.
120
+ if remove_punct:
121
+ translator = str.maketrans("", "", string.punctuation)
122
+ return no_spaces.lower().translate(translator)
123
+ else:
124
+ return no_spaces.lower()
scripts/mdconvert.py ADDED
@@ -0,0 +1,949 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
2
+ # Thanks to Microsoft researchers for open-sourcing this!
3
+ # type: ignore
4
+ import base64
5
+ import copy
6
+ import html
7
+ import json
8
+ import mimetypes
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ from typing import Any, Dict, List, Optional, Union
17
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
18
+
19
+ import mammoth
20
+ import markdownify
21
+ import pandas as pd
22
+ import pdfminer
23
+ import pdfminer.high_level
24
+ import pptx
25
+
26
+ # File-format detection
27
+ import puremagic
28
+ import pydub
29
+ import requests
30
+ import speech_recognition as sr
31
+ from bs4 import BeautifulSoup
32
+ from youtube_transcript_api import YouTubeTranscriptApi
33
+ from youtube_transcript_api.formatters import SRTFormatter
34
+
35
+
36
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
37
+ """
38
+ A custom version of markdownify's MarkdownConverter. Changes include:
39
+
40
+ - Altering the default heading style to use '#', '##', etc.
41
+ - Removing javascript hyperlinks.
42
+ - Truncating images with large data:uri sources.
43
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
44
+ """
45
+
46
+ def __init__(self, **options: Any):
47
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
48
+ # Explicitly cast options to the expected type if necessary
49
+ super().__init__(**options)
50
+
51
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
52
+ """Same as usual, but be sure to start with a new line"""
53
+ if not convert_as_inline:
54
+ if not re.search(r"^\n", text):
55
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
56
+
57
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
58
+
59
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
60
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
61
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
62
+ if not text:
63
+ return ""
64
+ href = el.get("href")
65
+ title = el.get("title")
66
+
67
+ # Escape URIs and skip non-http or file schemes
68
+ if href:
69
+ try:
70
+ parsed_url = urlparse(href) # type: ignore
71
+ if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
72
+ return "%s%s%s" % (prefix, text, suffix)
73
+ href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
74
+ except ValueError: # It's not clear if this ever gets thrown
75
+ return "%s%s%s" % (prefix, text, suffix)
76
+
77
+ # For the replacement see #29: text nodes underscores are escaped
78
+ if (
79
+ self.options["autolinks"]
80
+ and text.replace(r"\_", "_") == href
81
+ and not title
82
+ and not self.options["default_title"]
83
+ ):
84
+ # Shortcut syntax
85
+ return "<%s>" % href
86
+ if self.options["default_title"] and not title:
87
+ title = href
88
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
89
+ return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
90
+
91
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
92
+ """Same as usual converter, but removes data URIs"""
93
+
94
+ alt = el.attrs.get("alt", None) or ""
95
+ src = el.attrs.get("src", None) or ""
96
+ title = el.attrs.get("title", None) or ""
97
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
98
+ if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
99
+ return alt
100
+
101
+ # Remove dataURIs
102
+ if src.startswith("data:"):
103
+ src = src.split(",")[0] + "..."
104
+
105
+ return "![%s](%s%s)" % (alt, src, title_part)
106
+
107
+ def convert_soup(self, soup: Any) -> str:
108
+ return super().convert_soup(soup) # type: ignore
109
+
110
+
111
+ class DocumentConverterResult:
112
+ """The result of converting a document to text."""
113
+
114
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
115
+ self.title: Union[str, None] = title
116
+ self.text_content: str = text_content
117
+
118
+
119
+ class DocumentConverter:
120
+ """Abstract superclass of all DocumentConverters."""
121
+
122
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
123
+ raise NotImplementedError()
124
+
125
+
126
+ class PlainTextConverter(DocumentConverter):
127
+ """Anything with content type text/plain"""
128
+
129
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
130
+ # Guess the content type from any file extension that might be around
131
+ content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
132
+
133
+ # Only accept text files
134
+ if content_type is None:
135
+ return None
136
+ # elif "text/" not in content_type.lower():
137
+ # return None
138
+
139
+ text_content = ""
140
+ with open(local_path, "rt", encoding="utf-8") as fh:
141
+ text_content = fh.read()
142
+ return DocumentConverterResult(
143
+ title=None,
144
+ text_content=text_content,
145
+ )
146
+
147
+
148
+ class HtmlConverter(DocumentConverter):
149
+ """Anything with content type text/html"""
150
+
151
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
152
+ # Bail if not html
153
+ extension = kwargs.get("file_extension", "")
154
+ if extension.lower() not in [".html", ".htm"]:
155
+ return None
156
+
157
+ result = None
158
+ with open(local_path, "rt", encoding="utf-8") as fh:
159
+ result = self._convert(fh.read())
160
+
161
+ return result
162
+
163
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
164
+ """Helper function that converts and HTML string."""
165
+
166
+ # Parse the string
167
+ soup = BeautifulSoup(html_content, "html.parser")
168
+
169
+ # Remove javascript and style blocks
170
+ for script in soup(["script", "style"]):
171
+ script.extract()
172
+
173
+ # Print only the main content
174
+ body_elm = soup.find("body")
175
+ webpage_text = ""
176
+ if body_elm:
177
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
178
+ else:
179
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
180
+
181
+ assert isinstance(webpage_text, str)
182
+
183
+ return DocumentConverterResult(
184
+ title=None if soup.title is None else soup.title.string, text_content=webpage_text
185
+ )
186
+
187
+
188
+ class WikipediaConverter(DocumentConverter):
189
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
190
+
191
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
192
+ # Bail if not Wikipedia
193
+ extension = kwargs.get("file_extension", "")
194
+ if extension.lower() not in [".html", ".htm"]:
195
+ return None
196
+ url = kwargs.get("url", "")
197
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
198
+ return None
199
+
200
+ # Parse the file
201
+ soup = None
202
+ with open(local_path, "rt", encoding="utf-8") as fh:
203
+ soup = BeautifulSoup(fh.read(), "html.parser")
204
+
205
+ # Remove javascript and style blocks
206
+ for script in soup(["script", "style"]):
207
+ script.extract()
208
+
209
+ # Print only the main content
210
+ body_elm = soup.find("div", {"id": "mw-content-text"})
211
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
212
+
213
+ webpage_text = ""
214
+ main_title = None if soup.title is None else soup.title.string
215
+
216
+ if body_elm:
217
+ # What's the title
218
+ if title_elm and len(title_elm) > 0:
219
+ main_title = title_elm.string # type: ignore
220
+ assert isinstance(main_title, str)
221
+
222
+ # Convert the page
223
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
224
+ else:
225
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
226
+
227
+ return DocumentConverterResult(
228
+ title=main_title,
229
+ text_content=webpage_text,
230
+ )
231
+
232
+
233
+ class YouTubeConverter(DocumentConverter):
234
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
235
+
236
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
237
+ # Bail if not YouTube
238
+ extension = kwargs.get("file_extension", "")
239
+ if extension.lower() not in [".html", ".htm"]:
240
+ return None
241
+ url = kwargs.get("url", "")
242
+ if not url.startswith("https://www.youtube.com/watch?"):
243
+ return None
244
+
245
+ # Parse the file
246
+ soup = None
247
+ with open(local_path, "rt", encoding="utf-8") as fh:
248
+ soup = BeautifulSoup(fh.read(), "html.parser")
249
+
250
+ # Read the meta tags
251
+ assert soup.title is not None and soup.title.string is not None
252
+ metadata: Dict[str, str] = {"title": soup.title.string}
253
+ for meta in soup(["meta"]):
254
+ for a in meta.attrs:
255
+ if a in ["itemprop", "property", "name"]:
256
+ metadata[meta[a]] = meta.get("content", "")
257
+ break
258
+
259
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
260
+ try:
261
+ for script in soup(["script"]):
262
+ content = script.text
263
+ if "ytInitialData" in content:
264
+ lines = re.split(r"\r?\n", content)
265
+ obj_start = lines[0].find("{")
266
+ obj_end = lines[0].rfind("}")
267
+ if obj_start >= 0 and obj_end >= 0:
268
+ data = json.loads(lines[0][obj_start : obj_end + 1])
269
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
270
+ if attrdesc:
271
+ metadata["description"] = str(attrdesc["content"])
272
+ break
273
+ except Exception:
274
+ pass
275
+
276
+ # Start preparing the page
277
+ webpage_text = "# YouTube\n"
278
+
279
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
280
+ assert isinstance(title, str)
281
+
282
+ if title:
283
+ webpage_text += f"\n## {title}\n"
284
+
285
+ stats = ""
286
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
287
+ if views:
288
+ stats += f"- **Views:** {views}\n"
289
+
290
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
291
+ if keywords:
292
+ stats += f"- **Keywords:** {keywords}\n"
293
+
294
+ runtime = self._get(metadata, ["duration"]) # type: ignore
295
+ if runtime:
296
+ stats += f"- **Runtime:** {runtime}\n"
297
+
298
+ if len(stats) > 0:
299
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
300
+
301
+ description = self._get(metadata, ["description", "og:description"]) # type: ignore
302
+ if description:
303
+ webpage_text += f"\n### Description\n{description}\n"
304
+
305
+ transcript_text = ""
306
+ parsed_url = urlparse(url) # type: ignore
307
+ params = parse_qs(parsed_url.query) # type: ignore
308
+ if "v" in params:
309
+ assert isinstance(params["v"][0], str)
310
+ video_id = str(params["v"][0])
311
+ try:
312
+ # Must be a single transcript.
313
+ transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
314
+ # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
315
+ # Alternative formatting:
316
+ transcript_text = SRTFormatter().format_transcript(transcript)
317
+ except Exception:
318
+ pass
319
+ if transcript_text:
320
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
321
+
322
+ title = title if title else soup.title.string
323
+ assert isinstance(title, str)
324
+
325
+ return DocumentConverterResult(
326
+ title=title,
327
+ text_content=webpage_text,
328
+ )
329
+
330
+ def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
331
+ for k in keys:
332
+ if k in metadata:
333
+ return metadata[k]
334
+ return default
335
+
336
+ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
337
+ if isinstance(json, list):
338
+ for elm in json:
339
+ ret = self._findKey(elm, key)
340
+ if ret is not None:
341
+ return ret
342
+ elif isinstance(json, dict):
343
+ for k in json:
344
+ if k == key:
345
+ return json[k]
346
+ else:
347
+ ret = self._findKey(json[k], key)
348
+ if ret is not None:
349
+ return ret
350
+ return None
351
+
352
+
353
+ class PdfConverter(DocumentConverter):
354
+ """
355
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
356
+ """
357
+
358
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
359
+ # Bail if not a PDF
360
+ extension = kwargs.get("file_extension", "")
361
+ if extension.lower() != ".pdf":
362
+ return None
363
+
364
+ return DocumentConverterResult(
365
+ title=None,
366
+ text_content=pdfminer.high_level.extract_text(local_path),
367
+ )
368
+
369
+
370
+ class DocxConverter(HtmlConverter):
371
+ """
372
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
373
+ """
374
+
375
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
376
+ # Bail if not a DOCX
377
+ extension = kwargs.get("file_extension", "")
378
+ if extension.lower() != ".docx":
379
+ return None
380
+
381
+ result = None
382
+ with open(local_path, "rb") as docx_file:
383
+ result = mammoth.convert_to_html(docx_file)
384
+ html_content = result.value
385
+ result = self._convert(html_content)
386
+
387
+ return result
388
+
389
+
390
+ class XlsxConverter(HtmlConverter):
391
+ """
392
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
393
+ """
394
+
395
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
396
+ # Bail if not a XLSX
397
+ extension = kwargs.get("file_extension", "")
398
+ if extension.lower() not in [".xlsx", ".xls"]:
399
+ return None
400
+
401
+ sheets = pd.read_excel(local_path, sheet_name=None)
402
+ md_content = ""
403
+ for s in sheets:
404
+ md_content += f"## {s}\n"
405
+ html_content = sheets[s].to_html(index=False)
406
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
407
+
408
+ return DocumentConverterResult(
409
+ title=None,
410
+ text_content=md_content.strip(),
411
+ )
412
+
413
+
414
+ class PptxConverter(HtmlConverter):
415
+ """
416
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
417
+ """
418
+
419
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
420
+ # Bail if not a PPTX
421
+ extension = kwargs.get("file_extension", "")
422
+ if extension.lower() != ".pptx":
423
+ return None
424
+
425
+ md_content = ""
426
+
427
+ presentation = pptx.Presentation(local_path)
428
+ slide_num = 0
429
+ for slide in presentation.slides:
430
+ slide_num += 1
431
+
432
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
433
+
434
+ title = slide.shapes.title
435
+ for shape in slide.shapes:
436
+ # Pictures
437
+ if self._is_picture(shape):
438
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
439
+ alt_text = ""
440
+ try:
441
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
442
+ except Exception:
443
+ pass
444
+
445
+ # A placeholder name
446
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
447
+ md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
448
+
449
+ # Tables
450
+ if self._is_table(shape):
451
+ html_table = "<html><body><table>"
452
+ first_row = True
453
+ for row in shape.table.rows:
454
+ html_table += "<tr>"
455
+ for cell in row.cells:
456
+ if first_row:
457
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
458
+ else:
459
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
460
+ html_table += "</tr>"
461
+ first_row = False
462
+ html_table += "</table></body></html>"
463
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
464
+
465
+ # Text areas
466
+ elif shape.has_text_frame:
467
+ if shape == title:
468
+ md_content += "# " + shape.text.lstrip() + "\n"
469
+ else:
470
+ md_content += shape.text + "\n"
471
+
472
+ md_content = md_content.strip()
473
+
474
+ if slide.has_notes_slide:
475
+ md_content += "\n\n### Notes:\n"
476
+ notes_frame = slide.notes_slide.notes_text_frame
477
+ if notes_frame is not None:
478
+ md_content += notes_frame.text
479
+ md_content = md_content.strip()
480
+
481
+ return DocumentConverterResult(
482
+ title=None,
483
+ text_content=md_content.strip(),
484
+ )
485
+
486
+ def _is_picture(self, shape):
487
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
488
+ return True
489
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
490
+ if hasattr(shape, "image"):
491
+ return True
492
+ return False
493
+
494
+ def _is_table(self, shape):
495
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
496
+ return True
497
+ return False
498
+
499
+
500
+ class MediaConverter(DocumentConverter):
501
+ """
502
+ Abstract class for multi-modal media (e.g., images and audio)
503
+ """
504
+
505
+ def _get_metadata(self, local_path):
506
+ exiftool = shutil.which("exiftool")
507
+ if not exiftool:
508
+ return None
509
+ else:
510
+ try:
511
+ result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
512
+ return json.loads(result)[0]
513
+ except Exception:
514
+ return None
515
+
516
+
517
+ class WavConverter(MediaConverter):
518
+ """
519
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
520
+ """
521
+
522
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
523
+ # Bail if not a XLSX
524
+ extension = kwargs.get("file_extension", "")
525
+ if extension.lower() != ".wav":
526
+ return None
527
+
528
+ md_content = ""
529
+
530
+ # Add metadata
531
+ metadata = self._get_metadata(local_path)
532
+ if metadata:
533
+ for f in [
534
+ "Title",
535
+ "Artist",
536
+ "Author",
537
+ "Band",
538
+ "Album",
539
+ "Genre",
540
+ "Track",
541
+ "DateTimeOriginal",
542
+ "CreateDate",
543
+ "Duration",
544
+ ]:
545
+ if f in metadata:
546
+ md_content += f"{f}: {metadata[f]}\n"
547
+
548
+ # Transcribe
549
+ try:
550
+ transcript = self._transcribe_audio(local_path)
551
+ md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
552
+ except Exception:
553
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
554
+
555
+ return DocumentConverterResult(
556
+ title=None,
557
+ text_content=md_content.strip(),
558
+ )
559
+
560
+ def _transcribe_audio(self, local_path) -> str:
561
+ recognizer = sr.Recognizer()
562
+ with sr.AudioFile(local_path) as source:
563
+ audio = recognizer.record(source)
564
+ return recognizer.recognize_google(audio).strip()
565
+
566
+
567
+ class Mp3Converter(WavConverter):
568
+ """
569
+ Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
570
+ """
571
+
572
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
573
+ # Bail if not a MP3
574
+ extension = kwargs.get("file_extension", "")
575
+ if extension.lower() != ".mp3":
576
+ return None
577
+
578
+ md_content = ""
579
+
580
+ # Add metadata
581
+ metadata = self._get_metadata(local_path)
582
+ if metadata:
583
+ for f in [
584
+ "Title",
585
+ "Artist",
586
+ "Author",
587
+ "Band",
588
+ "Album",
589
+ "Genre",
590
+ "Track",
591
+ "DateTimeOriginal",
592
+ "CreateDate",
593
+ "Duration",
594
+ ]:
595
+ if f in metadata:
596
+ md_content += f"{f}: {metadata[f]}\n"
597
+
598
+ # Transcribe
599
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
600
+ os.close(handle)
601
+ try:
602
+ sound = pydub.AudioSegment.from_mp3(local_path)
603
+ sound.export(temp_path, format="wav")
604
+
605
+ _args = dict()
606
+ _args.update(kwargs)
607
+ _args["file_extension"] = ".wav"
608
+
609
+ try:
610
+ transcript = super()._transcribe_audio(temp_path).strip()
611
+ md_content += "\n\n### Audio Transcript:\n" + (
612
+ "[No speech detected]" if transcript == "" else transcript
613
+ )
614
+ except Exception:
615
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
616
+
617
+ finally:
618
+ os.unlink(temp_path)
619
+
620
+ # Return the result
621
+ return DocumentConverterResult(
622
+ title=None,
623
+ text_content=md_content.strip(),
624
+ )
625
+
626
+
627
+ class ImageConverter(MediaConverter):
628
+ """
629
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
630
+ """
631
+
632
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
633
+ # Bail if not a XLSX
634
+ extension = kwargs.get("file_extension", "")
635
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
636
+ return None
637
+
638
+ md_content = ""
639
+
640
+ # Add metadata
641
+ metadata = self._get_metadata(local_path)
642
+ if metadata:
643
+ for f in [
644
+ "ImageSize",
645
+ "Title",
646
+ "Caption",
647
+ "Description",
648
+ "Keywords",
649
+ "Artist",
650
+ "Author",
651
+ "DateTimeOriginal",
652
+ "CreateDate",
653
+ "GPSPosition",
654
+ ]:
655
+ if f in metadata:
656
+ md_content += f"{f}: {metadata[f]}\n"
657
+
658
+ # Try describing the image with GPTV
659
+ mlm_client = kwargs.get("mlm_client")
660
+ mlm_model = kwargs.get("mlm_model")
661
+ if mlm_client is not None and mlm_model is not None:
662
+ md_content += (
663
+ "\n# Description:\n"
664
+ + self._get_mlm_description(
665
+ local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
666
+ ).strip()
667
+ + "\n"
668
+ )
669
+
670
+ return DocumentConverterResult(
671
+ title=None,
672
+ text_content=md_content,
673
+ )
674
+
675
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
676
+ if prompt is None or prompt.strip() == "":
677
+ prompt = "Write a detailed caption for this image."
678
+
679
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
680
+
681
+ data_uri = ""
682
+ with open(local_path, "rb") as image_file:
683
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
684
+ if content_type is None:
685
+ content_type = "image/jpeg"
686
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
687
+ data_uri = f"data:{content_type};base64,{image_base64}"
688
+
689
+ messages = [
690
+ {
691
+ "role": "user",
692
+ "content": [
693
+ {"type": "text", "text": prompt},
694
+ {
695
+ "type": "image_url",
696
+ "image_url": {
697
+ "url": data_uri,
698
+ },
699
+ },
700
+ ],
701
+ }
702
+ ]
703
+
704
+ response = client.chat.completions.create(model=model, messages=messages)
705
+ return response.choices[0].message.content
706
+
707
+
708
+ class FileConversionException(BaseException):
709
+ pass
710
+
711
+
712
+ class UnsupportedFormatException(BaseException):
713
+ pass
714
+
715
+
716
+ class MarkdownConverter:
717
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
718
+ This reader will convert common file-types or webpages to Markdown."""
719
+
720
+ def __init__(
721
+ self,
722
+ requests_session: Optional[requests.Session] = None,
723
+ mlm_client: Optional[Any] = None,
724
+ mlm_model: Optional[Any] = None,
725
+ ):
726
+ if requests_session is None:
727
+ self._requests_session = requests.Session()
728
+ else:
729
+ self._requests_session = requests_session
730
+
731
+ self._mlm_client = mlm_client
732
+ self._mlm_model = mlm_model
733
+
734
+ self._page_converters: List[DocumentConverter] = []
735
+
736
+ # Register converters for successful browsing operations
737
+ # Later registrations are tried first / take higher priority than earlier registrations
738
+ # To this end, the most specific converters should appear below the most generic converters
739
+ self.register_page_converter(PlainTextConverter())
740
+ self.register_page_converter(HtmlConverter())
741
+ self.register_page_converter(WikipediaConverter())
742
+ self.register_page_converter(YouTubeConverter())
743
+ self.register_page_converter(DocxConverter())
744
+ self.register_page_converter(XlsxConverter())
745
+ self.register_page_converter(PptxConverter())
746
+ self.register_page_converter(WavConverter())
747
+ self.register_page_converter(Mp3Converter())
748
+ self.register_page_converter(ImageConverter())
749
+ self.register_page_converter(PdfConverter())
750
+
751
+ def convert(
752
+ self, source: Union[str, requests.Response], **kwargs: Any
753
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
754
+ """
755
+ Args:
756
+ - source: can be a string representing a path or url, or a requests.response object
757
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
758
+ """
759
+
760
+ # Local path or url
761
+ if isinstance(source, str):
762
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
763
+ return self.convert_url(source, **kwargs)
764
+ else:
765
+ return self.convert_local(source, **kwargs)
766
+ # Request response
767
+ elif isinstance(source, requests.Response):
768
+ return self.convert_response(source, **kwargs)
769
+
770
+ def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
771
+ # Prepare a list of extensions to try (in order of priority)
772
+ ext = kwargs.get("file_extension")
773
+ extensions = [ext] if ext is not None else []
774
+
775
+ # Get extension alternatives from the path and puremagic
776
+ base, ext = os.path.splitext(path)
777
+ self._append_ext(extensions, ext)
778
+ self._append_ext(extensions, self._guess_ext_magic(path))
779
+
780
+ # Convert
781
+ return self._convert(path, extensions, **kwargs)
782
+
783
+ # TODO what should stream's type be?
784
+ def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
785
+ # Prepare a list of extensions to try (in order of priority)
786
+ ext = kwargs.get("file_extension")
787
+ extensions = [ext] if ext is not None else []
788
+
789
+ # Save the file locally to a temporary file. It will be deleted before this method exits
790
+ handle, temp_path = tempfile.mkstemp()
791
+ fh = os.fdopen(handle, "wb")
792
+ result = None
793
+ try:
794
+ # Write to the temporary file
795
+ content = stream.read()
796
+ if isinstance(content, str):
797
+ fh.write(content.encode("utf-8"))
798
+ else:
799
+ fh.write(content)
800
+ fh.close()
801
+
802
+ # Use puremagic to check for more extension options
803
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
804
+
805
+ # Convert
806
+ result = self._convert(temp_path, extensions, **kwargs)
807
+ # Clean up
808
+ finally:
809
+ try:
810
+ fh.close()
811
+ except Exception:
812
+ pass
813
+ os.unlink(temp_path)
814
+
815
+ return result
816
+
817
+ def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
818
+ # Send a HTTP request to the URL
819
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
820
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
821
+ response.raise_for_status()
822
+ return self.convert_response(response, **kwargs)
823
+
824
+ def convert_response(
825
+ self, response: requests.Response, **kwargs: Any
826
+ ) -> DocumentConverterResult: # TODO fix kwargs type
827
+ # Prepare a list of extensions to try (in order of priority)
828
+ ext = kwargs.get("file_extension")
829
+ extensions = [ext] if ext is not None else []
830
+
831
+ # Guess from the mimetype
832
+ content_type = response.headers.get("content-type", "").split(";")[0]
833
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
834
+
835
+ # Read the content disposition if there is one
836
+ content_disposition = response.headers.get("content-disposition", "")
837
+ m = re.search(r"filename=([^;]+)", content_disposition)
838
+ if m:
839
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
840
+ self._append_ext(extensions, ext)
841
+
842
+ # Read from the extension from the path
843
+ base, ext = os.path.splitext(urlparse(response.url).path)
844
+ self._append_ext(extensions, ext)
845
+
846
+ # Save the file locally to a temporary file. It will be deleted before this method exits
847
+ handle, temp_path = tempfile.mkstemp()
848
+ fh = os.fdopen(handle, "wb")
849
+ result = None
850
+ try:
851
+ # Download the file
852
+ for chunk in response.iter_content(chunk_size=512):
853
+ fh.write(chunk)
854
+ fh.close()
855
+
856
+ # Use puremagic to check for more extension options
857
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
858
+
859
+ # Convert
860
+ result = self._convert(temp_path, extensions, url=response.url)
861
+ except Exception as e:
862
+ print(f"Error in converting: {e}")
863
+
864
+ # Clean up
865
+ finally:
866
+ try:
867
+ fh.close()
868
+ except Exception:
869
+ pass
870
+ os.unlink(temp_path)
871
+
872
+ return result
873
+
874
+ def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
875
+ error_trace = ""
876
+ for ext in extensions + [None]: # Try last with no extension
877
+ for converter in self._page_converters:
878
+ _kwargs = copy.deepcopy(kwargs)
879
+
880
+ # Overwrite file_extension appropriately
881
+ if ext is None:
882
+ if "file_extension" in _kwargs:
883
+ del _kwargs["file_extension"]
884
+ else:
885
+ _kwargs.update({"file_extension": ext})
886
+
887
+ # Copy any additional global options
888
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
889
+ _kwargs["mlm_client"] = self._mlm_client
890
+
891
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
892
+ _kwargs["mlm_model"] = self._mlm_model
893
+
894
+ # If we hit an error log it and keep trying
895
+ try:
896
+ res = converter.convert(local_path, **_kwargs)
897
+ except Exception:
898
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
899
+
900
+ if res is not None:
901
+ # Normalize the content
902
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
903
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
904
+
905
+ # Todo
906
+ return res
907
+
908
+ # If we got this far without success, report any exceptions
909
+ if len(error_trace) > 0:
910
+ raise FileConversionException(
911
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
912
+ )
913
+
914
+ # Nothing can handle it!
915
+ raise UnsupportedFormatException(
916
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
917
+ )
918
+
919
+ def _append_ext(self, extensions, ext):
920
+ """Append a unique non-None, non-empty extension to a list of extensions."""
921
+ if ext is None:
922
+ return
923
+ ext = ext.strip()
924
+ if ext == "":
925
+ return
926
+ # if ext not in extensions:
927
+ if True:
928
+ extensions.append(ext)
929
+
930
+ def _guess_ext_magic(self, path):
931
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
932
+ # Use puremagic to guess
933
+ try:
934
+ guesses = puremagic.magic_file(path)
935
+ if len(guesses) > 0:
936
+ ext = guesses[0].extension.strip()
937
+ if len(ext) > 0:
938
+ return ext
939
+ except FileNotFoundError:
940
+ pass
941
+ except IsADirectoryError:
942
+ pass
943
+ except PermissionError:
944
+ pass
945
+ return None
946
+
947
+ def register_page_converter(self, converter: DocumentConverter) -> None:
948
+ """Register a page text converter."""
949
+ self._page_converters.insert(0, converter)
scripts/reformulator.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import copy
4
+
5
+ from smolagents.models import MessageRole, Model
6
+
7
+
8
+ def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
9
+ messages = [
10
+ {
11
+ "role": MessageRole.SYSTEM,
12
+ "content": [
13
+ {
14
+ "type": "text",
15
+ "text": f"""Earlier you were asked the following:
16
+
17
+ {original_task}
18
+
19
+ Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
20
+ }
21
+ ],
22
+ }
23
+ ]
24
+
25
+ # The first message just repeats the question, so remove it
26
+ # if len(inner_messages) > 1:
27
+ # del inner_messages[0]
28
+
29
+ # copy them to this context
30
+ try:
31
+ for message in inner_messages:
32
+ if not message.get("content"):
33
+ continue
34
+ message = copy.deepcopy(message)
35
+ message["role"] = MessageRole.USER
36
+ messages.append(message)
37
+ except Exception:
38
+ messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
39
+
40
+ # ask for the final answer
41
+ messages.append(
42
+ {
43
+ "role": MessageRole.USER,
44
+ "content": [
45
+ {
46
+ "type": "text",
47
+ "text": f"""
48
+ Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
49
+
50
+ {original_task}
51
+
52
+ To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
53
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
55
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
56
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
57
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
58
+ If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
59
+ """,
60
+ }
61
+ ],
62
+ }
63
+ )
64
+
65
+ response = reformulation_model(messages).content
66
+
67
+ final_answer = response.split("FINAL ANSWER: ")[-1].strip()
68
+ print("> Reformulated answer: ", final_answer)
69
+
70
+ # if "unable to determine" in final_answer.lower():
71
+ # messages.append({"role": MessageRole.ASSISTANT, "content": response })
72
+ # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
73
+ # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
74
+
75
+ # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
76
+ # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
77
+ # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
78
+ # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
79
+ # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
80
+ # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
81
+ # """.strip()}]})
82
+
83
+ # response = model(messages).content
84
+ # print("\n>>>Making an educated guess.\n", response)
85
+ # final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
86
+ return final_answer
scripts/run_agents.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import textwrap
5
+ from pathlib import Path
6
+
7
+ # import tqdm.asyncio
8
+ from smolagents.utils import AgentError
9
+
10
+
11
+ def serialize_agent_error(obj):
12
+ if isinstance(obj, AgentError):
13
+ return {"error_type": obj.__class__.__name__, "message": obj.message}
14
+ else:
15
+ return str(obj)
16
+
17
+
18
+ def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
19
+ prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
20
+ {question}. But do not try to answer the question directly!
21
+ Do not add any information that is not present in the image."""
22
+ return visual_inspection_tool(image_path=file_name, question=prompt)
23
+
24
+
25
+ def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
26
+ prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
27
+ {question}. But do not try to answer the question directly!
28
+ Do not add any information that is not present in the document."""
29
+ return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
30
+
31
+
32
+ def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
33
+ file_extension = file_path.split(".")[-1]
34
+ if file_extension in ["png", "jpg", "jpeg"]:
35
+ file_description = f" - Attached image: {file_path}"
36
+ file_description += (
37
+ f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
38
+ )
39
+ return file_description
40
+ elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
41
+ file_description = f" - Attached document: {file_path}"
42
+ image_path = file_path.split(".")[0] + ".png"
43
+ if os.path.exists(image_path):
44
+ description = get_image_description(image_path, question, visual_inspection_tool)
45
+ else:
46
+ description = get_document_description(file_path, question, document_inspection_tool)
47
+ file_description += f"\n -> File description: {description}"
48
+ return file_description
49
+ elif file_extension in ["mp3", "m4a", "wav"]:
50
+ return f" - Attached audio: {file_path}"
51
+ else:
52
+ return f" - Attached file: {file_path}"
53
+
54
+
55
+ def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
56
+ folder_path = file_path.replace(".zip", "")
57
+ os.makedirs(folder_path, exist_ok=True)
58
+ shutil.unpack_archive(file_path, folder_path)
59
+
60
+ prompt_use_files = ""
61
+ for root, dirs, files in os.walk(folder_path):
62
+ for file in files:
63
+ file_path = os.path.join(root, file)
64
+ prompt_use_files += "\n" + textwrap.indent(
65
+ get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
66
+ prefix=" ",
67
+ )
68
+ return prompt_use_files
69
+
70
+
71
+ def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
72
+ f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
73
+ done = set()
74
+ if f.exists():
75
+ with open(f, encoding="utf-8") as fh:
76
+ done = {json.loads(line)["task_id"] for line in fh if line.strip()}
77
+
78
+ tasks = []
79
+ for i in range(total):
80
+ task_id = int(data[i]["task_id"])
81
+ if task_id not in done:
82
+ if tasks_ids is not None:
83
+ if task_id in tasks_ids:
84
+ tasks.append(data[i])
85
+ else:
86
+ tasks.append(data[i])
87
+ return tasks
scripts/text_inspector_tool.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from smolagents import Tool
4
+ from smolagents.models import MessageRole, Model
5
+
6
+ from .mdconvert import MarkdownConverter
7
+
8
+
9
+ class TextInspectorTool(Tool):
10
+ name = "inspect_file_as_text"
11
+ description = """
12
+ You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
13
+ This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
14
+
15
+ inputs = {
16
+ "file_path": {
17
+ "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
18
+ "type": "string",
19
+ },
20
+ "question": {
21
+ "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
22
+ "type": "string",
23
+ "nullable": True,
24
+ },
25
+ }
26
+ output_type = "string"
27
+ md_converter = MarkdownConverter()
28
+
29
+ def __init__(self, model: Model, text_limit: int):
30
+ super().__init__()
31
+ self.model = model
32
+ self.text_limit = text_limit
33
+
34
+ def forward_initial_exam_mode(self, file_path, question):
35
+ result = self.md_converter.convert(file_path)
36
+
37
+ if file_path[-4:] in [".png", ".jpg"]:
38
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
39
+
40
+ if ".zip" in file_path:
41
+ return result.text_content
42
+
43
+ if not question:
44
+ return result.text_content
45
+
46
+ if len(result.text_content) < 4000:
47
+ return "Document content: " + result.text_content
48
+
49
+ messages = [
50
+ {
51
+ "role": MessageRole.SYSTEM,
52
+ "content": [
53
+ {
54
+ "type": "text",
55
+ "text": "Here is a file:\n### "
56
+ + str(result.title)
57
+ + "\n\n"
58
+ + result.text_content[: self.text_limit],
59
+ }
60
+ ],
61
+ },
62
+ {
63
+ "role": MessageRole.USER,
64
+ "content": [
65
+ {
66
+ "type": "text",
67
+ "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
68
+ + question
69
+ + "\n\nDon't answer the question yourself! Just provide useful notes on the document",
70
+ }
71
+ ],
72
+ },
73
+ ]
74
+ return self.model(messages).content
75
+
76
+ def forward(self, file_path, question: Optional[str] = None) -> str:
77
+ result = self.md_converter.convert(file_path)
78
+
79
+ if file_path[-4:] in [".png", ".jpg"]:
80
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
81
+
82
+ if ".zip" in file_path:
83
+ return result.text_content
84
+
85
+ if not question:
86
+ return result.text_content
87
+
88
+ messages = [
89
+ {
90
+ "role": MessageRole.SYSTEM,
91
+ "content": [
92
+ {
93
+ "type": "text",
94
+ "text": "You will have to write a short caption for this file, then answer this question:"
95
+ + question,
96
+ }
97
+ ],
98
+ },
99
+ {
100
+ "role": MessageRole.USER,
101
+ "content": [
102
+ {
103
+ "type": "text",
104
+ "text": "Here is the complete file:\n### "
105
+ + str(result.title)
106
+ + "\n\n"
107
+ + result.text_content[: self.text_limit],
108
+ }
109
+ ],
110
+ },
111
+ {
112
+ "role": MessageRole.USER,
113
+ "content": [
114
+ {
115
+ "type": "text",
116
+ "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
117
+ + question,
118
+ }
119
+ ],
120
+ },
121
+ ]
122
+ return self.model(messages).content
scripts/text_web_browser.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import mimetypes
4
+ import os
5
+ import pathlib
6
+ import re
7
+ import time
8
+ import uuid
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib.parse import unquote, urljoin, urlparse
11
+
12
+ import pathvalidate
13
+ import requests
14
+ from serpapi import GoogleSearch
15
+
16
+ from smolagents import Tool
17
+
18
+ from .cookies import COOKIES
19
+ from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
20
+
21
+
22
+ class SimpleTextBrowser:
23
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
24
+
25
+ def __init__(
26
+ self,
27
+ start_page: Optional[str] = None,
28
+ viewport_size: Optional[int] = 1024 * 8,
29
+ downloads_folder: Optional[Union[str, None]] = None,
30
+ serpapi_key: Optional[Union[str, None]] = None,
31
+ request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
32
+ ):
33
+ self.start_page: str = start_page if start_page else "about:blank"
34
+ self.viewport_size = viewport_size # Applies only to the standard uri types
35
+ self.downloads_folder = downloads_folder
36
+ self.history: List[Tuple[str, float]] = list()
37
+ self.page_title: Optional[str] = None
38
+ self.viewport_current_page = 0
39
+ self.viewport_pages: List[Tuple[int, int]] = list()
40
+ self.set_address(self.start_page)
41
+ self.serpapi_key = serpapi_key
42
+ self.request_kwargs = request_kwargs
43
+ self.request_kwargs["cookies"] = COOKIES
44
+ self._mdconvert = MarkdownConverter()
45
+ self._page_content: str = ""
46
+
47
+ self._find_on_page_query: Union[str, None] = None
48
+ self._find_on_page_last_result: Union[int, None] = None # Location of the last result
49
+
50
+ @property
51
+ def address(self) -> str:
52
+ """Return the address of the current page."""
53
+ return self.history[-1][0]
54
+
55
+ def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
56
+ # TODO: Handle anchors
57
+ self.history.append((uri_or_path, time.time()))
58
+
59
+ # Handle special URIs
60
+ if uri_or_path == "about:blank":
61
+ self._set_page_content("")
62
+ elif uri_or_path.startswith("google:"):
63
+ self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
64
+ else:
65
+ if (
66
+ not uri_or_path.startswith("http:")
67
+ and not uri_or_path.startswith("https:")
68
+ and not uri_or_path.startswith("file:")
69
+ ):
70
+ if len(self.history) > 1:
71
+ prior_address = self.history[-2][0]
72
+ uri_or_path = urljoin(prior_address, uri_or_path)
73
+ # Update the address with the fully-qualified path
74
+ self.history[-1] = (uri_or_path, self.history[-1][1])
75
+ self._fetch_page(uri_or_path)
76
+
77
+ self.viewport_current_page = 0
78
+ self.find_on_page_query = None
79
+ self.find_on_page_viewport = None
80
+
81
+ @property
82
+ def viewport(self) -> str:
83
+ """Return the content of the current viewport."""
84
+ bounds = self.viewport_pages[self.viewport_current_page]
85
+ return self.page_content[bounds[0] : bounds[1]]
86
+
87
+ @property
88
+ def page_content(self) -> str:
89
+ """Return the full contents of the current page."""
90
+ return self._page_content
91
+
92
+ def _set_page_content(self, content: str) -> None:
93
+ """Sets the text content of the current page."""
94
+ self._page_content = content
95
+ self._split_pages()
96
+ if self.viewport_current_page >= len(self.viewport_pages):
97
+ self.viewport_current_page = len(self.viewport_pages) - 1
98
+
99
+ def page_down(self) -> None:
100
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
101
+
102
+ def page_up(self) -> None:
103
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
104
+
105
+ def find_on_page(self, query: str) -> Union[str, None]:
106
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
107
+
108
+ # Did we get here via a previous find_on_page search with the same query?
109
+ # If so, map to find_next
110
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
111
+ return self.find_next()
112
+
113
+ # Ok it's a new search start from the current viewport
114
+ self._find_on_page_query = query
115
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
116
+ if viewport_match is None:
117
+ self._find_on_page_last_result = None
118
+ return None
119
+ else:
120
+ self.viewport_current_page = viewport_match
121
+ self._find_on_page_last_result = viewport_match
122
+ return self.viewport
123
+
124
+ def find_next(self) -> Union[str, None]:
125
+ """Scroll to the next viewport that matches the query"""
126
+
127
+ if self._find_on_page_query is None:
128
+ return None
129
+
130
+ starting_viewport = self._find_on_page_last_result
131
+ if starting_viewport is None:
132
+ starting_viewport = 0
133
+ else:
134
+ starting_viewport += 1
135
+ if starting_viewport >= len(self.viewport_pages):
136
+ starting_viewport = 0
137
+
138
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
139
+ if viewport_match is None:
140
+ self._find_on_page_last_result = None
141
+ return None
142
+ else:
143
+ self.viewport_current_page = viewport_match
144
+ self._find_on_page_last_result = viewport_match
145
+ return self.viewport
146
+
147
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
148
+ """Search for matches between the starting viewport looping when reaching the end."""
149
+
150
+ if query is None:
151
+ return None
152
+
153
+ # Normalize the query, and convert to a regular expression
154
+ nquery = re.sub(r"\*", "__STAR__", query)
155
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
156
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
157
+ nquery = nquery.replace("__STAR__", ".*").lower()
158
+
159
+ if nquery.strip() == "":
160
+ return None
161
+
162
+ idxs = list()
163
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
164
+ idxs.extend(range(0, starting_viewport))
165
+
166
+ for i in idxs:
167
+ bounds = self.viewport_pages[i]
168
+ content = self.page_content[bounds[0] : bounds[1]]
169
+
170
+ # TODO: Remove markdown links and images
171
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
172
+ if re.search(nquery, ncontent):
173
+ return i
174
+
175
+ return None
176
+
177
+ def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
178
+ """Update the address, visit the page, and return the content of the viewport."""
179
+ self.set_address(path_or_uri, filter_year=filter_year)
180
+ return self.viewport
181
+
182
+ def _split_pages(self) -> None:
183
+ # Do not split search results
184
+ if self.address.startswith("google:"):
185
+ self.viewport_pages = [(0, len(self._page_content))]
186
+ return
187
+
188
+ # Handle empty pages
189
+ if len(self._page_content) == 0:
190
+ self.viewport_pages = [(0, 0)]
191
+ return
192
+
193
+ # Break the viewport into pages
194
+ self.viewport_pages = []
195
+ start_idx = 0
196
+ while start_idx < len(self._page_content):
197
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
198
+ # Adjust to end on a space
199
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
200
+ end_idx += 1
201
+ self.viewport_pages.append((start_idx, end_idx))
202
+ start_idx = end_idx
203
+
204
+ def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
205
+ if self.serpapi_key is None:
206
+ raise ValueError("Missing SerpAPI key.")
207
+
208
+ params = {
209
+ "engine": "google",
210
+ "q": query,
211
+ "api_key": self.serpapi_key,
212
+ }
213
+ if filter_year is not None:
214
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
215
+
216
+ search = GoogleSearch(params)
217
+ results = search.get_dict()
218
+ self.page_title = f"{query} - Search"
219
+ if "organic_results" not in results.keys():
220
+ raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
221
+ if len(results["organic_results"]) == 0:
222
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
223
+ self._set_page_content(
224
+ f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
225
+ )
226
+ return
227
+
228
+ def _prev_visit(url):
229
+ for i in range(len(self.history) - 1, -1, -1):
230
+ if self.history[i][0] == url:
231
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
232
+ return ""
233
+
234
+ web_snippets: List[str] = list()
235
+ idx = 0
236
+ if "organic_results" in results:
237
+ for page in results["organic_results"]:
238
+ idx += 1
239
+ date_published = ""
240
+ if "date" in page:
241
+ date_published = "\nDate published: " + page["date"]
242
+
243
+ source = ""
244
+ if "source" in page:
245
+ source = "\nSource: " + page["source"]
246
+
247
+ snippet = ""
248
+ if "snippet" in page:
249
+ snippet = "\n" + page["snippet"]
250
+
251
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
252
+
253
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
254
+ web_snippets.append(redacted_version)
255
+
256
+ content = (
257
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
258
+ + "\n\n".join(web_snippets)
259
+ )
260
+
261
+ self._set_page_content(content)
262
+
263
+ def _fetch_page(self, url: str) -> None:
264
+ download_path = ""
265
+ try:
266
+ if url.startswith("file://"):
267
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
268
+ res = self._mdconvert.convert_local(download_path)
269
+ self.page_title = res.title
270
+ self._set_page_content(res.text_content)
271
+ else:
272
+ # Prepare the request parameters
273
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
274
+ request_kwargs["stream"] = True
275
+
276
+ # Send a HTTP request to the URL
277
+ response = requests.get(url, **request_kwargs)
278
+ response.raise_for_status()
279
+
280
+ # If the HTTP request was successful
281
+ content_type = response.headers.get("content-type", "")
282
+
283
+ # Text or HTML
284
+ if "text/" in content_type.lower():
285
+ res = self._mdconvert.convert_response(response)
286
+ self.page_title = res.title
287
+ self._set_page_content(res.text_content)
288
+ # A download
289
+ else:
290
+ # Try producing a safe filename
291
+ fname = None
292
+ download_path = None
293
+ try:
294
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
295
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
296
+
297
+ suffix = 0
298
+ while os.path.exists(download_path) and suffix < 1000:
299
+ suffix += 1
300
+ base, ext = os.path.splitext(fname)
301
+ new_fname = f"{base}__{suffix}{ext}"
302
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
303
+
304
+ except NameError:
305
+ pass
306
+
307
+ # No suitable name, so make one
308
+ if fname is None:
309
+ extension = mimetypes.guess_extension(content_type)
310
+ if extension is None:
311
+ extension = ".download"
312
+ fname = str(uuid.uuid4()) + extension
313
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
314
+
315
+ # Open a file for writing
316
+ with open(download_path, "wb") as fh:
317
+ for chunk in response.iter_content(chunk_size=512):
318
+ fh.write(chunk)
319
+
320
+ # Render it
321
+ local_uri = pathlib.Path(download_path).as_uri()
322
+ self.set_address(local_uri)
323
+
324
+ except UnsupportedFormatException as e:
325
+ print(e)
326
+ self.page_title = ("Download complete.",)
327
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
328
+ except FileConversionException as e:
329
+ print(e)
330
+ self.page_title = ("Download complete.",)
331
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
332
+ except FileNotFoundError:
333
+ self.page_title = "Error 404"
334
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
335
+ except requests.exceptions.RequestException as request_exception:
336
+ try:
337
+ self.page_title = f"Error {response.status_code}"
338
+
339
+ # If the error was rendered in HTML we might as well render it
340
+ content_type = response.headers.get("content-type", "")
341
+ if content_type is not None and "text/html" in content_type.lower():
342
+ res = self._mdconvert.convert(response)
343
+ self.page_title = f"Error {response.status_code}"
344
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
345
+ else:
346
+ text = ""
347
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
348
+ text += chunk
349
+ self.page_title = f"Error {response.status_code}"
350
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
351
+ except NameError:
352
+ self.page_title = "Error"
353
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
354
+
355
+ def _state(self) -> Tuple[str, str]:
356
+ header = f"Address: {self.address}\n"
357
+ if self.page_title is not None:
358
+ header += f"Title: {self.page_title}\n"
359
+
360
+ current_page = self.viewport_current_page
361
+ total_pages = len(self.viewport_pages)
362
+
363
+ address = self.address
364
+ for i in range(len(self.history) - 2, -1, -1): # Start from the second last
365
+ if self.history[i][0] == address:
366
+ header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
367
+ break
368
+
369
+ header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
370
+ return (header, self.viewport)
371
+
372
+
373
+ class SearchInformationTool(Tool):
374
+ name = "web_search"
375
+ description = "Perform a web search query (think a google search) and returns the search results."
376
+ inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
377
+ inputs["filter_year"] = {
378
+ "type": "string",
379
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
380
+ "nullable": True,
381
+ }
382
+ output_type = "string"
383
+
384
+ def __init__(self, browser):
385
+ super().__init__()
386
+ self.browser = browser
387
+
388
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
389
+ self.browser.visit_page(f"google: {query}", filter_year=filter_year)
390
+ header, content = self.browser._state()
391
+ return header.strip() + "\n=======================\n" + content
392
+
393
+
394
+ class VisitTool(Tool):
395
+ name = "visit_page"
396
+ description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
397
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
398
+ output_type = "string"
399
+
400
+ def __init__(self, browser):
401
+ super().__init__()
402
+ self.browser = browser
403
+
404
+ def forward(self, url: str) -> str:
405
+ self.browser.visit_page(url)
406
+ header, content = self.browser._state()
407
+ return header.strip() + "\n=======================\n" + content
408
+
409
+
410
+ class DownloadTool(Tool):
411
+ name = "download_file"
412
+ description = """
413
+ Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
414
+ After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
415
+ DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
416
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
417
+ output_type = "string"
418
+
419
+ def __init__(self, browser):
420
+ super().__init__()
421
+ self.browser = browser
422
+
423
+ def forward(self, url: str) -> str:
424
+ if "arxiv" in url:
425
+ url = url.replace("abs", "pdf")
426
+ response = requests.get(url)
427
+ content_type = response.headers.get("content-type", "")
428
+ extension = mimetypes.guess_extension(content_type)
429
+ if extension and isinstance(extension, str):
430
+ new_path = f"./downloads/file{extension}"
431
+ else:
432
+ new_path = "./downloads/file.object"
433
+
434
+ with open(new_path, "wb") as f:
435
+ f.write(response.content)
436
+
437
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
438
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
439
+
440
+ return f"File was downloaded and saved under path {new_path}."
441
+
442
+
443
+ class ArchiveSearchTool(Tool):
444
+ name = "find_archived_url"
445
+ description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
446
+ inputs = {
447
+ "url": {"type": "string", "description": "The url you need the archive for."},
448
+ "date": {
449
+ "type": "string",
450
+ "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
451
+ },
452
+ }
453
+ output_type = "string"
454
+
455
+ def __init__(self, browser):
456
+ super().__init__()
457
+ self.browser = browser
458
+
459
+ def forward(self, url, date) -> str:
460
+ no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
461
+ archive_url = no_timestamp_url + f"&timestamp={date}"
462
+ response = requests.get(archive_url).json()
463
+ response_notimestamp = requests.get(no_timestamp_url).json()
464
+ if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
465
+ closest = response["archived_snapshots"]["closest"]
466
+ print("Archive found!", closest)
467
+
468
+ elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
469
+ closest = response_notimestamp["archived_snapshots"]["closest"]
470
+ print("Archive found!", closest)
471
+ else:
472
+ raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
473
+ target_url = closest["url"]
474
+ self.browser.visit_page(target_url)
475
+ header, content = self.browser._state()
476
+ return (
477
+ f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
478
+ + header.strip()
479
+ + "\n=======================\n"
480
+ + content
481
+ )
482
+
483
+
484
+ class PageUpTool(Tool):
485
+ name = "page_up"
486
+ description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
487
+ inputs = {}
488
+ output_type = "string"
489
+
490
+ def __init__(self, browser):
491
+ super().__init__()
492
+ self.browser = browser
493
+
494
+ def forward(self) -> str:
495
+ self.browser.page_up()
496
+ header, content = self.browser._state()
497
+ return header.strip() + "\n=======================\n" + content
498
+
499
+
500
+ class PageDownTool(Tool):
501
+ name = "page_down"
502
+ description = (
503
+ "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
504
+ )
505
+ inputs = {}
506
+ output_type = "string"
507
+
508
+ def __init__(self, browser):
509
+ super().__init__()
510
+ self.browser = browser
511
+
512
+ def forward(self) -> str:
513
+ self.browser.page_down()
514
+ header, content = self.browser._state()
515
+ return header.strip() + "\n=======================\n" + content
516
+
517
+
518
+ class FinderTool(Tool):
519
+ name = "find_on_page_ctrl_f"
520
+ description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
521
+ inputs = {
522
+ "search_string": {
523
+ "type": "string",
524
+ "description": "The string to search for on the page. This search string supports wildcards like '*'",
525
+ }
526
+ }
527
+ output_type = "string"
528
+
529
+ def __init__(self, browser):
530
+ super().__init__()
531
+ self.browser = browser
532
+
533
+ def forward(self, search_string: str) -> str:
534
+ find_result = self.browser.find_on_page(search_string)
535
+ header, content = self.browser._state()
536
+
537
+ if find_result is None:
538
+ return (
539
+ header.strip()
540
+ + f"\n=======================\nThe search string '{search_string}' was not found on this page."
541
+ )
542
+ else:
543
+ return header.strip() + "\n=======================\n" + content
544
+
545
+
546
+ class FindNextTool(Tool):
547
+ name = "find_next"
548
+ description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
549
+ inputs = {}
550
+ output_type = "string"
551
+
552
+ def __init__(self, browser):
553
+ super().__init__()
554
+ self.browser = browser
555
+
556
+ def forward(self) -> str:
557
+ find_result = self.browser.find_next()
558
+ header, content = self.browser._state()
559
+
560
+ if find_result is None:
561
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
562
+ else:
563
+ return header.strip() + "\n=======================\n" + content
scripts/visual_qa.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import mimetypes
4
+ import os
5
+ import uuid
6
+ from io import BytesIO
7
+ from typing import Optional
8
+
9
+ import requests
10
+ from dotenv import load_dotenv
11
+ from huggingface_hub import InferenceClient
12
+ from PIL import Image
13
+ from transformers import AutoProcessor
14
+
15
+ from smolagents import Tool, tool
16
+
17
+
18
+ load_dotenv(override=True)
19
+
20
+ idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
21
+
22
+
23
+ def process_images_and_text(image_path, query, client):
24
+ messages = [
25
+ {
26
+ "role": "user",
27
+ "content": [
28
+ {"type": "image"},
29
+ {"type": "text", "text": query},
30
+ ],
31
+ },
32
+ ]
33
+
34
+ prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
35
+
36
+ # load images from local directory
37
+
38
+ # encode images to strings which can be sent to the endpoint
39
+ def encode_local_image(image_path):
40
+ # load image
41
+ image = Image.open(image_path).convert("RGB")
42
+
43
+ # Convert the image to a base64 string
44
+ buffer = BytesIO()
45
+ image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
46
+ base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
47
+
48
+ # add string formatting required by the endpoint
49
+ image_string = f"data:image/jpeg;base64,{base64_image}"
50
+
51
+ return image_string
52
+
53
+ image_string = encode_local_image(image_path)
54
+ prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)
55
+
56
+ payload = {
57
+ "inputs": prompt_with_images,
58
+ "parameters": {
59
+ "return_full_text": False,
60
+ "max_new_tokens": 200,
61
+ },
62
+ }
63
+
64
+ return json.loads(client.post(json=payload).decode())[0]
65
+
66
+
67
+ # Function to encode the image
68
+ def encode_image(image_path):
69
+ if image_path.startswith("http"):
70
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
71
+ request_kwargs = {
72
+ "headers": {"User-Agent": user_agent},
73
+ "stream": True,
74
+ }
75
+
76
+ # Send a HTTP request to the URL
77
+ response = requests.get(image_path, **request_kwargs)
78
+ response.raise_for_status()
79
+ content_type = response.headers.get("content-type", "")
80
+
81
+ extension = mimetypes.guess_extension(content_type)
82
+ if extension is None:
83
+ extension = ".download"
84
+
85
+ fname = str(uuid.uuid4()) + extension
86
+ download_path = os.path.abspath(os.path.join("downloads", fname))
87
+
88
+ with open(download_path, "wb") as fh:
89
+ for chunk in response.iter_content(chunk_size=512):
90
+ fh.write(chunk)
91
+
92
+ image_path = download_path
93
+
94
+ with open(image_path, "rb") as image_file:
95
+ return base64.b64encode(image_file.read()).decode("utf-8")
96
+
97
+
98
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
99
+
100
+
101
+ def resize_image(image_path):
102
+ img = Image.open(image_path)
103
+ width, height = img.size
104
+ img = img.resize((int(width / 2), int(height / 2)))
105
+ new_image_path = f"resized_{image_path}"
106
+ img.save(new_image_path)
107
+ return new_image_path
108
+
109
+
110
+ class VisualQATool(Tool):
111
+ name = "visualizer"
112
+ description = "A tool that can answer questions about attached images."
113
+ inputs = {
114
+ "image_path": {
115
+ "description": "The path to the image on which to answer the question",
116
+ "type": "string",
117
+ },
118
+ "question": {"description": "the question to answer", "type": "string", "nullable": True},
119
+ }
120
+ output_type = "string"
121
+
122
+ client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
123
+
124
+ def forward(self, image_path: str, question: Optional[str] = None) -> str:
125
+ output = ""
126
+ add_note = False
127
+ if not question:
128
+ add_note = True
129
+ question = "Please write a detailed caption for this image."
130
+ try:
131
+ output = process_images_and_text(image_path, question, self.client)
132
+ except Exception as e:
133
+ print(e)
134
+ if "Payload Too Large" in str(e):
135
+ new_image_path = resize_image(image_path)
136
+ output = process_images_and_text(new_image_path, question, self.client)
137
+
138
+ if add_note:
139
+ output = (
140
+ f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
141
+ )
142
+
143
+ return output
144
+
145
+
146
+ @tool
147
+ def visualizer(image_path: str, question: Optional[str] = None) -> str:
148
+ """A tool that can answer questions about attached images.
149
+
150
+ Args:
151
+ image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
152
+ question: The question to answer.
153
+ """
154
+
155
+ add_note = False
156
+ if not question:
157
+ add_note = True
158
+ question = "Please write a detailed caption for this image."
159
+ if not isinstance(image_path, str):
160
+ raise Exception("You should provide at least `image_path` string argument to this tool!")
161
+
162
+ mime_type, _ = mimetypes.guess_type(image_path)
163
+ base64_image = encode_image(image_path)
164
+
165
+ payload = {
166
+ "model": "gpt-4o",
167
+ "messages": [
168
+ {
169
+ "role": "user",
170
+ "content": [
171
+ {"type": "text", "text": question},
172
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
173
+ ],
174
+ }
175
+ ],
176
+ "max_tokens": 1000,
177
+ }
178
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
179
+ try:
180
+ output = response.json()["choices"][0]["message"]["content"]
181
+ except Exception:
182
+ raise Exception(f"Response format unexpected: {response.json()}")
183
+
184
+ if add_note:
185
+ output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
186
+
187
+ return output