ds4sd
/

SmolDocling-256M-preview

@@ -31,12 +31,11 @@ pipeline_tag: image-text-to-text
 - 🔲 **OCR with Bounding Boxes** – OCR regions using a bounding box.
 - 📂 **General Document Processing** – Trained for both scientific and non-scientific documents.
 - 🔄 **Seamless Docling Integration** – Import into **Docling** and export in multiple formats.
-- 📚 **Multi-Page & Full Document Conversion** – Coming Soon.
-- 🧪 **Chemical Recognition** – Coming Soon.
 ### 🚧 *Coming soon!*
 - 📊 **Better chart recognition 🛠️**
 - 📚 **One shot multi-page inference ⏱️**
 ## ⌨️ Get started (code examples)
@@ -49,6 +48,7 @@ You can use transformers or docling to perform inference:
 # Prerequisites:
 # pip install torch
 # pip install docling_core
 import torch
 from docling_core.types.doc import DoclingDocument
@@ -96,15 +96,14 @@ doctags = processor.batch_decode(
 # Populate document
 doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
 # create a docling document
 doc = DoclingDocument(name="Document")
 doc.load_from_doctags(doctags_doc)
 # export as any format
 # HTML
-# print(doc.export_to_html())
-# with open(output_file, "w", encoding="utf-8") as f:
-#     f.write(doc.export_to_html())
 # MD
 print(doc.export_to_markdown())
 ```
@@ -118,7 +117,7 @@ print(doc.export_to_markdown())
 # Prerequisites:
 # pip install vllm
 # pip install docling_core
-# place page images you want to convert into img/ dir
 import time
 import os
@@ -129,8 +128,7 @@ from docling_core.types.doc.document import DocTagsDocument
 # Configuration
 MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
-# IMAGE_DIR = "images_dir"
-IMAGE_DIR = "img/"
 OUTPUT_DIR = "out/"
 PROMPT_TEXT = "Convert page to Docling."
@@ -172,15 +170,11 @@ for idx, img_file in enumerate(image_files, 1):
     doc.load_from_doctags(doctags_doc)
     # export as any format
     # HTML
-    # print(doc.export_to_html())
-    # with open(output_file, "w", encoding="utf-8") as f:
-    #     f.write(doc.export_to_html())
     # MD
     output_filename_md = img_fn + ".md"
     output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
-    markdown = doc.export_to_markdown()
-    with open(output_path_md, "w", encoding="utf-8") as f:
-        f.write(markdown)
 print(f"Total time: {time.time() - start_time:.2f} sec")
 ```
@@ -198,42 +192,49 @@ DocTags are integrated with Docling, which allows export to HTML, Markdown, and
   <tr>
     <td><b>Description</b></td>
     <td><b>Instruction</b></td>
   </tr>
   <tr>
     <td>Full conversion</td>
     <td>Convert this page to docling.</td>
   </tr>
   <tr>
     <td>Chart</td>
-    <td>Convert chart to table (e.g., &lt;chart&gt;).</td>
   </tr>
   <tr>
     <td>Formula</td>
-    <td>Convert formula to LaTeX (e.g., &lt;formula&gt;).</td>
   </tr>
   <tr>
     <td>Code</td>
-    <td>Convert code to text (e.g., &lt;code&gt;).</td>
   </tr>
   <tr>
     <td>Table</td>
-    <td>Convert table to OTSL (e.g., &lt;otsl&gt;). OTSL: <a href="https://arxiv.org/pdf/2305.03393">Lysak et al., 2023</a></td>
   </tr>
   <tr>
-    <td>No-Code Actions/Pipelines</td>
     <td>OCR the text in a specific location: &lt;loc_155&gt;&lt;loc_233&gt;&lt;loc_206&gt;&lt;loc_237&gt;</td>
   </tr>
   <tr>
-    <td></td>
     <td>Identify element at: &lt;loc_247&gt;&lt;loc_482&gt;&lt;10c_252&gt;&lt;loc_486&gt;</td>
   </tr>
   <tr>
-    <td></td>
     <td>Find all 'text' elements on the page, retrieve all section headers.</td>
   </tr>
   <tr>
-    <td></td>
     <td>Detect footer elements on the page.</td>
   </tr>
 </table>

 - 🔲 **OCR with Bounding Boxes** – OCR regions using a bounding box.
 - 📂 **General Document Processing** – Trained for both scientific and non-scientific documents.
 - 🔄 **Seamless Docling Integration** – Import into **Docling** and export in multiple formats.
 ### 🚧 *Coming soon!*
 - 📊 **Better chart recognition 🛠️**
 - 📚 **One shot multi-page inference ⏱️**
+- 🧪 **Chemical Recognition**
 ## ⌨️ Get started (code examples)
 # Prerequisites:
 # pip install torch
 # pip install docling_core
+# pip install transformers
 import torch
 from docling_core.types.doc import DoclingDocument
 # Populate document
 doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
+print(doctags)
 # create a docling document
 doc = DoclingDocument(name="Document")
 doc.load_from_doctags(doctags_doc)
 # export as any format
 # HTML
+# doc.save_as_html(output_file)
 # MD
 print(doc.export_to_markdown())
 ```
 # Prerequisites:
 # pip install vllm
 # pip install docling_core
+# place page images you want to convert into "img/" dir
 import time
 import os
 # Configuration
 MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
+IMAGE_DIR = "img/"  # Place your page images here
 OUTPUT_DIR = "out/"
 PROMPT_TEXT = "Convert page to Docling."
     doc.load_from_doctags(doctags_doc)
     # export as any format
     # HTML
+    # doc.save_as_html(output_file)
     # MD
     output_filename_md = img_fn + ".md"
     output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
+    doc.save_as_markdown(output_path_md)
 print(f"Total time: {time.time() - start_time:.2f} sec")
 ```
   <tr>
     <td><b>Description</b></td>
     <td><b>Instruction</b></td>
+    <td><b>Comment</b></td>
   </tr>
   <tr>
     <td>Full conversion</td>
     <td>Convert this page to docling.</td>
+    <td></td>
   </tr>
   <tr>
     <td>Chart</td>
+    <td>Convert chart to table.</td>
+    <td>(e.g., &lt;chart&gt;)</td>
   </tr>
   <tr>
     <td>Formula</td>
+    <td>Convert formula to LaTeX.</td>
+    <td>(e.g., &lt;formula&gt;)</td>
   </tr>
   <tr>
     <td>Code</td>
+    <td>Convert code to text.</td>
+    <td>(e.g., &lt;code&gt;)</td>
   </tr>
   <tr>
     <td>Table</td>
+    <td>Convert table to OTSL.</td>
+    <td>(e.g., &lt;otsl&gt;) OTSL: <a href="https://arxiv.org/pdf/2305.03393">Lysak et al., 2023</a></td>
   </tr>
   <tr>
+    <td rowspan=4>No-Code Actions/Pipelines</td>
     <td>OCR the text in a specific location: &lt;loc_155&gt;&lt;loc_233&gt;&lt;loc_206&gt;&lt;loc_237&gt;</td>
+    <td></td>
   </tr>
   <tr>
     <td>Identify element at: &lt;loc_247&gt;&lt;loc_482&gt;&lt;10c_252&gt;&lt;loc_486&gt;</td>
+    <td></td>
   </tr>
   <tr>
     <td>Find all 'text' elements on the page, retrieve all section headers.</td>
+    <td></td>
   </tr>
   <tr>
     <td>Detect footer elements on the page.</td>
+    <td></td>
   </tr>
 </table>