Update README.md
Browse files
README.md
CHANGED
@@ -31,12 +31,11 @@ pipeline_tag: image-text-to-text
|
|
31 |
- 🔲 **OCR with Bounding Boxes** – OCR regions using a bounding box.
|
32 |
- 📂 **General Document Processing** – Trained for both scientific and non-scientific documents.
|
33 |
- 🔄 **Seamless Docling Integration** – Import into **Docling** and export in multiple formats.
|
34 |
-
- 📚 **Multi-Page & Full Document Conversion** – Coming Soon.
|
35 |
-
- 🧪 **Chemical Recognition** – Coming Soon.
|
36 |
|
37 |
### 🚧 *Coming soon!*
|
38 |
- 📊 **Better chart recognition 🛠️**
|
39 |
- 📚 **One shot multi-page inference ⏱️**
|
|
|
40 |
|
41 |
## ⌨️ Get started (code examples)
|
42 |
|
@@ -49,6 +48,7 @@ You can use transformers or docling to perform inference:
|
|
49 |
# Prerequisites:
|
50 |
# pip install torch
|
51 |
# pip install docling_core
|
|
|
52 |
|
53 |
import torch
|
54 |
from docling_core.types.doc import DoclingDocument
|
@@ -96,15 +96,14 @@ doctags = processor.batch_decode(
|
|
96 |
|
97 |
# Populate document
|
98 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
|
99 |
# create a docling document
|
100 |
doc = DoclingDocument(name="Document")
|
101 |
doc.load_from_doctags(doctags_doc)
|
102 |
|
103 |
# export as any format
|
104 |
# HTML
|
105 |
-
#
|
106 |
-
# with open(output_file, "w", encoding="utf-8") as f:
|
107 |
-
# f.write(doc.export_to_html())
|
108 |
# MD
|
109 |
print(doc.export_to_markdown())
|
110 |
```
|
@@ -118,7 +117,7 @@ print(doc.export_to_markdown())
|
|
118 |
# Prerequisites:
|
119 |
# pip install vllm
|
120 |
# pip install docling_core
|
121 |
-
# place page images you want to convert into img/ dir
|
122 |
|
123 |
import time
|
124 |
import os
|
@@ -129,8 +128,7 @@ from docling_core.types.doc.document import DocTagsDocument
|
|
129 |
|
130 |
# Configuration
|
131 |
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
132 |
-
|
133 |
-
IMAGE_DIR = "img/"
|
134 |
OUTPUT_DIR = "out/"
|
135 |
PROMPT_TEXT = "Convert page to Docling."
|
136 |
|
@@ -172,15 +170,11 @@ for idx, img_file in enumerate(image_files, 1):
|
|
172 |
doc.load_from_doctags(doctags_doc)
|
173 |
# export as any format
|
174 |
# HTML
|
175 |
-
#
|
176 |
-
# with open(output_file, "w", encoding="utf-8") as f:
|
177 |
-
# f.write(doc.export_to_html())
|
178 |
# MD
|
179 |
output_filename_md = img_fn + ".md"
|
180 |
output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
|
181 |
-
|
182 |
-
with open(output_path_md, "w", encoding="utf-8") as f:
|
183 |
-
f.write(markdown)
|
184 |
|
185 |
print(f"Total time: {time.time() - start_time:.2f} sec")
|
186 |
```
|
@@ -198,42 +192,49 @@ DocTags are integrated with Docling, which allows export to HTML, Markdown, and
|
|
198 |
<tr>
|
199 |
<td><b>Description</b></td>
|
200 |
<td><b>Instruction</b></td>
|
|
|
201 |
</tr>
|
202 |
<tr>
|
203 |
<td>Full conversion</td>
|
204 |
<td>Convert this page to docling.</td>
|
|
|
205 |
</tr>
|
206 |
<tr>
|
207 |
<td>Chart</td>
|
208 |
-
<td>Convert chart to table
|
|
|
209 |
</tr>
|
210 |
<tr>
|
211 |
<td>Formula</td>
|
212 |
-
<td>Convert formula to LaTeX
|
|
|
213 |
</tr>
|
214 |
<tr>
|
215 |
<td>Code</td>
|
216 |
-
<td>Convert code to text
|
|
|
217 |
</tr>
|
218 |
<tr>
|
219 |
<td>Table</td>
|
220 |
-
<td>Convert table to OTSL
|
|
|
221 |
</tr>
|
222 |
<tr>
|
223 |
-
<td>No-Code Actions/Pipelines</td>
|
224 |
<td>OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237></td>
|
|
|
225 |
</tr>
|
226 |
<tr>
|
227 |
-
<td></td>
|
228 |
<td>Identify element at: <loc_247><loc_482><10c_252><loc_486></td>
|
|
|
229 |
</tr>
|
230 |
<tr>
|
231 |
-
<td></td>
|
232 |
<td>Find all 'text' elements on the page, retrieve all section headers.</td>
|
|
|
233 |
</tr>
|
234 |
<tr>
|
235 |
-
<td></td>
|
236 |
<td>Detect footer elements on the page.</td>
|
|
|
237 |
</tr>
|
238 |
</table>
|
239 |
|
|
|
31 |
- 🔲 **OCR with Bounding Boxes** – OCR regions using a bounding box.
|
32 |
- 📂 **General Document Processing** – Trained for both scientific and non-scientific documents.
|
33 |
- 🔄 **Seamless Docling Integration** – Import into **Docling** and export in multiple formats.
|
|
|
|
|
34 |
|
35 |
### 🚧 *Coming soon!*
|
36 |
- 📊 **Better chart recognition 🛠️**
|
37 |
- 📚 **One shot multi-page inference ⏱️**
|
38 |
+
- 🧪 **Chemical Recognition**
|
39 |
|
40 |
## ⌨️ Get started (code examples)
|
41 |
|
|
|
48 |
# Prerequisites:
|
49 |
# pip install torch
|
50 |
# pip install docling_core
|
51 |
+
# pip install transformers
|
52 |
|
53 |
import torch
|
54 |
from docling_core.types.doc import DoclingDocument
|
|
|
96 |
|
97 |
# Populate document
|
98 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
99 |
+
print(doctags)
|
100 |
# create a docling document
|
101 |
doc = DoclingDocument(name="Document")
|
102 |
doc.load_from_doctags(doctags_doc)
|
103 |
|
104 |
# export as any format
|
105 |
# HTML
|
106 |
+
# doc.save_as_html(output_file)
|
|
|
|
|
107 |
# MD
|
108 |
print(doc.export_to_markdown())
|
109 |
```
|
|
|
117 |
# Prerequisites:
|
118 |
# pip install vllm
|
119 |
# pip install docling_core
|
120 |
+
# place page images you want to convert into "img/" dir
|
121 |
|
122 |
import time
|
123 |
import os
|
|
|
128 |
|
129 |
# Configuration
|
130 |
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
131 |
+
IMAGE_DIR = "img/" # Place your page images here
|
|
|
132 |
OUTPUT_DIR = "out/"
|
133 |
PROMPT_TEXT = "Convert page to Docling."
|
134 |
|
|
|
170 |
doc.load_from_doctags(doctags_doc)
|
171 |
# export as any format
|
172 |
# HTML
|
173 |
+
# doc.save_as_html(output_file)
|
|
|
|
|
174 |
# MD
|
175 |
output_filename_md = img_fn + ".md"
|
176 |
output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
|
177 |
+
doc.save_as_markdown(output_path_md)
|
|
|
|
|
178 |
|
179 |
print(f"Total time: {time.time() - start_time:.2f} sec")
|
180 |
```
|
|
|
192 |
<tr>
|
193 |
<td><b>Description</b></td>
|
194 |
<td><b>Instruction</b></td>
|
195 |
+
<td><b>Comment</b></td>
|
196 |
</tr>
|
197 |
<tr>
|
198 |
<td>Full conversion</td>
|
199 |
<td>Convert this page to docling.</td>
|
200 |
+
<td></td>
|
201 |
</tr>
|
202 |
<tr>
|
203 |
<td>Chart</td>
|
204 |
+
<td>Convert chart to table.</td>
|
205 |
+
<td>(e.g., <chart>)</td>
|
206 |
</tr>
|
207 |
<tr>
|
208 |
<td>Formula</td>
|
209 |
+
<td>Convert formula to LaTeX.</td>
|
210 |
+
<td>(e.g., <formula>)</td>
|
211 |
</tr>
|
212 |
<tr>
|
213 |
<td>Code</td>
|
214 |
+
<td>Convert code to text.</td>
|
215 |
+
<td>(e.g., <code>)</td>
|
216 |
</tr>
|
217 |
<tr>
|
218 |
<td>Table</td>
|
219 |
+
<td>Convert table to OTSL.</td>
|
220 |
+
<td>(e.g., <otsl>) OTSL: <a href="https://arxiv.org/pdf/2305.03393">Lysak et al., 2023</a></td>
|
221 |
</tr>
|
222 |
<tr>
|
223 |
+
<td rowspan=4>No-Code Actions/Pipelines</td>
|
224 |
<td>OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237></td>
|
225 |
+
<td></td>
|
226 |
</tr>
|
227 |
<tr>
|
|
|
228 |
<td>Identify element at: <loc_247><loc_482><10c_252><loc_486></td>
|
229 |
+
<td></td>
|
230 |
</tr>
|
231 |
<tr>
|
|
|
232 |
<td>Find all 'text' elements on the page, retrieve all section headers.</td>
|
233 |
+
<td></td>
|
234 |
</tr>
|
235 |
<tr>
|
|
|
236 |
<td>Detect footer elements on the page.</td>
|
237 |
+
<td></td>
|
238 |
</tr>
|
239 |
</table>
|
240 |
|