Update README.md
Browse files
README.md
CHANGED
@@ -47,7 +47,17 @@ SmolDocling is a multimodal Image-Text-to-Text model designed for efficient docu
|
|
47 |
|
48 |
You can use transformers or docling to perform inference:
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
```python
|
53 |
import torch
|
@@ -93,9 +103,59 @@ generated_texts = processor.batch_decode(
|
|
93 |
|
94 |
print(generated_texts[0])
|
95 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
|
|
|
|
|
|
|
|
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
|
101 |
## Supported Instructions
|
|
|
47 |
|
48 |
You can use transformers or docling to perform inference:
|
49 |
|
50 |
+
<details>
|
51 |
+
<summary>Inference using Docling</summary>
|
52 |
+
|
53 |
+
```python
|
54 |
+
|
55 |
+
print(generated_texts[0])
|
56 |
+
```
|
57 |
+
</details>
|
58 |
+
|
59 |
+
<details>
|
60 |
+
<summary>Single image inference using Tranformers</summary>
|
61 |
|
62 |
```python
|
63 |
import torch
|
|
|
103 |
|
104 |
print(generated_texts[0])
|
105 |
```
|
106 |
+
</details>
|
107 |
+
|
108 |
+
<details>
|
109 |
+
<summary> 🚀 Fast Batch Inference Using VLLM</summary>
|
110 |
+
|
111 |
+
```python
|
112 |
+
!pip install vllm
|
113 |
+
|
114 |
+
import time
|
115 |
+
import os
|
116 |
+
from vllm import LLM, SamplingParams
|
117 |
+
from PIL import Image
|
118 |
|
119 |
+
# Configuration
|
120 |
+
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
121 |
+
IMAGE_DIR = "images_dir"
|
122 |
+
OUTPUT_DIR = "output_pred_dir"
|
123 |
+
PROMPT_TEXT = "Convert page to Docling."
|
124 |
|
125 |
+
# Ensure output directory exists
|
126 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
127 |
+
|
128 |
+
# Initialize LLM
|
129 |
+
llm = LLM(model=MODEL_PATH, limit_mm_per_prompt={"image": 1})
|
130 |
+
|
131 |
+
sampling_params = SamplingParams(
|
132 |
+
temperature=0.0,
|
133 |
+
max_tokens=8192)
|
134 |
+
|
135 |
+
chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
|
136 |
+
|
137 |
+
image_files = sorted([f for f in os.listdir(IMAGE_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg"))])
|
138 |
+
|
139 |
+
start_time = time.time()
|
140 |
+
total_tokens = 0
|
141 |
+
|
142 |
+
for idx, img_file in enumerate(image_files, 1):
|
143 |
+
img_path = os.path.join(IMAGE_DIR, img_file)
|
144 |
+
image = Image.open(img_path).convert("RGB")
|
145 |
+
|
146 |
+
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
|
147 |
+
output = llm.generate([llm_input], sampling_params=sampling_params)[0]
|
148 |
+
|
149 |
+
output_text = output.outputs[0].text
|
150 |
+
output_filename = os.path.splitext(img_file)[0] + ".dt"
|
151 |
+
output_path = os.path.join(OUTPUT_DIR, output_filename)
|
152 |
+
|
153 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
154 |
+
f.write(output_text)
|
155 |
+
|
156 |
+
print(f"Total time: {time.time() - start_time:.2f} sec")
|
157 |
+
```
|
158 |
+
</details>
|
159 |
|
160 |
|
161 |
## Supported Instructions
|