asnassar commited on
Commit
af4ca08
·
verified ·
1 Parent(s): c1a43cf

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +96 -0
README.md CHANGED
@@ -93,3 +93,99 @@ generated_texts = processor.batch_decode(
93
 
94
  print(generated_texts[0])
95
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  print(generated_texts[0])
95
  """
96
+
97
+ #### Transformers:
98
+
99
+
100
+ ```import json
101
+ import time
102
+ from pathlib import Path
103
+
104
+ import yaml
105
+
106
+ from docling.datamodel.base_models import InputFormat
107
+ from docling.datamodel.pipeline_options import SmolDoclingOptions, VlmPipelineOptions
108
+ from docling.document_converter import DocumentConverter, PdfFormatOption
109
+ from docling.pipeline.vlm_pipeline import VlmPipeline
110
+
111
+ sources = [
112
+ # "https://arxiv.org/pdf/2408.09869",
113
+ "tests/data/2305.03393v1-pg9-img.png",
114
+ # "tests/data/2305.03393v1-pg9.pdf",
115
+ ]
116
+
117
+ pipeline_options = VlmPipelineOptions() # artifacts_path="~/local_model_artifacts/"
118
+ pipeline_options.generate_page_images = True
119
+ # If force_backend_text = True, text from backend will be used instead of generated text
120
+ pipeline_options.force_backend_text = False
121
+
122
+
123
+ vlm_options = SmolDoclingOptions(
124
+ # question="Convert this page to docling.",
125
+ # load_in_8bit=True,
126
+ # llm_int8_threshold=6.0,
127
+ # quantized=False,
128
+ )
129
+
130
+ pipeline_options.vlm_options = vlm_options
131
+
132
+ from docling_core.types.doc import DocItemLabel, ImageRefMode
133
+ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
134
+
135
+ converter = DocumentConverter(
136
+ format_options={
137
+ InputFormat.PDF: PdfFormatOption(
138
+ pipeline_cls=VlmPipeline,
139
+ pipeline_options=pipeline_options,
140
+ ),
141
+ InputFormat.IMAGE: PdfFormatOption(
142
+ pipeline_cls=VlmPipeline,
143
+ pipeline_options=pipeline_options,
144
+ ),
145
+ }
146
+ )
147
+
148
+ out_path = Path("scratch")
149
+ out_path.mkdir(parents=True, exist_ok=True)
150
+
151
+ for source in sources:
152
+ start_time = time.time()
153
+ print("================================================")
154
+ print("Processing... {}".format(source))
155
+ print("================================================")
156
+ print("")
157
+
158
+ res = converter.convert(source)
159
+
160
+ print("------------------------------------------------")
161
+ print("MD:")
162
+ print("------------------------------------------------")
163
+ print("")
164
+ print(res.document.export_to_markdown())
165
+
166
+ # with (out_path / f"{res.input.file.stem}.html").open("w") as fp:
167
+ # fp.write(res.document.export_to_html())
168
+
169
+ res.document.save_as_html(
170
+ filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
171
+ image_mode=ImageRefMode.REFERENCED,
172
+ labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
173
+ )
174
+
175
+ with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
176
+ fp.write(json.dumps(res.document.export_to_dict()))
177
+
178
+ with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
179
+ fp.write(yaml.safe_dump(res.document.export_to_dict()))
180
+
181
+ pg_num = res.document.num_pages()
182
+
183
+ print("")
184
+ inference_time = time.time() - start_time
185
+ print(
186
+ f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
187
+ )
188
+
189
+ print("================================================")
190
+ print("done!")
191
+ print("================================================")```