Commit
·
cea8feb
1
Parent(s):
25f9e19
model
Browse files- .gitattributes +1 -0
- README.md +0 -0
- config.json +3 -0
- conversion_config.json +3 -0
- convert.py +70 -0
- onnx/model.onnx +3 -0
- onnx/model_int8.onnx +3 -0
- onnx/model_uint8.onnx +3 -0
- special_tokens_map.json +3 -0
- test_local.py +49 -0
- test_teradata.py +106 -0
- tokenizer.json +3 -0
- tokenizer_config.json +3 -0
.gitattributes
CHANGED
@@ -7,6 +7,7 @@
|
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
|
|
10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
|
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
11 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
12 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
*.model filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb68c77b52830c2223b9ad69b8a0473f49f06c930cf8fc34279eeddfb8230b47
|
3 |
+
size 1159
|
conversion_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1916c336ce8c725cb58e53cc53da3511fc4404781b5db05666206426d2505c30
|
3 |
+
size 303
|
convert.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
from optimum.exporters.onnx import main_export
|
6 |
+
import onnx
|
7 |
+
from onnxconverter_common import float16
|
8 |
+
import onnxruntime as rt
|
9 |
+
from onnxruntime.tools.onnx_model_utils import *
|
10 |
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
11 |
+
from huggingface_hub import hf_hub_download
|
12 |
+
|
13 |
+
|
14 |
+
with open('conversion_config.json') as json_file:
|
15 |
+
conversion_config = json.load(json_file)
|
16 |
+
|
17 |
+
|
18 |
+
model_id = conversion_config["model_id"]
|
19 |
+
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
|
20 |
+
precision_to_filename_map = conversion_config["precision_to_filename_map"]
|
21 |
+
opset = conversion_config["opset"]
|
22 |
+
IR = conversion_config["IR"]
|
23 |
+
|
24 |
+
|
25 |
+
op = onnx.OperatorSetIdProto()
|
26 |
+
op.version = opset
|
27 |
+
|
28 |
+
|
29 |
+
if not os.path.exists("onnx"):
|
30 |
+
os.makedirs("onnx")
|
31 |
+
|
32 |
+
if "fp32" in precision_to_filename_map:
|
33 |
+
print("Exporting the fp32 onnx file...")
|
34 |
+
|
35 |
+
filename = precision_to_filename_map['fp32']
|
36 |
+
|
37 |
+
hf_hub_download(repo_id=model_id, filename=filename, local_dir = "./")
|
38 |
+
model = onnx.load(filename)
|
39 |
+
model_fixed = onnx.helper.make_model(model.graph, ir_version = IR, opset_imports = [op]) #to be sure that we have compatible opset and IR version
|
40 |
+
onnx.save(model_fixed, filename)
|
41 |
+
|
42 |
+
print("Done\n\n")
|
43 |
+
|
44 |
+
if "int8" in precision_to_filename_map:
|
45 |
+
print("Exporting the int8 onnx file...")
|
46 |
+
|
47 |
+
|
48 |
+
filename = precision_to_filename_map['int8']
|
49 |
+
|
50 |
+
hf_hub_download(repo_id=model_id, filename=filename, local_dir = "./")
|
51 |
+
model = onnx.load(filename)
|
52 |
+
model_fixed = onnx.helper.make_model(model.graph, ir_version = IR, opset_imports = [op]) #to be sure that we have compatible opset and IR version
|
53 |
+
onnx.save(model_fixed, filename)
|
54 |
+
|
55 |
+
|
56 |
+
print("Done\n\n")
|
57 |
+
|
58 |
+
if "uint8" in precision_to_filename_map:
|
59 |
+
print("Exporting the uint8 onnx file...")
|
60 |
+
|
61 |
+
filename = precision_to_filename_map['uint8']
|
62 |
+
|
63 |
+
hf_hub_download(repo_id=model_id, filename=filename, local_dir = "./")
|
64 |
+
model = onnx.load(filename)
|
65 |
+
model_fixed = onnx.helper.make_model(model.graph, ir_version = IR, opset_imports = [op]) #to be sure that we have compatible opset and IR version
|
66 |
+
onnx.save(model_fixed, filename)
|
67 |
+
|
68 |
+
|
69 |
+
print("Done\n\n")
|
70 |
+
|
onnx/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74e517b6cb7efdd20e7c310ce49c851ff81c2a61684133f2f7a229e4b1e2b94c
|
3 |
+
size 1226099897
|
onnx/model_int8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e1dc692af82258063ff777f3151e86d412bfcfe0edb2047605ed92f4dccf486
|
3 |
+
size 310916005
|
onnx/model_uint8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbbcbce0ee6d3e0e059162b12258cd16d48311ab6aea23b7f8c72515a908763d
|
3 |
+
size 310916005
|
special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835
|
3 |
+
size 964
|
test_local.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import onnxruntime as rt
|
2 |
+
|
3 |
+
from sentence_transformers.util import cos_sim
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
+
import transformers
|
7 |
+
|
8 |
+
import gc
|
9 |
+
import json
|
10 |
+
|
11 |
+
|
12 |
+
with open('conversion_config.json') as json_file:
|
13 |
+
conversion_config = json.load(json_file)
|
14 |
+
|
15 |
+
|
16 |
+
model_id = conversion_config["model_id"]
|
17 |
+
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
|
18 |
+
precision_to_filename_map = conversion_config["precision_to_filename_map"]
|
19 |
+
|
20 |
+
sentences_1 = 'How is the weather today?'
|
21 |
+
sentences_2 = 'What is the current weather like today?'
|
22 |
+
|
23 |
+
print(f"Testing on cosine similiarity between sentences: \n'{sentences_1}'\n'{sentences_2}'\n\n\n")
|
24 |
+
|
25 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained("./")
|
26 |
+
enc1 = tokenizer(sentences_1)
|
27 |
+
enc2 = tokenizer(sentences_2)
|
28 |
+
|
29 |
+
for precision, file_name in precision_to_filename_map.items():
|
30 |
+
|
31 |
+
|
32 |
+
onnx_session = rt.InferenceSession(file_name)
|
33 |
+
embeddings_1_onnx = onnx_session.run(None, {"input_ids": [enc1.input_ids],
|
34 |
+
"attention_mask": [enc1.attention_mask]})[1][0]
|
35 |
+
|
36 |
+
embeddings_2_onnx = onnx_session.run(None, {"input_ids": [enc2.input_ids],
|
37 |
+
"attention_mask": [enc2.attention_mask]})[1][0]
|
38 |
+
|
39 |
+
del onnx_session
|
40 |
+
gc.collect()
|
41 |
+
print(f'Cosine similiarity for ONNX model with precision "{precision}" is {str(cos_sim(embeddings_1_onnx, embeddings_2_onnx))}')
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
model = SentenceTransformer(model_id, trust_remote_code=True)
|
47 |
+
embeddings_1_sentence_transformer = model.encode(sentences_1, normalize_embeddings=True, trust_remote_code=True)
|
48 |
+
embeddings_2_sentence_transformer = model.encode(sentences_2, normalize_embeddings=True, trust_remote_code=True)
|
49 |
+
print('Cosine similiarity for original sentence transformer model is '+str(cos_sim(embeddings_1_sentence_transformer, embeddings_2_sentence_transformer)))
|
test_teradata.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import teradataml as tdml
|
3 |
+
from tabulate import tabulate
|
4 |
+
|
5 |
+
import json
|
6 |
+
|
7 |
+
|
8 |
+
with open('conversion_config.json') as json_file:
|
9 |
+
conversion_config = json.load(json_file)
|
10 |
+
|
11 |
+
|
12 |
+
model_id = conversion_config["model_id"]
|
13 |
+
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
|
14 |
+
precision_to_filename_map = conversion_config["precision_to_filename_map"]
|
15 |
+
|
16 |
+
host = sys.argv[1]
|
17 |
+
username = sys.argv[2]
|
18 |
+
password = sys.argv[3]
|
19 |
+
|
20 |
+
print("Setting up connection to teradata...")
|
21 |
+
tdml.create_context(host = host, username = username, password = password)
|
22 |
+
print("Done\n\n")
|
23 |
+
|
24 |
+
|
25 |
+
print("Deploying tokenizer...")
|
26 |
+
try:
|
27 |
+
tdml.db_drop_table('tokenizer_table')
|
28 |
+
except:
|
29 |
+
print("Can't drop tokenizers table - it's not existing")
|
30 |
+
tdml.save_byom('tokenizer',
|
31 |
+
'tokenizer.json',
|
32 |
+
'tokenizer_table')
|
33 |
+
print("Done\n\n")
|
34 |
+
|
35 |
+
print("Testing models...")
|
36 |
+
try:
|
37 |
+
tdml.db_drop_table('model_table')
|
38 |
+
except:
|
39 |
+
print("Can't drop models table - it's not existing")
|
40 |
+
|
41 |
+
for precision, file_name in precision_to_filename_map.items():
|
42 |
+
print(f"Deploying {precision} model...")
|
43 |
+
tdml.save_byom(precision,
|
44 |
+
file_name,
|
45 |
+
'model_table')
|
46 |
+
print(f"Model {precision} is deployed\n")
|
47 |
+
|
48 |
+
print(f"Calculating embeddings with {precision} model...")
|
49 |
+
try:
|
50 |
+
tdml.db_drop_table('emails_embeddings_store')
|
51 |
+
except:
|
52 |
+
print("Can't drop embeddings table - it's not existing")
|
53 |
+
|
54 |
+
tdml.execute_sql(f"""
|
55 |
+
create volatile table emails_embeddings_store as (
|
56 |
+
select
|
57 |
+
*
|
58 |
+
from mldb.ONNXEmbeddings(
|
59 |
+
on emails.emails as InputTable
|
60 |
+
on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION
|
61 |
+
on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION
|
62 |
+
|
63 |
+
using
|
64 |
+
Accumulate('id', 'txt')
|
65 |
+
ModelOutputTensor('sentence_embedding')
|
66 |
+
EnableMemoryCheck('false')
|
67 |
+
OutputFormat('FLOAT32({number_of_generated_embeddings})')
|
68 |
+
OverwriteCachedModel('true')
|
69 |
+
) a
|
70 |
+
) with data on commit preserve rows
|
71 |
+
|
72 |
+
""")
|
73 |
+
print("Embeddings calculated")
|
74 |
+
print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...")
|
75 |
+
tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store')
|
76 |
+
tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3]
|
77 |
+
|
78 |
+
tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3]
|
79 |
+
|
80 |
+
cos_sim_pd = tdml.DataFrame.from_query(f"""
|
81 |
+
SELECT
|
82 |
+
dt.target_id,
|
83 |
+
dt.reference_id,
|
84 |
+
e_tgt.txt as target_txt,
|
85 |
+
e_ref.txt as reference_txt,
|
86 |
+
(1.0 - dt.distance) as similiarity
|
87 |
+
FROM
|
88 |
+
TD_VECTORDISTANCE (
|
89 |
+
ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable
|
90 |
+
ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION
|
91 |
+
USING
|
92 |
+
TargetIDColumn('id')
|
93 |
+
TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
|
94 |
+
RefIDColumn('id')
|
95 |
+
RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
|
96 |
+
DistanceMeasure('cosine')
|
97 |
+
topk(3)
|
98 |
+
) AS dt
|
99 |
+
JOIN emails.emails e_tgt on e_tgt.id = dt.target_id
|
100 |
+
JOIN emails.emails e_ref on e_ref.id = dt.reference_id;
|
101 |
+
""").to_pandas()
|
102 |
+
print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid'))
|
103 |
+
print("Done\n\n")
|
104 |
+
|
105 |
+
|
106 |
+
tdml.remove_context()
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
|
3 |
+
size 17082987
|
tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c826b85ddb981faae7251de7bd9195140665c52ac6c313a1cd9c4d9ba040140c
|
3 |
+
size 1370
|