Spaces:
Configuration error
Configuration error
Upload 9 files
Browse files- README.md +198 -7
- config.json +13 -0
- custom_gpt_config.py +19 -0
- handler.py +131 -0
- merges.txt +0 -0
- nova_model.py +131 -0
- requirements.txt +6 -0
- special_tokens_map.json +5 -0
- tokenizer.json +0 -0
README.md
CHANGED
@@ -1,11 +1,202 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
emoji: 📚
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: indigo
|
6 |
-
sdk: docker
|
7 |
-
pinned: false
|
8 |
license: mit
|
|
|
|
|
|
|
9 |
---
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
library_name: transformers
|
|
|
|
|
|
|
|
|
|
|
3 |
license: mit
|
4 |
+
language:
|
5 |
+
- en
|
6 |
+
pipeline_tag: text-generation
|
7 |
---
|
8 |
|
9 |
+
# Model Card for Model ID
|
10 |
+
|
11 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
## Model Details
|
16 |
+
|
17 |
+
### Model Description
|
18 |
+
|
19 |
+
<!-- Provide a longer summary of what this model is. -->
|
20 |
+
|
21 |
+
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
|
22 |
+
|
23 |
+
- **Developed by:** [More Information Needed]
|
24 |
+
- **Funded by [optional]:** [More Information Needed]
|
25 |
+
- **Shared by [optional]:** [More Information Needed]
|
26 |
+
- **Model type:** [More Information Needed]
|
27 |
+
- **Language(s) (NLP):** [More Information Needed]
|
28 |
+
- **License:** [More Information Needed]
|
29 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
30 |
+
|
31 |
+
### Model Sources [optional]
|
32 |
+
|
33 |
+
<!-- Provide the basic links for the model. -->
|
34 |
+
|
35 |
+
- **Repository:** [More Information Needed]
|
36 |
+
- **Paper [optional]:** [More Information Needed]
|
37 |
+
- **Demo [optional]:** [More Information Needed]
|
38 |
+
|
39 |
+
## Uses
|
40 |
+
|
41 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
42 |
+
|
43 |
+
### Direct Use
|
44 |
+
|
45 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
46 |
+
|
47 |
+
[More Information Needed]
|
48 |
+
|
49 |
+
### Downstream Use [optional]
|
50 |
+
|
51 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
52 |
+
|
53 |
+
[More Information Needed]
|
54 |
+
|
55 |
+
### Out-of-Scope Use
|
56 |
+
|
57 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
58 |
+
|
59 |
+
[More Information Needed]
|
60 |
+
|
61 |
+
## Bias, Risks, and Limitations
|
62 |
+
|
63 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
64 |
+
|
65 |
+
[More Information Needed]
|
66 |
+
|
67 |
+
### Recommendations
|
68 |
+
|
69 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
70 |
+
|
71 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
72 |
+
|
73 |
+
## How to Get Started with the Model
|
74 |
+
|
75 |
+
Use the code below to get started with the model.
|
76 |
+
|
77 |
+
[More Information Needed]
|
78 |
+
|
79 |
+
## Training Details
|
80 |
+
|
81 |
+
### Training Data
|
82 |
+
|
83 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
84 |
+
|
85 |
+
[More Information Needed]
|
86 |
+
|
87 |
+
### Training Procedure
|
88 |
+
|
89 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
90 |
+
|
91 |
+
#### Preprocessing [optional]
|
92 |
+
|
93 |
+
[More Information Needed]
|
94 |
+
|
95 |
+
|
96 |
+
#### Training Hyperparameters
|
97 |
+
|
98 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
99 |
+
|
100 |
+
#### Speeds, Sizes, Times [optional]
|
101 |
+
|
102 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
103 |
+
|
104 |
+
[More Information Needed]
|
105 |
+
|
106 |
+
## Evaluation
|
107 |
+
|
108 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
109 |
+
|
110 |
+
### Testing Data, Factors & Metrics
|
111 |
+
|
112 |
+
#### Testing Data
|
113 |
+
|
114 |
+
<!-- This should link to a Dataset Card if possible. -->
|
115 |
+
|
116 |
+
[More Information Needed]
|
117 |
+
|
118 |
+
#### Factors
|
119 |
+
|
120 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
121 |
+
|
122 |
+
[More Information Needed]
|
123 |
+
|
124 |
+
#### Metrics
|
125 |
+
|
126 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
127 |
+
|
128 |
+
[More Information Needed]
|
129 |
+
|
130 |
+
### Results
|
131 |
+
|
132 |
+
[More Information Needed]
|
133 |
+
|
134 |
+
#### Summary
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
## Model Examination [optional]
|
139 |
+
|
140 |
+
<!-- Relevant interpretability work for the model goes here -->
|
141 |
+
|
142 |
+
[More Information Needed]
|
143 |
+
|
144 |
+
## Environmental Impact
|
145 |
+
|
146 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
147 |
+
|
148 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
149 |
+
|
150 |
+
- **Hardware Type:** [More Information Needed]
|
151 |
+
- **Hours used:** [More Information Needed]
|
152 |
+
- **Cloud Provider:** [More Information Needed]
|
153 |
+
- **Compute Region:** [More Information Needed]
|
154 |
+
- **Carbon Emitted:** [More Information Needed]
|
155 |
+
|
156 |
+
## Technical Specifications [optional]
|
157 |
+
|
158 |
+
### Model Architecture and Objective
|
159 |
+
|
160 |
+
[More Information Needed]
|
161 |
+
|
162 |
+
### Compute Infrastructure
|
163 |
+
|
164 |
+
[More Information Needed]
|
165 |
+
|
166 |
+
#### Hardware
|
167 |
+
|
168 |
+
[More Information Needed]
|
169 |
+
|
170 |
+
#### Software
|
171 |
+
|
172 |
+
[More Information Needed]
|
173 |
+
|
174 |
+
## Citation [optional]
|
175 |
+
|
176 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
177 |
+
|
178 |
+
**BibTeX:**
|
179 |
+
|
180 |
+
[More Information Needed]
|
181 |
+
|
182 |
+
**APA:**
|
183 |
+
|
184 |
+
[More Information Needed]
|
185 |
+
|
186 |
+
## Glossary [optional]
|
187 |
+
|
188 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
189 |
+
|
190 |
+
[More Information Needed]
|
191 |
+
|
192 |
+
## More Information [optional]
|
193 |
+
|
194 |
+
[More Information Needed]
|
195 |
+
|
196 |
+
## Model Card Authors [optional]
|
197 |
+
|
198 |
+
[More Information Needed]
|
199 |
+
|
200 |
+
## Model Card Contact
|
201 |
+
|
202 |
+
[More Information Needed]
|
config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"HuggingFaceGPTModel"
|
4 |
+
],
|
5 |
+
"block_size": 1024,
|
6 |
+
"hidden_size": 1024,
|
7 |
+
"model_type": "custom_gpt",
|
8 |
+
"n_head": 16,
|
9 |
+
"n_layer": 24,
|
10 |
+
"torch_dtype": "float32",
|
11 |
+
"transformers_version": "4.44.2",
|
12 |
+
"vocab_size": 50304
|
13 |
+
}
|
custom_gpt_config.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import GPT2Config
|
2 |
+
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
|
3 |
+
|
4 |
+
class CustomGPTConfig(GPT2Config):
|
5 |
+
model_type = "custom_gpt"
|
6 |
+
|
7 |
+
def __init__(self, vocab_size=50304, n_layer=24, n_head=16, hidden_size=1024, block_size=1024, **kwargs):
|
8 |
+
super().__init__(
|
9 |
+
vocab_size=vocab_size,
|
10 |
+
n_positions=block_size,
|
11 |
+
n_ctx=block_size,
|
12 |
+
n_embd=hidden_size,
|
13 |
+
n_layer=n_layer,
|
14 |
+
n_head=n_head,
|
15 |
+
**kwargs,
|
16 |
+
)
|
17 |
+
|
18 |
+
# Register the custom configuration
|
19 |
+
CONFIG_MAPPING.register("custom_gpt", CustomGPTConfig)
|
handler.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from transformers import GPT2Tokenizer, PreTrainedModel, PretrainedConfig
|
4 |
+
|
5 |
+
# Custom Configuration
|
6 |
+
from transformers import GPT2Config
|
7 |
+
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
|
8 |
+
|
9 |
+
|
10 |
+
class CustomGPTConfig(GPT2Config):
|
11 |
+
model_type = "custom_gpt"
|
12 |
+
|
13 |
+
def __init__(self, vocab_size=50304, n_layer=24, n_head=16, hidden_size=1024, block_size=1024, **kwargs):
|
14 |
+
super().__init__(
|
15 |
+
vocab_size=vocab_size,
|
16 |
+
n_positions=block_size,
|
17 |
+
n_ctx=block_size,
|
18 |
+
n_embd=hidden_size,
|
19 |
+
n_layer=n_layer,
|
20 |
+
n_head=n_head,
|
21 |
+
**kwargs,
|
22 |
+
)
|
23 |
+
self.block_size = block_size # Ensure block_size is properly set
|
24 |
+
|
25 |
+
|
26 |
+
# Register the custom configuration
|
27 |
+
CONFIG_MAPPING.register("custom_gpt", CustomGPTConfig)
|
28 |
+
|
29 |
+
|
30 |
+
# Wrapper for GPT to make it compatible with HF中国镜像站
|
31 |
+
class HuggingFaceGPT(PreTrainedModel):
|
32 |
+
config_class = CustomGPTConfig
|
33 |
+
|
34 |
+
def __init__(self, config):
|
35 |
+
super().__init__(config)
|
36 |
+
from nova_model import GPT # Replace with your actual model import
|
37 |
+
self.transformer = GPT(config)
|
38 |
+
|
39 |
+
def forward(self, input_ids, **kwargs):
|
40 |
+
targets = kwargs.get("labels", None)
|
41 |
+
logits, loss = self.transformer(input_ids, targets=targets)
|
42 |
+
return {"logits": logits, "loss": loss}
|
43 |
+
|
44 |
+
|
45 |
+
class EndpointHandler:
|
46 |
+
def __init__(self, model_dir, device="cuda"):
|
47 |
+
print(f"Initializing model from directory: {model_dir}")
|
48 |
+
# Load custom configuration and model
|
49 |
+
self.config = CustomGPTConfig.from_pretrained(model_dir)
|
50 |
+
self.model = HuggingFaceGPT(self.config)
|
51 |
+
state_dict = torch.load(f"{model_dir}/pytorch_model.bin", map_location=torch.device(device))
|
52 |
+
self.model.load_state_dict(state_dict)
|
53 |
+
self.model.to(device)
|
54 |
+
self.model.eval()
|
55 |
+
print("Model initialized successfully.")
|
56 |
+
|
57 |
+
# Load tokenizer
|
58 |
+
self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
59 |
+
self.device = device
|
60 |
+
print("Tokenizer loaded successfully.")
|
61 |
+
|
62 |
+
def __call__(self, inputs):
|
63 |
+
print("Processing inputs...")
|
64 |
+
# Extract inputs
|
65 |
+
prompt = inputs.get("inputs", "")
|
66 |
+
parameters = inputs.get("parameters", {})
|
67 |
+
max_length = parameters.get("max_length", 32)
|
68 |
+
num_return_sequences = parameters.get("num_return_sequences", 4)
|
69 |
+
temperature = parameters.get("temperature", 1.0)
|
70 |
+
top_k = parameters.get("top_k", 50)
|
71 |
+
|
72 |
+
if not prompt:
|
73 |
+
print("Error: Input prompt is missing.")
|
74 |
+
return [{"error": "Input prompt is missing"}]
|
75 |
+
|
76 |
+
print(f"Prompt: {prompt}")
|
77 |
+
print(f"Parameters: {parameters}")
|
78 |
+
|
79 |
+
# Encode input prompt
|
80 |
+
tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
|
81 |
+
tokens = tokens.repeat(num_return_sequences, 1)
|
82 |
+
|
83 |
+
# Prepare RNG for reproducibility
|
84 |
+
sample_rng = torch.Generator(device=self.device)
|
85 |
+
sample_rng.manual_seed(42)
|
86 |
+
|
87 |
+
# Initialize generation
|
88 |
+
generated_tokens = tokens
|
89 |
+
while generated_tokens.size(1) < max_length:
|
90 |
+
with torch.no_grad():
|
91 |
+
# Forward pass to get logits
|
92 |
+
output = self.model(input_ids=generated_tokens)
|
93 |
+
logits = output["logits"][:, -1, :] # Get the last token logits
|
94 |
+
|
95 |
+
# Apply softmax to get probabilities
|
96 |
+
probs = F.softmax(logits / temperature, dim=-1)
|
97 |
+
|
98 |
+
# Top-k sampling
|
99 |
+
topk_probs, topk_indices = torch.topk(probs, top_k, dim=-1)
|
100 |
+
next_token = torch.multinomial(topk_probs, 1, generator=sample_rng)
|
101 |
+
selected_token = torch.gather(topk_indices, -1, next_token)
|
102 |
+
|
103 |
+
# Append the generated token
|
104 |
+
generated_tokens = torch.cat((generated_tokens, selected_token), dim=1)
|
105 |
+
|
106 |
+
# Debug log for generation progress
|
107 |
+
print(f"Generated tokens so far: {generated_tokens.size(1)}/{max_length}")
|
108 |
+
|
109 |
+
# Decode and return generated text
|
110 |
+
results = []
|
111 |
+
for i in range(num_return_sequences):
|
112 |
+
tokens_list = generated_tokens[i, :max_length].tolist()
|
113 |
+
decoded_text = self.tokenizer.decode(tokens_list, skip_special_tokens=True)
|
114 |
+
results.append({"generated_text": decoded_text})
|
115 |
+
|
116 |
+
print("Generation completed.")
|
117 |
+
return results
|
118 |
+
|
119 |
+
|
120 |
+
if __name__ == "__main__":
|
121 |
+
# Example usage
|
122 |
+
model_directory = "./"
|
123 |
+
handler = EndpointHandler(model_directory)
|
124 |
+
|
125 |
+
prompt_text = "Hello, I'm a language model,"
|
126 |
+
inputs = {"inputs": prompt_text, "parameters": {"max_length": 32, "num_return_sequences": 4, "temperature": 0.7, "top_k": 50}}
|
127 |
+
|
128 |
+
print("Starting inference...")
|
129 |
+
outputs = handler(inputs)
|
130 |
+
for idx, result in enumerate(outputs):
|
131 |
+
print(f"Sample {idx}: {result['generated_text']}")
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nova_model.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from torch.nn import functional as F
|
4 |
+
from qiskit.circuit.library import RealAmplitudes, ZZFeatureMap, ZFeatureMap
|
5 |
+
from qiskit import QuantumCircuit
|
6 |
+
from qiskit_machine_learning.neural_networks import SamplerQNN
|
7 |
+
from qiskit_machine_learning.connectors import TorchConnector
|
8 |
+
from dataclasses import dataclass
|
9 |
+
|
10 |
+
# Quantum Neural Network setup
|
11 |
+
num_qubits = 8
|
12 |
+
|
13 |
+
def create_qnn():
|
14 |
+
"""Creates a Quantum Neural Network."""
|
15 |
+
feature_map = ZFeatureMap(num_qubits, reps=32)
|
16 |
+
ansatz = RealAmplitudes(num_qubits, reps=32)
|
17 |
+
qc = QuantumCircuit(num_qubits)
|
18 |
+
qc.compose(feature_map, inplace=True)
|
19 |
+
qc.compose(ansatz, inplace=True)
|
20 |
+
|
21 |
+
qnn = SamplerQNN(
|
22 |
+
circuit=qc,
|
23 |
+
input_params=feature_map.parameters,
|
24 |
+
weight_params=ansatz.parameters,
|
25 |
+
)
|
26 |
+
|
27 |
+
return qnn
|
28 |
+
|
29 |
+
# Model Components
|
30 |
+
class CausalSelfAttention(nn.Module):
|
31 |
+
def __init__(self, config):
|
32 |
+
super().__init__()
|
33 |
+
assert config.n_embd % config.n_head == 0
|
34 |
+
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
|
35 |
+
self.c_proj = nn.Linear(config.n_embd, config.n_embd)
|
36 |
+
self.n_head = config.n_head
|
37 |
+
self.n_embd = config.n_embd
|
38 |
+
|
39 |
+
def forward(self, x):
|
40 |
+
B, T, C = x.size() # Batch size, sequence length, embedding size
|
41 |
+
qkv = self.c_attn(x)
|
42 |
+
q, k, v = qkv.split(self.n_embd, dim=2)
|
43 |
+
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
44 |
+
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
45 |
+
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
46 |
+
y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
|
47 |
+
y = y.transpose(1, 2).contiguous().view(B, T, C)
|
48 |
+
y = self.c_proj(y)
|
49 |
+
return y
|
50 |
+
|
51 |
+
class MLP(nn.Module):
|
52 |
+
def __init__(self, config):
|
53 |
+
super().__init__()
|
54 |
+
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
|
55 |
+
self.gelu = nn.GELU(approximate='tanh')
|
56 |
+
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
|
57 |
+
self.quantum_embedding = nn.Linear(config.n_embd, num_qubits)
|
58 |
+
self.qnn_layer = TorchConnector(create_qnn())
|
59 |
+
self.output_layer = nn.Linear(2 ** num_qubits, 1024)
|
60 |
+
|
61 |
+
def forward(self, x):
|
62 |
+
x = self.quantum_embedding(x)
|
63 |
+
x = self.qnn_layer(x)
|
64 |
+
x = self.gelu(x)
|
65 |
+
x = self.output_layer(x)
|
66 |
+
return x
|
67 |
+
|
68 |
+
class Block(nn.Module):
|
69 |
+
def __init__(self, config):
|
70 |
+
super().__init__()
|
71 |
+
self.ln_1 = nn.LayerNorm(config.n_embd)
|
72 |
+
self.attn = CausalSelfAttention(config)
|
73 |
+
self.ln_2 = nn.LayerNorm(config.n_embd)
|
74 |
+
self.mlp = MLP(config)
|
75 |
+
|
76 |
+
def forward(self, x):
|
77 |
+
x = x + self.attn(self.ln_1(x))
|
78 |
+
x = x + self.mlp(self.ln_2(x))
|
79 |
+
return x
|
80 |
+
|
81 |
+
@dataclass
|
82 |
+
class GPTConfig:
|
83 |
+
block_size: int = 1024
|
84 |
+
vocab_size: int = 50257
|
85 |
+
n_layer: int = 24
|
86 |
+
n_head: int = 16
|
87 |
+
n_embd: int = 1024
|
88 |
+
|
89 |
+
class GPT(nn.Module):
|
90 |
+
def __init__(self, config):
|
91 |
+
super().__init__()
|
92 |
+
self.config = config
|
93 |
+
self.transformer = nn.ModuleDict(dict(
|
94 |
+
wte=nn.Embedding(config.vocab_size, config.n_embd),
|
95 |
+
wpe=nn.Embedding(config.block_size, config.n_embd),
|
96 |
+
h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
97 |
+
ln_f=nn.LayerNorm(config.n_embd),
|
98 |
+
))
|
99 |
+
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
100 |
+
self.transformer.wte.weight = self.lm_head.weight
|
101 |
+
self.apply(self._init_weights)
|
102 |
+
|
103 |
+
def _init_weights(self, module):
|
104 |
+
if isinstance(module, nn.Linear):
|
105 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
106 |
+
if module.bias is not None:
|
107 |
+
torch.nn.init.zeros_(module.bias)
|
108 |
+
elif isinstance(module, nn.Embedding):
|
109 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
110 |
+
|
111 |
+
def forward(self, idx, targets=None):
|
112 |
+
B, T = idx.size()
|
113 |
+
assert T <= self.config.block_size, "Sequence length exceeds block size"
|
114 |
+
pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
|
115 |
+
tok_emb = self.transformer.wte(idx)
|
116 |
+
pos_emb = self.transformer.wpe(pos)
|
117 |
+
x = tok_emb + pos_emb
|
118 |
+
for block in self.transformer.h:
|
119 |
+
x = block(x)
|
120 |
+
x = self.transformer.ln_f(x)
|
121 |
+
logits = self.lm_head(x)
|
122 |
+
loss = None
|
123 |
+
if targets is not None:
|
124 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
|
125 |
+
return logits, loss
|
126 |
+
|
127 |
+
# Export the architecture for inference
|
128 |
+
if __name__ == "__main__":
|
129 |
+
config = GPTConfig()
|
130 |
+
model = GPT(config)
|
131 |
+
print(f"Model architecture:\n{model}")
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
qiskit
|
2 |
+
qiskit-machine-learning
|
3 |
+
qiskit-aer-gpu
|
4 |
+
transformers
|
5 |
+
tiktoken
|
6 |
+
datasets
|
special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
5 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|