import torch
import torch.nn.functional as F
from transformers import GPT2Tokenizer, PreTrainedModel, PretrainedConfig

# Custom Configuration
from transformers import GPT2Config
from transformers.models.auto.configuration_auto import CONFIG_MAPPING


class CustomGPTConfig(GPT2Config):
    model_type = "custom_gpt"

    def __init__(self, vocab_size=50304, n_layer=24, n_head=16, hidden_size=1024, block_size=1024, **kwargs):
        super().__init__(
            vocab_size=vocab_size,
            n_positions=block_size,
            n_ctx=block_size,
            n_embd=hidden_size,
            n_layer=n_layer,
            n_head=n_head,
            **kwargs,
        )
        self.block_size = block_size  # Ensure block_size is properly set


# Register the custom configuration
CONFIG_MAPPING.register("custom_gpt", CustomGPTConfig)


# Wrapper for GPT to make it compatible with HF中国镜像站
class HuggingFaceGPT(PreTrainedModel):
    config_class = CustomGPTConfig

    def __init__(self, config):
        super().__init__(config)
        from nova_model import GPT  # Replace with your actual model import
        self.transformer = GPT(config)

    def forward(self, input_ids, **kwargs):
        targets = kwargs.get("labels", None)
        logits, loss = self.transformer(input_ids, targets=targets)
        return {"logits": logits, "loss": loss}


class EndpointHandler:
    def __init__(self, model_dir, device="cuda"):
        print(f"Initializing model from directory: {model_dir}")
        # Load custom configuration and model
        self.config = CustomGPTConfig.from_pretrained(model_dir)
        self.model = HuggingFaceGPT(self.config)
        state_dict = torch.load(f"{model_dir}/pytorch_model.bin", map_location=torch.device(device))
        self.model.load_state_dict(state_dict)
        self.model.to(device)
        self.model.eval()
        print("Model initialized successfully.")

        # Load tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.device = device
        print("Tokenizer loaded successfully.")

    def __call__(self, inputs):
        print("Processing inputs...")
        # Extract inputs
        prompt = inputs.get("inputs", "")
        parameters = inputs.get("parameters", {})
        max_length = parameters.get("max_length", 32)
        num_return_sequences = parameters.get("num_return_sequences", 4)
        temperature = parameters.get("temperature", 1.0)
        top_k = parameters.get("top_k", 50)

        if not prompt:
            print("Error: Input prompt is missing.")
            return [{"error": "Input prompt is missing"}]

        print(f"Prompt: {prompt}")
        print(f"Parameters: {parameters}")

        # Encode input prompt
        tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        tokens = tokens.repeat(num_return_sequences, 1)

        # Prepare RNG for reproducibility
        sample_rng = torch.Generator(device=self.device)
        sample_rng.manual_seed(42)

        # Initialize generation
        generated_tokens = tokens
        while generated_tokens.size(1) < max_length:
            with torch.no_grad():
                # Forward pass to get logits
                output = self.model(input_ids=generated_tokens)
                logits = output["logits"][:, -1, :]  # Get the last token logits

                # Apply softmax to get probabilities
                probs = F.softmax(logits / temperature, dim=-1)

                # Top-k sampling
                topk_probs, topk_indices = torch.topk(probs, top_k, dim=-1)
                next_token = torch.multinomial(topk_probs, 1, generator=sample_rng)
                selected_token = torch.gather(topk_indices, -1, next_token)

                # Append the generated token
                generated_tokens = torch.cat((generated_tokens, selected_token), dim=1)

                # Debug log for generation progress
                print(f"Generated tokens so far: {generated_tokens.size(1)}/{max_length}")

        # Decode and return generated text
        results = []
        for i in range(num_return_sequences):
            tokens_list = generated_tokens[i, :max_length].tolist()
            decoded_text = self.tokenizer.decode(tokens_list, skip_special_tokens=True)
            results.append({"generated_text": decoded_text})

        print("Generation completed.")
        return results


if __name__ == "__main__":
    # Example usage
    model_directory = "./"
    handler = EndpointHandler(model_directory)

    prompt_text = "Hello, I'm a language model,"
    inputs = {"inputs": prompt_text, "parameters": {"max_length": 32, "num_return_sequences": 4, "temperature": 0.7, "top_k": 50}}

    print("Starting inference...")
    outputs = handler(inputs)
    for idx, result in enumerate(outputs):
        print(f"Sample {idx}: {result['generated_text']}")