Spaces:

Gauri-tr
/

ch_new

Runtime error

App Files Files Community

Gauri-tr commited on 7 days ago

Commit

dbf8811

verified ·

1 Parent(s): 60bf674

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -392

app.py CHANGED Viewed

@@ -1,392 +1,161 @@
-# app.py - FastAPI implementation for HF中国镜像站 Spaces
-import os
-import gc
-import time
-import torch
-from fastapi import FastAPI, Request, Form
-from fastapi.responses import HTMLResponse
-from fastapi.staticfiles import StaticFiles
-from fastapi.templating import Jinja2Templates
-from pydantic import BaseModel
-from typing import Optional
-import logging
-from threading import Thread
-from queue import Queue
-# Import optimized model loading utilities
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from peft import PeftModel, PeftConfig
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Initialize FastAPI app
-app = FastAPI()
-# Set up templates and static files
-templates = Jinja2Templates(directory="templates")
-os.makedirs("templates", exist_ok=True)
-os.makedirs("static", exist_ok=True)
-app.mount("/static", StaticFiles(directory="static"), name="static")
-# Create chat template
-with open("templates/index.html", "w") as f:
-    f.write("""
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Sarcastic Assistant Chat</title>
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <style>
-        body {
-            font-family: Arial, sans-serif;
-            max-width: 800px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: #f5f5f5;
-        }
-        .chat-container {
-            border-radius: 10px;
-            background: white;
-            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
-            padding: 20px;
-            height: 70vh;
-            overflow-y: auto;
-            margin-bottom: 20px;
-        }
-        .message {
-            padding: 10px 15px;
-            border-radius: 18px;
-            margin-bottom: 10px;
-            max-width: 80%;
-            word-wrap: break-word;
-        }
-        .user {
-            background-color: #e1ffc7;
-            margin-left: auto;
-            text-align: right;
-        }
-        .assistant {
-            background-color: #f0f0f0;
-            margin-right: auto;
-        }
-        .input-area {
-            display: flex;
-            gap: 10px;
-        }
-        #user-input {
-            flex: 1;
-            padding: 10px 15px;
-            border: 1px solid #ddd;
-            border-radius: 25px;
-            outline: none;
-        }
-        button {
-            padding: 10px 20px;
-            background-color: #4CAF50;
-            color: white;
-            border: none;
-            border-radius: 25px;
-            cursor: pointer;
-        }
-        button:hover {
-            background-color: #45a049;
-        }
-        #thinking {
-            color: #666;
-            font-style: italic;
-            display: none;
-        }
-        .status-area {
-            margin-top: 10px;
-            color: #666;
-            font-size: 0.9em;
-        }
-    </style>
-</head>
-<body>
-    <h1>Sarcastic Assistant</h1>
-    <div class="chat-container" id="chat-container">
-        <div class="message assistant">
-            Hi there! I'm your sarcastic assistant. What's on your mind today?
-        </div>
-    </div>
-    <div class="input-area">
-        <input type="text" id="user-input" placeholder="Type your message..." autocomplete="off">
-        <button onclick="sendMessage()">Send</button>
-    </div>
-    <div class="status-area">
-        <div id="thinking">Thinking...</div>
-        <div id="model-info">Llama 3.1-8B with AdaLoRA fine-tuning</div>
-    </div>
-    <script>
-        const chatContainer = document.getElementById('chat-container');
-        const userInput = document.getElementById('user-input');
-        const thinkingIndicator = document.getElementById('thinking');
-        // Enable Enter key to send messages
-        userInput.addEventListener("keyup", function(event) {
-            if (event.key === "Enter") {
-                sendMessage();
-            }
-        });
-        async function sendMessage() {
-            const message = userInput.value.trim();
-            if (!message) return;
-            // Add user message to chat
-            addMessage(message, 'user');
-            userInput.value = '';
-            // Show thinking indicator
-            thinkingIndicator.style.display = 'block';
-            try {
-                // Send message to API
-                const response = await fetch('/generate', {
-                    method: 'POST',
-                    headers: {
-                        'Content-Type': 'application/json'
-                    },
-                    body: JSON.stringify({ message: message })
-                });
-                if (!response.ok) {
-                    throw new Error('Network response was not ok');
-                }
-                const data = await response.json();
-                // Add AI response to chat
-                addMessage(data.response, 'assistant');
-            } catch (error) {
-                console.error('Error:', error);
-                addMessage('Sorry, I had trouble processing that. Please try again.', 'assistant');
-            } finally {
-                // Hide thinking indicator
-                thinkingIndicator.style.display = 'none';
-                // Scroll to bottom
-                chatContainer.scrollTop = chatContainer.scrollHeight;
-            }
-        }
-        function addMessage(text, sender) {
-            const messageDiv = document.createElement('div');
-            messageDiv.classList.add('message', sender);
-            messageDiv.textContent = text;
-            chatContainer.appendChild(messageDiv);
-            chatContainer.scrollTop = chatContainer.scrollHeight;
-        }
-    </script>
-</body>
-</html>
-    """)
-# Create response queue for background processing
-response_queue = Queue()
-# Model loading - optimized for CPU
-class ModelManager:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.pipeline = None
-        self.is_loaded = False
-        self.loading_thread = None
-    def load_model_in_background(self):
-        """Load model in a background thread to avoid blocking the server startup"""
-        if self.loading_thread is None or not self.loading_thread.is_alive():
-            self.loading_thread = Thread(target=self._load_model)
-            self.loading_thread.daemon = True
-            self.loading_thread.start()
-    def _load_model(self):
-        """Internal method to load the model with optimizations for CPU"""
-        try:
-            logger.info("Loading tokenizer...")
-            # Loading base model tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                "meta-llama/Llama-3.1-8B-Instruct",
-                use_fast=True
-            )
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-            self.tokenizer.padding_side = "right"
-            logger.info("Loading model with CPU optimizations...")
-            # Load the base model with CPU optimizations
-            model_kwargs = {
-                # Load in 8-bit for reduced memory usage
-                "load_in_8bit": True,
-                "device_map": "auto",
-                # CPU optimizations
-                "low_cpu_mem_usage": True,
-            }
-            # Load the base model
-            base_model = AutoModelForCausalLM.from_pretrained(
-                "meta-llama/Llama-3.1-8B-Instruct",
-                **model_kwargs
-            )
-            logger.info("Loading adapter weights...")
-            # Load the PEFT adapter - assuming the adapter is in the lora_model directory
-            try:
-                # First try with directory in current folder
-                adapter_path = "Gauri-tr/lora_model"
-                if not os.path.exists(adapter_path):
-                    # Check in parent directories
-                    adapter_path = "../lora_model"
-                self.model = PeftModel.from_pretrained(
-                    base_model,
-                    adapter_path,
-                    device_map="auto"
-                )
-            except Exception as e:
-                logger.error(f"Failed to load PEFT adapter: {e}")
-                # Fallback to using base model
-                self.model = base_model
-                logger.warning("Using base model without adapters")
-            logger.info("Setting up inference pipeline...")
-            # Create pipeline with optimized settings
-            self.pipeline = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=64,
-                temperature=0.8,
-                top_p=0.9,
-                top_k=40,
-                repetition_penalty=1.15,
-                pad_token_id=self.tokenizer.eos_token_id,
-                do_sample=True
-            )
-            self.is_loaded = True
-            logger.info("Model loading complete!")
-        except Exception as e:
-            logger.error(f"Error loading model: {e}")
-            self.is_loaded = False
-    def generate_response(self, user_message):
-        """Generate a response using the loaded model"""
-        if not self.is_loaded:
-            return "Model is still loading, please try again in a moment."
-        try:
-            # Format prompt for sarcastic responses
-            instruction = "Respond to this message as if you were in a conversation. Determine the tone and style of the conversation and reply accordingly. Be funny, sarcastic and smart as well."
-            prompt = f"""Below is an instruction that describes a task, and an input that provides further context. Write a response that appropriately completes the request.
-### Instruction:
-{instruction}
-### Input:
-{user_message}
-### Response:
-"""
-            # Generate response
-            start_time = time.time()
-            outputs = self.pipeline(
-                prompt,
-                return_full_text=False
-            )
-            generation_time = time.time() - start_time
-            logger.info(f"Generation took {generation_time:.2f} seconds")
-            # Extract response
-            full_response = outputs[0]['generated_text']
-            # Extract just the response part
-            response_parts = full_response.split("### Response:")
-            if len(response_parts) > 1:
-                response = response_parts[1].strip()
-                # Clean up any trailing text
-                response = response.split("[Your Name]")[0].strip()
-                response = response.split("---")[0].strip()
-                return response
-            else:
-                return full_response.strip()
-        except Exception as e:
-            logger.error(f"Error generating response: {e}")
-            return "I'm having trouble thinking right now. Can you try again?"
-# Create model manager
-model_manager = ModelManager()
-# Background response generation
-def generate_response_in_background(user_message):
-    response = model_manager.generate_response(user_message)
-    response_queue.put(response)
-# API model
-class MessageRequest(BaseModel):
-    message: str
-# Routes
-@app.get("/", response_class=HTMLResponse)
-async def read_root(request: Request):
-    return templates.TemplateResponse("index.html", {"request": request})
-@app.post("/generate")
-async def generate(message_request: MessageRequest):
-    user_message = message_request.message
-    # If model isn't loaded yet, start loading it
-    if not model_manager.is_loaded:
-        model_manager.load_model_in_background()
-        return {"response": "I'm just starting up. Please try again in a moment!"}
-    # Handle message generation
-    thread = Thread(target=generate_response_in_background, args=(user_message,))
-    thread.daemon = True
-    thread.start()
-    # Wait for response with timeout
-    try:
-        thread.join(timeout=30)  # 30 second timeout
-        if thread.is_alive():
-            # If still running after timeout, return a message
-            return {"response": "I'm thinking hard about this one! Try sending a simpler message or try again later."}
-        # Get response from queue if available
-        if not response_queue.empty():
-            response = response_queue.get()
-            return {"response": response}
-        else:
-            return {"response": "Sorry, I couldn't generate a response. Please try again."}
-    except Exception as e:
-        logger.error(f"Error in response generation: {e}")
-        return {"response": "Something went wrong. Please try again."}
-# Startup event
-@app.on_event("startup")
-async def startup_event():
-    # Start loading model in background at startup
-    model_manager.load_model_in_background()
-    logger.info("Starting model loading in background")
-# Shutdown event
-@app.on_event("shutdown")
-async def shutdown_event():
-    # Clean up resources
-    logger.info("Shutting down and cleaning up resources")
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-if __name__ == "__main__":
-    import uvicorn
-    # Run the FastAPI app
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI, Request, Form, BackgroundTasks
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+import torch
+import os
+import gc
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import Optional
+import time
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Set optimization variables
+os.environ["OMP_NUM_THREADS"] = "8"
+os.environ["MKL_NUM_THREADS"] = "8"
+torch.set_num_threads(8)
+# Initialize FastAPI
+app = FastAPI()
+# Load templates and static files
+templates = Jinja2Templates(directory="")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# Disable gradient computation
+torch.set_grad_enabled(False)
+# Cache for responses
+response_cache = {}
+# Model initialization
+model_id = "Gauri-tr/llama-3.1-8b-sarcasm"
+tokenizer = None
+model = None
+# Load model in a lazy fashion
+def load_model():
+    global model, tokenizer
+    if model is None:
+        logger.info("Loading model and tokenizer...")
+        start_time = time.time()
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        # Load model with optimizations
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            device_map="cpu",
+            low_cpu_mem_usage=True,
+        )
+        # Set to evaluation mode
+        model.eval()
+        # Try to optimize with torch.compile if available
+        try:
+            import torch._dynamo
+            model = torch.compile(model, backend="inductor", fullgraph=True)
+            logger.info("Using torch.compile optimization")
+        except Exception as e:
+            logger.warning(f"Could not use torch.compile: {e}")
+        logger.info(f"Model loaded in {time.time() - start_time:.2f} seconds")
+        # Run a warmup inference
+        _ = generate_response("Hello", max_length=10)
+    return model, tokenizer
+def generate_response(input_text: str, max_length: int = 30) -> str:
+    # Check cache first
+    cache_key = f"{input_text}_{max_length}"
+    if cache_key in response_cache:
+        logger.info("Using cached response")
+        return response_cache[cache_key]
+    # Format prompt
+    prompt = f"""Below is an instruction that describes a task, and an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+Respond to this message as if you were in a conversation. Be funny, sarcastic and smart.
+### Input:
+{input_text}
+### Response:
+"""
+    # Ensure model is loaded
+    model, tokenizer = load_model()
+    # Tokenize
+    inputs = tokenizer(prompt, return_tensors="pt")
+    # Generate with optimization
+    start_time = time.time()
+    with torch.inference_mode():
+        outputs = model.generate(
+            inputs["input_ids"],
+            max_new_tokens=max_length,
+            do_sample=True,
+            temperature=0.8,
+            top_p=0.9,
+            repetition_penalty=1.2,
+            num_beams=1,  # Greedy decoding for speed
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=True,  # Use KV cache
+        )
+    generation_time = time.time() - start_time
+    logger.info(f"Generated response in {generation_time:.2f} seconds")
+    # Decode
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract response part
+    if "### Response:" in response:
+        response = response.split("### Response:")[1].strip()
+    # Cache the result
+    response_cache[cache_key] = response
+    # Make sure to clean up memory
+    gc.collect()
+    return response
+# Define routes
+@app.get("/", response_class=HTMLResponse)
+async def index(request: Request):
+    # Start model loading in background if needed
+    if model is None:
+        load_model()
+    return templates.TemplateResponse("index.html", {"request": request})
+@app.post("/chat/")
+async def chat(message: str = Form(...), max_length: Optional[int] = Form(30)):
+    response = generate_response(message, max_length)
+    return {"response": response, "message": message}
+# Health check endpoint
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+# Preload model at startup
+@app.on_event("startup")
+async def startup_event():
+    # Just initialize the tokenizer at startup - model will load on first request
+    global tokenizer
+    if tokenizer is None:
+        logger.info("Pre-loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token