synthetic-data-generator_3

Runtime error

App Files Files Community

davidberenstein1957 HF staff commited on 24 days ago

Commit

40f5d26

unverified ·

2 Parent(s): 57b7e7b 4e323c8

Merge pull request #26 from mcdaqc/main

Browse files

Files changed (15) hide show

.dockerignore +57 -0
.env.local.template +54 -0
.gitignore +4 -1
README.md +23 -0
app.py +2 -1
docker-compose.yml +17 -0
docker/.env.docker.template +43 -0
docker/Dockerfile +45 -0
docker/README.md +76 -0
docker/argilla/compose.yml +118 -0
docker/ollama/compose.yml +48 -0
docker/ollama/entrypoint.sh +35 -0
src/synthetic_dataset_generator/__init__.py +1 -4
src/synthetic_dataset_generator/app.py +3 -0
src/synthetic_dataset_generator/apps/base.py +3 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,57 @@

+# Version control
+.git
+.gitignore
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.env*
+!.env.example
+.venv
+env/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Testing
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Project specific
+nltk_data/
+.pdm-python
+.pdm.toml
+__pypackages__/

.env.local.template ADDED Viewed

	@@ -0,0 +1,54 @@

+# =============================================================================
+# LOCAL/API CONFIGURATION
+# =============================================================================
+# -----------------------------------------------------------------------------
+# REQUIRED CONFIGURATION
+# -----------------------------------------------------------------------------
+# HF中国镜像站 token (required for all setups)
+HF_TOKEN=hf_...
+# Generation Settings
+MAX_NUM_TOKENS=2048
+MAX_NUM_ROWS=1000
+DEFAULT_BATCH_SIZE=5
+# Required for chat data generation with Llama or Qwen models
+# Options: "llama3", "qwen2", or custom template string
+MAGPIE_PRE_QUERY_TEMPLATE=llama3
+# -----------------------------------------------------------------------------
+# A. CLOUD API SERVICES
+# -----------------------------------------------------------------------------
+# 1. HF中国镜像站 INFERENCE API (Default, Recommended)
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+# MODEL=Qwen/Qwen2.5-1.5B-Instruct
+# 2. OPENAI API
+# OPENAI_BASE_URL=https://api.openai.com/v1/
+# MODEL=gpt-4
+# API_KEY=sk-...
+# 3. HF中国镜像站 SPACE FOR ARGILLA (optional)
+# ARGILLA_API_URL=https://your-space.hf.space/
+# ARGILLA_API_KEY=your_key
+# -----------------------------------------------------------------------------
+# B. LOCAL SERVICES (Requires Installation)
+# -----------------------------------------------------------------------------
+# 1. LOCAL OLLAMA
+# OLLAMA_BASE_URL=http://127.0.0.1:11434/
+# MODEL=llama3.2:1b
+# TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
+# 2. LOCAL VLLM
+# VLLM_BASE_URL=http://127.0.0.1:8000/
+# MODEL=Qwen/Qwen2.5-1.5B-Instruct
+# TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct
+# 3. LOCAL TGI
+# HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/
+# MODEL=meta-llama/Llama-3.1-8B-Instruct
+# TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct

.gitignore CHANGED Viewed

@@ -167,4 +167,7 @@ cython_debug/
 nltk_data/
 # examples
-models/

 nltk_data/
 # examples
+models/
+# Elasticsearch data
+elasticsearch_data/

README.md CHANGED Viewed

@@ -108,6 +108,12 @@ To save the generated datasets to a local directory instead of pushing them to t
 - `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
 ### Argilla integration
 Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the HF中国镜像站 Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
@@ -138,3 +144,20 @@ Run the app:
 ```bash
 python app.py
 ```

 - `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
+You can use our environment template as a starting point:
+```bash
+cp .env.local.template .env
+```
 ### Argilla integration
 Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the HF中国镜像站 Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
 ```bash
 python app.py
 ```
+## 🐳 Docker Setup
+Quick setup with all services (App + Ollama + Argilla):
+```bash
+# Copy environment template
+cp docker/.env.docker.template .env # Add your HF_TOKEN in .env
+# Build all services (this may take a few minutes)
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
+# Start all services
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+> For more detailed Docker configurations and setups, check [docker/README.md](docker/README.md)

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from synthetic_dataset_generator import launch
-launch()

 from synthetic_dataset_generator import launch
+if __name__ == "__main__":
+    launch()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+services:
+  app:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+    image: synthetic-data-generator:app
+    ports:
+      - "7860:7860"
+    env_file:
+      - .env
+    networks:
+      - app-network
+networks:
+  app-network:
+    name: synthetic-data-network
+    driver: bridge

docker/.env.docker.template ADDED Viewed

	@@ -0,0 +1,43 @@

+# =============================================================================
+# DOCKER CONFIGURATION ONLY - FULL SETUP (APP + OLLAMA + ARGILLA)
+# =============================================================================
+# Note: Before building:
+# 1. Copy this template to the root directory: cp docker/.env.docker.template .env
+# 2. Comment/uncomment the sections you want to use (OLLAMA and/or ARGILLA)
+# 3. Then build and run with the appropriate docker compose command
+# HF中国镜像站 token with read/write permissions
+HF_TOKEN=your_token_here
+# -----------------------------------------------------------------------------
+# GENERATION SETTINGS
+# -----------------------------------------------------------------------------
+MAX_NUM_TOKENS=2048
+MAX_NUM_ROWS=1000
+DEFAULT_BATCH_SIZE=5
+# -----------------------------------------------------------------------------
+# OLLAMA DOCKER CONFIGURATION
+# -----------------------------------------------------------------------------
+OLLAMA_BASE_URL=http://ollama:11434
+OLLAMA_HARDWARE=latest # latest (for CPU/NVIDIA), rocm (for AMD)
+# LLAMA 3.2
+MODEL=llama3.2:1b
+TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
+MAGPIE_PRE_QUERY_TEMPLATE=llama3
+# DEEPSEEK R1
+#MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+#TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+#MAGPIE_PRE_QUERY_TEMPLATE= "<｜begin▁of▁sentence｜>User: "
+# -----------------------------------------------------------------------------
+# ARGILLA DOCKER CONFIGURATION (persistent data)
+# -----------------------------------------------------------------------------
+ARGILLA_API_URL=http://argilla:6900
+ARGILLA_USERNAME=admin
+ARGILLA_PASSWORD=admin1234
+ARGILLA_API_KEY=admin.1234
+ARGILLA_REINDEX_DATASET=1

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+# Use Python slim image as base
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1
+# Create and set working directory
+WORKDIR /app
+# Create non-root user first
+RUN useradd -m -u 1000 appuser
+# Install system dependencies including build tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    build-essential \
+    cmake \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Install pdm
+RUN pip install --no-cache-dir pdm
+# Copy project files and set permissions
+COPY . .
+RUN chown -R appuser:appuser /app && \
+    chmod -R 755 /app
+# Switch to non-root user
+USER appuser
+# Install dependencies in a virtual environment
+RUN pdm install --prod --frozen-lockfile
+# Expose Gradio port
+EXPOSE 7860
+# Start command using pdm run to use the virtual environment
+CMD ["pdm", "run", "python", "-m", "synthetic_dataset_generator"]

docker/README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Docker Configuration Guide
+The application can be run with different configurations using Docker Compose:
+- `docker-compose.yml`: Core application
+- `docker/ollama/compose.yml`: Ollama service for local LLM inference
+- `docker/argilla/compose.yml`: Argilla service for data curation
+## Ollama Integration
+The `MODEL` variable in your `.env` file determines which model Ollama will download and use. For example:
+```env
+MODEL=llama3.2:1b
+```
+## Setup Options
+### Full Setup (App + Ollama + Argilla)
+```bash
+# Keep all sections uncommented in .env
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+### App + Ollama
+```bash
+# Comment out ARGILLA section in .env
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml build
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d
+```
+### App + Argilla
+```bash
+# Comment out OLLAMA section in .env
+docker compose -f docker-compose.yml -f docker/argilla/compose.yml build
+docker compose -f docker-compose.yml -f docker/argilla/compose.yml up -d
+```
+### App Only
+```bash
+# Comment out both OLLAMA and ARGILLA sections in .env
+docker compose -f docker-compose.yml build
+docker compose -f docker-compose.yml up -d
+```
+## Managing Services
+Services are built separately but are linked together. If you already have some services built and want to add another:
+1. You don't need to rebuild existing services
+2. Just build the new service
+3. Stop everything with `down` and start again with `up`
+For example, if you have App + Ollama and want to add Argilla:
+```bash
+docker compose -f docker/argilla/compose.yml build  # only build Argilla
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml down
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
+```
+Similarly, if you have built all services but want to run only some of them:
+> **Important**: When running specific services, remember to comment out unused services in `.env` first
+```bash
+# No need to build again, just start the services you need
+docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d  # start only App + Ollama
+```
+## Service URLs
+Once running, access the services at:
+- App: http://localhost:7860
+- Argilla: http://localhost:6900 (if enabled)
+- Ollama: http://localhost:11434 (if enabled)
+> Note:  Services will be available after a few seconds while they initialize. Ollama models and Argilla datasets are persisted and available after restarts

docker/argilla/compose.yml ADDED Viewed

	@@ -0,0 +1,118 @@

+services:
+  app:
+    extends:
+      file: docker-compose.yml
+      service: app
+    depends_on:
+      argilla:
+        condition: service_healthy
+        required: false
+    environment:
+      - ARGILLA_API_URL=http://argilla:6900
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0
+    environment:
+      - ES_JAVA_OPTS=-Xms512m -Xmx512m
+      - node.name=elasticsearch
+      - cluster.name=es-argilla-local
+      - discovery.type=single-node
+      - cluster.routing.allocation.disk.threshold_enabled=false
+      - xpack.security.enabled=false
+    volumes:
+      - es_data:/usr/share/elasticsearch/data
+    networks:
+      - app-network
+    ports:
+      - "9200:9200"
+      - "9300:9300"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9200"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+  postgres:
+    image: postgres:14
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: argilla
+    networks:
+      - app-network
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+  redis:
+    image: redis
+    networks:
+      - app-network
+  argilla:
+    image: argilla/argilla-server:latest
+    ports:
+      - "6900:6900"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:6900/api/ready"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    env_file:
+      - .env
+    environment:
+      - ARGILLA_HOME_PATH=/var/lib/argilla
+      - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
+      - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
+      - ARGILLA_REDIS_URL=redis://redis:6379/0
+      - USERNAME=${ARGILLA_USERNAME}
+      - PASSWORD=${ARGILLA_PASSWORD}
+      - API_KEY=${ARGILLA_API_KEY}
+      - WORKSPACE=default
+    volumes:
+      - argilla_data:/argilla
+    networks:
+      - app-network
+    depends_on:
+      elasticsearch:
+        condition: service_healthy
+      postgres:
+        condition: service_started
+      redis:
+        condition: service_started
+  worker:
+    image: argilla/argilla-server:latest
+    env_file:
+      - .env
+    environment:
+      - ARGILLA_HOME_PATH=/var/lib/argilla
+      - ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
+      - ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
+      - ARGILLA_REDIS_URL=redis://redis:6379/0
+      - BACKGROUND_NUM_WORKERS=2
+      - USERNAME=${ARGILLA_USERNAME}
+      - PASSWORD=${ARGILLA_PASSWORD}
+      - API_KEY=${ARGILLA_API_KEY}
+      - WORKSPACE=default
+    networks:
+      - app-network
+    depends_on:
+      - postgres
+      - elasticsearch
+      - redis
+    command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}'
+volumes:
+  es_data:
+    name: synthetic-data-es
+  argilla_data:
+    name: synthetic-data-argilla
+  postgres_data:
+    name: synthetic-data-postgres

docker/ollama/compose.yml ADDED Viewed

	@@ -0,0 +1,48 @@

+services:
+  app:
+    extends:
+      file: docker-compose.yml
+      service: app
+    depends_on:
+      ollama:
+        condition: service_healthy
+        required: true
+    environment:
+      - OLLAMA_BASE_URL=http://ollama:11434
+  ollama:
+    image: ollama/ollama:${OLLAMA_HARDWARE:-latest}
+    ports:
+      - "11434:11434"
+    env_file:
+      - .env
+    environment:
+      - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-}
+    volumes:
+      - ollama_data:/root/.ollama
+      - ./docker/ollama/entrypoint.sh:/entrypoint.sh
+    networks:
+      - app-network
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    tty: true
+    entrypoint: ["/usr/bin/bash", "/entrypoint.sh"]
+    healthcheck:
+      test:
+        - "CMD-SHELL"
+        - |
+          test -f /tmp/ollama_ready && \
+          bash -c '</dev/tcp/localhost/11434'
+      interval: 10s
+      timeout: 10s
+      retries: 100
+      start_period: 10s
+volumes:
+  ollama_data:
+    name: synthetic-data-ollama

docker/ollama/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Start Ollama in the background
+/bin/ollama serve &
+# Record Process ID
+pid=$!
+# Pause for Ollama to start
+sleep 5
+# Extract model name from MODEL variable (removing quotes if present)
+MODEL_NAME=$(echo $MODEL | tr -d '"')
+# Verificar que MODEL_NAME tenga un valor
+if [ -z "$MODEL_NAME" ]; then
+    echo "❌ No model specified in MODEL environment variable"
+else
+    # Check if model exists
+    if ollama list | grep -q "$MODEL_NAME"; then
+        echo "🟢 Model ($MODEL_NAME) already installed"
+        touch /tmp/ollama_ready
+    else
+        echo "🔴 Retrieving model ($MODEL_NAME)..."
+        # Intentar descargar el modelo sin crear el archivo hasta estar seguros
+        if ollama pull "$MODEL_NAME" 2>/dev/null && ollama list | grep -q "$MODEL_NAME"; then
+            echo "🟢 Model download complete!"
+            touch /tmp/ollama_ready
+        else
+            echo "❌ Error downloading model ($MODEL_NAME)"
+        fi
+    fi
+fi
+# Wait for Ollama process to finish
+wait $pid

src/synthetic_dataset_generator/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import inspect
 from gradio import TabbedInterface
 from synthetic_dataset_generator import (  # noqa
@@ -7,15 +6,13 @@ from synthetic_dataset_generator import (  # noqa
     _inference_endpoints,
 )
 def launch(*args, **kwargs):
     """Launch the synthetic dataset generator.
     Based on the `TabbedInterface` from Gradio.
     Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
     """
     from synthetic_dataset_generator.app import demo
-    return demo.launch(*args, **kwargs)
 launch.__doc__ = TabbedInterface.launch.__doc__

 import inspect
 from gradio import TabbedInterface
 from synthetic_dataset_generator import (  # noqa
     _inference_endpoints,
 )
 def launch(*args, **kwargs):
     """Launch the synthetic dataset generator.
     Based on the `TabbedInterface` from Gradio.
     Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
     """
     from synthetic_dataset_generator.app import demo
+    return demo.launch(*args, server_name="0.0.0.0", **kwargs)
 launch.__doc__ = TabbedInterface.launch.__doc__

src/synthetic_dataset_generator/app.py CHANGED Viewed

@@ -17,6 +17,9 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
 .table-wrap .tbody td {vertical-align: top}
 #system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
 .container {padding-inline: 0 !important}
 #sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
 .datasets {height: 70px;}
 """

 .table-wrap .tbody td {vertical-align: top}
 #system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
 .container {padding-inline: 0 !important}
+.gradio-container { width: 100% !important; }
+.gradio-row { display: flex !important; flex-direction: row !important; }
+.gradio-column { flex: 1 !important; min-width: 0 !important; }
 #sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
 .datasets {height: 70px;}
 """

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -131,6 +131,9 @@ def show_success_message(org_name: str, repo_name: str) -> gr.Markdown:
             max_height=None,
         )
     argilla_api_url = client.api_url
     return gr.Markdown(
         value=f"""
                 <div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">

             max_height=None,
         )
     argilla_api_url = client.api_url
+    # Transform Docker internal URL to localhost if needed
+    if "argilla:" in argilla_api_url:
+        argilla_api_url = argilla_api_url.replace("argilla:", "127.0.0.1:")
     return gr.Markdown(
         value=f"""
                 <div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">