Spaces:
Runtime error
Runtime error
Merge pull request #26 from mcdaqc/main
Browse files- .dockerignore +57 -0
- .env.local.template +54 -0
- .gitignore +4 -1
- README.md +23 -0
- app.py +2 -1
- docker-compose.yml +17 -0
- docker/.env.docker.template +43 -0
- docker/Dockerfile +45 -0
- docker/README.md +76 -0
- docker/argilla/compose.yml +118 -0
- docker/ollama/compose.yml +48 -0
- docker/ollama/entrypoint.sh +35 -0
- src/synthetic_dataset_generator/__init__.py +1 -4
- src/synthetic_dataset_generator/app.py +3 -0
- src/synthetic_dataset_generator/apps/base.py +3 -0
.dockerignore
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Version control
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
|
5 |
+
# Python
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
*.so
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
|
28 |
+
# Virtual environments
|
29 |
+
.env*
|
30 |
+
!.env.example
|
31 |
+
.venv
|
32 |
+
env/
|
33 |
+
venv/
|
34 |
+
ENV/
|
35 |
+
|
36 |
+
# IDE
|
37 |
+
.idea/
|
38 |
+
.vscode/
|
39 |
+
*.swp
|
40 |
+
*.swo
|
41 |
+
|
42 |
+
# Testing
|
43 |
+
.tox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
|
53 |
+
# Project specific
|
54 |
+
nltk_data/
|
55 |
+
.pdm-python
|
56 |
+
.pdm.toml
|
57 |
+
__pypackages__/
|
.env.local.template
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# =============================================================================
|
2 |
+
# LOCAL/API CONFIGURATION
|
3 |
+
# =============================================================================
|
4 |
+
|
5 |
+
# -----------------------------------------------------------------------------
|
6 |
+
# REQUIRED CONFIGURATION
|
7 |
+
# -----------------------------------------------------------------------------
|
8 |
+
# HF中国镜像站 token (required for all setups)
|
9 |
+
HF_TOKEN=hf_...
|
10 |
+
|
11 |
+
# Generation Settings
|
12 |
+
MAX_NUM_TOKENS=2048
|
13 |
+
MAX_NUM_ROWS=1000
|
14 |
+
DEFAULT_BATCH_SIZE=5
|
15 |
+
|
16 |
+
# Required for chat data generation with Llama or Qwen models
|
17 |
+
# Options: "llama3", "qwen2", or custom template string
|
18 |
+
MAGPIE_PRE_QUERY_TEMPLATE=llama3
|
19 |
+
|
20 |
+
# -----------------------------------------------------------------------------
|
21 |
+
# A. CLOUD API SERVICES
|
22 |
+
# -----------------------------------------------------------------------------
|
23 |
+
|
24 |
+
# 1. HF中国镜像站 INFERENCE API (Default, Recommended)
|
25 |
+
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
26 |
+
# MODEL=Qwen/Qwen2.5-1.5B-Instruct
|
27 |
+
|
28 |
+
# 2. OPENAI API
|
29 |
+
# OPENAI_BASE_URL=https://api.openai.com/v1/
|
30 |
+
# MODEL=gpt-4
|
31 |
+
# API_KEY=sk-...
|
32 |
+
|
33 |
+
# 3. HF中国镜像站 SPACE FOR ARGILLA (optional)
|
34 |
+
# ARGILLA_API_URL=https://your-space.hf.space/
|
35 |
+
# ARGILLA_API_KEY=your_key
|
36 |
+
|
37 |
+
# -----------------------------------------------------------------------------
|
38 |
+
# B. LOCAL SERVICES (Requires Installation)
|
39 |
+
# -----------------------------------------------------------------------------
|
40 |
+
|
41 |
+
# 1. LOCAL OLLAMA
|
42 |
+
# OLLAMA_BASE_URL=http://127.0.0.1:11434/
|
43 |
+
# MODEL=llama3.2:1b
|
44 |
+
# TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
|
45 |
+
|
46 |
+
# 2. LOCAL VLLM
|
47 |
+
# VLLM_BASE_URL=http://127.0.0.1:8000/
|
48 |
+
# MODEL=Qwen/Qwen2.5-1.5B-Instruct
|
49 |
+
# TOKENIZER_ID=Qwen/Qwen2.5-1.5B-Instruct
|
50 |
+
|
51 |
+
# 3. LOCAL TGI
|
52 |
+
# HUGGINGFACE_BASE_URL=http://127.0.0.1:3000/
|
53 |
+
# MODEL=meta-llama/Llama-3.1-8B-Instruct
|
54 |
+
# TOKENIZER_ID=meta-llama/Llama-3.1-8B-Instruct
|
.gitignore
CHANGED
@@ -167,4 +167,7 @@ cython_debug/
|
|
167 |
nltk_data/
|
168 |
|
169 |
# examples
|
170 |
-
models/
|
|
|
|
|
|
|
|
167 |
nltk_data/
|
168 |
|
169 |
# examples
|
170 |
+
models/
|
171 |
+
|
172 |
+
# Elasticsearch data
|
173 |
+
elasticsearch_data/
|
README.md
CHANGED
@@ -108,6 +108,12 @@ To save the generated datasets to a local directory instead of pushing them to t
|
|
108 |
|
109 |
- `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
### Argilla integration
|
112 |
|
113 |
Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the HF中国镜像站 Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
|
@@ -138,3 +144,20 @@ Run the app:
|
|
138 |
```bash
|
139 |
python app.py
|
140 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
- `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
|
110 |
|
111 |
+
You can use our environment template as a starting point:
|
112 |
+
|
113 |
+
```bash
|
114 |
+
cp .env.local.template .env
|
115 |
+
```
|
116 |
+
|
117 |
### Argilla integration
|
118 |
|
119 |
Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the HF中国镜像站 Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).
|
|
|
144 |
```bash
|
145 |
python app.py
|
146 |
```
|
147 |
+
|
148 |
+
## 🐳 Docker Setup
|
149 |
+
|
150 |
+
Quick setup with all services (App + Ollama + Argilla):
|
151 |
+
|
152 |
+
```bash
|
153 |
+
# Copy environment template
|
154 |
+
cp docker/.env.docker.template .env # Add your HF_TOKEN in .env
|
155 |
+
|
156 |
+
# Build all services (this may take a few minutes)
|
157 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
|
158 |
+
|
159 |
+
# Start all services
|
160 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
|
161 |
+
```
|
162 |
+
|
163 |
+
> For more detailed Docker configurations and setups, check [docker/README.md](docker/README.md)
|
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
from synthetic_dataset_generator import launch
|
2 |
|
3 |
-
|
|
|
|
1 |
from synthetic_dataset_generator import launch
|
2 |
|
3 |
+
if __name__ == "__main__":
|
4 |
+
launch()
|
docker-compose.yml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
app:
|
3 |
+
build:
|
4 |
+
context: .
|
5 |
+
dockerfile: docker/Dockerfile
|
6 |
+
image: synthetic-data-generator:app
|
7 |
+
ports:
|
8 |
+
- "7860:7860"
|
9 |
+
env_file:
|
10 |
+
- .env
|
11 |
+
networks:
|
12 |
+
- app-network
|
13 |
+
|
14 |
+
networks:
|
15 |
+
app-network:
|
16 |
+
name: synthetic-data-network
|
17 |
+
driver: bridge
|
docker/.env.docker.template
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# =============================================================================
|
2 |
+
# DOCKER CONFIGURATION ONLY - FULL SETUP (APP + OLLAMA + ARGILLA)
|
3 |
+
# =============================================================================
|
4 |
+
|
5 |
+
# Note: Before building:
|
6 |
+
# 1. Copy this template to the root directory: cp docker/.env.docker.template .env
|
7 |
+
# 2. Comment/uncomment the sections you want to use (OLLAMA and/or ARGILLA)
|
8 |
+
# 3. Then build and run with the appropriate docker compose command
|
9 |
+
|
10 |
+
# HF中国镜像站 token with read/write permissions
|
11 |
+
HF_TOKEN=your_token_here
|
12 |
+
|
13 |
+
# -----------------------------------------------------------------------------
|
14 |
+
# GENERATION SETTINGS
|
15 |
+
# -----------------------------------------------------------------------------
|
16 |
+
MAX_NUM_TOKENS=2048
|
17 |
+
MAX_NUM_ROWS=1000
|
18 |
+
DEFAULT_BATCH_SIZE=5
|
19 |
+
|
20 |
+
# -----------------------------------------------------------------------------
|
21 |
+
# OLLAMA DOCKER CONFIGURATION
|
22 |
+
# -----------------------------------------------------------------------------
|
23 |
+
OLLAMA_BASE_URL=http://ollama:11434
|
24 |
+
OLLAMA_HARDWARE=latest # latest (for CPU/NVIDIA), rocm (for AMD)
|
25 |
+
|
26 |
+
# LLAMA 3.2
|
27 |
+
MODEL=llama3.2:1b
|
28 |
+
TOKENIZER_ID=meta-llama/Llama-3.2-1B-Instruct
|
29 |
+
MAGPIE_PRE_QUERY_TEMPLATE=llama3
|
30 |
+
|
31 |
+
# DEEPSEEK R1
|
32 |
+
#MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
33 |
+
#TOKENIZER_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
34 |
+
#MAGPIE_PRE_QUERY_TEMPLATE= "<|begin▁of▁sentence|>User: "
|
35 |
+
|
36 |
+
# -----------------------------------------------------------------------------
|
37 |
+
# ARGILLA DOCKER CONFIGURATION (persistent data)
|
38 |
+
# -----------------------------------------------------------------------------
|
39 |
+
ARGILLA_API_URL=http://argilla:6900
|
40 |
+
ARGILLA_USERNAME=admin
|
41 |
+
ARGILLA_PASSWORD=admin1234
|
42 |
+
ARGILLA_API_KEY=admin.1234
|
43 |
+
ARGILLA_REINDEX_DATASET=1
|
docker/Dockerfile
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python slim image as base
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV PYTHONUNBUFFERED=1 \
|
6 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
7 |
+
PIP_NO_CACHE_DIR=1
|
8 |
+
|
9 |
+
# Create and set working directory
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
# Create non-root user first
|
13 |
+
RUN useradd -m -u 1000 appuser
|
14 |
+
|
15 |
+
# Install system dependencies including build tools
|
16 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
17 |
+
curl \
|
18 |
+
build-essential \
|
19 |
+
cmake \
|
20 |
+
libgl1-mesa-glx \
|
21 |
+
libglib2.0-0 \
|
22 |
+
libsm6 \
|
23 |
+
libxext6 \
|
24 |
+
libxrender-dev \
|
25 |
+
&& rm -rf /var/lib/apt/lists/*
|
26 |
+
|
27 |
+
# Install pdm
|
28 |
+
RUN pip install --no-cache-dir pdm
|
29 |
+
|
30 |
+
# Copy project files and set permissions
|
31 |
+
COPY . .
|
32 |
+
RUN chown -R appuser:appuser /app && \
|
33 |
+
chmod -R 755 /app
|
34 |
+
|
35 |
+
# Switch to non-root user
|
36 |
+
USER appuser
|
37 |
+
|
38 |
+
# Install dependencies in a virtual environment
|
39 |
+
RUN pdm install --prod --frozen-lockfile
|
40 |
+
|
41 |
+
# Expose Gradio port
|
42 |
+
EXPOSE 7860
|
43 |
+
|
44 |
+
# Start command using pdm run to use the virtual environment
|
45 |
+
CMD ["pdm", "run", "python", "-m", "synthetic_dataset_generator"]
|
docker/README.md
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Docker Configuration Guide
|
2 |
+
|
3 |
+
The application can be run with different configurations using Docker Compose:
|
4 |
+
|
5 |
+
- `docker-compose.yml`: Core application
|
6 |
+
- `docker/ollama/compose.yml`: Ollama service for local LLM inference
|
7 |
+
- `docker/argilla/compose.yml`: Argilla service for data curation
|
8 |
+
|
9 |
+
## Ollama Integration
|
10 |
+
|
11 |
+
The `MODEL` variable in your `.env` file determines which model Ollama will download and use. For example:
|
12 |
+
```env
|
13 |
+
MODEL=llama3.2:1b
|
14 |
+
```
|
15 |
+
|
16 |
+
## Setup Options
|
17 |
+
|
18 |
+
### Full Setup (App + Ollama + Argilla)
|
19 |
+
```bash
|
20 |
+
# Keep all sections uncommented in .env
|
21 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml build
|
22 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
|
23 |
+
```
|
24 |
+
|
25 |
+
### App + Ollama
|
26 |
+
```bash
|
27 |
+
# Comment out ARGILLA section in .env
|
28 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml build
|
29 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d
|
30 |
+
```
|
31 |
+
|
32 |
+
### App + Argilla
|
33 |
+
```bash
|
34 |
+
# Comment out OLLAMA section in .env
|
35 |
+
docker compose -f docker-compose.yml -f docker/argilla/compose.yml build
|
36 |
+
docker compose -f docker-compose.yml -f docker/argilla/compose.yml up -d
|
37 |
+
```
|
38 |
+
|
39 |
+
### App Only
|
40 |
+
```bash
|
41 |
+
# Comment out both OLLAMA and ARGILLA sections in .env
|
42 |
+
docker compose -f docker-compose.yml build
|
43 |
+
docker compose -f docker-compose.yml up -d
|
44 |
+
```
|
45 |
+
|
46 |
+
## Managing Services
|
47 |
+
|
48 |
+
Services are built separately but are linked together. If you already have some services built and want to add another:
|
49 |
+
|
50 |
+
1. You don't need to rebuild existing services
|
51 |
+
2. Just build the new service
|
52 |
+
3. Stop everything with `down` and start again with `up`
|
53 |
+
|
54 |
+
For example, if you have App + Ollama and want to add Argilla:
|
55 |
+
```bash
|
56 |
+
docker compose -f docker/argilla/compose.yml build # only build Argilla
|
57 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml down
|
58 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml -f docker/argilla/compose.yml up -d
|
59 |
+
```
|
60 |
+
|
61 |
+
Similarly, if you have built all services but want to run only some of them:
|
62 |
+
> **Important**: When running specific services, remember to comment out unused services in `.env` first
|
63 |
+
|
64 |
+
```bash
|
65 |
+
# No need to build again, just start the services you need
|
66 |
+
docker compose -f docker-compose.yml -f docker/ollama/compose.yml up -d # start only App + Ollama
|
67 |
+
```
|
68 |
+
|
69 |
+
## Service URLs
|
70 |
+
|
71 |
+
Once running, access the services at:
|
72 |
+
- App: http://localhost:7860
|
73 |
+
- Argilla: http://localhost:6900 (if enabled)
|
74 |
+
- Ollama: http://localhost:11434 (if enabled)
|
75 |
+
|
76 |
+
> Note: Services will be available after a few seconds while they initialize. Ollama models and Argilla datasets are persisted and available after restarts
|
docker/argilla/compose.yml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
app:
|
3 |
+
extends:
|
4 |
+
file: docker-compose.yml
|
5 |
+
service: app
|
6 |
+
depends_on:
|
7 |
+
argilla:
|
8 |
+
condition: service_healthy
|
9 |
+
required: false
|
10 |
+
environment:
|
11 |
+
- ARGILLA_API_URL=http://argilla:6900
|
12 |
+
|
13 |
+
elasticsearch:
|
14 |
+
image: docker.elastic.co/elasticsearch/elasticsearch:8.17.0
|
15 |
+
environment:
|
16 |
+
- ES_JAVA_OPTS=-Xms512m -Xmx512m
|
17 |
+
- node.name=elasticsearch
|
18 |
+
- cluster.name=es-argilla-local
|
19 |
+
- discovery.type=single-node
|
20 |
+
- cluster.routing.allocation.disk.threshold_enabled=false
|
21 |
+
- xpack.security.enabled=false
|
22 |
+
volumes:
|
23 |
+
- es_data:/usr/share/elasticsearch/data
|
24 |
+
networks:
|
25 |
+
- app-network
|
26 |
+
ports:
|
27 |
+
- "9200:9200"
|
28 |
+
- "9300:9300"
|
29 |
+
ulimits:
|
30 |
+
memlock:
|
31 |
+
soft: -1
|
32 |
+
hard: -1
|
33 |
+
nofile:
|
34 |
+
soft: 65536
|
35 |
+
hard: 65536
|
36 |
+
healthcheck:
|
37 |
+
test: ["CMD", "curl", "-f", "http://localhost:9200"]
|
38 |
+
interval: 30s
|
39 |
+
timeout: 10s
|
40 |
+
retries: 3
|
41 |
+
|
42 |
+
postgres:
|
43 |
+
image: postgres:14
|
44 |
+
environment:
|
45 |
+
POSTGRES_USER: postgres
|
46 |
+
POSTGRES_PASSWORD: postgres
|
47 |
+
POSTGRES_DB: argilla
|
48 |
+
networks:
|
49 |
+
- app-network
|
50 |
+
volumes:
|
51 |
+
- postgres_data:/var/lib/postgresql/data
|
52 |
+
|
53 |
+
redis:
|
54 |
+
image: redis
|
55 |
+
networks:
|
56 |
+
- app-network
|
57 |
+
|
58 |
+
argilla:
|
59 |
+
image: argilla/argilla-server:latest
|
60 |
+
ports:
|
61 |
+
- "6900:6900"
|
62 |
+
healthcheck:
|
63 |
+
test: ["CMD", "curl", "-f", "http://localhost:6900/api/ready"]
|
64 |
+
interval: 30s
|
65 |
+
timeout: 10s
|
66 |
+
retries: 3
|
67 |
+
env_file:
|
68 |
+
- .env
|
69 |
+
environment:
|
70 |
+
- ARGILLA_HOME_PATH=/var/lib/argilla
|
71 |
+
- ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
|
72 |
+
- ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
|
73 |
+
- ARGILLA_REDIS_URL=redis://redis:6379/0
|
74 |
+
- USERNAME=${ARGILLA_USERNAME}
|
75 |
+
- PASSWORD=${ARGILLA_PASSWORD}
|
76 |
+
- API_KEY=${ARGILLA_API_KEY}
|
77 |
+
- WORKSPACE=default
|
78 |
+
volumes:
|
79 |
+
- argilla_data:/argilla
|
80 |
+
networks:
|
81 |
+
- app-network
|
82 |
+
depends_on:
|
83 |
+
elasticsearch:
|
84 |
+
condition: service_healthy
|
85 |
+
postgres:
|
86 |
+
condition: service_started
|
87 |
+
redis:
|
88 |
+
condition: service_started
|
89 |
+
|
90 |
+
worker:
|
91 |
+
image: argilla/argilla-server:latest
|
92 |
+
env_file:
|
93 |
+
- .env
|
94 |
+
environment:
|
95 |
+
- ARGILLA_HOME_PATH=/var/lib/argilla
|
96 |
+
- ARGILLA_ELASTICSEARCH=http://elasticsearch:9200
|
97 |
+
- ARGILLA_DATABASE_URL=postgresql+asyncpg://postgres:postgres@postgres:5432/argilla
|
98 |
+
- ARGILLA_REDIS_URL=redis://redis:6379/0
|
99 |
+
- BACKGROUND_NUM_WORKERS=2
|
100 |
+
- USERNAME=${ARGILLA_USERNAME}
|
101 |
+
- PASSWORD=${ARGILLA_PASSWORD}
|
102 |
+
- API_KEY=${ARGILLA_API_KEY}
|
103 |
+
- WORKSPACE=default
|
104 |
+
networks:
|
105 |
+
- app-network
|
106 |
+
depends_on:
|
107 |
+
- postgres
|
108 |
+
- elasticsearch
|
109 |
+
- redis
|
110 |
+
command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}'
|
111 |
+
|
112 |
+
volumes:
|
113 |
+
es_data:
|
114 |
+
name: synthetic-data-es
|
115 |
+
argilla_data:
|
116 |
+
name: synthetic-data-argilla
|
117 |
+
postgres_data:
|
118 |
+
name: synthetic-data-postgres
|
docker/ollama/compose.yml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
app:
|
3 |
+
extends:
|
4 |
+
file: docker-compose.yml
|
5 |
+
service: app
|
6 |
+
depends_on:
|
7 |
+
ollama:
|
8 |
+
condition: service_healthy
|
9 |
+
required: true
|
10 |
+
environment:
|
11 |
+
- OLLAMA_BASE_URL=http://ollama:11434
|
12 |
+
|
13 |
+
ollama:
|
14 |
+
image: ollama/ollama:${OLLAMA_HARDWARE:-latest}
|
15 |
+
ports:
|
16 |
+
- "11434:11434"
|
17 |
+
env_file:
|
18 |
+
- .env
|
19 |
+
environment:
|
20 |
+
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-}
|
21 |
+
volumes:
|
22 |
+
- ollama_data:/root/.ollama
|
23 |
+
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
|
24 |
+
networks:
|
25 |
+
- app-network
|
26 |
+
deploy:
|
27 |
+
resources:
|
28 |
+
reservations:
|
29 |
+
devices:
|
30 |
+
- driver: nvidia
|
31 |
+
count: all
|
32 |
+
capabilities: [gpu]
|
33 |
+
tty: true
|
34 |
+
entrypoint: ["/usr/bin/bash", "/entrypoint.sh"]
|
35 |
+
healthcheck:
|
36 |
+
test:
|
37 |
+
- "CMD-SHELL"
|
38 |
+
- |
|
39 |
+
test -f /tmp/ollama_ready && \
|
40 |
+
bash -c '</dev/tcp/localhost/11434'
|
41 |
+
interval: 10s
|
42 |
+
timeout: 10s
|
43 |
+
retries: 100
|
44 |
+
start_period: 10s
|
45 |
+
|
46 |
+
volumes:
|
47 |
+
ollama_data:
|
48 |
+
name: synthetic-data-ollama
|
docker/ollama/entrypoint.sh
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Start Ollama in the background
|
4 |
+
/bin/ollama serve &
|
5 |
+
# Record Process ID
|
6 |
+
pid=$!
|
7 |
+
|
8 |
+
# Pause for Ollama to start
|
9 |
+
sleep 5
|
10 |
+
|
11 |
+
# Extract model name from MODEL variable (removing quotes if present)
|
12 |
+
MODEL_NAME=$(echo $MODEL | tr -d '"')
|
13 |
+
|
14 |
+
# Verificar que MODEL_NAME tenga un valor
|
15 |
+
if [ -z "$MODEL_NAME" ]; then
|
16 |
+
echo "❌ No model specified in MODEL environment variable"
|
17 |
+
else
|
18 |
+
# Check if model exists
|
19 |
+
if ollama list | grep -q "$MODEL_NAME"; then
|
20 |
+
echo "🟢 Model ($MODEL_NAME) already installed"
|
21 |
+
touch /tmp/ollama_ready
|
22 |
+
else
|
23 |
+
echo "🔴 Retrieving model ($MODEL_NAME)..."
|
24 |
+
# Intentar descargar el modelo sin crear el archivo hasta estar seguros
|
25 |
+
if ollama pull "$MODEL_NAME" 2>/dev/null && ollama list | grep -q "$MODEL_NAME"; then
|
26 |
+
echo "🟢 Model download complete!"
|
27 |
+
touch /tmp/ollama_ready
|
28 |
+
else
|
29 |
+
echo "❌ Error downloading model ($MODEL_NAME)"
|
30 |
+
fi
|
31 |
+
fi
|
32 |
+
fi
|
33 |
+
|
34 |
+
# Wait for Ollama process to finish
|
35 |
+
wait $pid
|
src/synthetic_dataset_generator/__init__.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import inspect
|
2 |
-
|
3 |
from gradio import TabbedInterface
|
4 |
|
5 |
from synthetic_dataset_generator import ( # noqa
|
@@ -7,15 +6,13 @@ from synthetic_dataset_generator import ( # noqa
|
|
7 |
_inference_endpoints,
|
8 |
)
|
9 |
|
10 |
-
|
11 |
def launch(*args, **kwargs):
|
12 |
"""Launch the synthetic dataset generator.
|
13 |
Based on the `TabbedInterface` from Gradio.
|
14 |
Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
|
15 |
"""
|
16 |
from synthetic_dataset_generator.app import demo
|
17 |
-
|
18 |
-
return demo.launch(*args, **kwargs)
|
19 |
|
20 |
|
21 |
launch.__doc__ = TabbedInterface.launch.__doc__
|
|
|
1 |
import inspect
|
|
|
2 |
from gradio import TabbedInterface
|
3 |
|
4 |
from synthetic_dataset_generator import ( # noqa
|
|
|
6 |
_inference_endpoints,
|
7 |
)
|
8 |
|
|
|
9 |
def launch(*args, **kwargs):
|
10 |
"""Launch the synthetic dataset generator.
|
11 |
Based on the `TabbedInterface` from Gradio.
|
12 |
Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
|
13 |
"""
|
14 |
from synthetic_dataset_generator.app import demo
|
15 |
+
return demo.launch(*args, server_name="0.0.0.0", **kwargs)
|
|
|
16 |
|
17 |
|
18 |
launch.__doc__ = TabbedInterface.launch.__doc__
|
src/synthetic_dataset_generator/app.py
CHANGED
@@ -17,6 +17,9 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
|
|
17 |
.table-wrap .tbody td {vertical-align: top}
|
18 |
#system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
|
19 |
.container {padding-inline: 0 !important}
|
|
|
|
|
|
|
20 |
#sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
|
21 |
.datasets {height: 70px;}
|
22 |
"""
|
|
|
17 |
.table-wrap .tbody td {vertical-align: top}
|
18 |
#system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
|
19 |
.container {padding-inline: 0 !important}
|
20 |
+
.gradio-container { width: 100% !important; }
|
21 |
+
.gradio-row { display: flex !important; flex-direction: row !important; }
|
22 |
+
.gradio-column { flex: 1 !important; min-width: 0 !important; }
|
23 |
#sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
|
24 |
.datasets {height: 70px;}
|
25 |
"""
|
src/synthetic_dataset_generator/apps/base.py
CHANGED
@@ -131,6 +131,9 @@ def show_success_message(org_name: str, repo_name: str) -> gr.Markdown:
|
|
131 |
max_height=None,
|
132 |
)
|
133 |
argilla_api_url = client.api_url
|
|
|
|
|
|
|
134 |
return gr.Markdown(
|
135 |
value=f"""
|
136 |
<div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
|
|
|
131 |
max_height=None,
|
132 |
)
|
133 |
argilla_api_url = client.api_url
|
134 |
+
# Transform Docker internal URL to localhost if needed
|
135 |
+
if "argilla:" in argilla_api_url:
|
136 |
+
argilla_api_url = argilla_api_url.replace("argilla:", "127.0.0.1:")
|
137 |
return gr.Markdown(
|
138 |
value=f"""
|
139 |
<div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
|