abiswal commited on
Commit
cacf673
·
1 Parent(s): e82a4be
Files changed (3) hide show
  1. app.py +53 -198
  2. old_app.py +205 -0
  3. src/about.py +3 -27
app.py CHANGED
@@ -1,204 +1,59 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
  )
90
 
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
  with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
 
 
3
 
4
+ # Simplified DataFrame for the leaderboard
5
+ data = {
6
+ "Model": [
7
+ "Handwritten TAG",
8
+ "Zero-shot Text2SQL",
9
+ "Zero-shot Text2SQL + LM Generation",
10
+ "RAG (E5)",
11
+ "RAG (E5) + LM Rerank"
12
+ ],
13
+ "Code": [
14
+ "", # Handwritten TAG doesn't have a code link
15
+ "", # Zero-shot Text2SQL doesn't have a code link
16
+ "", # Zero-shot Text2SQL + LM Generation doesn't have a code link
17
+ "", # RAG (E5) doesn't have a code link
18
+ "" # RAG (E5) + LM Rerank doesn't have a code link
19
+ ],
20
+ "Execution Accuracy": [
21
+ "55%", # Handwritten TAG
22
+ "17%", # Zero-shot Text2SQL
23
+ "13%", # Zero-shot Text2SQL + LM Generation
24
+ "0%", # RAG (E5)
25
+ "2%" # RAG (E5) + LM Rerank
26
+ ]
27
+ }
28
+
29
+ leaderboard_df = pd.DataFrame(data)
30
+
31
+ # Simplified Gradio app
32
+ with gr.Blocks() as demo:
33
+ # Header for the leaderboard
34
+ gr.HTML(
35
+ """
36
+ <h1>Leaderboard - Execution Accuracy (EX)</h1>
37
+ <style>
38
+ #highlight-green {
39
+ background-color: #d4edda;
40
+ color: #155724;
41
+ font-weight: bold;
42
+ }
43
+ </style>
44
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  )
46
 
47
+ # Highlight the top row in green for "Handwritten TAG"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  with gr.Row():
49
+ gr.Dataframe(
50
+ value=leaderboard_df,
51
+ headers=["Model", "Code", "Execution Accuracy"],
52
+ datatype=["str", "str", "str"],
53
+ row_count=(5, "dynamic"),
54
+ wrap=True,
55
+ elem_id="leaderboard",
56
+ type="pandas"
57
+ )
58
+
59
+ demo.launch()
 
 
old_app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ AutoEvalColumn,
22
+ ModelType,
23
+ fields,
24
+ WeightType,
25
+ Precision
26
+ )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
+
31
+
32
+ def restart_space():
33
+ API.restart_space(repo_id=REPO_ID)
34
+
35
+ ### Space initialisation
36
+ try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ )
41
+ except Exception:
42
+ restart_space()
43
+ try:
44
+ print(EVAL_RESULTS_PATH)
45
+ snapshot_download(
46
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ )
48
+ except Exception:
49
+ restart_space()
50
+
51
+
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
+
54
+ (
55
+ finished_eval_queue_df,
56
+ running_eval_queue_df,
57
+ pending_eval_queue_df,
58
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
+
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0.01,
80
+ max=150,
81
+ label="Select the number of parameters (B)",
82
+ ),
83
+ ColumnFilter(
84
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ ),
86
+ ],
87
+ bool_checkboxgroup_label="Hide models",
88
+ interactive=False,
89
+ )
90
+
91
+
92
+ demo = gr.Blocks(css=custom_css)
93
+ with demo:
94
+ gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
+
97
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
+
101
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
+
104
+ with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
105
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
106
+ # with gr.Column():
107
+ # with gr.Row():
108
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
109
+
110
+ # with gr.Column():
111
+ # with gr.Accordion(
112
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
113
+ # open=False,
114
+ # ):
115
+ # with gr.Row():
116
+ # finished_eval_table = gr.components.Dataframe(
117
+ # value=finished_eval_queue_df,
118
+ # headers=EVAL_COLS,
119
+ # datatype=EVAL_TYPES,
120
+ # row_count=5,
121
+ # )
122
+ # with gr.Accordion(
123
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
124
+ # open=False,
125
+ # ):
126
+ # with gr.Row():
127
+ # running_eval_table = gr.components.Dataframe(
128
+ # value=running_eval_queue_df,
129
+ # headers=EVAL_COLS,
130
+ # datatype=EVAL_TYPES,
131
+ # row_count=5,
132
+ # )
133
+
134
+ # with gr.Accordion(
135
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
136
+ # open=False,
137
+ # ):
138
+ # with gr.Row():
139
+ # pending_eval_table = gr.components.Dataframe(
140
+ # value=pending_eval_queue_df,
141
+ # headers=EVAL_COLS,
142
+ # datatype=EVAL_TYPES,
143
+ # row_count=5,
144
+ # )
145
+ # with gr.Row():
146
+ # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
147
+
148
+ # with gr.Row():
149
+ # with gr.Column():
150
+ # model_name_textbox = gr.Textbox(label="Model name")
151
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
152
+ # model_type = gr.Dropdown(
153
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
154
+ # label="Model type",
155
+ # multiselect=False,
156
+ # value=None,
157
+ # interactive=True,
158
+ # )
159
+
160
+ # with gr.Column():
161
+ # precision = gr.Dropdown(
162
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
163
+ # label="Precision",
164
+ # multiselect=False,
165
+ # value="float16",
166
+ # interactive=True,
167
+ # )
168
+ # weight_type = gr.Dropdown(
169
+ # choices=[i.value.name for i in WeightType],
170
+ # label="Weights type",
171
+ # multiselect=False,
172
+ # value="Original",
173
+ # interactive=True,
174
+ # )
175
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
176
+
177
+ # submit_button = gr.Button("Submit Eval")
178
+ # submission_result = gr.Markdown()
179
+ # submit_button.click(
180
+ # add_new_eval,
181
+ # [
182
+ # model_name_textbox,
183
+ # base_model_name_textbox,
184
+ # revision_name_textbox,
185
+ # precision,
186
+ # weight_type,
187
+ # model_type,
188
+ # ],
189
+ # submission_result,
190
+ # )
191
+
192
+ with gr.Row():
193
+ with gr.Accordion("📙 Citation", open=False):
194
+ citation_button = gr.Textbox(
195
+ value=CITATION_BUTTON_TEXT,
196
+ label=CITATION_BUTTON_LABEL,
197
+ lines=20,
198
+ elem_id="citation-button",
199
+ show_copy_button=True,
200
+ )
201
+
202
+ scheduler = BackgroundScheduler()
203
+ scheduler.add_job(restart_space, "interval", seconds=1800)
204
+ scheduler.start()
205
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -21,7 +21,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
@@ -38,33 +38,9 @@ To reproduce our results, here is the commands you can run:
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">TAG leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
+ ## Steps before submission
42
 
43
+ ### 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """
45
 
46
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"