Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -109,8 +109,9 @@ def evaluate(
|
|
109 |
calibrated: bool = True,
|
110 |
check_gt_only: bool = False,
|
111 |
no_gt: bool = False,
|
|
|
112 |
):
|
113 |
-
|
114 |
if parallel < 1:
|
115 |
n_workers = max(1, multiprocessing.cpu_count() // 2)
|
116 |
else:
|
@@ -122,6 +123,14 @@ def evaluate(
|
|
122 |
extra = subset + "_" if subset != "full" else ""
|
123 |
|
124 |
problems = get_bigcodebench(subset=subset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
dataset_hash = get_bigcodebench_hash(subset=subset)
|
126 |
|
127 |
if not no_gt:
|
@@ -214,7 +223,7 @@ def evaluate(
|
|
214 |
|
215 |
pass_at_k.update({
|
216 |
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
217 |
-
for k in
|
218 |
if total.min() >= k
|
219 |
})
|
220 |
|
@@ -247,6 +256,7 @@ interface = gr.Interface(
|
|
247 |
gr.Checkbox(label="Calibrated", value=True),
|
248 |
gr.Checkbox(label="Check GT Only"),
|
249 |
gr.Checkbox(label="No GT"),
|
|
|
250 |
],
|
251 |
outputs=[
|
252 |
gr.JSON(label="Results"),
|
|
|
109 |
calibrated: bool = True,
|
110 |
check_gt_only: bool = False,
|
111 |
no_gt: bool = False,
|
112 |
+
selective_evaluate: str = "",
|
113 |
):
|
114 |
+
passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
|
115 |
if parallel < 1:
|
116 |
n_workers = max(1, multiprocessing.cpu_count() // 2)
|
117 |
else:
|
|
|
123 |
extra = subset + "_" if subset != "full" else ""
|
124 |
|
125 |
problems = get_bigcodebench(subset=subset)
|
126 |
+
|
127 |
+
# Add selective evaluation logic
|
128 |
+
if selective_evaluate:
|
129 |
+
selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))]
|
130 |
+
problems = {k: v for k, v in problems.items() if k in selected_ids}
|
131 |
+
if not problems:
|
132 |
+
raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
|
133 |
+
|
134 |
dataset_hash = get_bigcodebench_hash(subset=subset)
|
135 |
|
136 |
if not no_gt:
|
|
|
223 |
|
224 |
pass_at_k.update({
|
225 |
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
226 |
+
for k in passk
|
227 |
if total.min() >= k
|
228 |
})
|
229 |
|
|
|
256 |
gr.Checkbox(label="Calibrated", value=True),
|
257 |
gr.Checkbox(label="Check GT Only"),
|
258 |
gr.Checkbox(label="No GT"),
|
259 |
+
gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""),
|
260 |
],
|
261 |
outputs=[
|
262 |
gr.JSON(label="Results"),
|