terryyz commited on
Commit
ee7febe
·
verified ·
1 Parent(s): 6c9cbae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -109,8 +109,9 @@ def evaluate(
109
  calibrated: bool = True,
110
  check_gt_only: bool = False,
111
  no_gt: bool = False,
 
112
  ):
113
- pass_k = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
114
  if parallel < 1:
115
  n_workers = max(1, multiprocessing.cpu_count() // 2)
116
  else:
@@ -122,6 +123,14 @@ def evaluate(
122
  extra = subset + "_" if subset != "full" else ""
123
 
124
  problems = get_bigcodebench(subset=subset)
 
 
 
 
 
 
 
 
125
  dataset_hash = get_bigcodebench_hash(subset=subset)
126
 
127
  if not no_gt:
@@ -214,7 +223,7 @@ def evaluate(
214
 
215
  pass_at_k.update({
216
  f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
217
- for k in pass_k
218
  if total.min() >= k
219
  })
220
 
@@ -247,6 +256,7 @@ interface = gr.Interface(
247
  gr.Checkbox(label="Calibrated", value=True),
248
  gr.Checkbox(label="Check GT Only"),
249
  gr.Checkbox(label="No GT"),
 
250
  ],
251
  outputs=[
252
  gr.JSON(label="Results"),
 
109
  calibrated: bool = True,
110
  check_gt_only: bool = False,
111
  no_gt: bool = False,
112
+ selective_evaluate: str = "",
113
  ):
114
+ passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
115
  if parallel < 1:
116
  n_workers = max(1, multiprocessing.cpu_count() // 2)
117
  else:
 
123
  extra = subset + "_" if subset != "full" else ""
124
 
125
  problems = get_bigcodebench(subset=subset)
126
+
127
+ # Add selective evaluation logic
128
+ if selective_evaluate:
129
+ selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))]
130
+ problems = {k: v for k, v in problems.items() if k in selected_ids}
131
+ if not problems:
132
+ raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
133
+
134
  dataset_hash = get_bigcodebench_hash(subset=subset)
135
 
136
  if not no_gt:
 
223
 
224
  pass_at_k.update({
225
  f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
226
+ for k in passk
227
  if total.min() >= k
228
  })
229
 
 
256
  gr.Checkbox(label="Calibrated", value=True),
257
  gr.Checkbox(label="Check GT Only"),
258
  gr.Checkbox(label="No GT"),
259
+ gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""),
260
  ],
261
  outputs=[
262
  gr.JSON(label="Results"),