-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbenchmarks.py
69 lines (50 loc) · 2.1 KB
/
benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Code to run HumanEval and MBPP benchmarks."""
import os
from human_eval.data import stream_jsonl, read_problems, write_jsonl
from human_eval.evaluation import evaluate_functional_correctness
from omegaconf import DictConfig
from transformers import PreTrainedModel, PreTrainedTokenizer
from src.routers.generator import batch_generate_completions, GenerateData
BENCHMARK_OUTPUT_DIR = '.benchmark_outputs/'
N_WORKERS = 4
def run_human_eval_benchmark(config: DictConfig, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
"""Run the HumanEval benchmark.
This follows the example given in the README here: https://github.com/openai/human-eval
"""
problems = read_problems()
n_samples_per_task = 1
problem_tuples = [(k, v['prompt']) for k, v in problems.items()]
task_ids, prompts = zip(*problem_tuples)
# Create lists of the input task ids and corresponding GenerateData objects as inputs
input_tasks = [
task_id
for task_id in task_ids
for _ in range(n_samples_per_task)
]
inputs = [
GenerateData(
prior_context=prompt,
max_decode_length=config.train.max_gen_length
)
for prompt in prompts
for _ in range(n_samples_per_task)
]
# Generate completions and format them for HumanEval
completions = batch_generate_completions(
inputs, config, model, tokenizer, batch_size=config.train.generation_batch_size,
progress_bar=True,
)['output_text']
samples = [
dict(task_id=task_id, completion=completion)
for task_id, completion in zip(input_tasks, completions)
]
os.makedirs(BENCHMARK_OUTPUT_DIR, exist_ok=True)
# Write the results to a file
filepath = os.path.join(BENCHMARK_OUTPUT_DIR, 'human_eval_samples.jsonl')
write_jsonl(filepath, samples)
evaluate_functional_correctness(filepath, k=[1], n_workers=N_WORKERS, timeout=20)
# Read the results
results = list(stream_jsonl(filepath + '_results.jsonl'))
passed = [r['passed'] for r in results]
passed_frac = sum(passed) / len(passed)
return passed_frac