| import os |
|
|
| import pytest |
|
|
| import lm_eval.api as api |
| import lm_eval.evaluator as evaluator |
| from lm_eval import tasks |
|
|
|
|
| @pytest.mark.parametrize( |
| "limit,model,model_args", |
| [ |
| ( |
| 10, |
| "hf", |
| "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu", |
| ), |
| ], |
| ) |
| def test_include_correctness(limit: int, model: str, model_args: str): |
| task_name = ["arc_easy"] |
|
|
| task_manager = tasks.TaskManager() |
| task_dict = tasks.get_task_dict(task_name, task_manager) |
|
|
| e1 = evaluator.simple_evaluate( |
| model=model, |
| tasks=task_name, |
| limit=limit, |
| model_args=model_args, |
| ) |
| assert e1 is not None |
|
|
| |
| lm = api.registry.get_model(model).create_from_arg_string( |
| model_args, |
| { |
| "batch_size": None, |
| "max_batch_size": None, |
| "device": None, |
| }, |
| ) |
|
|
| task_name = ["arc_easy"] |
|
|
| task_manager = tasks.TaskManager( |
| include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs", |
| include_defaults=False, |
| ) |
| task_dict = tasks.get_task_dict(task_name, task_manager) |
|
|
| e2 = evaluator.evaluate( |
| lm=lm, |
| task_dict=task_dict, |
| limit=limit, |
| ) |
|
|
| assert e2 is not None |
| |
|
|
| def r(x): |
| return x["results"]["arc_easy"] |
|
|
| assert all( |
| x == y |
| for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()]) |
| ) |
|
|
|
|
| |
| def test_no_include_defaults(): |
| task_name = ["arc_easy"] |
|
|
| task_manager = tasks.TaskManager( |
| include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs", |
| include_defaults=False, |
| ) |
| |
| task_dict = tasks.get_task_dict(task_name, task_manager) |
|
|
| |
| task_name = ["arc_challenge"] |
| with pytest.raises(KeyError): |
| task_dict = tasks.get_task_dict(task_name, task_manager) |
|
|
|
|
| |
| |
|
|
| |
|
|
| |
| |
|
|