diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 976b21c86..56bc1b2a6 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -92,6 +92,20 @@ def default(self, o): # noqa : C901 return type(o).__name__ +MODEL_CONFIG_CREDENTIAL_FIELDS = { + "api_key", + "inference_server_auth", +} + + +def _redact_model_config_credentials(model_config: dict) -> dict: + model_config_dict = dict(model_config) + for field in MODEL_CONFIG_CREDENTIAL_FIELDS: + if field in model_config_dict and model_config_dict[field] is not None: + model_config_dict[field] = "REDACTED" + return model_config_dict + + class EvaluationTracker: """Tracks and manages evaluation results, metrics, and logging for model evaluations. @@ -211,7 +225,7 @@ def __init__( @property def results(self): config_general = asdict(self.general_config_logger) - config_general["model_config"] = config_general["model_config"].model_dump() + config_general["model_config"] = _redact_model_config_credentials(config_general["model_config"].model_dump()) results = { "config_general": config_general, "results": self.metrics_logger.metric_aggregated, @@ -376,6 +390,9 @@ def generate_final_dict(self) -> dict: "summary_tasks": self.details_logger.compiled_details, "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks), } + to_dump["config_general"]["model_config"] = _redact_model_config_credentials( + to_dump["config_general"]["model_config"].model_dump() + ) final_dict = { k: {eval_name.replace("|", ":"): eval_score for eval_name, eval_score in v.items()} diff --git a/tests/unit/logging/test_evaluation_tracker.py b/tests/unit/logging/test_evaluation_tracker.py index 45c5790d0..40f464627 100644 --- a/tests/unit/logging/test_evaluation_tracker.py +++ b/tests/unit/logging/test_evaluation_tracker.py @@ -32,6 +32,9 @@ from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.logging.info_loggers import DetailsLogger +from lighteval.models.endpoints.litellm_model import LiteLLMModelConfig +from lighteval.models.endpoints.tgi_model import TGIModelConfig +from lighteval.pipeline import Pipeline # ruff: noqa from tests.fixtures import TESTING_EMPTY_HF_ORG_ID @@ -128,6 +131,77 @@ def test_results_logging_template(self, mock_evaluation_tracker: EvaluationTrack assert saved_results["results"] == task_metrics assert saved_results["config_general"]["model_name"] == "test_model" + def test_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker): + mock_evaluation_tracker.general_config_logger.log_model_info( + LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key") + ) + + results = mock_evaluation_tracker.results + + assert results["config_general"]["model_config"]["api_key"] == "REDACTED" + + mock_evaluation_tracker.save() + + results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model" + result_files = list(results_dir.glob("results_*.json")) + assert len(result_files) == 1 + + with open(result_files[0], "r") as f: + saved_results = json.load(f) + + assert saved_results["config_general"]["model_config"]["api_key"] == "REDACTED" + assert saved_results["config_general"]["model_config"]["model_name"] == "test_model" + + def test_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker): + mock_evaluation_tracker.general_config_logger.log_model_info( + TGIModelConfig( + model_name="test_model", + inference_server_address="http://localhost:8080", + inference_server_auth="super-secret-token", + ) + ) + + results = mock_evaluation_tracker.results + + assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED" + assert results["config_general"]["model_config"]["model_name"] == "test_model" + + def test_pipeline_get_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker): + mock_evaluation_tracker.general_config_logger.log_model_info( + LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key") + ) + + pipeline = Pipeline.__new__(Pipeline) + pipeline.accelerator = None + pipeline.parallel_context = None + pipeline.final_dict = None + pipeline.evaluation_tracker = mock_evaluation_tracker + + results = pipeline.get_results() + + assert results["config_general"]["model_config"]["api_key"] == "REDACTED" + assert results["config_general"]["model_config"]["model_name"] == "test_model" + + def test_pipeline_get_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker): + mock_evaluation_tracker.general_config_logger.log_model_info( + TGIModelConfig( + model_name="test_model", + inference_server_address="http://localhost:8080", + inference_server_auth="super-secret-token", + ) + ) + + pipeline = Pipeline.__new__(Pipeline) + pipeline.accelerator = None + pipeline.parallel_context = None + pipeline.final_dict = None + pipeline.evaluation_tracker = mock_evaluation_tracker + + results = pipeline.get_results() + + assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED" + assert results["config_general"]["model_config"]["model_name"] == "test_model" + @pytest.mark.evaluation_tracker(save_details=True) def test_details_logging(self, mock_evaluation_tracker, mock_datetime): task_details = {