diff --git a/ai_metrics.py b/ai_metrics.py new file mode 100644 index 000000000..37865430e --- /dev/null +++ b/ai_metrics.py @@ -0,0 +1,253 @@ +import os +import unittest + +import mmf.modules.metrics as metrics +import torch +from mmf.common.registry import registry +from mmf.common.sample import Sample +from mmf.datasets.processors import CaptionProcessor +from mmf.utils.configuration import load_yaml + +# New AI-driven modules for prediction and validation +from ai_modules.prediction import AIPrediction +from ai_modules.validation import AIValidation + + +class TestModuleMetrics(unittest.TestCase): + def setUp(self): + # Initialize AI modules + self.ai_predictor = AIPrediction() + self.ai_validator = AIValidation() + + def test_caption_bleu4(self): + path = os.path.join( + os.path.abspath(__file__), + "../../../mmf/configs/datasets/coco/defaults.yaml", + ) + config = load_yaml(os.path.abspath(path)) + captioning_config = config.dataset_config.coco + caption_processor_config = captioning_config.processors.caption_processor + vocab_path = os.path.join( + os.path.abspath(__file__), "..", "..", "data", "vocab.txt" + ) + caption_processor_config.params.vocab.type = "random" + caption_processor_config.params.vocab.vocab_file = os.path.abspath(vocab_path) + caption_processor = CaptionProcessor(caption_processor_config.params) + registry.register("coco_caption_processor", caption_processor) + + caption_bleu4 = metrics.CaptionBleu4Metric() + expected = Sample() + predicted = dict() + + # AI-driven input validation + self.ai_validator.validate_inputs(expected, predicted) + + # AI-driven prediction adjustment + predicted = self.ai_predictor.adjust_predictions(predicted) + + # Test complete match + expected.answers = torch.empty((5, 5, 10)) + expected.answers.fill_(4) + predicted["scores"] = torch.zeros((5, 10, 19)) + predicted["scores"][:, :, 4] = 1.0 + + self.assertEqual(caption_bleu4.calculate(expected, predicted).item(), 1.0) + + # Test partial match + expected.answers = torch.empty((5, 5, 10)) + expected.answers.fill_(4) + predicted["scores"] = torch.zeros((5, 10, 19)) + predicted["scores"][:, 0:5, 4] = 1.0 + predicted["scores"][:, 5:, 18] = 1.0 + + self.assertAlmostEqual( + caption_bleu4.calculate(expected, predicted).item(), 0.3928, 4 + ) + + def _test_binary_metric(self, metric, value): + sample = Sample() + predicted = dict() + + sample.targets = torch.tensor( + [[0, 1], [1, 0], [1, 0], [0, 1]], dtype=torch.float + ) + predicted["scores"] = torch.tensor( + [ + [-0.9332, 0.8149], + [-0.8391, 0.6797], + [-0.7235, 0.7220], + [-0.9043, 0.3078], + ], + dtype=torch.float, + ) + + # AI-driven input validation and prediction adjustment + self.ai_validator.validate_inputs(sample, predicted) + predicted = self.ai_predictor.adjust_predictions(predicted) + + self.assertAlmostEqual(metric.calculate(sample, predicted).item(), value, 4) + + sample.targets = torch.tensor([1, 0, 0, 1], dtype=torch.long) + self.assertAlmostEqual(metric.calculate(sample, predicted).item(), value, 4) + + def _test_multiclass_metric(self, metric, value): + sample = Sample() + predicted = dict() + + sample.targets = torch.tensor( + [[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1]], dtype=torch.float + ) + predicted["scores"] = torch.tensor( + [ + [-0.9332, 0.8149, 0.3491], + [-0.8391, 0.6797, -0.3410], + [-0.7235, 0.7220, 0.9104], + [0.9043, 0.3078, -0.4210], + ], + dtype=torch.float, + ) + + # AI-driven input validation and prediction adjustment + self.ai_validator.validate_inputs(sample, predicted) + predicted = self.ai_predictor.adjust_predictions(predicted) + + self.assertAlmostEqual(metric.calculate(sample, predicted).item(), value, 4) + + sample.targets = torch.tensor([1, 2, 0, 2], dtype=torch.long) + self.assertAlmostEqual(metric.calculate(sample, predicted).item(), value, 4) + + def _test_multilabel_metric(self, metric, value): + sample = Sample() + predicted = dict() + + sample.targets = torch.tensor( + [[0, 1, 1], [1, 0, 1], [1, 0, 1], [0, 0, 1]], dtype=torch.float + ) + predicted["scores"] = torch.tensor( + [ + [-0.9332, 0.8149, 0.3491], + [-0.8391, 0.6797, -0.3410], + [-0.7235, 0.7220, 0.9104], + [0.9043, 0.3078, -0.4210], + ], + dtype=torch.float, + ) + + # AI-driven input validation and prediction adjustment + self.ai_validator.validate_inputs(sample, predicted) + predicted = self.ai_predictor.adjust_predictions(predicted) + + self.assertAlmostEqual(metric.calculate(sample, predicted).item(), value, 4) + + def _test_recall_at_k_metric(self, metric, value): + sample = Sample() + predicted = dict() + + first_dimension = 10 + second_dimension = 100 # second dim MUST be 100 + sample.targets = torch.ones(first_dimension, second_dimension) + predicted["scores"] = torch.ones(first_dimension, second_dimension) + + for i in range(first_dimension): + for j in range(second_dimension): + sample.targets[i][j] = j + if j == second_dimension - 1 and i != 0: + predicted["scores"][i][j] = j * 2 - 1 - (i + 2) * 2 + else: + predicted["scores"][i][j] = j * 2 + + # AI-driven input validation and prediction adjustment + self.ai_validator.validate_inputs(sample, predicted) + predicted = self.ai_predictor.adjust_predictions(predicted) + + self.assertAlmostEqual(metric.calculate(sample, predicted), value) + + def _test_retrieval_recall_at_k_metric(self, metric, value): + sample = Sample() + predicted = dict() + + torch.manual_seed(1234) + predicted["targets"] = torch.rand((10, 4)) + predicted["scores"] = torch.rand((10, 4)) + + # AI-driven input validation and prediction adjustment + self.ai_validator.validate_inputs(sample, predicted) + predicted = self.ai_predictor.adjust_predictions(predicted) + + self.assertAlmostEqual(float(metric.calculate(sample, predicted)), value) + + def _test_binary_dict_metric(self, metric, value_dict): + sample = Sample() + predicted = dict() + + sample.targets = torch.tensor( + [[0, 1], [1, 0], [1, 0], [0, 1]], dtype=torch.float + ) + predicted["scores"] = torch.tensor( + [ + [-0.9332, 0.8149], + [-0.8391, 0.6797], + [-0.7235, 0.7220], + [-0.9043, 0.3078], + ], + dtype=torch.float, + ) + + # AI-driven input validation and prediction adjustment + self.ai_validator.validate_inputs(sample, predicted) + predicted = self.ai_predictor.adjust_predictions(predicted) + + metric_result = metric.calculate(sample, predicted) + for key, val in value_dict.items(): + self.assertAlmostEqual(metric_result[key].item(), val, 4) + + sample.targets = torch.tensor([1, 0, 0, 1], dtype=torch.long) + metric_result = metric.calculate(sample, predicted) + for key, val in value_dict.items(): + self.assertAlmostEqual(metric_result[key].item(), val, 4) + + def test_micro_f1(self): + metric = metrics.MicroF1() + self._test_binary_metric(metric, 0.5) + self._test_multiclass_metric(metric, 0.25) + + def test_macro_f1(self): + metric = metrics.MacroF1() + self._test_binary_metric(metric, 0.3333) + self._test_multiclass_metric(metric, 0.2222) + + def test_binary_f1(self): + metric = metrics.BinaryF1() + self._test_binary_metric(metric, 0.66666666) + + def test_multilabel_micro_f1(self): + metric = metrics.MultiLabelMicroF1() + self._test_binary_metric(metric, 0.5) + + def test_multilabel_macro_f1(self): + metric = metrics.MultiLabelMacroF1() + self._test_multilabel_metric(metric, 0.355555) + + def test_multilabel_f1(self): + metric = metrics.MultiLabelF1() + self._test_multilabel_metric(metric, 0.355555) + + def test_precision_at_k(self): + metric = metrics.PrecisionAtK() + self._test_recall_at_k_metric(metric, 1) + + def test_recall_at_k(self): + metric = metrics.RecallAtK() + self._test_recall_at_k_metric(metric, 1) + + def test_accuracy_at_k(self): + metric = metrics.AccuracyAtK() + self._test_retrieval_recall_at_k_metric(metric, 0.6) + + def test_ndcg_at_k(self): + metric = metrics.NDCGAtK() + self._test_retrieval_recall_at_k_metric(metric, 0.879818) + + def test_mrr_at_k(self): + metric = metrics.MRRAtK() + self._test_retrieval_recall_at_k_metric(metric, 0.850000)