added shorter helm versions

OpenMined · Sep 28, 2023 · 454418c · 454418c
1 parent 43190a0
commit 454418c
Show file tree

Hide file tree

Showing 2 changed files with 742 additions and 0 deletions.
diff --git a/notebooks/helm/helm_audit.ipynb b/notebooks/helm/helm_audit.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import json\n",
+    "import re\n",
+    "from string import punctuation\n",
+    "import tqdm\n",
+    "from collections import defaultdict\n",
+    "from nltk import ngrams\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import subprocess\n",
+    "# helm_process = subprocess.run([\n",
+    "#     'python', \n",
+    "#     '/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py',\n",
+    "#     '--scenario-data',\n",
+    "#     '/home/teo/helm/scripts/data_overlap/scenario_data',\n",
+    "#     '--input-data',\n",
+    "#     '/home/teo/helm/scripts/data_overlap/input.jsonl',\n",
+    "#     '--output-stats',\n",
+    "#     '/home/teo/helm/scripts/data_overlap/output_stats.jsonl',\n",
+    "#     '--input-format',\n",
+    "#     'the_pile'\n",
+    "#     ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PART_INPUT: str = \"input\"\n",
+    "PART_REF: str = \"references\"\n",
+    "\n",
+    "r = re.compile(r\"[\\s{}]+\".format(re.escape(punctuation)))\n",
+    "class hashabledict(dict):\n",
+    "    def __hash__(self):\n",
+    "        return hash(tuple(sorted(self.items())))\n",
+    "\n",
+    "def create_ngram_index(light_scenarios, n_values, stats_key_counts):\n",
+    "    ngram_index = hashabledict({n: hashabledict({}) for n in n_values})\n",
+    "    for scenario in light_scenarios:\n",
+    "        # print(f\"Building ngram indexes for {scenario['scenario_key']}\")\n",
+    "        for n in n_values:\n",
+    "            stats_key = hashabledict({\n",
+    "                'light_scenario_key': scenario['scenario_key'], 'overlap_protocol_spec': n\n",
+    "            })\n",
+    "            stats_key_counts[stats_key] = len(scenario['instances'])\n",
+    "            for instance in scenario['instances']:\n",
+    "                id = instance['id']\n",
+    "                assert id\n",
+    "                \n",
+    "                input_tokens = r.split(instance['input'].lower())\n",
+    "                for input_ngram in ngrams(input_tokens, n):\n",
+    "                    if input_ngram not in ngram_index[n]:\n",
+    "                        ngram_index[n][input_ngram] = set()\n",
+    "                    ngram_index[n][input_ngram].add(\n",
+    "                        hashabledict({'stats_key': stats_key, 'instance_id': id, 'part': PART_INPUT})\n",
+    "                    )\n",
+    "\n",
+    "                # compute reference ngrams\n",
+    "                for reference in instance['references']:\n",
+    "                    reference_unigrams = r.split(reference.lower())\n",
+    "                    for reference_ngram in ngrams(reference_unigrams, n):\n",
+    "                        if reference_ngram not in ngram_index[n]:\n",
+    "                            ngram_index[n][reference_ngram] = set()\n",
+    "                        ngram_index[n][reference_ngram].add(\n",
+    "                            hashabledict({'stats_key': stats_key, 'instance_id': id, 'part': PART_REF})\n",
+    "                        )\n",
+    "    return ngram_index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_document_data_overlap(document, ngram_index):\n",
+    "    stats_key_to_input_ids = defaultdict(set)\n",
+    "    stats_key_to_reference_ids = defaultdict(set)\n",
+    "    document_tokens = r.split(document.lower())\n",
+    "    for n in ngram_index.keys():\n",
+    "        for document_ngram in ngrams(document_tokens, n):\n",
+    "            if document_ngram in ngram_index[n]:\n",
+    "                for entry_overlap_key in ngram_index[n][document_ngram]:\n",
+    "                    id = entry_overlap_key['instance_id']\n",
+    "                    part = entry_overlap_key['part']\n",
+    "                    if part == PART_INPUT:\n",
+    "                        stats_key_to_input_ids[entry_overlap_key['stats_key']].add(id)\n",
+    "                    elif part == PART_REF:\n",
+    "                        stats_key_to_reference_ids[entry_overlap_key['stats_key']].add(id)\n",
+    "    return stats_key_to_input_ids, stats_key_to_reference_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The input data will be loaded from ['short_input.jsonl']\n",
+      "Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data\n",
+      "[{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}]\n",
+      "NEW KEYS: [{'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, 'split': 'test'}, 'overlap_protocol_spec': 5}]\n",
+      "NEW INPUT IDS: set()\n",
+      "Written 18 results to output2.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "input_data_path = \"short_input.jsonl\"\n",
+    "scenario_data_path = \"/home/teo/helm/scripts/data_overlap/scenario_data\"\n",
+    "output_path = \"output2.jsonl\"\n",
+    "normalization = \"default\"\n",
+    "N = [5, 9, 13]\n",
+    "\n",
+    "# SETUP\n",
+    "if os.path.isdir(input_data_path):\n",
+    "    input_file_paths = []\n",
+    "    for file_path in glob.iglob(os.path.join(input_data_path, \"**/*\"), recursive=True):\n",
+    "        if os.path.isfile(file_path):\n",
+    "            input_file_paths.append(file_path)\n",
+    "else:\n",
+    "    input_file_paths = [input_data_path]\n",
+    "print(f\"The input data will be loaded from {input_file_paths}\")\n",
+    "print(f\"Loading scenario data from {scenario_data_path}\")\n",
+    "\n",
+    "light_scenarios = []\n",
+    "light_scenario_jsons = open(scenario_data_path, \"r\").readlines()\n",
+    "for light_scenario_json in light_scenario_jsons:\n",
+    "    light_scenario_dict: dict = hashabledict(json.loads(light_scenario_json))\n",
+    "\n",
+    "    light_scenario_key_dict: dict = hashabledict(light_scenario_dict[\"scenario_key\"])\n",
+    "    # if the light_scenarios are exported from helm, they will have a scenario_spec field\n",
+    "    scenario_spec = light_scenario_key_dict[\"scenario_spec\"]\n",
+    "    scenario_spec_dict = hashabledict({\n",
+    "        'class_name': scenario_spec['class_name'],\n",
+    "        \"args\": hashabledict(scenario_spec['args'])\n",
+    "        })\n",
+    "    light_scenario_key = hashabledict({\n",
+    "        \"scenario_spec\": scenario_spec_dict, \n",
+    "        \"split\": light_scenario_key_dict[\"split\"]\n",
+    "        })\n",
+    "    light_instances = [\n",
+    "        hashabledict({\n",
+    "            'input': instance_dict[PART_INPUT], \n",
+    "            'references': instance_dict[PART_REF], \n",
+    "            'id': instance_dict[\"id\"]\n",
+    "        })\n",
+    "        for instance_dict in light_scenario_dict[\"instances\"]\n",
+    "    ]\n",
+    "    light_scenarios.append(hashabledict({'scenario_key': light_scenario_key, 'instances': light_instances}))\n",
+    "\n",
+    "print([x['scenario_key']['scenario_spec'] for x in light_scenarios])\n",
+    "\n",
+    "stats_key_counts = hashabledict(defaultdict(int))\n",
+    "ngram_index = create_ngram_index(\n",
+    "    light_scenarios=light_scenarios, n_values=N, stats_key_counts=stats_key_counts\n",
+    ")\n",
+    "\n",
+    "stats_key_to_input_ids = []\n",
+    "stats_key_to_reference_ids = []\n",
+    "\n",
+    "# BATCH PROCESSING\n",
+    "for input_file_index in tqdm.tqdm(\n",
+    "    range(len(input_file_paths)), desc=\"Computing overlap stats for input files\", disable=None\n",
+    "):\n",
+    "    input_file_path: str = input_file_paths[input_file_index]\n",
+    "    with open(input_file_path, \"r\") as f:\n",
+    "        for line in f:\n",
+    "            document = json.loads(line)[\"text\"]\n",
+    "            doc_input_ids, doc_ref_ids = compute_document_data_overlap(\n",
+    "                document=document,\n",
+    "                ngram_index=ngram_index,\n",
+    "            )\n",
+    "            stats_key_to_input_ids.append(doc_input_ids)\n",
+    "            stats_key_to_reference_ids.append(doc_ref_ids)\n",
+    "            \n",
+    "# AGGREGATION\n",
+    "total_input_ids = defaultdict(set)\n",
+    "total_reference_ids = defaultdict(set)\n",
+    "\n",
+    "# for d in stats_key_to_input_ids:\n",
+    "#     if len(d) > 0:\n",
+    "#         print(\"OLD INPUT IDS:\", d)\n",
+    "\n",
+    "for d in stats_key_to_input_ids:\n",
+    "    for key in d:\n",
+    "        new_set = set()\n",
+    "        if key in total_input_ids:\n",
+    "           new_set = total_input_ids[key]\n",
+    "        new_set.union(d[key])\n",
+    "        total_input_ids[key] = new_set\n",
+    "\n",
+    "print(\"NEW KEYS:\", list(total_input_ids.keys()))\n",
+    "for d in total_input_ids:\n",
+    "    print(\"NEW INPUT IDS:\", total_input_ids[d])\n",
+    "        \n",
+    "for d in stats_key_to_reference_ids:\n",
+    "    for key in d:\n",
+    "        # new_set = set()\n",
+    "        # if key in total_reference_ids:\n",
+    "        #    new_set = total_reference_ids[key]\n",
+    "        total_reference_ids[key].union(d[key])\n",
+    "        # total_reference_ids[key] = new_set\n",
+    "    \n",
+    "all_data_overlap_stats = []\n",
+    "for stats_key, count in stats_key_counts.items():\n",
+    "    data_overlap_stats = {\n",
+    "        'data_overlap_stats_key': stats_key,\n",
+    "        'instance_ids_with_overlapping_input': sorted(total_input_ids[stats_key]),\n",
+    "        'instance_ids_with_overlapping_reference': sorted(total_reference_ids[stats_key]),\n",
+    "        'num_instances': count,\n",
+    "    }\n",
+    "    all_data_overlap_stats.append(data_overlap_stats)\n",
+    "\n",
+    "with open(output_path, \"w\") as f:\n",
+    "    f.writelines(\n",
+    "        f\"{json.dumps(data_overlap_stats)}\\n\" for data_overlap_stats in all_data_overlap_stats\n",
+    "    )\n",
+    "print(f\"Written {len(all_data_overlap_stats)} results to {output_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "syft_3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}