Skip to content

Commit

Permalink
added shorter helm versions
Browse files Browse the repository at this point in the history
  • Loading branch information
teo-milea committed Sep 28, 2023
1 parent 43190a0 commit 454418c
Show file tree
Hide file tree
Showing 2 changed files with 742 additions and 0 deletions.
273 changes: 273 additions & 0 deletions notebooks/helm/helm_audit.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"import json\n",
"import re\n",
"from string import punctuation\n",
"import tqdm\n",
"from collections import defaultdict\n",
"from nltk import ngrams\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# import subprocess\n",
"# helm_process = subprocess.run([\n",
"# 'python', \n",
"# '/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py',\n",
"# '--scenario-data',\n",
"# '/home/teo/helm/scripts/data_overlap/scenario_data',\n",
"# '--input-data',\n",
"# '/home/teo/helm/scripts/data_overlap/input.jsonl',\n",
"# '--output-stats',\n",
"# '/home/teo/helm/scripts/data_overlap/output_stats.jsonl',\n",
"# '--input-format',\n",
"# 'the_pile'\n",
"# ])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"PART_INPUT: str = \"input\"\n",
"PART_REF: str = \"references\"\n",
"\n",
"r = re.compile(r\"[\\s{}]+\".format(re.escape(punctuation)))\n",
"class hashabledict(dict):\n",
" def __hash__(self):\n",
" return hash(tuple(sorted(self.items())))\n",
"\n",
"def create_ngram_index(light_scenarios, n_values, stats_key_counts):\n",
" ngram_index = hashabledict({n: hashabledict({}) for n in n_values})\n",
" for scenario in light_scenarios:\n",
" # print(f\"Building ngram indexes for {scenario['scenario_key']}\")\n",
" for n in n_values:\n",
" stats_key = hashabledict({\n",
" 'light_scenario_key': scenario['scenario_key'], 'overlap_protocol_spec': n\n",
" })\n",
" stats_key_counts[stats_key] = len(scenario['instances'])\n",
" for instance in scenario['instances']:\n",
" id = instance['id']\n",
" assert id\n",
" \n",
" input_tokens = r.split(instance['input'].lower())\n",
" for input_ngram in ngrams(input_tokens, n):\n",
" if input_ngram not in ngram_index[n]:\n",
" ngram_index[n][input_ngram] = set()\n",
" ngram_index[n][input_ngram].add(\n",
" hashabledict({'stats_key': stats_key, 'instance_id': id, 'part': PART_INPUT})\n",
" )\n",
"\n",
" # compute reference ngrams\n",
" for reference in instance['references']:\n",
" reference_unigrams = r.split(reference.lower())\n",
" for reference_ngram in ngrams(reference_unigrams, n):\n",
" if reference_ngram not in ngram_index[n]:\n",
" ngram_index[n][reference_ngram] = set()\n",
" ngram_index[n][reference_ngram].add(\n",
" hashabledict({'stats_key': stats_key, 'instance_id': id, 'part': PART_REF})\n",
" )\n",
" return ngram_index"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def compute_document_data_overlap(document, ngram_index):\n",
" stats_key_to_input_ids = defaultdict(set)\n",
" stats_key_to_reference_ids = defaultdict(set)\n",
" document_tokens = r.split(document.lower())\n",
" for n in ngram_index.keys():\n",
" for document_ngram in ngrams(document_tokens, n):\n",
" if document_ngram in ngram_index[n]:\n",
" for entry_overlap_key in ngram_index[n][document_ngram]:\n",
" id = entry_overlap_key['instance_id']\n",
" part = entry_overlap_key['part']\n",
" if part == PART_INPUT:\n",
" stats_key_to_input_ids[entry_overlap_key['stats_key']].add(id)\n",
" elif part == PART_REF:\n",
" stats_key_to_reference_ids[entry_overlap_key['stats_key']].add(id)\n",
" return stats_key_to_input_ids, stats_key_to_reference_ids"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The input data will be loaded from ['short_input.jsonl']\n",
"Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data\n",
"[{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}]\n",
"NEW KEYS: [{'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, 'split': 'test'}, 'overlap_protocol_spec': 5}]\n",
"NEW INPUT IDS: set()\n",
"Written 18 results to output2.jsonl\n"
]
}
],
"source": [
"\n",
"input_data_path = \"short_input.jsonl\"\n",
"scenario_data_path = \"/home/teo/helm/scripts/data_overlap/scenario_data\"\n",
"output_path = \"output2.jsonl\"\n",
"normalization = \"default\"\n",
"N = [5, 9, 13]\n",
"\n",
"# SETUP\n",
"if os.path.isdir(input_data_path):\n",
" input_file_paths = []\n",
" for file_path in glob.iglob(os.path.join(input_data_path, \"**/*\"), recursive=True):\n",
" if os.path.isfile(file_path):\n",
" input_file_paths.append(file_path)\n",
"else:\n",
" input_file_paths = [input_data_path]\n",
"print(f\"The input data will be loaded from {input_file_paths}\")\n",
"print(f\"Loading scenario data from {scenario_data_path}\")\n",
"\n",
"light_scenarios = []\n",
"light_scenario_jsons = open(scenario_data_path, \"r\").readlines()\n",
"for light_scenario_json in light_scenario_jsons:\n",
" light_scenario_dict: dict = hashabledict(json.loads(light_scenario_json))\n",
"\n",
" light_scenario_key_dict: dict = hashabledict(light_scenario_dict[\"scenario_key\"])\n",
" # if the light_scenarios are exported from helm, they will have a scenario_spec field\n",
" scenario_spec = light_scenario_key_dict[\"scenario_spec\"]\n",
" scenario_spec_dict = hashabledict({\n",
" 'class_name': scenario_spec['class_name'],\n",
" \"args\": hashabledict(scenario_spec['args'])\n",
" })\n",
" light_scenario_key = hashabledict({\n",
" \"scenario_spec\": scenario_spec_dict, \n",
" \"split\": light_scenario_key_dict[\"split\"]\n",
" })\n",
" light_instances = [\n",
" hashabledict({\n",
" 'input': instance_dict[PART_INPUT], \n",
" 'references': instance_dict[PART_REF], \n",
" 'id': instance_dict[\"id\"]\n",
" })\n",
" for instance_dict in light_scenario_dict[\"instances\"]\n",
" ]\n",
" light_scenarios.append(hashabledict({'scenario_key': light_scenario_key, 'instances': light_instances}))\n",
"\n",
"print([x['scenario_key']['scenario_spec'] for x in light_scenarios])\n",
"\n",
"stats_key_counts = hashabledict(defaultdict(int))\n",
"ngram_index = create_ngram_index(\n",
" light_scenarios=light_scenarios, n_values=N, stats_key_counts=stats_key_counts\n",
")\n",
"\n",
"stats_key_to_input_ids = []\n",
"stats_key_to_reference_ids = []\n",
"\n",
"# BATCH PROCESSING\n",
"for input_file_index in tqdm.tqdm(\n",
" range(len(input_file_paths)), desc=\"Computing overlap stats for input files\", disable=None\n",
"):\n",
" input_file_path: str = input_file_paths[input_file_index]\n",
" with open(input_file_path, \"r\") as f:\n",
" for line in f:\n",
" document = json.loads(line)[\"text\"]\n",
" doc_input_ids, doc_ref_ids = compute_document_data_overlap(\n",
" document=document,\n",
" ngram_index=ngram_index,\n",
" )\n",
" stats_key_to_input_ids.append(doc_input_ids)\n",
" stats_key_to_reference_ids.append(doc_ref_ids)\n",
" \n",
"# AGGREGATION\n",
"total_input_ids = defaultdict(set)\n",
"total_reference_ids = defaultdict(set)\n",
"\n",
"# for d in stats_key_to_input_ids:\n",
"# if len(d) > 0:\n",
"# print(\"OLD INPUT IDS:\", d)\n",
"\n",
"for d in stats_key_to_input_ids:\n",
" for key in d:\n",
" new_set = set()\n",
" if key in total_input_ids:\n",
" new_set = total_input_ids[key]\n",
" new_set.union(d[key])\n",
" total_input_ids[key] = new_set\n",
"\n",
"print(\"NEW KEYS:\", list(total_input_ids.keys()))\n",
"for d in total_input_ids:\n",
" print(\"NEW INPUT IDS:\", total_input_ids[d])\n",
" \n",
"for d in stats_key_to_reference_ids:\n",
" for key in d:\n",
" # new_set = set()\n",
" # if key in total_reference_ids:\n",
" # new_set = total_reference_ids[key]\n",
" total_reference_ids[key].union(d[key])\n",
" # total_reference_ids[key] = new_set\n",
" \n",
"all_data_overlap_stats = []\n",
"for stats_key, count in stats_key_counts.items():\n",
" data_overlap_stats = {\n",
" 'data_overlap_stats_key': stats_key,\n",
" 'instance_ids_with_overlapping_input': sorted(total_input_ids[stats_key]),\n",
" 'instance_ids_with_overlapping_reference': sorted(total_reference_ids[stats_key]),\n",
" 'num_instances': count,\n",
" }\n",
" all_data_overlap_stats.append(data_overlap_stats)\n",
"\n",
"with open(output_path, \"w\") as f:\n",
" f.writelines(\n",
" f\"{json.dumps(data_overlap_stats)}\\n\" for data_overlap_stats in all_data_overlap_stats\n",
" )\n",
"print(f\"Written {len(all_data_overlap_stats)} results to {output_path}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "syft_3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 454418c

Please sign in to comment.