Skip to content

Commit

Permalink
found bug with serialization for hashabledict
Browse files Browse the repository at this point in the history
  • Loading branch information
teo-milea committed Sep 29, 2023
1 parent 3acff4d commit 9a58e2c
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 170 deletions.
146 changes: 91 additions & 55 deletions notebooks/helm/helm_audit.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,56 +20,42 @@
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The input data will be loaded from ['short_input.jsonl']\n",
"Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data\n",
"Initializing the stats, ngram_index, and ngram_counter {\n",
" Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), split='train')\n",
" Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), split='valid')\n",
" Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), split='test')\n",
" Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'anatomy'}), split='train')\n",
" Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'anatomy'}), split='valid')\n",
" Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'anatomy'}), split='test')\n",
"} [0.076s]\n",
"Computing overlap stats {\n",
"} [0.001s]\n",
"Written 18 results to /home/teo/helm/scripts/data_overlap/output_stats.jsonl\n"
]
}
],
"outputs": [],
"source": [
"import subprocess\n",
"helm_process = subprocess.run([\n",
" 'python', \n",
" '/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py',\n",
" '--scenario-data',\n",
" '/home/teo/helm/scripts/data_overlap/scenario_data',\n",
" '--input-data',\n",
" 'short_input.jsonl',\n",
" '--output-stats',\n",
" '/home/teo/helm/scripts/data_overlap/output_stats.jsonl',\n",
" '--input-format',\n",
" 'the_pile'\n",
" ])"
"# import subprocess\n",
"# helm_process = subprocess.run([\n",
"# 'python', \n",
"# '/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py',\n",
"# '--scenario-data',\n",
"# '/home/teo/helm/scripts/data_overlap/scenario_data',\n",
"# '--input-data',\n",
"# 'short_input.jsonl',\n",
"# '--output-stats',\n",
"# '/home/teo/helm/scripts/data_overlap/output_stats.jsonl',\n",
"# '--input-format',\n",
"# 'the_pile'\n",
"# ])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"kj/filesystem-disk-unix.c++:1703: warning: PWD environment variable doesn't match current directory; pwd = /home/teo/OpenMined/PySyft\n"
]
}
],
"source": [
"PART_INPUT: str = \"input\"\n",
"PART_REF: str = \"references\"\n",
"\n",
"r = re.compile(r\"[\\s{}]+\".format(re.escape(punctuation)))\n",
"class hashabledict(dict):\n",
" def __hash__(self):\n",
" return hash(tuple(sorted(self.items())))\n",
"from syft.service.code.user_code import hashabledict\n",
"\n",
"def create_ngram_index(light_scenarios, n_values, stats_key_counts):\n",
" ngram_index = hashabledict({n: hashabledict({}) for n in n_values})\n",
Expand Down Expand Up @@ -129,30 +115,24 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The input data will be loaded from ['short_input.jsonl']\n",
"Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data\n",
"Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data.jsonl\n",
"[{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}]\n",
"dict_keys([])\n",
"dict_keys([])\n",
"dict_keys([{'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, 'split': 'test'}, 'overlap_protocol_spec': 5}])\n",
"dict_keys([])\n",
"dict_keys([])\n",
"{'id328'} {'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, 'split': 'test'}, 'overlap_protocol_spec': 5}\n",
"Written 18 results to output2.jsonl\n"
]
}
],
"source": [
"\n",
"input_data_path = \"short_input.jsonl\"\n",
"scenario_data_path = \"/home/teo/helm/scripts/data_overlap/scenario_data\"\n",
"scenario_data_path = \"/home/teo/helm/scripts/data_overlap/scenario_data.jsonl\"\n",
"output_path = \"output2.jsonl\"\n",
"normalization = \"default\"\n",
"N = [5, 9, 13]\n",
Expand Down Expand Up @@ -224,18 +204,12 @@
"total_reference_ids = defaultdict(set)\n",
"\n",
"for d in stats_key_to_input_ids:\n",
" print(d.keys())\n",
"\n",
"for d in stats_key_to_input_ids:\n",
" for key in d:\n",
" new_set = set()\n",
" if key in total_input_ids:\n",
" new_set = total_input_ids[key]\n",
" new_set = new_set.union(d[key])\n",
" total_input_ids[key] = new_set\n",
" \n",
"for d in total_input_ids:\n",
" print(total_input_ids[d], d)\n",
"\n",
"for d in stats_key_to_reference_ids:\n",
" for key in d:\n",
Expand Down Expand Up @@ -266,10 +240,72 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import syft as sy "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{{'stats_key': {'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario',\n",
" 'args': {'subject': 'philosophy'}},\n",
" 'split': 'train'},\n",
" 'overlap_protocol_spec': {'n': 5}},\n",
" 'instance_id': 'id0',\n",
" 'part': 'references'}}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(list(ngram_index.values())[0].values())[0]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"dummy = {'key': \"value\"}\n",
"dummy = hashabledict({'key': \"value\"})"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"{}"
],
"text/plain": [
"{}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ser = sy.serialize(dummy, to_bytes=True)\n",
"msg = sy.deserialize(ser, from_bytes=True)\n",
"msg"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 9a58e2c

Please sign in to comment.