found bug with serialization for hashabledict

OpenMined · Sep 29, 2023 · 9a58e2c · 9a58e2c
1 parent 3acff4d
commit 9a58e2c
Show file tree

Hide file tree

Showing 3 changed files with 248 additions and 170 deletions.
diff --git a/notebooks/helm/helm_audit.ipynb b/notebooks/helm/helm_audit.ipynb
@@ -20,56 +20,42 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The input data will be loaded from ['short_input.jsonl']\n",
-      "Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data\n",
-      "Initializing the stats, ngram_index, and ngram_counter {\n",
-      "  Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), split='train')\n",
-      "  Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), split='valid')\n",
-      "  Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), split='test')\n",
-      "  Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'anatomy'}), split='train')\n",
-      "  Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'anatomy'}), split='valid')\n",
-      "  Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'anatomy'}), split='test')\n",
-      "} [0.076s]\n",
-      "Computing overlap stats {\n",
-      "} [0.001s]\n",
-      "Written 18 results to /home/teo/helm/scripts/data_overlap/output_stats.jsonl\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import subprocess\n",
-    "helm_process = subprocess.run([\n",
-    "    'python', \n",
-    "    '/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py',\n",
-    "    '--scenario-data',\n",
-    "    '/home/teo/helm/scripts/data_overlap/scenario_data',\n",
-    "    '--input-data',\n",
-    "    'short_input.jsonl',\n",
-    "    '--output-stats',\n",
-    "    '/home/teo/helm/scripts/data_overlap/output_stats.jsonl',\n",
-    "    '--input-format',\n",
-    "    'the_pile'\n",
-    "    ])"
+    "# import subprocess\n",
+    "# helm_process = subprocess.run([\n",
+    "#     'python', \n",
+    "#     '/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py',\n",
+    "#     '--scenario-data',\n",
+    "#     '/home/teo/helm/scripts/data_overlap/scenario_data',\n",
+    "#     '--input-data',\n",
+    "#     'short_input.jsonl',\n",
+    "#     '--output-stats',\n",
+    "#     '/home/teo/helm/scripts/data_overlap/output_stats.jsonl',\n",
+    "#     '--input-format',\n",
+    "#     'the_pile'\n",
+    "#     ])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "kj/filesystem-disk-unix.c++:1703: warning: PWD environment variable doesn't match current directory; pwd = /home/teo/OpenMined/PySyft\n"
+     ]
+    }
+   ],
    "source": [
     "PART_INPUT: str = \"input\"\n",
     "PART_REF: str = \"references\"\n",
     "\n",
     "r = re.compile(r\"[\\s{}]+\".format(re.escape(punctuation)))\n",
-    "class hashabledict(dict):\n",
-    "    def __hash__(self):\n",
-    "        return hash(tuple(sorted(self.items())))\n",
+    "from syft.service.code.user_code import hashabledict\n",
     "\n",
     "def create_ngram_index(light_scenarios, n_values, stats_key_counts):\n",
     "    ngram_index = hashabledict({n: hashabledict({}) for n in n_values})\n",
@@ -129,30 +115,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "The input data will be loaded from ['short_input.jsonl']\n",
-      "Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data\n",
+      "Loading scenario data from /home/teo/helm/scripts/data_overlap/scenario_data.jsonl\n",
       "[{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}, {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}]\n",
-      "dict_keys([])\n",
-      "dict_keys([])\n",
-      "dict_keys([{'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, 'split': 'test'}, 'overlap_protocol_spec': 5}])\n",
-      "dict_keys([])\n",
-      "dict_keys([])\n",
-      "{'id328'} {'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}, 'split': 'test'}, 'overlap_protocol_spec': 5}\n",
       "Written 18 results to output2.jsonl\n"
      ]
     }
    ],
    "source": [
     "\n",
     "input_data_path = \"short_input.jsonl\"\n",
-    "scenario_data_path = \"/home/teo/helm/scripts/data_overlap/scenario_data\"\n",
+    "scenario_data_path = \"/home/teo/helm/scripts/data_overlap/scenario_data.jsonl\"\n",
     "output_path = \"output2.jsonl\"\n",
     "normalization = \"default\"\n",
     "N = [5, 9, 13]\n",
@@ -224,18 +204,12 @@
     "total_reference_ids = defaultdict(set)\n",
     "\n",
     "for d in stats_key_to_input_ids:\n",
-    "    print(d.keys())\n",
-    "\n",
-    "for d in stats_key_to_input_ids:\n",
     "    for key in d:\n",
     "        new_set = set()\n",
     "        if key in total_input_ids:\n",
     "           new_set = total_input_ids[key]\n",
     "        new_set = new_set.union(d[key])\n",
     "        total_input_ids[key] = new_set\n",
-    "        \n",
-    "for d in total_input_ids:\n",
-    "    print(total_input_ids[d], d)\n",
     "\n",
     "for d in stats_key_to_reference_ids:\n",
     "    for key in d:\n",
@@ -266,10 +240,72 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import syft as sy "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{{'stats_key': {'light_scenario_key': {'scenario_spec': {'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario',\n",
+       "     'args': {'subject': 'philosophy'}},\n",
+       "    'split': 'train'},\n",
+       "   'overlap_protocol_spec': {'n': 5}},\n",
+       "  'instance_id': 'id0',\n",
+       "  'part': 'references'}}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(list(ngram_index.values())[0].values())[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dummy = {'key': \"value\"}\n",
+    "dummy = hashabledict({'key': \"value\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "{}"
+      ],
+      "text/plain": [
+       "{}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ser = sy.serialize(dummy, to_bytes=True)\n",
+    "msg = sy.deserialize(ser, from_bytes=True)\n",
+    "msg"
+   ]
   }
  ],
  "metadata": {