run predictions on fixed datasets

waleko · Sep 16, 2023 · 052946f · 052946f
1 parent 36af64c
commit 052946f
Show file tree

Hide file tree

Showing 10 changed files with 38,057 additions and 212,022 deletions.
diff --git a/data/JetBrains_kotlin_1000.finetuned_pred.csv b/data/JetBrains_kotlin_1000.finetuned_pred.csv
diff --git a/data/JetBrains_kotlin_1000.hf_pred.csv b/data/JetBrains_kotlin_1000.hf_pred.csv
diff --git a/data/microsoft_vscode_1000.finetuned_pred.csv b/data/microsoft_vscode_1000.finetuned_pred.csv
diff --git a/data/microsoft_vscode_1000.hf_pred.csv b/data/microsoft_vscode_1000.hf_pred.csv
diff --git a/data/msg-test.finetuned_pred.csv b/data/msg-test.finetuned_pred.csv
diff --git a/data/msg-test.hf_pred.csv b/data/msg-test.hf_pred.csv
diff --git a/data/transloadit_uppy_1000.finetuned_pred.csv b/data/transloadit_uppy_1000.finetuned_pred.csv
diff --git a/data/transloadit_uppy_1000.hf_pred.csv b/data/transloadit_uppy_1000.hf_pred.csv
diff --git a/notebooks/2_inference.ipynb b/notebooks/2_inference.ipynb
@@ -15,12 +15,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "initial_id",
    "metadata": {
     "collapsed": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from pathlib import Path\n",
     "\n",
@@ -49,12 +58,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "ad4d16d13804be69",
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 1.29k/1.29k [00:00<00:00, 169kB/s]\n",
+      "Downloading (…)olve/main/vocab.json: 100%|██████████| 575k/575k [00:00<00:00, 5.89MB/s]\n",
+      "Downloading (…)olve/main/merges.txt: 100%|██████████| 294k/294k [00:00<00:00, 1.09MB/s]\n",
+      "Downloading (…)in/added_tokens.json: 100%|██████████| 1.87k/1.87k [00:00<00:00, 1.13MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 913/913 [00:00<00:00, 515kB/s]\n"
+     ]
+    }
+   ],
    "source": [
     "# download tokenizer from huggingface\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/codereviewer\")\n",
@@ -65,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "cb003d6d8f578da1",
    "metadata": {
     "collapsed": false
@@ -99,7 +120,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "d06f51b2150c61c4",
    "metadata": {
     "collapsed": false
@@ -146,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "7a5b97449733bbc6",
    "metadata": {
     "collapsed": false
@@ -194,12 +215,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "c508661efcdcad40",
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)lve/main/config.json: 100%|██████████| 2.13k/2.13k [00:00<00:00, 478kB/s]\n",
+      "Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:02<00:00, 306MB/s] \n",
+      "Downloading (…)neration_config.json: 100%|██████████| 168/168 [00:00<00:00, 36.7kB/s]\n",
+      "100%|██████████| 636/636 [11:08<00:00,  1.05s/it]\n",
+      "100%|██████████| 63/63 [03:31<00:00,  3.35s/it]\n",
+      "100%|██████████| 63/63 [01:57<00:00,  1.87s/it]\n",
+      "100%|██████████| 63/63 [02:39<00:00,  2.53s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "# download the pretrained model from huggingface\n",
     "hf_model = AutoModelForSeq2SeqLM.from_pretrained(\"microsoft/codereviewer\")\n",
@@ -235,12 +270,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "851255e54c49484a",
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)lve/main/config.json: 100%|██████████| 2.13k/2.13k [00:00<00:00, 290kB/s]\n",
+      "Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:22<00:00, 39.4MB/s] \n",
+      "Some weights of the model checkpoint at waleko/codereviewer-finetuned-msg were not used when initializing T5ForConditionalGeneration: ['cls_head.bias', 'cls_head.weight']\n",
+      "- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 168/168 [00:00<00:00, 49.2kB/s]\n",
+      "100%|██████████| 636/636 [15:58<00:00,  1.51s/it]\n",
+      "100%|██████████| 63/63 [01:41<00:00,  1.61s/it]\n",
+      "100%|██████████| 63/63 [01:32<00:00,  1.47s/it]\n",
+      "100%|██████████| 63/63 [01:27<00:00,  1.39s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "# download the fine-tuned model\n",
     "ft_model = AutoModelForSeq2SeqLM.from_pretrained(\"waleko/codereviewer-finetuned-msg\")\n",