revert to using Languages to get stop words

FluxML · Jun 11, 2024 · d2c09fc · d2c09fc
1 parent 16d8c17
commit d2c09fc
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 38 deletions.
diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb b/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb
@@ -43,15 +43,7 @@
    "metadata": {}
   },
   {
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[ Info: Precompiling WordTokenizers [796a5d58-b03d-544a-977e-18100b691f6e]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "cell_type": "code",
    "source": [
     "using MLJ\n",
@@ -106,7 +98,7 @@
      "data": {
       "text/plain": "\u001b[1m5×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m Category \u001b[0m\u001b[1m Message                           \u001b[0m\n     │\u001b[90m String7  \u001b[0m\u001b[90m String                            \u001b[0m\n─────┼─────────────────────────────────────────────\n   1 │ ham       Go until jurong point, crazy.. A…\n   2 │ ham       Ok lar... Joking wif u oni...\n   3 │ spam      Free entry in 2 a wkly comp to w…\n   4 │ ham       U dun say so early hor... U c al…\n   5 │ ham       Nah I don't think he goes to usf…",
       "text/html": [
-       "<div><div style = \"float: left;\"><span>5×2 DataFrame</span></div><div style = \"clear: both;\"></div></div><div class = \"data-frame\" style = \"overflow-x: scroll;\"><table class = \"data-frame\" style = \"margin-bottom: 6px;\"><thead><tr class = \"header\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">Row</th><th style = \"text-align: left;\">Category</th><th style = \"text-align: left;\">Message</th></tr><tr class = \"subheader headerLastRow\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\"></th><th title = \"InlineStrings.String7\" style = \"text-align: left;\">String7</th><th title = \"String\" style = \"text-align: left;\">String</th></tr></thead><tbody><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">1</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">2</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Ok lar... Joking wif u oni...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">3</td><td style = \"text-align: left;\">spam</td><td style = \"text-align: left;\">Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&amp;C&apos;s apply 08452810075over18&apos;s</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">4</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">U dun say so early hor... U c already then say...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">5</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Nah I don&apos;t think he goes to usf, he lives around here though</td></tr></tbody></table></div>"
+       "<div><div style = \"float: left;\"><span>5×2 DataFrame</span></div><div style = \"clear: both;\"></div></div><div class = \"data-frame\" style = \"overflow-x: scroll;\"><table class = \"data-frame\" style = \"margin-bottom: 6px;\"><thead><tr class = \"header\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">Row</th><th style = \"text-align: left;\">Category</th><th style = \"text-align: left;\">Message</th></tr><tr class = \"subheader headerLastRow\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\"></th><th title = \"String7\" style = \"text-align: left;\">String7</th><th title = \"String\" style = \"text-align: left;\">String</th></tr></thead><tbody><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">1</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">2</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Ok lar... Joking wif u oni...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">3</td><td style = \"text-align: left;\">spam</td><td style = \"text-align: left;\">Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&amp;C&apos;s apply 08452810075over18&apos;s</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">4</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">U dun say so early hor... U c already then say...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">5</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Nah I don&apos;t think he goes to usf, he lives around here though</td></tr></tbody></table></div>"
       ]
      },
      "metadata": {},
@@ -157,17 +149,13 @@
    ],
    "cell_type": "code",
    "source": [
-    "const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
-    "                    \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n",
-    "                    \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n",
-    "                    \"this\", \"to\", \"was\", \"will\", \"with\"]\n",
+    "const STOP_WORDS = Languages.stopwords(Languages.English())\n",
     "\n",
     "function preprocess_text(text)\n",
     "    # (1) Splitting texts into words (so later it can be a sequence of vectors)\n",
     "    tokens = WordTokenizers.tokenize(text)\n",
     "\n",
     "    # (2) Stop word removal\n",
-    "    top_words = Languages.stopwords(Languages.English())\n",
     "    filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n",
     "\n",
     "    return filtered_tokens\n",
@@ -308,7 +296,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\", \"something\", \"like\"] is ham\n"
+      "[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\"] is ham\n"
      ]
     }
    ],
@@ -360,7 +348,7 @@
     {
      "output_type": "execute_result",
      "data": {
-      "text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 6, 7, 10734, 10734, 10734, 10734, 10734]\n [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 15, 18]\n [40, 41, 42, 43, 44, 40, 45, 46, 47, 10734, 10734, 10734]\n [48, 28, 40, 49, 50, 51, 52, 10734, 10734, 10734, 10734, 10734]\n [48, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]"
+      "text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 10404, 10404, 10404, 10404, 10404, 10404, 10404]\n [6, 7, 8, 9, 10, 11, 12, 13, 11, 14, 15, 16]\n [36, 37, 38, 39, 36, 40, 41, 42, 10404, 10404, 10404, 10404]\n [43, 24, 36, 44, 45, 46, 10404, 10404, 10404, 10404, 10404, 10404]\n [43, 47, 48, 49, 50, 51, 52, 53, 54, 55, 44, 45]"
      },
      "metadata": {},
      "execution_count": 11
@@ -524,7 +512,7 @@
     {
      "output_type": "execute_result",
      "data": {
-      "text/plain": "NeuralNetworkClassifier(\n  builder = GenericBuilder(\n        apply = Main.var\"##265\".var\"#15#16\"()), \n  finaliser = NNlib.softmax, \n  optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n  loss = Flux.Losses.crossentropy, \n  epochs = 10, \n  batch_size = 128, \n  lambda = 0.0, \n  alpha = 0.0, \n  rng = Random.TaskLocalRNG(), \n  optimiser_changes_trigger_retraining = false, \n  acceleration = CPU1{Nothing}(nothing))"
+      "text/plain": "NeuralNetworkClassifier(\n  builder = GenericBuilder(\n        apply = Main.var\"##280\".var\"#15#16\"()), \n  finaliser = NNlib.softmax, \n  optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n  loss = Flux.Losses.crossentropy, \n  epochs = 10, \n  batch_size = 128, \n  lambda = 0.0, \n  alpha = 0.0, \n  rng = Random.TaskLocalRNG(), \n  optimiser_changes_trigger_retraining = false, \n  acceleration = CPU1{Nothing}(nothing))"
      },
      "metadata": {},
      "execution_count": 16
@@ -555,7 +543,7 @@
     {
      "output_type": "execute_result",
      "data": {
-      "text/plain": "untrained Machine; caches model-specific representations of data\n  model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n  args: \n    1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n    2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n"
+      "text/plain": "untrained Machine; caches model-specific representations of data\n  model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n  args: \n    1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n    2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n"
      },
      "metadata": {},
      "execution_count": 17
@@ -583,13 +571,13 @@
      "output_type": "stream",
      "text": [
       "[ Info: Training machine(NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …), …).\n",
-      "\rOptimising neural net:  18%[====>                    ]  ETA: 0:00:43\u001b[K\rOptimising neural net:  27%[======>                  ]  ETA: 0:00:30\u001b[K\rOptimising neural net:  36%[=========>               ]  ETA: 0:00:23\u001b[K\rOptimising neural net:  45%[===========>             ]  ETA: 0:00:18\u001b[K\rOptimising neural net:  55%[=============>           ]  ETA: 0:00:14\u001b[K\rOptimising neural net:  64%[===============>         ]  ETA: 0:00:10\u001b[K\rOptimising neural net:  73%[==================>      ]  ETA: 0:00:07\u001b[K\rOptimising neural net:  82%[====================>    ]  ETA: 0:00:05\u001b[K\rOptimising neural net:  91%[======================>  ]  ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:25\u001b[K\n"
+      "\rOptimising neural net:  18%[====>                    ]  ETA: 0:00:13\u001b[K\rOptimising neural net:  27%[======>                  ]  ETA: 0:00:12\u001b[K\rOptimising neural net:  36%[=========>               ]  ETA: 0:00:11\u001b[K\rOptimising neural net:  45%[===========>             ]  ETA: 0:00:10\u001b[K\rOptimising neural net:  55%[=============>           ]  ETA: 0:00:08\u001b[K\rOptimising neural net:  64%[===============>         ]  ETA: 0:00:07\u001b[K\rOptimising neural net:  73%[==================>      ]  ETA: 0:00:05\u001b[K\rOptimising neural net:  82%[====================>    ]  ETA: 0:00:03\u001b[K\rOptimising neural net:  91%[======================>  ]  ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:18\u001b[K\n"
      ]
     },
     {
      "output_type": "execute_result",
      "data": {
-      "text/plain": "trained Machine; caches model-specific representations of data\n  model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n  args: \n    1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n    2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n"
+      "text/plain": "trained Machine; caches model-specific representations of data\n  model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n  args: \n    1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n    2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n"
      },
      "metadata": {},
      "execution_count": 18
@@ -614,7 +602,7 @@
     {
      "output_type": "execute_result",
      "data": {
-      "text/plain": "0.8672080465558729"
+      "text/plain": "0.9468762240501374"
      },
      "metadata": {},
      "execution_count": 19
@@ -641,7 +629,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{InlineStrings.String7, UInt32}[InlineStrings.String7(\"ham\")]`"
+      "SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{String7, UInt32}[String7(\"ham\")]`"
      ]
     }
    ],

diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl b/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl
@@ -40,17 +40,13 @@ first(df, 5)
 
 # - Return the filtered vector of words
 
-const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
-                    "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
-                    "such", "that", "the", "their", "then", "there", "these", "they",
-                    "this", "to", "was", "will", "with"]
+const STOP_WORDS = Languages.stopwords(Languages.English())
 
 function preprocess_text(text)
     ## (1) Splitting texts into words (so later it can be a sequence of vectors)
     tokens = WordTokenizers.tokenize(text)
 
     ## (2) Stop word removal
-    top_words = Languages.stopwords(Languages.English())
     filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)
 
     return filtered_tokens

diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.md b/docs/src/full tutorials/Spam Detection with RNNs/notebook.md
@@ -49,17 +49,13 @@ Let's define a function that given an SMS message would:
 - Return the filtered vector of words
 
 ````@julia
-const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
-                    "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
-                    "such", "that", "the", "their", "then", "there", "these", "they",
-                    "this", "to", "was", "will", "with"]
+const STOP_WORDS = Languages.stopwords(Languages.English())
 
 function preprocess_text(text)
     # (1) Splitting texts into words (so later it can be a sequence of vectors)
     tokens = WordTokenizers.tokenize(text)
 
     # (2) Stop word removal
-    top_words = Languages.stopwords(Languages.English())
     filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)
 
     return filtered_tokens

diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb b/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb
@@ -120,17 +120,13 @@
    "outputs": [],
    "cell_type": "code",
    "source": [
-    "const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
-    "                    \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n",
-    "                    \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n",
-    "                    \"this\", \"to\", \"was\", \"will\", \"with\"]\n",
+    "const STOP_WORDS = Languages.stopwords(Languages.English())\n",
     "\n",
     "function preprocess_text(text)\n",
     "    # (1) Splitting texts into words (so later it can be a sequence of vectors)\n",
     "    tokens = WordTokenizers.tokenize(text)\n",
     "\n",
     "    # (2) Stop word removal\n",
-    "    top_words = Languages.stopwords(Languages.English())\n",
     "    filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n",
     "\n",
     "    return filtered_tokens\n",