diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb b/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb index 2d3ed3ba..80de90d4 100644 --- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb +++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb @@ -43,15 +43,7 @@ "metadata": {} }, { - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ Info: Precompiling WordTokenizers [796a5d58-b03d-544a-977e-18100b691f6e]\n" - ] - } - ], + "outputs": [], "cell_type": "code", "source": [ "using MLJ\n", @@ -106,7 +98,7 @@ "data": { "text/plain": "\u001b[1m5×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m Category \u001b[0m\u001b[1m Message \u001b[0m\n │\u001b[90m String7 \u001b[0m\u001b[90m String \u001b[0m\n─────┼─────────────────────────────────────────────\n 1 │ ham Go until jurong point, crazy.. A…\n 2 │ ham Ok lar... Joking wif u oni...\n 3 │ spam Free entry in 2 a wkly comp to w…\n 4 │ ham U dun say so early hor... U c al…\n 5 │ ham Nah I don't think he goes to usf…", "text/html": [ - "
5×2 DataFrame
RowCategoryMessage
String7String
1hamGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
2hamOk lar... Joking wif u oni...
3spamFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
4hamU dun say so early hor... U c already then say...
5hamNah I don't think he goes to usf, he lives around here though
" + "
5×2 DataFrame
RowCategoryMessage
String7String
1hamGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
2hamOk lar... Joking wif u oni...
3spamFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
4hamU dun say so early hor... U c already then say...
5hamNah I don't think he goes to usf, he lives around here though
" ] }, "metadata": {}, @@ -157,17 +149,13 @@ ], "cell_type": "code", "source": [ - "const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", - " \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n", - " \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n", - " \"this\", \"to\", \"was\", \"will\", \"with\"]\n", + "const STOP_WORDS = Languages.stopwords(Languages.English())\n", "\n", "function preprocess_text(text)\n", " # (1) Splitting texts into words (so later it can be a sequence of vectors)\n", " tokens = WordTokenizers.tokenize(text)\n", "\n", " # (2) Stop word removal\n", - " top_words = Languages.stopwords(Languages.English())\n", " filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n", "\n", " return filtered_tokens\n", @@ -308,7 +296,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\", \"something\", \"like\"] is ham\n" + "[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\"] is ham\n" ] } ], @@ -360,7 +348,7 @@ { "output_type": "execute_result", "data": { - "text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 6, 7, 10734, 10734, 10734, 10734, 10734]\n [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 15, 18]\n [40, 41, 42, 43, 44, 40, 45, 46, 47, 10734, 10734, 10734]\n [48, 28, 40, 49, 50, 51, 52, 10734, 10734, 10734, 10734, 10734]\n [48, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]" + "text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 10404, 10404, 10404, 10404, 10404, 10404, 10404]\n [6, 7, 8, 9, 10, 11, 12, 13, 11, 14, 15, 16]\n [36, 37, 38, 39, 36, 40, 41, 42, 10404, 10404, 10404, 10404]\n [43, 24, 36, 44, 45, 46, 10404, 10404, 10404, 10404, 10404, 10404]\n [43, 47, 48, 49, 50, 51, 52, 53, 54, 55, 44, 45]" }, "metadata": {}, "execution_count": 11 @@ -524,7 +512,7 @@ { "output_type": "execute_result", "data": { - "text/plain": "NeuralNetworkClassifier(\n builder = GenericBuilder(\n apply = Main.var\"##265\".var\"#15#16\"()), \n finaliser = NNlib.softmax, \n optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n loss = Flux.Losses.crossentropy, \n epochs = 10, \n batch_size = 128, \n lambda = 0.0, \n alpha = 0.0, \n rng = Random.TaskLocalRNG(), \n optimiser_changes_trigger_retraining = false, \n acceleration = CPU1{Nothing}(nothing))" + "text/plain": "NeuralNetworkClassifier(\n builder = GenericBuilder(\n apply = Main.var\"##280\".var\"#15#16\"()), \n finaliser = NNlib.softmax, \n optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n loss = Flux.Losses.crossentropy, \n epochs = 10, \n batch_size = 128, \n lambda = 0.0, \n alpha = 0.0, \n rng = Random.TaskLocalRNG(), \n optimiser_changes_trigger_retraining = false, \n acceleration = CPU1{Nothing}(nothing))" }, "metadata": {}, "execution_count": 16 @@ -555,7 +543,7 @@ { "output_type": "execute_result", "data": { - "text/plain": "untrained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n 2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n" + "text/plain": "untrained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n 2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n" }, "metadata": {}, "execution_count": 17 @@ -583,13 +571,13 @@ "output_type": "stream", "text": [ "[ Info: Training machine(NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …), …).\n", - "\rOptimising neural net: 18%[====> ] ETA: 0:00:43\u001b[K\rOptimising neural net: 27%[======> ] ETA: 0:00:30\u001b[K\rOptimising neural net: 36%[=========> ] ETA: 0:00:23\u001b[K\rOptimising neural net: 45%[===========> ] ETA: 0:00:18\u001b[K\rOptimising neural net: 55%[=============> ] ETA: 0:00:14\u001b[K\rOptimising neural net: 64%[===============> ] ETA: 0:00:10\u001b[K\rOptimising neural net: 73%[==================> ] ETA: 0:00:07\u001b[K\rOptimising neural net: 82%[====================> ] ETA: 0:00:05\u001b[K\rOptimising neural net: 91%[======================> ] ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:25\u001b[K\n" + "\rOptimising neural net: 18%[====> ] ETA: 0:00:13\u001b[K\rOptimising neural net: 27%[======> ] ETA: 0:00:12\u001b[K\rOptimising neural net: 36%[=========> ] ETA: 0:00:11\u001b[K\rOptimising neural net: 45%[===========> ] ETA: 0:00:10\u001b[K\rOptimising neural net: 55%[=============> ] ETA: 0:00:08\u001b[K\rOptimising neural net: 64%[===============> ] ETA: 0:00:07\u001b[K\rOptimising neural net: 73%[==================> ] ETA: 0:00:05\u001b[K\rOptimising neural net: 82%[====================> ] ETA: 0:00:03\u001b[K\rOptimising neural net: 91%[======================> ] ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:18\u001b[K\n" ] }, { "output_type": "execute_result", "data": { - "text/plain": "trained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n 2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n" + "text/plain": "trained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n 2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n" }, "metadata": {}, "execution_count": 18 @@ -614,7 +602,7 @@ { "output_type": "execute_result", "data": { - "text/plain": "0.8672080465558729" + "text/plain": "0.9468762240501374" }, "metadata": {}, "execution_count": 19 @@ -641,7 +629,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{InlineStrings.String7, UInt32}[InlineStrings.String7(\"ham\")]`" + "SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{String7, UInt32}[String7(\"ham\")]`" ] } ], diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl b/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl index 3cee22e7..14f5f49f 100644 --- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl +++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl @@ -40,17 +40,13 @@ first(df, 5) # - Return the filtered vector of words -const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", - "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", - "such", "that", "the", "their", "then", "there", "these", "they", - "this", "to", "was", "will", "with"] +const STOP_WORDS = Languages.stopwords(Languages.English()) function preprocess_text(text) ## (1) Splitting texts into words (so later it can be a sequence of vectors) tokens = WordTokenizers.tokenize(text) ## (2) Stop word removal - top_words = Languages.stopwords(Languages.English()) filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens) return filtered_tokens diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.md b/docs/src/full tutorials/Spam Detection with RNNs/notebook.md index bba74a06..8bcc0482 100644 --- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.md +++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.md @@ -49,17 +49,13 @@ Let's define a function that given an SMS message would: - Return the filtered vector of words ````@julia -const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", - "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", - "such", "that", "the", "their", "then", "there", "these", "they", - "this", "to", "was", "will", "with"] +const STOP_WORDS = Languages.stopwords(Languages.English()) function preprocess_text(text) # (1) Splitting texts into words (so later it can be a sequence of vectors) tokens = WordTokenizers.tokenize(text) # (2) Stop word removal - top_words = Languages.stopwords(Languages.English()) filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens) return filtered_tokens diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb b/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb index 122f4948..ff5322f2 100644 --- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb +++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb @@ -120,17 +120,13 @@ "outputs": [], "cell_type": "code", "source": [ - "const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", - " \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n", - " \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n", - " \"this\", \"to\", \"was\", \"will\", \"with\"]\n", + "const STOP_WORDS = Languages.stopwords(Languages.English())\n", "\n", "function preprocess_text(text)\n", " # (1) Splitting texts into words (so later it can be a sequence of vectors)\n", " tokens = WordTokenizers.tokenize(text)\n", "\n", " # (2) Stop word removal\n", - " top_words = Languages.stopwords(Languages.English())\n", " filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n", "\n", " return filtered_tokens\n",