diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb b/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb
index 2d3ed3ba..80de90d4 100644
--- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb
+++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb
@@ -43,15 +43,7 @@
"metadata": {}
},
{
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[ Info: Precompiling WordTokenizers [796a5d58-b03d-544a-977e-18100b691f6e]\n"
- ]
- }
- ],
+ "outputs": [],
"cell_type": "code",
"source": [
"using MLJ\n",
@@ -106,7 +98,7 @@
"data": {
"text/plain": "\u001b[1m5×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m Category \u001b[0m\u001b[1m Message \u001b[0m\n │\u001b[90m String7 \u001b[0m\u001b[90m String \u001b[0m\n─────┼─────────────────────────────────────────────\n 1 │ ham Go until jurong point, crazy.. A…\n 2 │ ham Ok lar... Joking wif u oni...\n 3 │ spam Free entry in 2 a wkly comp to w…\n 4 │ ham U dun say so early hor... U c al…\n 5 │ ham Nah I don't think he goes to usf…",
"text/html": [
- "
1 | ham | Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... |
2 | ham | Ok lar... Joking wif u oni... |
3 | spam | Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's |
4 | ham | U dun say so early hor... U c already then say... |
5 | ham | Nah I don't think he goes to usf, he lives around here though |
"
+ "1 | ham | Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... |
2 | ham | Ok lar... Joking wif u oni... |
3 | spam | Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's |
4 | ham | U dun say so early hor... U c already then say... |
5 | ham | Nah I don't think he goes to usf, he lives around here though |
"
]
},
"metadata": {},
@@ -157,17 +149,13 @@
],
"cell_type": "code",
"source": [
- "const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
- " \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n",
- " \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n",
- " \"this\", \"to\", \"was\", \"will\", \"with\"]\n",
+ "const STOP_WORDS = Languages.stopwords(Languages.English())\n",
"\n",
"function preprocess_text(text)\n",
" # (1) Splitting texts into words (so later it can be a sequence of vectors)\n",
" tokens = WordTokenizers.tokenize(text)\n",
"\n",
" # (2) Stop word removal\n",
- " top_words = Languages.stopwords(Languages.English())\n",
" filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n",
"\n",
" return filtered_tokens\n",
@@ -308,7 +296,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\", \"something\", \"like\"] is ham\n"
+ "[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\"] is ham\n"
]
}
],
@@ -360,7 +348,7 @@
{
"output_type": "execute_result",
"data": {
- "text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 6, 7, 10734, 10734, 10734, 10734, 10734]\n [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 15, 18]\n [40, 41, 42, 43, 44, 40, 45, 46, 47, 10734, 10734, 10734]\n [48, 28, 40, 49, 50, 51, 52, 10734, 10734, 10734, 10734, 10734]\n [48, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]"
+ "text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 10404, 10404, 10404, 10404, 10404, 10404, 10404]\n [6, 7, 8, 9, 10, 11, 12, 13, 11, 14, 15, 16]\n [36, 37, 38, 39, 36, 40, 41, 42, 10404, 10404, 10404, 10404]\n [43, 24, 36, 44, 45, 46, 10404, 10404, 10404, 10404, 10404, 10404]\n [43, 47, 48, 49, 50, 51, 52, 53, 54, 55, 44, 45]"
},
"metadata": {},
"execution_count": 11
@@ -524,7 +512,7 @@
{
"output_type": "execute_result",
"data": {
- "text/plain": "NeuralNetworkClassifier(\n builder = GenericBuilder(\n apply = Main.var\"##265\".var\"#15#16\"()), \n finaliser = NNlib.softmax, \n optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n loss = Flux.Losses.crossentropy, \n epochs = 10, \n batch_size = 128, \n lambda = 0.0, \n alpha = 0.0, \n rng = Random.TaskLocalRNG(), \n optimiser_changes_trigger_retraining = false, \n acceleration = CPU1{Nothing}(nothing))"
+ "text/plain": "NeuralNetworkClassifier(\n builder = GenericBuilder(\n apply = Main.var\"##280\".var\"#15#16\"()), \n finaliser = NNlib.softmax, \n optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n loss = Flux.Losses.crossentropy, \n epochs = 10, \n batch_size = 128, \n lambda = 0.0, \n alpha = 0.0, \n rng = Random.TaskLocalRNG(), \n optimiser_changes_trigger_retraining = false, \n acceleration = CPU1{Nothing}(nothing))"
},
"metadata": {},
"execution_count": 16
@@ -555,7 +543,7 @@
{
"output_type": "execute_result",
"data": {
- "text/plain": "untrained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n 2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n"
+ "text/plain": "untrained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n 2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n"
},
"metadata": {},
"execution_count": 17
@@ -583,13 +571,13 @@
"output_type": "stream",
"text": [
"[ Info: Training machine(NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …), …).\n",
- "\rOptimising neural net: 18%[====> ] ETA: 0:00:43\u001b[K\rOptimising neural net: 27%[======> ] ETA: 0:00:30\u001b[K\rOptimising neural net: 36%[=========> ] ETA: 0:00:23\u001b[K\rOptimising neural net: 45%[===========> ] ETA: 0:00:18\u001b[K\rOptimising neural net: 55%[=============> ] ETA: 0:00:14\u001b[K\rOptimising neural net: 64%[===============> ] ETA: 0:00:10\u001b[K\rOptimising neural net: 73%[==================> ] ETA: 0:00:07\u001b[K\rOptimising neural net: 82%[====================> ] ETA: 0:00:05\u001b[K\rOptimising neural net: 91%[======================> ] ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:25\u001b[K\n"
+ "\rOptimising neural net: 18%[====> ] ETA: 0:00:13\u001b[K\rOptimising neural net: 27%[======> ] ETA: 0:00:12\u001b[K\rOptimising neural net: 36%[=========> ] ETA: 0:00:11\u001b[K\rOptimising neural net: 45%[===========> ] ETA: 0:00:10\u001b[K\rOptimising neural net: 55%[=============> ] ETA: 0:00:08\u001b[K\rOptimising neural net: 64%[===============> ] ETA: 0:00:07\u001b[K\rOptimising neural net: 73%[==================> ] ETA: 0:00:05\u001b[K\rOptimising neural net: 82%[====================> ] ETA: 0:00:03\u001b[K\rOptimising neural net: 91%[======================> ] ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:18\u001b[K\n"
]
},
{
"output_type": "execute_result",
"data": {
- "text/plain": "trained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n 2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n"
+ "text/plain": "trained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n 2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n"
},
"metadata": {},
"execution_count": 18
@@ -614,7 +602,7 @@
{
"output_type": "execute_result",
"data": {
- "text/plain": "0.8672080465558729"
+ "text/plain": "0.9468762240501374"
},
"metadata": {},
"execution_count": 19
@@ -641,7 +629,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{InlineStrings.String7, UInt32}[InlineStrings.String7(\"ham\")]`"
+ "SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{String7, UInt32}[String7(\"ham\")]`"
]
}
],
diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl b/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl
index 3cee22e7..14f5f49f 100644
--- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl
+++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.jl
@@ -40,17 +40,13 @@ first(df, 5)
# - Return the filtered vector of words
-const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
- "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
- "such", "that", "the", "their", "then", "there", "these", "they",
- "this", "to", "was", "will", "with"]
+const STOP_WORDS = Languages.stopwords(Languages.English())
function preprocess_text(text)
## (1) Splitting texts into words (so later it can be a sequence of vectors)
tokens = WordTokenizers.tokenize(text)
## (2) Stop word removal
- top_words = Languages.stopwords(Languages.English())
filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)
return filtered_tokens
diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.md b/docs/src/full tutorials/Spam Detection with RNNs/notebook.md
index bba74a06..8bcc0482 100644
--- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.md
+++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.md
@@ -49,17 +49,13 @@ Let's define a function that given an SMS message would:
- Return the filtered vector of words
````@julia
-const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
- "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
- "such", "that", "the", "their", "then", "there", "these", "they",
- "this", "to", "was", "will", "with"]
+const STOP_WORDS = Languages.stopwords(Languages.English())
function preprocess_text(text)
# (1) Splitting texts into words (so later it can be a sequence of vectors)
tokens = WordTokenizers.tokenize(text)
# (2) Stop word removal
- top_words = Languages.stopwords(Languages.English())
filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)
return filtered_tokens
diff --git a/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb b/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb
index 122f4948..ff5322f2 100644
--- a/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb
+++ b/docs/src/full tutorials/Spam Detection with RNNs/notebook.unexecuted.ipynb
@@ -120,17 +120,13 @@
"outputs": [],
"cell_type": "code",
"source": [
- "const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
- " \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n",
- " \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n",
- " \"this\", \"to\", \"was\", \"will\", \"with\"]\n",
+ "const STOP_WORDS = Languages.stopwords(Languages.English())\n",
"\n",
"function preprocess_text(text)\n",
" # (1) Splitting texts into words (so later it can be a sequence of vectors)\n",
" tokens = WordTokenizers.tokenize(text)\n",
"\n",
" # (2) Stop word removal\n",
- " top_words = Languages.stopwords(Languages.English())\n",
" filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n",
"\n",
" return filtered_tokens\n",