Skip to content

Commit

Permalink
revert to using Languages to get stop words
Browse files Browse the repository at this point in the history
  • Loading branch information
ablaom committed Jun 11, 2024
1 parent 16d8c17 commit d2c09fc
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 38 deletions.
34 changes: 11 additions & 23 deletions docs/src/full tutorials/Spam Detection with RNNs/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,7 @@
"metadata": {}
},
{
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ Info: Precompiling WordTokenizers [796a5d58-b03d-544a-977e-18100b691f6e]\n"
]
}
],
"outputs": [],
"cell_type": "code",
"source": [
"using MLJ\n",
Expand Down Expand Up @@ -106,7 +98,7 @@
"data": {
"text/plain": "\u001b[1m5×2 DataFrame\u001b[0m\n\u001b[1m Row \u001b[0m│\u001b[1m Category \u001b[0m\u001b[1m Message \u001b[0m\n\u001b[90m String7 \u001b[0m\u001b[90m String \u001b[0m\n─────┼─────────────────────────────────────────────\n 1 │ ham Go until jurong point, crazy.. A…\n 2 │ ham Ok lar... Joking wif u oni...\n 3 │ spam Free entry in 2 a wkly comp to w…\n 4 │ ham U dun say so early hor... U c al…\n 5 │ ham Nah I don't think he goes to usf…",
"text/html": [
"<div><div style = \"float: left;\"><span>5×2 DataFrame</span></div><div style = \"clear: both;\"></div></div><div class = \"data-frame\" style = \"overflow-x: scroll;\"><table class = \"data-frame\" style = \"margin-bottom: 6px;\"><thead><tr class = \"header\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">Row</th><th style = \"text-align: left;\">Category</th><th style = \"text-align: left;\">Message</th></tr><tr class = \"subheader headerLastRow\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\"></th><th title = \"InlineStrings.String7\" style = \"text-align: left;\">String7</th><th title = \"String\" style = \"text-align: left;\">String</th></tr></thead><tbody><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">1</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">2</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Ok lar... Joking wif u oni...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">3</td><td style = \"text-align: left;\">spam</td><td style = \"text-align: left;\">Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&amp;C&apos;s apply 08452810075over18&apos;s</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">4</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">U dun say so early hor... U c already then say...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">5</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Nah I don&apos;t think he goes to usf, he lives around here though</td></tr></tbody></table></div>"
"<div><div style = \"float: left;\"><span>5×2 DataFrame</span></div><div style = \"clear: both;\"></div></div><div class = \"data-frame\" style = \"overflow-x: scroll;\"><table class = \"data-frame\" style = \"margin-bottom: 6px;\"><thead><tr class = \"header\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">Row</th><th style = \"text-align: left;\">Category</th><th style = \"text-align: left;\">Message</th></tr><tr class = \"subheader headerLastRow\"><th class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\"></th><th title = \"String7\" style = \"text-align: left;\">String7</th><th title = \"String\" style = \"text-align: left;\">String</th></tr></thead><tbody><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">1</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">2</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Ok lar... Joking wif u oni...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">3</td><td style = \"text-align: left;\">spam</td><td style = \"text-align: left;\">Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&amp;C&apos;s apply 08452810075over18&apos;s</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">4</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">U dun say so early hor... U c already then say...</td></tr><tr><td class = \"rowNumber\" style = \"font-weight: bold; text-align: right;\">5</td><td style = \"text-align: left;\">ham</td><td style = \"text-align: left;\">Nah I don&apos;t think he goes to usf, he lives around here though</td></tr></tbody></table></div>"
]
},
"metadata": {},
Expand Down Expand Up @@ -157,17 +149,13 @@
],
"cell_type": "code",
"source": [
"const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
" \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n",
" \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n",
" \"this\", \"to\", \"was\", \"will\", \"with\"]\n",
"const STOP_WORDS = Languages.stopwords(Languages.English())\n",
"\n",
"function preprocess_text(text)\n",
" # (1) Splitting texts into words (so later it can be a sequence of vectors)\n",
" tokens = WordTokenizers.tokenize(text)\n",
"\n",
" # (2) Stop word removal\n",
" top_words = Languages.stopwords(Languages.English())\n",
" filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n",
"\n",
" return filtered_tokens\n",
Expand Down Expand Up @@ -308,7 +296,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\", \"something\", \"like\"] is ham\n"
"[\"Que\", \"pases\", \"un\", \"buen\", \"tiempo\"] is ham\n"
]
}
],
Expand Down Expand Up @@ -360,7 +348,7 @@
{
"output_type": "execute_result",
"data": {
"text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 6, 7, 10734, 10734, 10734, 10734, 10734]\n [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 15, 18]\n [40, 41, 42, 43, 44, 40, 45, 46, 47, 10734, 10734, 10734]\n [48, 28, 40, 49, 50, 51, 52, 10734, 10734, 10734, 10734, 10734]\n [48, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]"
"text/plain": "5-element Vector{Vector{Int64}}:\n [1, 2, 3, 4, 5, 10404, 10404, 10404, 10404, 10404, 10404, 10404]\n [6, 7, 8, 9, 10, 11, 12, 13, 11, 14, 15, 16]\n [36, 37, 38, 39, 36, 40, 41, 42, 10404, 10404, 10404, 10404]\n [43, 24, 36, 44, 45, 46, 10404, 10404, 10404, 10404, 10404, 10404]\n [43, 47, 48, 49, 50, 51, 52, 53, 54, 55, 44, 45]"
},
"metadata": {},
"execution_count": 11
Expand Down Expand Up @@ -524,7 +512,7 @@
{
"output_type": "execute_result",
"data": {
"text/plain": "NeuralNetworkClassifier(\n builder = GenericBuilder(\n apply = Main.var\"##265\".var\"#15#16\"()), \n finaliser = NNlib.softmax, \n optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n loss = Flux.Losses.crossentropy, \n epochs = 10, \n batch_size = 128, \n lambda = 0.0, \n alpha = 0.0, \n rng = Random.TaskLocalRNG(), \n optimiser_changes_trigger_retraining = false, \n acceleration = CPU1{Nothing}(nothing))"
"text/plain": "NeuralNetworkClassifier(\n builder = GenericBuilder(\n apply = Main.var\"##280\".var\"#15#16\"()), \n finaliser = NNlib.softmax, \n optimiser = Adam(0.1, (0.9, 0.999), 1.0e-8), \n loss = Flux.Losses.crossentropy, \n epochs = 10, \n batch_size = 128, \n lambda = 0.0, \n alpha = 0.0, \n rng = Random.TaskLocalRNG(), \n optimiser_changes_trigger_retraining = false, \n acceleration = CPU1{Nothing}(nothing))"
},
"metadata": {},
"execution_count": 16
Expand Down Expand Up @@ -555,7 +543,7 @@
{
"output_type": "execute_result",
"data": {
"text/plain": "untrained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n 2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n"
"text/plain": "untrained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n 2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n"
},
"metadata": {},
"execution_count": 17
Expand Down Expand Up @@ -583,13 +571,13 @@
"output_type": "stream",
"text": [
"[ Info: Training machine(NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …), …).\n",
"\rOptimising neural net: 18%[====> ] ETA: 0:00:43\u001b[K\rOptimising neural net: 27%[======> ] ETA: 0:00:30\u001b[K\rOptimising neural net: 36%[=========> ] ETA: 0:00:23\u001b[K\rOptimising neural net: 45%[===========> ] ETA: 0:00:18\u001b[K\rOptimising neural net: 55%[=============> ] ETA: 0:00:14\u001b[K\rOptimising neural net: 64%[===============> ] ETA: 0:00:10\u001b[K\rOptimising neural net: 73%[==================> ] ETA: 0:00:07\u001b[K\rOptimising neural net: 82%[====================> ] ETA: 0:00:05\u001b[K\rOptimising neural net: 91%[======================> ] ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:25\u001b[K\n"
"\rOptimising neural net: 18%[====> ] ETA: 0:00:13\u001b[K\rOptimising neural net: 27%[======> ] ETA: 0:00:12\u001b[K\rOptimising neural net: 36%[=========> ] ETA: 0:00:11\u001b[K\rOptimising neural net: 45%[===========> ] ETA: 0:00:10\u001b[K\rOptimising neural net: 55%[=============> ] ETA: 0:00:08\u001b[K\rOptimising neural net: 64%[===============> ] ETA: 0:00:07\u001b[K\rOptimising neural net: 73%[==================> ] ETA: 0:00:05\u001b[K\rOptimising neural net: 82%[====================> ] ETA: 0:00:03\u001b[K\rOptimising neural net: 91%[======================> ] ETA: 0:00:02\u001b[K\rOptimising neural net: 100%[=========================] Time: 0:00:18\u001b[K\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": "trained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @338 ⏎ AbstractMatrix{ScientificTypesBase.Continuous}\n 2:\tSource @119 ⏎ AbstractVector{ScientificTypesBase.Multiclass{2}}\n"
"text/plain": "trained Machine; caches model-specific representations of data\n model: NeuralNetworkClassifier(builder = GenericBuilder(apply = #15), …)\n args: \n 1:\tSource @330 ⏎ AbstractMatrix{Continuous}\n 2:\tSource @964 ⏎ AbstractVector{Multiclass{2}}\n"
},
"metadata": {},
"execution_count": 18
Expand All @@ -614,7 +602,7 @@
{
"output_type": "execute_result",
"data": {
"text/plain": "0.8672080465558729"
"text/plain": "0.9468762240501374"
},
"metadata": {},
"execution_count": 19
Expand All @@ -641,7 +629,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{InlineStrings.String7, UInt32}[InlineStrings.String7(\"ham\")]`"
"SMS: `Hi elaine, is today's meeting confirmed?` and the prediction is `CategoricalArrays.CategoricalValue{String7, UInt32}[String7(\"ham\")]`"
]
}
],
Expand Down
6 changes: 1 addition & 5 deletions docs/src/full tutorials/Spam Detection with RNNs/notebook.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,13 @@ first(df, 5)

# - Return the filtered vector of words

const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
"if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
"such", "that", "the", "their", "then", "there", "these", "they",
"this", "to", "was", "will", "with"]
const STOP_WORDS = Languages.stopwords(Languages.English())

function preprocess_text(text)
## (1) Splitting texts into words (so later it can be a sequence of vectors)
tokens = WordTokenizers.tokenize(text)

## (2) Stop word removal
top_words = Languages.stopwords(Languages.English())
filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)

return filtered_tokens
Expand Down
6 changes: 1 addition & 5 deletions docs/src/full tutorials/Spam Detection with RNNs/notebook.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,13 @@ Let's define a function that given an SMS message would:
- Return the filtered vector of words

````@julia
const STOP_WORDS = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
"if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
"such", "that", "the", "their", "then", "there", "these", "they",
"this", "to", "was", "will", "with"]
const STOP_WORDS = Languages.stopwords(Languages.English())
function preprocess_text(text)
# (1) Splitting texts into words (so later it can be a sequence of vectors)
tokens = WordTokenizers.tokenize(text)
# (2) Stop word removal
top_words = Languages.stopwords(Languages.English())
filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)
return filtered_tokens
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,13 @@
"outputs": [],
"cell_type": "code",
"source": [
"const STOP_WORDS = [\"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
" \"if\", \"in\", \"into\", \"is\", \"it\", \"no\", \"not\", \"of\", \"on\", \"or\",\n",
" \"such\", \"that\", \"the\", \"their\", \"then\", \"there\", \"these\", \"they\",\n",
" \"this\", \"to\", \"was\", \"will\", \"with\"]\n",
"const STOP_WORDS = Languages.stopwords(Languages.English())\n",
"\n",
"function preprocess_text(text)\n",
" # (1) Splitting texts into words (so later it can be a sequence of vectors)\n",
" tokens = WordTokenizers.tokenize(text)\n",
"\n",
" # (2) Stop word removal\n",
" top_words = Languages.stopwords(Languages.English())\n",
" filtered_tokens = filter(token -> !(token in STOP_WORDS), tokens)\n",
"\n",
" return filtered_tokens\n",
Expand Down

0 comments on commit d2c09fc

Please sign in to comment.