From fb08c4f8af792e9a80ad2651d12156a382501274 Mon Sep 17 00:00:00 2001 From: sadatRJ Date: Fri, 21 Mar 2025 00:58:18 +0600 Subject: [PATCH] data proprocessing added on chapter11_part02_sequence-models before feeding to network --- chapter11_part01_introduction.ipynb | 12 +++-- chapter11_part02_sequence-models.ipynb | 63 ++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 12 deletions(-) diff --git a/chapter11_part01_introduction.ipynb b/chapter11_part01_introduction.ipynb index 3ef20b7618..88824bae86 100644 --- a/chapter11_part01_introduction.ipynb +++ b/chapter11_part01_introduction.ipynb @@ -6,7 +6,11 @@ "colab_type": "text" }, "source": [ - "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6." + "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n", + "\n", + "**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n", + "\n", + "This notebook was generated for TensorFlow 2.6." ] }, { @@ -746,9 +750,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.8.8" } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 1 +} diff --git a/chapter11_part02_sequence-models.ipynb b/chapter11_part02_sequence-models.ipynb index bfcf6237a2..d2903f0e1c 100644 --- a/chapter11_part02_sequence-models.ipynb +++ b/chapter11_part02_sequence-models.ipynb @@ -6,7 +6,11 @@ "colab_type": "text" }, "source": [ - "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6." + "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n", + "\n", + "**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n", + "\n", + "This notebook was generated for TensorFlow 2.6." ] }, { @@ -90,8 +94,51 @@ ")\n", "test_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/test\", batch_size=batch_size\n", - ")\n", - "text_only_train_ds = train_ds.map(lambda x, y: x)" + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preprocessing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_text(text):\n", + " # Convert to lowercase\n", + " text = tf.strings.lower(text)\n", + " \n", + " # Remove non-UTF-8 characters (ignore decoding errors)\n", + " text = tf.strings.regex_replace(text, r'[^\\x00-\\x7F]+', '') \n", + " \n", + " # Keep only letters, numbers, spaces, and some punctuation\n", + " text = tf.strings.regex_replace(text, r\"[^a-zA-Z0-9\\s.,!?']\", \"\")\n", + " \n", + " return text\n", + "\n", + "def preprocess_text(text, label):\n", + " \n", + " text = clean_text(text) # Apply cleaning function\n", + " \n", + " return text, label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train = train_ds.map(preprocess_text)\n", + "val = val_ds.map(preprocess_text)\n", + "test = test_ds.map(preprocess_text)\n", + "text_only_train_ds = train.map(lambda x, y: x)" ] }, { @@ -122,13 +169,13 @@ ")\n", "text_vectorization.adapt(text_only_train_ds)\n", "\n", - "int_train_ds = train_ds.map(\n", + "int_train_ds = train.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", - "int_val_ds = val_ds.map(\n", + "int_val_ds = val.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", - "int_test_ds = test_ds.map(\n", + "int_test_ds = test.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)" ] @@ -470,9 +517,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.8.8" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }