Skip to content

Commit

Permalink
Fixed duplicate BOS token in training input.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 657230030
Change-Id: I0a4ddb7928887524276f287bf97c13051ca3ffb5
  • Loading branch information
Gemma Team authored and texasmichelle committed Jul 30, 2024
1 parent 4dbac72 commit 20cb3da
Showing 1 changed file with 15 additions and 3 deletions.
18 changes: 15 additions & 3 deletions colabs/fine_tuning_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@
" example: str | bytes,\n",
" prefix: str = '',\n",
" suffix: str = '',\n",
" add_bos: bool = True,\n",
" add_eos: bool = True) -\u003e jax.Array:\n",
" \"\"\"\n",
" Tokenization function.\n",
Expand All @@ -266,12 +267,16 @@
" example: input string to tokenize.\n",
" prefix: prefix to add to the input string.\n",
" suffix: suffix to add to the input string.\n",
" add_eos: if True, add an end of sentence token at the end of the output\n",
" add_bos: if True, add a beginning of sequence token at the start of the\n",
" tokenized sequence.\n",
" add_eos: if True, add an end of sequence token at the end of the tokenized\n",
" sequence.\n",
" Returns:\n",
" Tokens corresponding to the input string.\n",
" \"\"\"\n",
" int_list = [self._spm_processor.bos_id()]\n",
" int_list = []\n",
" if add_bos:\n",
" int_list.append(self._spm_processor.bos_id())\n",
" int_list.extend(self._spm_processor.EncodeAsIds(prefix + example + suffix))\n",
" if add_eos:\n",
" int_list.append(self._spm_processor.eos_id())\n",
Expand All @@ -282,11 +287,12 @@
" str_tensor: tf.Tensor,\n",
" prefix: str = '',\n",
" suffix: str = '',\n",
" add_bos: bool = True,\n",
" add_eos: bool = True) -\u003e tf.Tensor:\n",
" \"\"\"Tensforflow operator for the tokenize function.\"\"\"\n",
" encoded = tf.numpy_function(\n",
" self.tokenize,\n",
" [str_tensor, prefix, suffix, add_eos],\n",
" [str_tensor, prefix, suffix, add_bos, add_eos],\n",
" tf.int32)\n",
" encoded.set_shape([None])\n",
" return encoded\n",
Expand Down Expand Up @@ -320,9 +326,11 @@
" return tokenizer.tokenize_tf_op(example,\n",
" prefix='Translate this into French:\\n',\n",
" suffix='\\n',\n",
" add_bos=True,\n",
" add_eos=False)\n",
"def tokenize_destination(tokenizer, example: tf.Tensor):\n",
" return tokenizer.tokenize_tf_op(example,\n",
" add_bos=False,\n",
" add_eos=True)\n",
"\n",
"ds = tfds.load(\"mtnt/en-fr\",split=\"train\")\n",
Expand Down Expand Up @@ -399,14 +407,18 @@
"\n",
" def _tokenize_source(self, example: tf.Tensor):\n",
" \"\"\"Tokenization function for the source.\"\"\"\n",
" # We add \u003cBOS\u003e as these tokens are the start of our sequence.\n",
" return self._tokenizer.tokenize_tf_op(example,\n",
" prefix=self.TRANSLATION_PREFIX,\n",
" suffix=self.TRANSLATION_SUFFIX,\n",
" add_bos=True,\n",
" add_eos=False)\n",
"\n",
" def _tokenize_destination(self, example: tf.Tensor):\n",
" \"\"\"Tokenization function for the French translation.\"\"\"\n",
" # We do not add \u003cBOS\u003e as these tokens get appended to the source tokens.\n",
" return self._tokenizer.tokenize_tf_op(example,\n",
" add_bos=False,\n",
" add_eos=True)\n",
"\n",
" def _pad_up_to_max_len(self,\n",
Expand Down

0 comments on commit 20cb3da

Please sign in to comment.