All modules for which code is available
-- mammoth.decoders.decoder -
- mammoth.decoders.transformer_decoder -
- mammoth.encoders.encoder -
- mammoth.encoders.mean_encoder -
- mammoth.encoders.transformer_encoder -
- mammoth.models.model +
- mammoth.models.model
- mammoth.modules.average_attn
- mammoth.modules.conv_multi_step_attention
- mammoth.modules.copy_generator diff --git a/_modules/mammoth/decoders/decoder.html b/_modules/mammoth/decoders/decoder.html deleted file mode 100644 index cf48cff3..00000000 --- a/_modules/mammoth/decoders/decoder.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - - - -
Source code for mammoth.decoders.decoder
-import torch.nn as nn
-
-
-[docs]class DecoderBase(nn.Module):
- """Abstract class for decoders.
-
- Args:
- attentional (bool): The decoder returns non-empty attention.
- """
-
- def __init__(self, attentional=True):
- super(DecoderBase, self).__init__()
- self.attentional = attentional
-
-[docs] @classmethod
- def from_opt(cls, opt, embeddings):
- """Alternate constructor.
-
- Subclasses should override this method.
- """
-
- raise NotImplementedError
-
Source code for mammoth.decoders.transformer_decoder
-"""
-Implementation of "Attention is All You Need" and of
-subsequent transformer based architectures
-"""
-
-import torch
-import torch.nn as nn
-
-from mammoth.decoders.decoder import DecoderBase
-from mammoth.modules import MultiHeadedAttention, AverageAttention
-from mammoth.modules.position_ffn import PositionwiseFeedForward
-from mammoth.modules.position_ffn import ActivationFunction
-from mammoth.utils.misc import sequence_mask
-
-
-class TransformerDecoderLayerBase(nn.Module):
- def __init__(
- self,
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- self_attn_type="scaled-dot",
- max_relative_positions=0,
- aan_useffn=False,
- full_context_alignment=False,
- alignment_heads=0,
- pos_ffn_activation_fn=ActivationFunction.relu,
- ):
- """
- Args:
- d_model (int): the dimension of keys/values/queries in
- :class:`MultiHeadedAttention`, also the input size of
- the first-layer of the :class:`PositionwiseFeedForward`.
- heads (int): the number of heads for MultiHeadedAttention.
- d_ff (int): the second-layer of the
- :class:`PositionwiseFeedForward`.
- dropout (float): dropout in residual, self-attn(dot) and
- feed-forward
- attention_dropout (float): dropout in context_attn (and
- self-attn(avg))
- self_attn_type (string): type of self-attention scaled-dot,
- average
- max_relative_positions (int):
- Max distance between inputs in relative positions
- representations
- aan_useffn (bool): Turn on the FFN layer in the AAN decoder
- full_context_alignment (bool):
- whether enable an extra full context decoder forward for
- alignment
- alignment_heads (int):
- N. of cross attention heads to use for alignment guiding
- pos_ffn_activation_fn (ActivationFunction):
- activation function choice for PositionwiseFeedForward layer
-
- """
- super(TransformerDecoderLayerBase, self).__init__()
-
- if self_attn_type == "scaled-dot":
- self.self_attn = MultiHeadedAttention(
- heads,
- d_model,
- dropout=attention_dropout,
- max_relative_positions=max_relative_positions,
- )
- elif self_attn_type == "average":
- self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn)
-
- self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn)
- self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
- self.drop = nn.Dropout(dropout)
- self.full_context_alignment = full_context_alignment
- self.alignment_heads = alignment_heads
-
- def forward(self, *args, **kwargs):
- """Extend `_forward` for (possibly) multiple decoder pass:
- Always a default (future masked) decoder forward pass,
- Possibly a second future aware decoder pass for joint learn
- full context alignement, :cite:`garg2019jointly`.
-
- Args:
- * All arguments of _forward.
- with_align (bool): whether return alignment attention.
-
- Returns:
- (FloatTensor, FloatTensor, FloatTensor or None):
-
- * output ``(batch_size, T, model_dim)``
- * top_attn ``(batch_size, T, src_len)``
- * attn_align ``(batch_size, T, src_len)`` or None
- """
- with_align = kwargs.pop("with_align", False)
- output, attns = self._forward(*args, **kwargs)
- top_attn = attns[:, 0, :, :].contiguous()
- attn_align = None
- if with_align:
- if self.full_context_alignment:
- # return _, (B, Q_len, K_len)
- _, attns = self._forward(*args, **kwargs, future=True)
-
- if self.alignment_heads > 0:
- attns = attns[:, : self.alignment_heads, :, :].contiguous()
- # layer average attention across heads, get ``(B, Q, K)``
- # Case 1: no full_context, no align heads -> layer avg baseline
- # Case 2: no full_context, 1 align heads -> guided align
- # Case 3: full_context, 1 align heads -> full cte guided align
- attn_align = attns.mean(dim=1)
- return output, top_attn, attn_align
-
- def update_dropout(self, dropout, attention_dropout):
- self.self_attn.update_dropout(attention_dropout)
- self.feed_forward.update_dropout(dropout)
- self.drop.p = dropout
-
- def _forward(self, *args, **kwargs):
- raise NotImplementedError
-
- def _compute_dec_mask(self, tgt_pad_mask, future):
- tgt_len = tgt_pad_mask.size(-1)
- if not future: # apply future_mask, result mask in (B, T, T)
- future_mask = torch.ones(
- [tgt_len, tgt_len],
- device=tgt_pad_mask.device,
- dtype=torch.uint8,
- )
- future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
- # BoolTensor was introduced in pytorch 1.2
- try:
- future_mask = future_mask.bool()
- except AttributeError:
- pass
- dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
- else: # only mask padding, result mask in (B, 1, T)
- dec_mask = tgt_pad_mask
- return dec_mask
-
- def _forward_self_attn(self, inputs_norm, dec_mask, layer_cache, step):
- if isinstance(self.self_attn, MultiHeadedAttention):
- return self.self_attn(
- inputs_norm,
- inputs_norm,
- inputs_norm,
- mask=dec_mask,
- layer_cache=layer_cache,
- attn_type="self",
- )
- elif isinstance(self.self_attn, AverageAttention):
- return self.self_attn(inputs_norm, mask=dec_mask, layer_cache=layer_cache, step=step)
- else:
- raise ValueError(f"self attention {type(self.self_attn)} not supported")
-
-
-class TransformerDecoderLayer(TransformerDecoderLayerBase):
- """Transformer Decoder layer block in Pre-Norm style.
- Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style,
- providing better converge speed and performance. This is also the actual
- implementation in tensor2tensor and also avalable in fairseq.
- See https://tunz.kr/post/4 and :cite:`DeeperTransformer`.
-
- .. mermaid::
-
- graph LR
- %% "*SubLayer" can be self-attn, src-attn or feed forward block
- A(input) --> B[Norm]
- B --> C["*SubLayer"]
- C --> D[Drop]
- D --> E((+))
- A --> E
- E --> F(out)
-
- """
-
- def __init__(
- self,
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- self_attn_type="scaled-dot",
- max_relative_positions=0,
- aan_useffn=False,
- full_context_alignment=False,
- alignment_heads=0,
- pos_ffn_activation_fn=ActivationFunction.relu,
- ):
- """
- Args:
- See TransformerDecoderLayerBase
- """
- super(TransformerDecoderLayer, self).__init__(
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- self_attn_type,
- max_relative_positions,
- aan_useffn,
- full_context_alignment,
- alignment_heads,
- pos_ffn_activation_fn=pos_ffn_activation_fn,
- )
- self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout)
- self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
-
- def update_dropout(self, dropout, attention_dropout):
- super(TransformerDecoderLayer, self).update_dropout(dropout, attention_dropout)
- self.context_attn.update_dropout(attention_dropout)
-
- def _forward(
- self,
- inputs,
- memory_bank,
- src_pad_mask,
- tgt_pad_mask,
- layer_cache=None,
- step=None,
- future=False,
- ):
- """A naive forward pass for transformer decoder.
-
- # T: could be 1 in the case of stepwise decoding or tgt_len
-
- Args:
- inputs (FloatTensor): ``(batch_size, T, model_dim)``
- memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
- src_pad_mask (bool): ``(batch_size, 1, src_len)``
- tgt_pad_mask (bool): ``(batch_size, 1, T)``
- layer_cache (dict or None): cached layer info when stepwise decode
- step (int or None): stepwise decoding counter
- future (bool): If set True, do not apply future_mask.
-
- Returns:
- (FloatTensor, FloatTensor):
-
- * output ``(batch_size, T, model_dim)``
- * attns ``(batch_size, head, T, src_len)``
-
- """
- dec_mask = None
-
- if inputs.size(1) > 1:
- # masking is necessary when sequence length is greater than one
- dec_mask = self._compute_dec_mask(tgt_pad_mask, future)
-
- inputs_norm = self.layer_norm_1(inputs)
-
- query, _ = self._forward_self_attn(inputs_norm, dec_mask, layer_cache, step)
-
- query = self.drop(query) + inputs
-
- query_norm = self.layer_norm_2(query)
- mid, attns = self.context_attn(
- memory_bank,
- memory_bank,
- query_norm,
- mask=src_pad_mask,
- layer_cache=layer_cache,
- attn_type="context",
- )
- output = self.feed_forward(self.drop(mid) + query)
-
- return output, attns
-
-
-class TransformerDecoderBase(DecoderBase):
- def __init__(self, d_model, copy_attn, embeddings, alignment_layer, layer_norm_module):
- super(TransformerDecoderBase, self).__init__()
-
- self.embeddings = embeddings
-
- # Decoder State
- self.state = {}
-
- # previously, there was a GlobalAttention module here for copy
- # attention. But it was never actually used -- the "copy" attention
- # just reuses the context attention.
- self._copy = copy_attn
- self.layer_norm = layer_norm_module
-
- self.alignment_layer = alignment_layer
-
- @classmethod
- def from_opt(cls, opt, embeddings, is_on_top=False):
- """Alternate constructor."""
- return cls(
- opt.dec_layers,
- opt.rnn_size,
- opt.heads,
- opt.transformer_ff,
- opt.copy_attn,
- opt.self_attn_type,
- opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
- opt.attention_dropout[0] if type(opt.attention_dropout) is list else opt.attention_dropout,
- embeddings,
- opt.max_relative_positions,
- opt.aan_useffn,
- opt.full_context_alignment,
- opt.alignment_layer,
- alignment_heads=opt.alignment_heads,
- pos_ffn_activation_fn=opt.pos_ffn_activation_fn,
- layer_norm_module=(
- nn.LayerNorm(opt.rnn_size, eps=1e-6) if is_on_top
- else nn.Identity()
- ),
- )
-
- def init_state(self, src, memory_bank, enc_hidden):
- """Initialize decoder state."""
- self.state["src"] = src
- self.state["cache"] = None
-
- def map_state(self, fn):
- def _recursive_map(struct, batch_dim=0):
- for k, v in struct.items():
- if v is not None:
- if isinstance(v, dict):
- _recursive_map(v)
- else:
- struct[k] = fn(v, batch_dim)
-
- if self.state["src"] is not None:
- self.state["src"] = fn(self.state["src"], 1)
- if self.state["cache"] is not None:
- _recursive_map(self.state["cache"])
-
- def detach_state(self):
- raise NotImplementedError
-
- def forward(self, *args, **kwargs):
- raise NotImplementedError
-
- def update_dropout(self, dropout, attention_dropout):
- if self.embeddings:
- self.embeddings.update_dropout(dropout)
- for layer in self.transformer_layers:
- layer.update_dropout(dropout, attention_dropout)
-
-
-[docs]class TransformerDecoder(TransformerDecoderBase):
- """The Transformer decoder from "Attention is All You Need".
- :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
-
- .. mermaid::
-
- graph BT
- A[input]
- B[multi-head self-attn]
- BB[multi-head src-attn]
- C[feed forward]
- O[output]
- A --> B
- B --> BB
- BB --> C
- C --> O
-
-
- Args:
- num_layers (int): number of decoder layers.
- d_model (int): size of the model
- heads (int): number of heads
- d_ff (int): size of the inner FF layer
- copy_attn (bool): if using a separate copy attention
- self_attn_type (str): type of self-attention scaled-dot, average
- dropout (float): dropout in residual, self-attn(dot) and feed-forward
- attention_dropout (float): dropout in context_attn (and self-attn(avg))
- embeddings (mammoth.modules.Embeddings):
- embeddings to use, should have positional encodings
- max_relative_positions (int):
- Max distance between inputs in relative positions representations
- aan_useffn (bool): Turn on the FFN layer in the AAN decoder
- full_context_alignment (bool):
- whether enable an extra full context decoder forward for alignment
- alignment_layer (int): N° Layer to supervise with for alignment guiding
- alignment_heads (int):
- N. of cross attention heads to use for alignment guiding
- """
-
- def __init__(
- self,
- num_layers,
- d_model,
- heads,
- d_ff,
- copy_attn,
- self_attn_type,
- dropout,
- attention_dropout,
- embeddings,
- max_relative_positions,
- aan_useffn,
- full_context_alignment,
- alignment_layer,
- alignment_heads,
- pos_ffn_activation_fn=ActivationFunction.relu,
- layer_norm_module=None,
- ):
- super(TransformerDecoder, self).__init__(d_model, copy_attn, embeddings, alignment_layer, layer_norm_module)
-
- self.transformer_layers = nn.ModuleList(
- [
- TransformerDecoderLayer(
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- self_attn_type=self_attn_type,
- max_relative_positions=max_relative_positions,
- aan_useffn=aan_useffn,
- full_context_alignment=full_context_alignment,
- alignment_heads=alignment_heads,
- pos_ffn_activation_fn=pos_ffn_activation_fn,
- )
- for i in range(num_layers)
- ]
- )
-
- def detach_state(self):
- self.state["src"] = self.state["src"].detach()
-
- def _get_layers(self):
- """ Allow subclasses to modify layer stack on-the-fly """
- return self.transformer_layers
-
-[docs] def forward(
- self,
- tgt,
- memory_bank=None,
- step=None,
- memory_lengths=None,
- tgt_pad_mask=None,
- skip_embedding=False,
- **kwargs
- ):
- """Decode, possibly stepwise."""
- if memory_bank is None:
- memory_bank = self.embeddings(tgt)
- src_memory_bank = memory_bank.transpose(0, 1).contiguous()
- if step == 0:
- self._init_cache(memory_bank)
-
- if skip_embedding:
- # tgt and memory_bank are already in batch-first order
- output = tgt
- src_memory_bank = memory_bank
- else:
- tgt_words = tgt[:, :, 0].transpose(0, 1)
-
- pad_idx = self.embeddings.word_padding_idx
- tgt_pad_mask = tgt_words.data.eq(pad_idx).unsqueeze(1) # [B, 1, T_tgt]
-
- emb = self.embeddings(tgt, step=step)
- assert emb.dim() == 3 # len x batch x embedding_dim
-
- output = emb.transpose(0, 1).contiguous()
-
- src_pad_mask = None
- if memory_lengths is not None:
- # either if the attention bridge contains no fixed-length component
- # or lengths were provided for a DecodeStrategy in translation
- src_max_len = memory_bank.size(1)
- src_pad_mask = ~sequence_mask(memory_lengths, src_max_len).unsqueeze(1)
-
- with_align = kwargs.pop("with_align", False)
- attn_aligns = []
-
- for i, layer in enumerate(self._get_layers()):
- layer_cache = (
- self.state["cache"]["layer_{}".format(i)]
- if step is not None
- else None
- )
- output, attn, attn_align = layer(
- output,
- src_memory_bank,
- src_pad_mask,
- tgt_pad_mask,
- layer_cache=layer_cache,
- step=step,
- with_align=with_align,
- )
- if attn_align is not None:
- attn_aligns.append(attn_align)
-
- output = self.layer_norm(output)
- # caller should call transpose and contiguous if they need it
- dec_outs = output
-
- attns = {"std": attn}
- if self._copy:
- attns["copy"] = attn
- if with_align:
- attns["align"] = attn_aligns[self.alignment_layer] # `(B, Q, K)`
- # attns["align"] = torch.stack(attn_aligns, 0).mean(0) # All avg
-
- # TODO change the way attns is returned dict => list or tuple (onnx)
- return dec_outs, attns
-
- def _init_cache(self, memory_bank):
- self.state["cache"] = {}
- # memory_bank is now batch-first
- batch_size = memory_bank.size(0)
- depth = memory_bank.size(-1)
-
- for i, layer in enumerate(self._get_layers()):
- try:
- if layer._does_not_need_cache:
- self.state["cache"]["layer_{}".format(i)] = None
- continue
- except AttributeError:
- # needs the cache
- pass
- layer_cache = {"memory_keys": None, "memory_values": None}
- if isinstance(layer.self_attn, AverageAttention):
- layer_cache["prev_g"] = torch.zeros((batch_size, 1, depth), device=memory_bank.device)
- else:
- layer_cache["self_keys"] = None
- layer_cache["self_values"] = None
- self.state["cache"]["layer_{}".format(i)] = layer_cache
-
Source code for mammoth.encoders.encoder
-"""Base class for encoders and generic multi encoders."""
-
-import torch.nn as nn
-
-from mammoth.utils.misc import aeq
-
-
-[docs]class EncoderBase(nn.Module):
- """
- Base encoder class. Specifies the interface used by different encoder types
- and required by :class:`mammoth.Models.NMTModel`.
-
- .. mermaid::
-
- graph BT
- A[Input]
- subgraph RNN
- C[Pos 1]
- D[Pos 2]
- E[Pos N]
- end
- F[Memory_Bank]
- G[Final]
- A-->C
- A-->D
- A-->E
- C-->F
- D-->F
- E-->F
- E-->G
- """
-
- @classmethod
- def from_opt(cls, opt, embeddings=None):
- raise NotImplementedError
-
- def _check_args(self, src, lengths=None, hidden=None):
- n_batch = src.size(1)
- if lengths is not None:
- (n_batch_,) = lengths.size()
- aeq(n_batch, n_batch_)
-
-[docs] def forward(self, src, lengths=None):
- """
- Args:
- src (LongTensor):
- padded sequences of sparse indices ``(src_len, batch, nfeat)``
- lengths (LongTensor): length of each sequence ``(batch,)``
-
-
- Returns:
- (FloatTensor, FloatTensor, FloatTensor):
-
- * final encoder state, used to initialize decoder
- * memory bank for attention, ``(src_len, batch, hidden)``
- * lengths
- """
-
- raise NotImplementedError
-
Source code for mammoth.encoders.mean_encoder
-"""Define a minimal encoder."""
-from mammoth.encoders.encoder import EncoderBase
-from mammoth.utils.misc import sequence_mask
-import torch
-
-
-[docs]class MeanEncoder(EncoderBase):
- """A trivial non-recurrent encoder. Simply applies mean pooling.
-
- Args:
- num_layers (int): number of replicated layers
- embeddings (mammoth.modules.Embeddings): embedding module to use
- """
-
- def __init__(self, num_layers, embeddings):
- super(MeanEncoder, self).__init__()
- self.num_layers = num_layers
- self.embeddings = embeddings
-
-[docs] @classmethod
- def from_opt(cls, opt, embeddings):
- """Alternate constructor."""
- return cls(opt.enc_layers, embeddings)
-
-[docs] def forward(self, src, lengths=None):
- """See :func:`EncoderBase.forward()`"""
- self._check_args(src, lengths)
-
- emb = self.embeddings(src)
- _, batch, emb_dim = emb.size()
-
- if lengths is not None:
- # we avoid padding while mean pooling
- mask = sequence_mask(lengths).float()
- mask = mask / lengths.unsqueeze(1).float()
- mean = torch.bmm(mask.unsqueeze(1), emb.transpose(0, 1)).squeeze(1)
- else:
- mean = emb.mean(0)
-
- mean = mean.expand(self.num_layers, batch, emb_dim)
- memory_bank = emb
- encoder_final = (mean, mean)
- return encoder_final, memory_bank, lengths
-
Source code for mammoth.encoders.transformer_encoder
-"""
-Implementation of "Attention is All You Need"
-"""
-
-import torch.nn as nn
-
-from mammoth.encoders.encoder import EncoderBase
-from mammoth.modules import MultiHeadedAttention
-from mammoth.modules.position_ffn import PositionwiseFeedForward
-from mammoth.modules.position_ffn import ActivationFunction
-from mammoth.utils.misc import sequence_mask
-
-
-class TransformerEncoderLayer(nn.Module):
- """
- A single layer of the transformer encoder.
-
- Args:
- d_model (int): the dimension of keys/values/queries in
- MultiHeadedAttention, also the input size of
- the first-layer of the PositionwiseFeedForward.
- heads (int): the number of head for MultiHeadedAttention.
- d_ff (int): the second-layer of the PositionwiseFeedForward.
- dropout (float): dropout probability(0-1.0).
- pos_ffn_activation_fn (ActivationFunction):
- activation function choice for PositionwiseFeedForward layer
- """
-
- def __init__(
- self,
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- max_relative_positions=0,
- pos_ffn_activation_fn=ActivationFunction.relu,
- ):
- super(TransformerEncoderLayer, self).__init__()
-
- self.self_attn = MultiHeadedAttention(
- heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions
- )
- self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn)
- self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
- self.dropout = nn.Dropout(dropout)
-
- def forward(self, inputs, mask):
- """
- Args:
- inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
- mask (LongTensor): ``(batch_size, 1, src_len)``
-
- Returns:
- (FloatTensor):
-
- * outputs ``(batch_size, src_len, model_dim)``
- """
- input_norm = self.layer_norm(inputs)
- context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self")
- out = self.dropout(context) + inputs
- return self.feed_forward(out)
-
- def update_dropout(self, dropout, attention_dropout):
- self.self_attn.update_dropout(attention_dropout)
- self.feed_forward.update_dropout(dropout)
- self.dropout.p = dropout
-
-
-[docs]class TransformerEncoder(EncoderBase):
- """The Transformer encoder from "Attention is All You Need"
- :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
-
- .. mermaid::
-
- graph BT
- A[input]
- B[multi-head self-attn]
- C[feed forward]
- O[output]
- A --> B
- B --> C
- C --> O
-
- Args:
- num_layers (int): number of encoder layers
- d_model (int): size of the model
- heads (int): number of heads
- d_ff (int): size of the inner FF layer
- dropout (float): dropout parameters
- embeddings (mammoth.modules.Embeddings):
- embeddings to use, should have positional encodings
- pos_ffn_activation_fn (ActivationFunction):
- activation function choice for PositionwiseFeedForward layer
-
- Returns:
- (torch.FloatTensor, torch.FloatTensor):
-
- * embeddings ``(src_len, batch_size, model_dim)``
- * memory_bank ``(src_len, batch_size, model_dim)``
- """
-
- def __init__(
- self,
- num_layers,
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- embeddings,
- max_relative_positions,
- pos_ffn_activation_fn=ActivationFunction.relu,
- layer_norm_module=None,
- ):
- super(TransformerEncoder, self).__init__()
-
- self.embeddings = embeddings
- self.transformer = nn.ModuleList(
- [
- TransformerEncoderLayer(
- d_model,
- heads,
- d_ff,
- dropout,
- attention_dropout,
- max_relative_positions=max_relative_positions,
- pos_ffn_activation_fn=pos_ffn_activation_fn,
- )
- for i in range(num_layers)
- ]
- )
- self.layer_norm = layer_norm_module
-
-[docs] @classmethod
- def from_opt(cls, opt, embeddings, is_on_top=False):
- """Alternate constructor."""
- return cls(
- opt.enc_layers,
- opt.rnn_size,
- opt.heads,
- opt.transformer_ff,
- opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
- opt.attention_dropout[0] if type(opt.attention_dropout) is list else opt.attention_dropout,
- embeddings,
- opt.max_relative_positions,
- pos_ffn_activation_fn=opt.pos_ffn_activation_fn,
- layer_norm_module=(
- nn.LayerNorm(opt.enc_rnn_size, eps=1e-6) if is_on_top
- else nn.Identity()
- )
- )
-
-[docs] def forward(self, src, lengths=None, skip_embedding=False, mask=None):
- """See :func:`EncoderBase.forward()`"""
-
- if skip_embedding:
- out = src
- emb = None
- else:
- self._check_args(src, lengths)
- emb = self.embeddings(src)
- out = emb.transpose(0, 1).contiguous()
- if mask is None:
- mask = ~sequence_mask(lengths).unsqueeze(1)
-
- # Run the forward pass of every layer of the tranformer.
- out = self._forward_loop(out, mask)
- out = self.layer_norm(out)
-
- # caller should call transpose and contiguous if they need it
- return emb, out, lengths, mask
-
- def _forward_loop(self, out, mask):
- """ Run the forward pass of every layer of the transformer. """
- for layer in self.transformer:
- out = layer(out, mask)
-
- def update_dropout(self, dropout, attention_dropout):
- self.embeddings.update_dropout(dropout)
- for layer in self.transformer:
- layer.update_dropout(dropout, attention_dropout)
-
Source code for mammoth.translate.translator
import torch
import mammoth.model_builder
-import mammoth.decoders.ensemble
+import mammoth.modules.decoder_ensemble
# from mammoth.inputters.text_dataset import InferenceDataIterator
from mammoth.translate.beam_search import BeamSearch, BeamSearchLM
from mammoth.translate.greedy_search import GreedySearch, GreedySearchLM
@@ -208,7 +208,7 @@ Source code for mammoth.translate.translator
out_file = codecs.open(opt.output, "w+", "utf-8")
load_test_model = (
- mammoth.decoders.ensemble.load_test_model if len(opt.models) > 3
+ mammoth.modules.decoder_ensemble.load_test_model if len(opt.models) > 3
else mammoth.model_builder.load_test_multitask_model
)
if logger:
diff --git a/_sources/attention_bridges.md.txt b/_sources/attention_bridges.md.txt
index 0080a85f..3b014dbd 100644
--- a/_sources/attention_bridges.md.txt
+++ b/_sources/attention_bridges.md.txt
@@ -1,7 +1,7 @@
# Attention Bridge
-The embeddings are generated through the self-attention mechanism ([Attention Bridge](./onmt/attention_bridge.py)) of the encoder and establish a connection with language-specific decoders that focus their attention on these embeddings. This is why they are referred to as 'bridges'. This architectural element serves to link the encoded information with the decoding process, enhancing the flow of information between different stages of language processing.
+The embeddings are generated through the self-attention mechanism ([Attention Bridge](./mammoth/modules/attention_bridge.py)) of the encoder and establish a connection with language-specific decoders that focus their attention on these embeddings. This is why they are referred to as 'bridges'. This architectural element serves to link the encoded information with the decoding process, enhancing the flow of information between different stages of language processing.
There are five types of attention mechanism implemented:
@@ -61,7 +61,7 @@ The `PerceiverAttentionBridgeLayer` involves a multi-headed dot product self-att
3. **Linear Layer**: After normalization, the data is fed into a linear layer. This linear transformation can be seen as a learned projection of the attention-weighted data into a new space.
-4. **ReLU Activation**: The output of the linear layer undergoes the Rectified Linear Unit (ReLU) activation function.
+4. **ReLU Activation**: The output of the linear layer undergoes the Rectified Linear Unit (ReLU) activation function.
5. **Linear Layer (Second)**: Another linear layer is applied to the ReLU-activated output.
@@ -72,11 +72,11 @@ The `PerceiverAttentionBridgeLayer` involves a multi-headed dot product self-att
The process described involves dot product self-attention. The steps are as follows:
1. **Input Transformation**: Given an input matrix $\mathbf{H} \in \mathbb{R}^{d_h \times n}$, two sets of learned weight matrices are used to transform the input. These weight matrices are $\mathbf{W}_1 \in \mathbb{R}^{d_h \times d_a}$ and $\mathbf{W}_2 \in \mathbb{R}^{d_h \times d_a}$. The multiplication of $\mathbf{H}$ with $\mathbf{W}_1$ and $\mathbf{W}_2$ produces matrices $\mathbf{V}$ and $\mathbf{K}$, respectively:
-
+
- $\mathbf{V} = \mathbf{H} \mathbf{W}_1$
- $\mathbf{K} = \mathbf{H} \mathbf{W}_2$
-2. **Attention Calculation**: The core attention calculation involves three matrices: $\mathbf{Q} \in \mathbb{R}^{d_h \times n}$, $\mathbf{K}$ (calculated previously), and $\mathbf{V}$ (calculated previously). The dot product of $\mathbf{Q}$ and $\mathbf{K}^\top$ is divided by the square root of the dimensionality of the input features ($\sqrt{d_h}$).
+2. **Attention Calculation**: The core attention calculation involves three matrices: $\mathbf{Q} \in \mathbb{R}^{d_h \times n}$, $\mathbf{K}$ (calculated previously), and $\mathbf{V}$ (calculated previously). The dot product of $\mathbf{Q}$ and $\mathbf{K}^\top$ is divided by the square root of the dimensionality of the input features ($\sqrt{d_h}$).
The final attended output is calculated by multiplying the attention weights with the $\mathbf{V}$ matrix: $\mathbf{H}^\prime = \operatorname{Softmax}(\frac{\mathbf{Q}\mathbf{K}^\top}{\sqrt{d_h}})\mathbf{V}$
@@ -86,5 +86,4 @@ The TransformerEncoderLayer employs multi-headed dot product self-attention (by
## FeedForwardAttentionBridgeLayer
-The `FeedForwardAttentionBridgeLayer` module applies a sequence of linear transformations and `ReLU` activations to the input data, followed by an attention bridge normalization, enhancing the connectivity between different parts of the model.
-
+The `FeedForwardAttentionBridgeLayer` module applies a sequence of linear transformations and `ReLU` activations to the input data, followed by an attention bridge normalization, enhancing the connectivity between different parts of the model.
diff --git a/attention_bridges.html b/attention_bridges.html
index 92206f43..328630bd 100644
--- a/attention_bridges.html
+++ b/attention_bridges.html
@@ -186,7 +186,7 @@
Attention Bridge¶
-The embeddings are generated through the self-attention mechanism (Attention Bridge) of the encoder and establish a connection with language-specific decoders that focus their attention on these embeddings. This is why they are referred to as ‘bridges’. This architectural element serves to link the encoded information with the decoding process, enhancing the flow of information between different stages of language processing.
+The embeddings are generated through the self-attention mechanism (Attention Bridge) of the encoder and establish a connection with language-specific decoders that focus their attention on these embeddings. This is why they are referred to as ‘bridges’. This architectural element serves to link the encoded information with the decoding process, enhancing the flow of information between different stages of language processing.
There are five types of attention mechanism implemented:
layer_type_to_cls = {
'lin': LinAttentionBridgeLayer,
diff --git a/genindex.html b/genindex.html
index 6f92775d..bbf3522f 100644
--- a/genindex.html
+++ b/genindex.html
@@ -273,8 +273,6 @@ C
D