Module `pysimt.layers.decoders.tf_decoder`

Expand source code

import torch.nn.functional as F
from torch import nn

from ...utils.nn import generate_combined_mask, generate_lookahead_mask

from .. import TFEmbedding
from ..positionwise_ff import PositionwiseSublayer
from ..transformers import CrossAttentionSublayer
from ..transformers import SelfAttentionSublayer
from ..transformers import SerialMMCrossAttentionSublayer
from ..transformers import ParallelMMCrossAttentionSublayer
from ..transformers import HierarchicalMMCrossAttentionSublayer


class TFDecoderBlock(nn.Module):
    def __init__(self, model_dim, n_heads, ff_dim, ff_activ='gelu',
                 dropout=0.1, attn_dropout=0.0, pre_norm=True,
                 img_attn=None, n_mm_hier_heads=8):
        """
        Creates a decoder block, consisting of self attention, cross-attention
        and a position wise feed forward network.
        :param model_dim: The model dimensions.
        :param n_heads: The number of attention heads.
        :param ff_dim: The feed forward layer units.
        :param ff_activ: The feed forward layer activation function. Default 'gelu'.
        :param dropout: The dropout value. Default 0.1.
        :param img_attn: type of image attention; can be 'parallel', 'serial', or None (default).
        """
        super().__init__()

        self.img_attn = img_attn
        self.self_attn = SelfAttentionSublayer(
            model_dim, n_heads, dropout, attn_dropout, pre_norm)
        self.feed_forward = PositionwiseSublayer(
            model_dim, ff_dim, ff_activ, dropout, pre_norm)

        if img_attn == 'parallel':
            self.cross_attn = ParallelMMCrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm)
        elif img_attn == 'serial':
            self.cross_attn = SerialMMCrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm)
        elif img_attn == 'hierarchical':
            self.cross_attn = HierarchicalMMCrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm, n_mm_hier_heads)
        else:
            self.cross_attn = CrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm)

    def forward(self, encoder_x, decoder_x, encoder_mask=None,
                decoder_mask=None, image_x=None):
        all_weights = {}
        decoder_x, all_weights['self'] = self.self_attn(decoder_x, decoder_mask)
        decoder_x_attn, all_weights['cross'] = self.cross_attn(
            decoder_x, encoder_x, encoder_x, encoder_mask,
            key_img=image_x, value_img=image_x)

        return self.feed_forward(decoder_x_attn, decoder_mask), all_weights


class TFDecoder(nn.Module):
    """Decoder block for Transformer.

    Arguments:

    Input:

    Output:
    """

    def __init__(self, model_dim, ff_dim, n_heads, n_layers, num_embeddings,
                 tied_emb=False, ff_activ='gelu', dropout=0.1,
                 attn_dropout=0.0, pre_norm=True, img_attn=None,
                 n_mm_hier_heads=8, store_attn_weights=True):
        """
        Creates a TFDecoder.
        :param model_dim: The model dimension.
        :param ff_dim: The feed-forward layer dimension.
        :param n_heads: The number of heads.
        :param n_layers: The number of layers.
        :param num_embeddings: The number of the embeddings.
        :param tied_emb: Whether to tie the input and output embeddings. Default: False.
        :param ff_activ: The feed forward layer activation function. Default 'gelu'.
        :param dropout: The dropout value. Default 0.1.
        :param pre_norm: Whether it should use 'pre_norm' layer types or 'post_norm' Default True.
        """
        super().__init__()
        self.model_dim = model_dim
        self.ff_dim = ff_dim
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.ff_activ = ff_activ
        self.dropout = dropout
        self.pre_norm = pre_norm
        self.store_attn_weights = store_attn_weights
        self.blocks = []
        self._all_attention_weights = []
        self.final_layer_norm = None

        self.trg_embedding = TFEmbedding(
            num_embeddings=num_embeddings,
            embedding_dim=self.model_dim, dropout=dropout)

        for _ in range(self.n_layers):
            layers = TFDecoderBlock(
                model_dim=self.model_dim, n_heads=self.n_heads,
                ff_dim=self.ff_dim, ff_activ=self.ff_activ, dropout=self.dropout,
                attn_dropout=attn_dropout, pre_norm=self.pre_norm,
                img_attn=img_attn, n_mm_hier_heads=n_mm_hier_heads)
            self.blocks.append(layers)

        self.blocks = nn.ModuleList(self.blocks)

        if self.pre_norm:
            self.final_layer_norm = nn.LayerNorm(self.model_dim, eps=1e-6)

        self.output_layer = nn.Linear(self.model_dim, num_embeddings)

        if tied_emb:
            self.output_layer.weight = self.trg_embedding.weight

    def f_init(self, encoder_data):
        """
        Returns the initial hidden state of the decoder. N/A for the transformer.
        :param encoder_data:
        :return:
        """
        return None

    def forward(self, encoder_data, target, **kwargs):
        """Forward-pass of the decoder block.
        :param encoder_data: a tuple containing the encoder's hidden states tensor, shape (s_len, bsize, model_dim)
                             and the corresponding mask.
        :param target: input tensor, shape (t_len, bsize, model_dim)
        :param kwargs: Extra arguments for the decoder. In wait-k training, 'k' should be passed.

        :return: For backward compatibility with other decoders the method returns a tuple:
                the result from the final output layer and the decoders hidden states.
        """
        encoder_states, encoder_mask = encoder_data['src']
        encoder_image = self._get_image_data(encoder_data)
        encoder_mask = self._create_waitk_encoder_mask_if_needed(
            encoder_mask, encoder_states, kwargs, target)

        decoder_mask = generate_combined_mask(target)
        decoder_x = self.trg_embedding(target)

        self._all_attention_weights = []
        for block in self.blocks:
            decoder_x, attn_weights = block(
                encoder_states, decoder_x, encoder_mask, decoder_mask, encoder_image)
            if self.store_attn_weights:
                self._all_attention_weights.append(attn_weights)

        if self.pre_norm:
            decoder_x = self.final_layer_norm(decoder_x)

        return F.log_softmax(self.output_layer(decoder_x), dim=-1), decoder_x

    @staticmethod
    def _create_waitk_encoder_mask_if_needed(encoder_mask, encoder_states, kwargs, target):
        if 'k' in kwargs:
            simultaneous_k = kwargs['k']
            encoder_lookahead_mask = generate_lookahead_mask(
                encoder_states, simultaneous_k, target.shape[0])
            encoder_mask = encoder_mask | encoder_lookahead_mask
        return encoder_mask

    @staticmethod
    def _get_image_data(encoder_data):
        encoder_image = None
        if 'image' in encoder_data:
            encoder_image, _ = encoder_data['image']
        return encoder_image

    def f_next(self, encoder_data, next_word_emb, hidden_states, hypothesis):
        probs, decoder_x = self.forward(encoder_data, hypothesis)
        next_word_probs = probs[-1, :, :]

        return next_word_probs, decoder_x

    def get_emb(self, data):
        # FIXME:
        if len(data.shape) == 1:
            data = data.unsqueeze(0)
        return self.trg_embedding(data)

    def get_attention_weights(self):
        return self._all_attention_weights

Classes

class TFDecoder (model_dim, ff_dim, n_heads, n_layers, num_embeddings, tied_emb=False, ff_activ='gelu', dropout=0.1, attn_dropout=0.0, pre_norm=True, img_attn=None, n_mm_hier_heads=8, store_attn_weights=True)

Decoder block for Transformer.

Arguments:

Input:

Output:

Creates a TFDecoder. :param model_dim: The model dimension. :param ff_dim: The feed-forward layer dimension. :param n_heads: The number of heads. :param n_layers: The number of layers. :param num_embeddings: The number of the embeddings. :param tied_emb: Whether to tie the input and output embeddings. Default: False. :param ff_activ: The feed forward layer activation function. Default 'gelu'. :param dropout: The dropout value. Default 0.1. :param pre_norm: Whether it should use 'pre_norm' layer types or 'post_norm' Default True.

Expand source code

class TFDecoder(nn.Module):
    """Decoder block for Transformer.

    Arguments:

    Input:

    Output:
    """

    def __init__(self, model_dim, ff_dim, n_heads, n_layers, num_embeddings,
                 tied_emb=False, ff_activ='gelu', dropout=0.1,
                 attn_dropout=0.0, pre_norm=True, img_attn=None,
                 n_mm_hier_heads=8, store_attn_weights=True):
        """
        Creates a TFDecoder.
        :param model_dim: The model dimension.
        :param ff_dim: The feed-forward layer dimension.
        :param n_heads: The number of heads.
        :param n_layers: The number of layers.
        :param num_embeddings: The number of the embeddings.
        :param tied_emb: Whether to tie the input and output embeddings. Default: False.
        :param ff_activ: The feed forward layer activation function. Default 'gelu'.
        :param dropout: The dropout value. Default 0.1.
        :param pre_norm: Whether it should use 'pre_norm' layer types or 'post_norm' Default True.
        """
        super().__init__()
        self.model_dim = model_dim
        self.ff_dim = ff_dim
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.ff_activ = ff_activ
        self.dropout = dropout
        self.pre_norm = pre_norm
        self.store_attn_weights = store_attn_weights
        self.blocks = []
        self._all_attention_weights = []
        self.final_layer_norm = None

        self.trg_embedding = TFEmbedding(
            num_embeddings=num_embeddings,
            embedding_dim=self.model_dim, dropout=dropout)

        for _ in range(self.n_layers):
            layers = TFDecoderBlock(
                model_dim=self.model_dim, n_heads=self.n_heads,
                ff_dim=self.ff_dim, ff_activ=self.ff_activ, dropout=self.dropout,
                attn_dropout=attn_dropout, pre_norm=self.pre_norm,
                img_attn=img_attn, n_mm_hier_heads=n_mm_hier_heads)
            self.blocks.append(layers)

        self.blocks = nn.ModuleList(self.blocks)

        if self.pre_norm:
            self.final_layer_norm = nn.LayerNorm(self.model_dim, eps=1e-6)

        self.output_layer = nn.Linear(self.model_dim, num_embeddings)

        if tied_emb:
            self.output_layer.weight = self.trg_embedding.weight

    def f_init(self, encoder_data):
        """
        Returns the initial hidden state of the decoder. N/A for the transformer.
        :param encoder_data:
        :return:
        """
        return None

    def forward(self, encoder_data, target, **kwargs):
        """Forward-pass of the decoder block.
        :param encoder_data: a tuple containing the encoder's hidden states tensor, shape (s_len, bsize, model_dim)
                             and the corresponding mask.
        :param target: input tensor, shape (t_len, bsize, model_dim)
        :param kwargs: Extra arguments for the decoder. In wait-k training, 'k' should be passed.

        :return: For backward compatibility with other decoders the method returns a tuple:
                the result from the final output layer and the decoders hidden states.
        """
        encoder_states, encoder_mask = encoder_data['src']
        encoder_image = self._get_image_data(encoder_data)
        encoder_mask = self._create_waitk_encoder_mask_if_needed(
            encoder_mask, encoder_states, kwargs, target)

        decoder_mask = generate_combined_mask(target)
        decoder_x = self.trg_embedding(target)

        self._all_attention_weights = []
        for block in self.blocks:
            decoder_x, attn_weights = block(
                encoder_states, decoder_x, encoder_mask, decoder_mask, encoder_image)
            if self.store_attn_weights:
                self._all_attention_weights.append(attn_weights)

        if self.pre_norm:
            decoder_x = self.final_layer_norm(decoder_x)

        return F.log_softmax(self.output_layer(decoder_x), dim=-1), decoder_x

    @staticmethod
    def _create_waitk_encoder_mask_if_needed(encoder_mask, encoder_states, kwargs, target):
        if 'k' in kwargs:
            simultaneous_k = kwargs['k']
            encoder_lookahead_mask = generate_lookahead_mask(
                encoder_states, simultaneous_k, target.shape[0])
            encoder_mask = encoder_mask | encoder_lookahead_mask
        return encoder_mask

    @staticmethod
    def _get_image_data(encoder_data):
        encoder_image = None
        if 'image' in encoder_data:
            encoder_image, _ = encoder_data['image']
        return encoder_image

    def f_next(self, encoder_data, next_word_emb, hidden_states, hypothesis):
        probs, decoder_x = self.forward(encoder_data, hypothesis)
        next_word_probs = probs[-1, :, :]

        return next_word_probs, decoder_x

    def get_emb(self, data):
        # FIXME:
        if len(data.shape) == 1:
            data = data.unsqueeze(0)
        return self.trg_embedding(data)

    def get_attention_weights(self):
        return self._all_attention_weights

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def f_init(self, encoder_data)

Returns the initial hidden state of the decoder. N/A for the transformer. :param encoder_data: :return:

Expand source code

def f_init(self, encoder_data):
    """
    Returns the initial hidden state of the decoder. N/A for the transformer.
    :param encoder_data:
    :return:
    """
    return None

def f_next(self, encoder_data, next_word_emb, hidden_states, hypothesis)

Expand source code

def f_next(self, encoder_data, next_word_emb, hidden_states, hypothesis):
    probs, decoder_x = self.forward(encoder_data, hypothesis)
    next_word_probs = probs[-1, :, :]

    return next_word_probs, decoder_x

def forward(self, encoder_data, target, **kwargs) ‑> Callable[..., Any]

Forward-pass of the decoder block. :param encoder_data: a tuple containing the encoder's hidden states tensor, shape (s_len, bsize, model_dim) and the corresponding mask. :param target: input tensor, shape (t_len, bsize, model_dim) :param kwargs: Extra arguments for the decoder. In wait-k training, 'k' should be passed.

:return: For backward compatibility with other decoders the method returns a tuple: the result from the final output layer and the decoders hidden states.

Expand source code

def forward(self, encoder_data, target, **kwargs):
    """Forward-pass of the decoder block.
    :param encoder_data: a tuple containing the encoder's hidden states tensor, shape (s_len, bsize, model_dim)
                         and the corresponding mask.
    :param target: input tensor, shape (t_len, bsize, model_dim)
    :param kwargs: Extra arguments for the decoder. In wait-k training, 'k' should be passed.

    :return: For backward compatibility with other decoders the method returns a tuple:
            the result from the final output layer and the decoders hidden states.
    """
    encoder_states, encoder_mask = encoder_data['src']
    encoder_image = self._get_image_data(encoder_data)
    encoder_mask = self._create_waitk_encoder_mask_if_needed(
        encoder_mask, encoder_states, kwargs, target)

    decoder_mask = generate_combined_mask(target)
    decoder_x = self.trg_embedding(target)

    self._all_attention_weights = []
    for block in self.blocks:
        decoder_x, attn_weights = block(
            encoder_states, decoder_x, encoder_mask, decoder_mask, encoder_image)
        if self.store_attn_weights:
            self._all_attention_weights.append(attn_weights)

    if self.pre_norm:
        decoder_x = self.final_layer_norm(decoder_x)

    return F.log_softmax(self.output_layer(decoder_x), dim=-1), decoder_x

def get_attention_weights(self)

Expand source code

def get_attention_weights(self):
    return self._all_attention_weights

def get_emb(self, data)

Expand source code

def get_emb(self, data):
    # FIXME:
    if len(data.shape) == 1:
        data = data.unsqueeze(0)
    return self.trg_embedding(data)

class TFDecoderBlock (model_dim, n_heads, ff_dim, ff_activ='gelu', dropout=0.1, attn_dropout=0.0, pre_norm=True, img_attn=None, n_mm_hier_heads=8)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Creates a decoder block, consisting of self attention, cross-attention and a position wise feed forward network. :param model_dim: The model dimensions. :param n_heads: The number of attention heads. :param ff_dim: The feed forward layer units. :param ff_activ: The feed forward layer activation function. Default 'gelu'. :param dropout: The dropout value. Default 0.1. :param img_attn: type of image attention; can be 'parallel', 'serial', or None (default).

Expand source code

class TFDecoderBlock(nn.Module):
    def __init__(self, model_dim, n_heads, ff_dim, ff_activ='gelu',
                 dropout=0.1, attn_dropout=0.0, pre_norm=True,
                 img_attn=None, n_mm_hier_heads=8):
        """
        Creates a decoder block, consisting of self attention, cross-attention
        and a position wise feed forward network.
        :param model_dim: The model dimensions.
        :param n_heads: The number of attention heads.
        :param ff_dim: The feed forward layer units.
        :param ff_activ: The feed forward layer activation function. Default 'gelu'.
        :param dropout: The dropout value. Default 0.1.
        :param img_attn: type of image attention; can be 'parallel', 'serial', or None (default).
        """
        super().__init__()

        self.img_attn = img_attn
        self.self_attn = SelfAttentionSublayer(
            model_dim, n_heads, dropout, attn_dropout, pre_norm)
        self.feed_forward = PositionwiseSublayer(
            model_dim, ff_dim, ff_activ, dropout, pre_norm)

        if img_attn == 'parallel':
            self.cross_attn = ParallelMMCrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm)
        elif img_attn == 'serial':
            self.cross_attn = SerialMMCrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm)
        elif img_attn == 'hierarchical':
            self.cross_attn = HierarchicalMMCrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm, n_mm_hier_heads)
        else:
            self.cross_attn = CrossAttentionSublayer(
                model_dim, n_heads, dropout, attn_dropout, pre_norm)

    def forward(self, encoder_x, decoder_x, encoder_mask=None,
                decoder_mask=None, image_x=None):
        all_weights = {}
        decoder_x, all_weights['self'] = self.self_attn(decoder_x, decoder_mask)
        decoder_x_attn, all_weights['cross'] = self.cross_attn(
            decoder_x, encoder_x, encoder_x, encoder_mask,
            key_img=image_x, value_img=image_x)

        return self.feed_forward(decoder_x_attn, decoder_mask), all_weights

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, encoder_x, decoder_x, encoder_mask=None, decoder_mask=None, image_x=None) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Expand source code

def forward(self, encoder_x, decoder_x, encoder_mask=None,
            decoder_mask=None, image_x=None):
    all_weights = {}
    decoder_x, all_weights['self'] = self.self_attn(decoder_x, decoder_mask)
    decoder_x_attn, all_weights['cross'] = self.cross_attn(
        decoder_x, encoder_x, encoder_x, encoder_mask,
        key_img=image_x, value_img=image_x)

    return self.feed_forward(decoder_x_attn, decoder_mask), all_weights