Module `pysimt.models.snmt_rnn`

Expand source code

# -*- coding: utf-8 -*-
import logging

import torch
from torch import nn

from ..layers import RecurrentEncoder, VisualFeaturesEncoder
from ..layers.decoders import ConditionalGRUDecoder
from ..vocabulary import Vocabulary
from ..utils.nn import get_n_params
from ..utils.topology import Topology
from ..utils.ml_metrics import Loss
from ..utils.device import DEVICE
from ..utils.io import progress_bar
from ..datasets import MultimodalDataset
from ..metrics import Metric

logger = logging.getLogger('pysimt')

"""You can use this model to pre-train a unidirectional NMT model. Once trained,
you can decode translations from this model either using plain greedy-search
or state-of-the-art simultanous decoding algorithms.

# Pure greedy-search (by default batched)
$ pysimt translate -s test_2016_flickr,test_2017_flickr \
    -f gs -o <output_prefix> <best model's .ckpt>

"""


class SimultaneousNMT(nn.Module):
    def set_defaults(self):
        self.defaults = {
            'emb_dim': 128,             # Source and target embedding sizes
            'enc_dim': 256,             # Encoder hidden size
            'enc_proj_dim': None,       # Encoder final projection
            'enc_proj_activ': 'linear',  # Encoder final projection activation
            'enc_type': 'gru',          # Encoder type (gru|lstm)
            'enc_lnorm': False,         # Add layer-normalization to encoder output
            'enc_bidirectional': True,  # Whether the RNN encoder should be bidirectional
            'n_encoders': 1,            # Number of stacked encoders
            'dec_dim': 256,             # Decoder hidden size
            'dec_type': 'gru',          # Decoder type (gru|lstm)
            'dec_variant': 'cond',      # The only option is `cond`
            'dec_inp_activ': None,
            'att_type': 'mlp',          # Attention type (mlp|dot)
            'att_temp': 1.,             # Attention temperature
            'att_activ': 'tanh',        # Attention non-linearity (all torch nonlins)
            'att_bottleneck': 'ctx',    # Bottleneck dimensionality (ctx|hid)
            'dropout_emb': 0,           # Simple dropout to source embeddings
            'dropout_ctx': 0,           # Simple dropout to source encodings
            'dropout_out': 0,           # Simple dropout to decoder output
            'dropout_enc': 0,           # Intra-encoder dropout if n_encoders > 1
            'tied_emb': False,          # Share embeddings: (False|2way|3way)
            'direction': None,          # Network directionality, i.e. en->de
            'max_len': 80,              # Reject sentences where 'bucket_by' length > 80
            'bucket_by': None,          # A key like 'en' to define w.r.t which dataset
                                        # the batches will be sorted
            'bucket_order': None,       # Curriculum: ascending/descending/None
            'sampler_type': 'bucket',   # bucket or approximate
            'short_list': 0,            # Short list vocabularies (0: disabled)
            'out_logic': 'simple',      # 'simple' or 'deep' output
            # Visual features (optional)
            'aux_dim': None,            # Auxiliary features dim (# channels for conv features)
            'aux_dropout': 0.0,         # Auxiliary features dropout
            'aux_lnorm': False,         # layer-norm
            'aux_l2norm': False,        # L2-normalize
            'aux_proj_dim': None,       # Projection layer for features
            'aux_proj_activ': None,     # Projection layer non-linearity
            'num_regions': 36,          # The number of regions to use. Valid only for OD features. Default: 36.
            'feat_mode': None,          # OD feature type. None defaults to `roi_feats`
            'mm_fusion_op': 'concat',   # fusion type
            'mm_fusion_dropout': 0.0,   # fusion dropout
            # Decoding/training simultaneous NMT args
            'translator_type': 'gs',    # This model implements plain unidirectional MT
                                        # so the decoding is normal greedy-search
            'translator_args': {},      # No extra arguments to translator
        }

    def __init__(self, opts):
        super().__init__()

        # opts -> config file sections {.model, .data, .vocabulary, .train}
        self.opts = opts

        # Vocabulary objects
        self.vocabs = {}

        # Each auxiliary loss should be stored inside this dictionary
        # in order to be taken into account by the mainloop for multi-tasking
        self.aux_loss = {}

        # Setup options
        self.opts.model = self.set_model_options(opts.model)

        # Parse topology & languages
        self.topology = Topology(self.opts.model['direction'])

        # Load vocabularies here
        for name, fname in self.opts.vocabulary.items():
            self.vocabs[name] = Vocabulary(fname, short_list=self.opts.model['short_list'])

        # Inherently non multi-lingual aware
        slangs = self.topology.get_src_langs()
        tlangs = self.topology.get_trg_langs()
        if slangs:
            self.sl = slangs[0]
            self.src_vocab = self.vocabs[self.sl]
            self.n_src_vocab = len(self.src_vocab)
        if tlangs:
            self.tl = tlangs[0]
            self.trg_vocab = self.vocabs[self.tl]
            self.n_trg_vocab = len(self.trg_vocab)
            self.val_refs = self.opts.data['val_set'][self.tl]

        # Check vocabulary sizes for 3way tying
        if self.opts.model.get('tied_emb', False) not in [False, '2way', '3way']:
            raise RuntimeError(
                "'{}' not recognized for tied_emb.".format(self.opts.model['tied_emb']))

        if self.opts.model.get('tied_emb', False) == '3way':
            assert self.n_src_vocab == self.n_trg_vocab, \
                "The vocabulary sizes do not match for 3way tied embeddings."

    def __repr__(self):
        s = super().__repr__() + '\n'
        for vocab in self.vocabs.values():
            s += "{}\n".format(vocab)
        s += "{}\n".format(get_n_params(self))
        return s

    def set_model_options(self, model_opts):
        self.set_defaults()
        for opt, value in model_opts.items():
            if opt in self.defaults:
                # Override defaults from config
                self.defaults[opt] = value
            else:
                logger.info('Warning: unused model option: {}'.format(opt))
        return self.defaults

    def reset_parameters(self):
        for name, param in self.named_parameters():
            # Skip 1-d biases and scalars
            if param.requires_grad and param.dim() > 1:
                nn.init.kaiming_normal_(param.data)
        # Reset padding embedding to 0
        for layer in list(self.encoders.values()) + [self.dec]:
            if hasattr(layer, 'emb'):
                with torch.no_grad():
                    layer.emb.weight.data[0].fill_(0)

    def create_src_encoder(self):
        """Creates and returns an RNN encoder for textual input."""
        return RecurrentEncoder(
            input_size=self.opts.model['emb_dim'],
            hidden_size=self.opts.model['enc_dim'],
            n_vocab=self.n_src_vocab,
            bidirectional=self.opts.model['enc_bidirectional'],
            rnn_type=self.opts.model['enc_type'],
            proj_dim=self.opts.model['enc_proj_dim'],
            proj_activ=self.opts.model['enc_proj_activ'],
            dropout_emb=self.opts.model['dropout_emb'],
            dropout_ctx=self.opts.model['dropout_ctx'],
            dropout_rnn=self.opts.model['dropout_enc'],
            num_layers=self.opts.model['n_encoders'],
            layer_norm=self.opts.model['enc_lnorm'],
        )

    def create_image_encoder(self):
        """Creates and returns an MLP encoder for visual features."""
        return VisualFeaturesEncoder(
            input_size=self.opts.model['aux_dim'],
            proj_dim=self.opts.model['aux_proj_dim'],
            proj_activ=self.opts.model['aux_proj_activ'],
            layer_norm=self.opts.model['aux_lnorm'],
            l2_norm=self.opts.model['aux_l2norm'],
            dropout=self.opts.model['aux_dropout'],
        )

    def create_decoder(self, encoders):
        """Creates and returns the RNN decoder. No hidden state initialization
        for sake of simplicity. Encoders are passed to allow multi-modal
        attention out-of-the-box."""
        return ConditionalGRUDecoder(
            input_size=self.opts.model['emb_dim'],
            hidden_size=self.opts.model['dec_dim'],
            n_vocab=self.n_trg_vocab,
            encoders=encoders,
            rnn_type=self.opts.model['dec_type'],
            tied_emb=self.opts.model['tied_emb'],
            att_type=self.opts.model['att_type'],
            att_temp=self.opts.model['att_temp'],
            att_activ=self.opts.model['att_activ'],
            att_bottleneck=self.opts.model['att_bottleneck'],
            dropout_out=self.opts.model['dropout_out'],
            out_logic=self.opts.model['out_logic'],
            dec_inp_activ=self.opts.model['dec_inp_activ'],
            mm_fusion_op=self.opts.model['mm_fusion_op'],
            mm_fusion_dropout=self.opts.model['mm_fusion_dropout'],
        )

    def setup(self, is_train=True):
        """Sets up NN topology by creating the layers."""
        encoders = {}
        for key in self.topology.srcs.keys():
            encoders[key] = getattr(self, f'create_{key}_encoder')()
        self.encoders = nn.ModuleDict(encoders)
        self.dec = self.create_decoder(encoders=self.encoders)

        # Share encoder and decoder weights
        if self.opts.model['tied_emb'] == '3way':
            self.encoders[str(self.sl)].emb.weight = self.dec.emb.weight

    def load_data(self, split, batch_size, mode='train'):
        """Loads the requested dataset split."""
        # For wait_if_diff, wait_if_worse and test-time waitk decodings
        if mode == 'beam' and self.opts.model['translator_type'] != 'gs':
            batch_size = 1

        self.dataset = MultimodalDataset(
            data=self.opts.data['{}_set'.format(split)],
            mode=mode, batch_size=batch_size,
            vocabs=self.vocabs, topology=self.topology,
            max_len=self.opts.model['max_len'],
            sampler_type=self.opts.model['sampler_type'],
            bucket_by=self.opts.model['bucket_by'],
            bucket_order=self.opts.model['bucket_order'],
            # order_file is for multimodal adv. evaluation
            order_file=self.opts.data[split + '_set'].get('ord', None),
            feat_mode=self.opts.model['feat_mode'],
            num_regions=self.opts.model['num_regions'])
        logger.info(self.dataset)
        return self.dataset

    def get_bos(self, batch_size):
        """Returns a representation for <bos> embeddings for decoding."""
        return torch.LongTensor(batch_size).fill_(self.trg_vocab['<bos>'])

    def cache_enc_states(self, batch):
        """Caches encoder states internally by forward-pass'ing each encoder."""
        for key, enc in self.encoders.items():
            _ = enc(batch[key])

    def get_enc_state_dict(self, up_to=int(1e6)):
        """Encodes the batch optionally by partial encoding up to `up_to`
        words for derived simultaneous NMT classes. By default, the value
        is large enough to leave it as vanilla NMT."""
        return {str(k): e.get_states(up_to=up_to) for k, e in self.encoders.items()}

    def forward(self, batch, **kwargs):
        """Training forward-pass with explicit timestep-based loop."""
        loss = 0.0

        # Cache encoder states first
        self.cache_enc_states(batch)

        # Encode modalities and get the dict back
        state_dict = self.get_enc_state_dict()

        # Initial state is None i.e. 0. `state_dict` is not used
        h = self.dec.f_init(state_dict)

        # Convert target token indices to embeddings -> T*B*E
        y = batch[self.tl]
        y_emb = self.dec.emb(y)

        # -1: So that we skip the timestep where input is <eos>
        for t in range(y_emb.size(0) - 1):
            log_p, h = self.dec.f_next(state_dict, y_emb[t], h)
            loss += self.dec.nll_loss(log_p, y[t + 1])

        return {
            'loss': loss,
            'n_items': y[1:].nonzero(as_tuple=False).size(0),
        }

    def test_performance(self, data_loader, dump_file=None):
        """Computes test set loss over the given DataLoader instance."""
        loss = Loss()

        for batch in progress_bar(data_loader, unit='batch'):
            batch.device(DEVICE)
            out = self.forward(batch)
            loss.update(out['loss'], out['n_items'])

        return [
            Metric('LOSS', loss.get(), higher_better=False),
        ]

    def register_tensorboard(self, handle):
        """Stores tensorboard hook for custom logging."""
        self.tboard = handle

Global variables

var logger: You can use this model to pre-train a unidirectional NMT model. Once trained, you can decode translations from this model either using plain greedy-search or state-of-the-art simultanous decoding algorithms.

Pure greedy-search (by default batched)

$ pysimt translate -s test_2016_flickr,test_2017_flickr -f gs -o

Classes

class SimultaneousNMT (opts)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class SimultaneousNMT(nn.Module):
    def set_defaults(self):
        self.defaults = {
            'emb_dim': 128,             # Source and target embedding sizes
            'enc_dim': 256,             # Encoder hidden size
            'enc_proj_dim': None,       # Encoder final projection
            'enc_proj_activ': 'linear',  # Encoder final projection activation
            'enc_type': 'gru',          # Encoder type (gru|lstm)
            'enc_lnorm': False,         # Add layer-normalization to encoder output
            'enc_bidirectional': True,  # Whether the RNN encoder should be bidirectional
            'n_encoders': 1,            # Number of stacked encoders
            'dec_dim': 256,             # Decoder hidden size
            'dec_type': 'gru',          # Decoder type (gru|lstm)
            'dec_variant': 'cond',      # The only option is `cond`
            'dec_inp_activ': None,
            'att_type': 'mlp',          # Attention type (mlp|dot)
            'att_temp': 1.,             # Attention temperature
            'att_activ': 'tanh',        # Attention non-linearity (all torch nonlins)
            'att_bottleneck': 'ctx',    # Bottleneck dimensionality (ctx|hid)
            'dropout_emb': 0,           # Simple dropout to source embeddings
            'dropout_ctx': 0,           # Simple dropout to source encodings
            'dropout_out': 0,           # Simple dropout to decoder output
            'dropout_enc': 0,           # Intra-encoder dropout if n_encoders > 1
            'tied_emb': False,          # Share embeddings: (False|2way|3way)
            'direction': None,          # Network directionality, i.e. en->de
            'max_len': 80,              # Reject sentences where 'bucket_by' length > 80
            'bucket_by': None,          # A key like 'en' to define w.r.t which dataset
                                        # the batches will be sorted
            'bucket_order': None,       # Curriculum: ascending/descending/None
            'sampler_type': 'bucket',   # bucket or approximate
            'short_list': 0,            # Short list vocabularies (0: disabled)
            'out_logic': 'simple',      # 'simple' or 'deep' output
            # Visual features (optional)
            'aux_dim': None,            # Auxiliary features dim (# channels for conv features)
            'aux_dropout': 0.0,         # Auxiliary features dropout
            'aux_lnorm': False,         # layer-norm
            'aux_l2norm': False,        # L2-normalize
            'aux_proj_dim': None,       # Projection layer for features
            'aux_proj_activ': None,     # Projection layer non-linearity
            'num_regions': 36,          # The number of regions to use. Valid only for OD features. Default: 36.
            'feat_mode': None,          # OD feature type. None defaults to `roi_feats`
            'mm_fusion_op': 'concat',   # fusion type
            'mm_fusion_dropout': 0.0,   # fusion dropout
            # Decoding/training simultaneous NMT args
            'translator_type': 'gs',    # This model implements plain unidirectional MT
                                        # so the decoding is normal greedy-search
            'translator_args': {},      # No extra arguments to translator
        }

    def __init__(self, opts):
        super().__init__()

        # opts -> config file sections {.model, .data, .vocabulary, .train}
        self.opts = opts

        # Vocabulary objects
        self.vocabs = {}

        # Each auxiliary loss should be stored inside this dictionary
        # in order to be taken into account by the mainloop for multi-tasking
        self.aux_loss = {}

        # Setup options
        self.opts.model = self.set_model_options(opts.model)

        # Parse topology & languages
        self.topology = Topology(self.opts.model['direction'])

        # Load vocabularies here
        for name, fname in self.opts.vocabulary.items():
            self.vocabs[name] = Vocabulary(fname, short_list=self.opts.model['short_list'])

        # Inherently non multi-lingual aware
        slangs = self.topology.get_src_langs()
        tlangs = self.topology.get_trg_langs()
        if slangs:
            self.sl = slangs[0]
            self.src_vocab = self.vocabs[self.sl]
            self.n_src_vocab = len(self.src_vocab)
        if tlangs:
            self.tl = tlangs[0]
            self.trg_vocab = self.vocabs[self.tl]
            self.n_trg_vocab = len(self.trg_vocab)
            self.val_refs = self.opts.data['val_set'][self.tl]

        # Check vocabulary sizes for 3way tying
        if self.opts.model.get('tied_emb', False) not in [False, '2way', '3way']:
            raise RuntimeError(
                "'{}' not recognized for tied_emb.".format(self.opts.model['tied_emb']))

        if self.opts.model.get('tied_emb', False) == '3way':
            assert self.n_src_vocab == self.n_trg_vocab, \
                "The vocabulary sizes do not match for 3way tied embeddings."

    def __repr__(self):
        s = super().__repr__() + '\n'
        for vocab in self.vocabs.values():
            s += "{}\n".format(vocab)
        s += "{}\n".format(get_n_params(self))
        return s

    def set_model_options(self, model_opts):
        self.set_defaults()
        for opt, value in model_opts.items():
            if opt in self.defaults:
                # Override defaults from config
                self.defaults[opt] = value
            else:
                logger.info('Warning: unused model option: {}'.format(opt))
        return self.defaults

    def reset_parameters(self):
        for name, param in self.named_parameters():
            # Skip 1-d biases and scalars
            if param.requires_grad and param.dim() > 1:
                nn.init.kaiming_normal_(param.data)
        # Reset padding embedding to 0
        for layer in list(self.encoders.values()) + [self.dec]:
            if hasattr(layer, 'emb'):
                with torch.no_grad():
                    layer.emb.weight.data[0].fill_(0)

    def create_src_encoder(self):
        """Creates and returns an RNN encoder for textual input."""
        return RecurrentEncoder(
            input_size=self.opts.model['emb_dim'],
            hidden_size=self.opts.model['enc_dim'],
            n_vocab=self.n_src_vocab,
            bidirectional=self.opts.model['enc_bidirectional'],
            rnn_type=self.opts.model['enc_type'],
            proj_dim=self.opts.model['enc_proj_dim'],
            proj_activ=self.opts.model['enc_proj_activ'],
            dropout_emb=self.opts.model['dropout_emb'],
            dropout_ctx=self.opts.model['dropout_ctx'],
            dropout_rnn=self.opts.model['dropout_enc'],
            num_layers=self.opts.model['n_encoders'],
            layer_norm=self.opts.model['enc_lnorm'],
        )

    def create_image_encoder(self):
        """Creates and returns an MLP encoder for visual features."""
        return VisualFeaturesEncoder(
            input_size=self.opts.model['aux_dim'],
            proj_dim=self.opts.model['aux_proj_dim'],
            proj_activ=self.opts.model['aux_proj_activ'],
            layer_norm=self.opts.model['aux_lnorm'],
            l2_norm=self.opts.model['aux_l2norm'],
            dropout=self.opts.model['aux_dropout'],
        )

    def create_decoder(self, encoders):
        """Creates and returns the RNN decoder. No hidden state initialization
        for sake of simplicity. Encoders are passed to allow multi-modal
        attention out-of-the-box."""
        return ConditionalGRUDecoder(
            input_size=self.opts.model['emb_dim'],
            hidden_size=self.opts.model['dec_dim'],
            n_vocab=self.n_trg_vocab,
            encoders=encoders,
            rnn_type=self.opts.model['dec_type'],
            tied_emb=self.opts.model['tied_emb'],
            att_type=self.opts.model['att_type'],
            att_temp=self.opts.model['att_temp'],
            att_activ=self.opts.model['att_activ'],
            att_bottleneck=self.opts.model['att_bottleneck'],
            dropout_out=self.opts.model['dropout_out'],
            out_logic=self.opts.model['out_logic'],
            dec_inp_activ=self.opts.model['dec_inp_activ'],
            mm_fusion_op=self.opts.model['mm_fusion_op'],
            mm_fusion_dropout=self.opts.model['mm_fusion_dropout'],
        )

    def setup(self, is_train=True):
        """Sets up NN topology by creating the layers."""
        encoders = {}
        for key in self.topology.srcs.keys():
            encoders[key] = getattr(self, f'create_{key}_encoder')()
        self.encoders = nn.ModuleDict(encoders)
        self.dec = self.create_decoder(encoders=self.encoders)

        # Share encoder and decoder weights
        if self.opts.model['tied_emb'] == '3way':
            self.encoders[str(self.sl)].emb.weight = self.dec.emb.weight

    def load_data(self, split, batch_size, mode='train'):
        """Loads the requested dataset split."""
        # For wait_if_diff, wait_if_worse and test-time waitk decodings
        if mode == 'beam' and self.opts.model['translator_type'] != 'gs':
            batch_size = 1

        self.dataset = MultimodalDataset(
            data=self.opts.data['{}_set'.format(split)],
            mode=mode, batch_size=batch_size,
            vocabs=self.vocabs, topology=self.topology,
            max_len=self.opts.model['max_len'],
            sampler_type=self.opts.model['sampler_type'],
            bucket_by=self.opts.model['bucket_by'],
            bucket_order=self.opts.model['bucket_order'],
            # order_file is for multimodal adv. evaluation
            order_file=self.opts.data[split + '_set'].get('ord', None),
            feat_mode=self.opts.model['feat_mode'],
            num_regions=self.opts.model['num_regions'])
        logger.info(self.dataset)
        return self.dataset

    def get_bos(self, batch_size):
        """Returns a representation for <bos> embeddings for decoding."""
        return torch.LongTensor(batch_size).fill_(self.trg_vocab['<bos>'])

    def cache_enc_states(self, batch):
        """Caches encoder states internally by forward-pass'ing each encoder."""
        for key, enc in self.encoders.items():
            _ = enc(batch[key])

    def get_enc_state_dict(self, up_to=int(1e6)):
        """Encodes the batch optionally by partial encoding up to `up_to`
        words for derived simultaneous NMT classes. By default, the value
        is large enough to leave it as vanilla NMT."""
        return {str(k): e.get_states(up_to=up_to) for k, e in self.encoders.items()}

    def forward(self, batch, **kwargs):
        """Training forward-pass with explicit timestep-based loop."""
        loss = 0.0

        # Cache encoder states first
        self.cache_enc_states(batch)

        # Encode modalities and get the dict back
        state_dict = self.get_enc_state_dict()

        # Initial state is None i.e. 0. `state_dict` is not used
        h = self.dec.f_init(state_dict)

        # Convert target token indices to embeddings -> T*B*E
        y = batch[self.tl]
        y_emb = self.dec.emb(y)

        # -1: So that we skip the timestep where input is <eos>
        for t in range(y_emb.size(0) - 1):
            log_p, h = self.dec.f_next(state_dict, y_emb[t], h)
            loss += self.dec.nll_loss(log_p, y[t + 1])

        return {
            'loss': loss,
            'n_items': y[1:].nonzero(as_tuple=False).size(0),
        }

    def test_performance(self, data_loader, dump_file=None):
        """Computes test set loss over the given DataLoader instance."""
        loss = Loss()

        for batch in progress_bar(data_loader, unit='batch'):
            batch.device(DEVICE)
            out = self.forward(batch)
            loss.update(out['loss'], out['n_items'])

        return [
            Metric('LOSS', loss.get(), higher_better=False),
        ]

    def register_tensorboard(self, handle):
        """Stores tensorboard hook for custom logging."""
        self.tboard = handle

Ancestors

torch.nn.modules.module.Module

Subclasses

Class variables

var dump_patches : bool
var training : bool

Methods

def cache_enc_states(self, batch)

Caches encoder states internally by forward-pass'ing each encoder.

Expand source code

def cache_enc_states(self, batch):
    """Caches encoder states internally by forward-pass'ing each encoder."""
    for key, enc in self.encoders.items():
        _ = enc(batch[key])

def create_decoder(self, encoders)

Creates and returns the RNN decoder. No hidden state initialization for sake of simplicity. Encoders are passed to allow multi-modal attention out-of-the-box.

Expand source code

def create_decoder(self, encoders):
    """Creates and returns the RNN decoder. No hidden state initialization
    for sake of simplicity. Encoders are passed to allow multi-modal
    attention out-of-the-box."""
    return ConditionalGRUDecoder(
        input_size=self.opts.model['emb_dim'],
        hidden_size=self.opts.model['dec_dim'],
        n_vocab=self.n_trg_vocab,
        encoders=encoders,
        rnn_type=self.opts.model['dec_type'],
        tied_emb=self.opts.model['tied_emb'],
        att_type=self.opts.model['att_type'],
        att_temp=self.opts.model['att_temp'],
        att_activ=self.opts.model['att_activ'],
        att_bottleneck=self.opts.model['att_bottleneck'],
        dropout_out=self.opts.model['dropout_out'],
        out_logic=self.opts.model['out_logic'],
        dec_inp_activ=self.opts.model['dec_inp_activ'],
        mm_fusion_op=self.opts.model['mm_fusion_op'],
        mm_fusion_dropout=self.opts.model['mm_fusion_dropout'],
    )

def create_image_encoder(self)

Creates and returns an MLP encoder for visual features.

Expand source code

def create_image_encoder(self):
    """Creates and returns an MLP encoder for visual features."""
    return VisualFeaturesEncoder(
        input_size=self.opts.model['aux_dim'],
        proj_dim=self.opts.model['aux_proj_dim'],
        proj_activ=self.opts.model['aux_proj_activ'],
        layer_norm=self.opts.model['aux_lnorm'],
        l2_norm=self.opts.model['aux_l2norm'],
        dropout=self.opts.model['aux_dropout'],
    )

def create_src_encoder(self)

Creates and returns an RNN encoder for textual input.

Expand source code

def create_src_encoder(self):
    """Creates and returns an RNN encoder for textual input."""
    return RecurrentEncoder(
        input_size=self.opts.model['emb_dim'],
        hidden_size=self.opts.model['enc_dim'],
        n_vocab=self.n_src_vocab,
        bidirectional=self.opts.model['enc_bidirectional'],
        rnn_type=self.opts.model['enc_type'],
        proj_dim=self.opts.model['enc_proj_dim'],
        proj_activ=self.opts.model['enc_proj_activ'],
        dropout_emb=self.opts.model['dropout_emb'],
        dropout_ctx=self.opts.model['dropout_ctx'],
        dropout_rnn=self.opts.model['dropout_enc'],
        num_layers=self.opts.model['n_encoders'],
        layer_norm=self.opts.model['enc_lnorm'],
    )

def forward(self, batch, **kwargs) ‑> Callable[..., Any]

Training forward-pass with explicit timestep-based loop.

Expand source code

def forward(self, batch, **kwargs):
    """Training forward-pass with explicit timestep-based loop."""
    loss = 0.0

    # Cache encoder states first
    self.cache_enc_states(batch)

    # Encode modalities and get the dict back
    state_dict = self.get_enc_state_dict()

    # Initial state is None i.e. 0. `state_dict` is not used
    h = self.dec.f_init(state_dict)

    # Convert target token indices to embeddings -> T*B*E
    y = batch[self.tl]
    y_emb = self.dec.emb(y)

    # -1: So that we skip the timestep where input is <eos>
    for t in range(y_emb.size(0) - 1):
        log_p, h = self.dec.f_next(state_dict, y_emb[t], h)
        loss += self.dec.nll_loss(log_p, y[t + 1])

    return {
        'loss': loss,
        'n_items': y[1:].nonzero(as_tuple=False).size(0),
    }

def get_bos(self, batch_size)

Returns a representation for embeddings for decoding.

Expand source code

def get_bos(self, batch_size):
    """Returns a representation for <bos> embeddings for decoding."""
    return torch.LongTensor(batch_size).fill_(self.trg_vocab['<bos>'])

def get_enc_state_dict(self, up_to=1000000)

Encodes the batch optionally by partial encoding up to up_to words for derived simultaneous NMT classes. By default, the value is large enough to leave it as vanilla NMT.

Expand source code

def get_enc_state_dict(self, up_to=int(1e6)):
    """Encodes the batch optionally by partial encoding up to `up_to`
    words for derived simultaneous NMT classes. By default, the value
    is large enough to leave it as vanilla NMT."""
    return {str(k): e.get_states(up_to=up_to) for k, e in self.encoders.items()}

def load_data(self, split, batch_size, mode='train')

Loads the requested dataset split.

Expand source code

def load_data(self, split, batch_size, mode='train'):
    """Loads the requested dataset split."""
    # For wait_if_diff, wait_if_worse and test-time waitk decodings
    if mode == 'beam' and self.opts.model['translator_type'] != 'gs':
        batch_size = 1

    self.dataset = MultimodalDataset(
        data=self.opts.data['{}_set'.format(split)],
        mode=mode, batch_size=batch_size,
        vocabs=self.vocabs, topology=self.topology,
        max_len=self.opts.model['max_len'],
        sampler_type=self.opts.model['sampler_type'],
        bucket_by=self.opts.model['bucket_by'],
        bucket_order=self.opts.model['bucket_order'],
        # order_file is for multimodal adv. evaluation
        order_file=self.opts.data[split + '_set'].get('ord', None),
        feat_mode=self.opts.model['feat_mode'],
        num_regions=self.opts.model['num_regions'])
    logger.info(self.dataset)
    return self.dataset

def register_tensorboard(self, handle)

Stores tensorboard hook for custom logging.

Expand source code

def register_tensorboard(self, handle):
    """Stores tensorboard hook for custom logging."""
    self.tboard = handle

def reset_parameters(self)

Expand source code

def reset_parameters(self):
    for name, param in self.named_parameters():
        # Skip 1-d biases and scalars
        if param.requires_grad and param.dim() > 1:
            nn.init.kaiming_normal_(param.data)
    # Reset padding embedding to 0
    for layer in list(self.encoders.values()) + [self.dec]:
        if hasattr(layer, 'emb'):
            with torch.no_grad():
                layer.emb.weight.data[0].fill_(0)

def set_defaults(self)

Expand source code

def set_defaults(self):
    self.defaults = {
        'emb_dim': 128,             # Source and target embedding sizes
        'enc_dim': 256,             # Encoder hidden size
        'enc_proj_dim': None,       # Encoder final projection
        'enc_proj_activ': 'linear',  # Encoder final projection activation
        'enc_type': 'gru',          # Encoder type (gru|lstm)
        'enc_lnorm': False,         # Add layer-normalization to encoder output
        'enc_bidirectional': True,  # Whether the RNN encoder should be bidirectional
        'n_encoders': 1,            # Number of stacked encoders
        'dec_dim': 256,             # Decoder hidden size
        'dec_type': 'gru',          # Decoder type (gru|lstm)
        'dec_variant': 'cond',      # The only option is `cond`
        'dec_inp_activ': None,
        'att_type': 'mlp',          # Attention type (mlp|dot)
        'att_temp': 1.,             # Attention temperature
        'att_activ': 'tanh',        # Attention non-linearity (all torch nonlins)
        'att_bottleneck': 'ctx',    # Bottleneck dimensionality (ctx|hid)
        'dropout_emb': 0,           # Simple dropout to source embeddings
        'dropout_ctx': 0,           # Simple dropout to source encodings
        'dropout_out': 0,           # Simple dropout to decoder output
        'dropout_enc': 0,           # Intra-encoder dropout if n_encoders > 1
        'tied_emb': False,          # Share embeddings: (False|2way|3way)
        'direction': None,          # Network directionality, i.e. en->de
        'max_len': 80,              # Reject sentences where 'bucket_by' length > 80
        'bucket_by': None,          # A key like 'en' to define w.r.t which dataset
                                    # the batches will be sorted
        'bucket_order': None,       # Curriculum: ascending/descending/None
        'sampler_type': 'bucket',   # bucket or approximate
        'short_list': 0,            # Short list vocabularies (0: disabled)
        'out_logic': 'simple',      # 'simple' or 'deep' output
        # Visual features (optional)
        'aux_dim': None,            # Auxiliary features dim (# channels for conv features)
        'aux_dropout': 0.0,         # Auxiliary features dropout
        'aux_lnorm': False,         # layer-norm
        'aux_l2norm': False,        # L2-normalize
        'aux_proj_dim': None,       # Projection layer for features
        'aux_proj_activ': None,     # Projection layer non-linearity
        'num_regions': 36,          # The number of regions to use. Valid only for OD features. Default: 36.
        'feat_mode': None,          # OD feature type. None defaults to `roi_feats`
        'mm_fusion_op': 'concat',   # fusion type
        'mm_fusion_dropout': 0.0,   # fusion dropout
        # Decoding/training simultaneous NMT args
        'translator_type': 'gs',    # This model implements plain unidirectional MT
                                    # so the decoding is normal greedy-search
        'translator_args': {},      # No extra arguments to translator
    }

def set_model_options(self, model_opts)

Expand source code

def set_model_options(self, model_opts):
    self.set_defaults()
    for opt, value in model_opts.items():
        if opt in self.defaults:
            # Override defaults from config
            self.defaults[opt] = value
        else:
            logger.info('Warning: unused model option: {}'.format(opt))
    return self.defaults

def setup(self, is_train=True)

Sets up NN topology by creating the layers.

Expand source code

def setup(self, is_train=True):
    """Sets up NN topology by creating the layers."""
    encoders = {}
    for key in self.topology.srcs.keys():
        encoders[key] = getattr(self, f'create_{key}_encoder')()
    self.encoders = nn.ModuleDict(encoders)
    self.dec = self.create_decoder(encoders=self.encoders)

    # Share encoder and decoder weights
    if self.opts.model['tied_emb'] == '3way':
        self.encoders[str(self.sl)].emb.weight = self.dec.emb.weight

def test_performance(self, data_loader, dump_file=None)

Computes test set loss over the given DataLoader instance.

Expand source code

def test_performance(self, data_loader, dump_file=None):
    """Computes test set loss over the given DataLoader instance."""
    loss = Loss()

    for batch in progress_bar(data_loader, unit='batch'):
        batch.device(DEVICE)
        out = self.forward(batch)
        loss.update(out['loss'], out['n_items'])

    return [
        Metric('LOSS', loss.get(), higher_better=False),
    ]