Module pysimt.layers.encoders.recurrent
Expand source code
from torch import nn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from .. import FF
class RecurrentEncoder(nn.Module):
"""A recurrent encoder with embedding layer.
Arguments:
input_size (int): Embedding dimensionality.
hidden_size (int): RNN hidden state dimensionality.
n_vocab (int): Number of tokens for the embedding layer.
rnn_type (str): RNN Type, i.e. GRU or LSTM.
num_layers (int, optional): Number of stacked RNNs (Default: 1).
bidirectional (bool, optional): If `False`, the RNN is unidirectional.
dropout_rnn (float, optional): Inter-layer dropout rate only
applicable if `num_layers > 1`. (Default: 0.)
dropout_emb(float, optional): Dropout rate for embeddings (Default: 0.)
dropout_ctx(float, optional): Dropout rate for the
encodings/annotations (Default: 0.)
emb_maxnorm(float, optional): If given, renormalizes embeddings so
that their norm is the given value.
emb_gradscale(bool, optional): If `True`, scales the gradients
per embedding w.r.t. to its frequency in the batch.
proj_dim(int, optional): If not `None`, add a final projection
layer. Can be used to adapt dimensionality for decoder.
proj_activ(str, optional): Non-linearity for projection layer.
`None` or `linear` does not apply any non-linearity.
layer_norm(bool, optional): Apply layer normalization at the
output of the encoder.
Input:
x (Tensor): A tensor of shape (n_timesteps, n_samples)
including the integer token indices for the given batch.
Output:
hs (Tensor): A tensor of shape (n_timesteps, n_samples, hidden)
that contains encoder hidden states for all timesteps. If
bidirectional, `hs` is doubled in size in the last dimension
to contain both directional states.
mask (Tensor): A binary mask of shape (n_timesteps, n_samples)
that may further be used in attention and/or decoder. `None`
is returned if batch contains only sentences with same lengths.
"""
def __init__(self, input_size, hidden_size, n_vocab, rnn_type,
num_layers=1, bidirectional=True,
dropout_rnn=0, dropout_emb=0, dropout_ctx=0,
emb_maxnorm=None, emb_gradscale=False,
proj_dim=None, proj_activ=None, layer_norm=False):
super().__init__()
self.rnn_type = rnn_type.upper()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_vocab = n_vocab
self.num_layers = num_layers
self.bidirectional = bidirectional
self.emb_maxnorm = emb_maxnorm
self.emb_gradscale = emb_gradscale
self.proj_dim = proj_dim
self.proj_activ = proj_activ
self.layer_norm = layer_norm
# For dropout btw layers, only effective if num_layers > 1
self.dropout_rnn = dropout_rnn
# Our other custom dropouts after embeddings and annotations
self.dropout_emb = dropout_emb
self.dropout_ctx = dropout_ctx
self.tile_factor = self.num_layers
self.ctx_size = self.hidden_size
# Doubles its size because of concatenation
if self.bidirectional:
self.ctx_size *= 2
self.tile_factor *= 2
# Embedding dropout
self.do_emb = nn.Dropout(self.dropout_emb)
# Create embedding layer
self.emb = nn.Embedding(self.n_vocab, self.input_size,
padding_idx=0, max_norm=self.emb_maxnorm,
scale_grad_by_freq=self.emb_gradscale)
# Create fused/cudnn encoder according to the requested type
RNN = getattr(nn, self.rnn_type)
self.enc = RNN(self.input_size, self.hidden_size,
self.num_layers, bias=True, batch_first=False,
dropout=self.dropout_rnn,
bidirectional=self.bidirectional)
output_layers = []
if self.proj_dim:
output_layers.append(
FF(self.ctx_size, self.proj_dim, activ=self.proj_activ))
self.ctx_size = self.proj_dim
if self.layer_norm:
output_layers.append(nn.LayerNorm(self.ctx_size))
if self.dropout_ctx > 0:
output_layers.append(nn.Dropout(p=self.dropout_ctx))
self.output = nn.Sequential(*output_layers)
# Variables for caching
self._states, self._mask = None, None
def embed(self, x):
"""Embeds and return the representations and mask."""
self._mask = x.ne(0).long()
# src_embs: (seq_len, batch_size, emb_dim)
return self.emb(x), self._mask
def forward(self, x, **kwargs):
if not isinstance(x, tuple):
# Received the usual embeddings
embs, _ = self.embed(x)
else:
embs = x[0]
# Pack the tensor so that RNN correctly computes the hidden
# states by ignoring padded positions
packed_inputs = pack_padded_sequence(
self.do_emb(embs), lengths=self._mask.sum(0).long().cpu(),
enforce_sorted=False)
# Get initial state
hx = kwargs.get('hx', None)
if hx is not None:
hx = hx.expand(self.tile_factor, -1, -1).contiguous()
# Encode -> unpack to obtain an ordinary tensor of hidden states
# padded positions will now have explicit 0's in their hidden states
# all_hids: (seq_len, batch_size, self.enc_out_dim)
# hx: (num_layers * num_directions, batch, hidden_size)
all_hids = pad_packed_sequence(self.enc(packed_inputs, hx=hx)[0])[0]
# Cache states and return
self._states = self.output(all_hids)
return self._states, self._mask
def get_states(self, up_to=int(1e6)):
"""Reveals partial source information through `up_to` argument.
Useful for simultaneous NMT encodings."""
assert self._states is not None, "Call encoder first to cache states!"
return self._states[:up_to], self._mask[:up_to]
Classes
class RecurrentEncoder (input_size, hidden_size, n_vocab, rnn_type, num_layers=1, bidirectional=True, dropout_rnn=0, dropout_emb=0, dropout_ctx=0, emb_maxnorm=None, emb_gradscale=False, proj_dim=None, proj_activ=None, layer_norm=False)
-
A recurrent encoder with embedding layer.
Arguments
input_size (int): Embedding dimensionality. hidden_size (int): RNN hidden state dimensionality. n_vocab (int): Number of tokens for the embedding layer. rnn_type (str): RNN Type, i.e. GRU or LSTM. num_layers (int, optional): Number of stacked RNNs (Default: 1). bidirectional (bool, optional): If
False
, the RNN is unidirectional. dropout_rnn (float, optional): Inter-layer dropout rate only applicable ifnum_layers > 1
. (Default: 0.) dropout_emb(float, optional): Dropout rate for embeddings (Default: 0.) dropout_ctx(float, optional): Dropout rate for the encodings/annotations (Default: 0.) emb_maxnorm(float, optional): If given, renormalizes embeddings so that their norm is the given value. emb_gradscale(bool, optional): IfTrue
, scales the gradients per embedding w.r.t. to its frequency in the batch. proj_dim(int, optional): If notNone
, add a final projection layer. Can be used to adapt dimensionality for decoder. proj_activ(str, optional): Non-linearity for projection layer.None
orlinear
does not apply any non-linearity. layer_norm(bool, optional): Apply layer normalization at the output of the encoder.Input
x (Tensor): A tensor of shape (n_timesteps, n_samples) including the integer token indices for the given batch.
Output
hs (Tensor): A tensor of shape (n_timesteps, n_samples, hidden) that contains encoder hidden states for all timesteps. If bidirectional,
hs
is doubled in size in the last dimension to contain both directional states. mask (Tensor): A binary mask of shape (n_timesteps, n_samples) that may further be used in attention and/or decoder.None
is returned if batch contains only sentences with same lengths.Initializes internal Module state, shared by both nn.Module and ScriptModule.
Expand source code
class RecurrentEncoder(nn.Module): """A recurrent encoder with embedding layer. Arguments: input_size (int): Embedding dimensionality. hidden_size (int): RNN hidden state dimensionality. n_vocab (int): Number of tokens for the embedding layer. rnn_type (str): RNN Type, i.e. GRU or LSTM. num_layers (int, optional): Number of stacked RNNs (Default: 1). bidirectional (bool, optional): If `False`, the RNN is unidirectional. dropout_rnn (float, optional): Inter-layer dropout rate only applicable if `num_layers > 1`. (Default: 0.) dropout_emb(float, optional): Dropout rate for embeddings (Default: 0.) dropout_ctx(float, optional): Dropout rate for the encodings/annotations (Default: 0.) emb_maxnorm(float, optional): If given, renormalizes embeddings so that their norm is the given value. emb_gradscale(bool, optional): If `True`, scales the gradients per embedding w.r.t. to its frequency in the batch. proj_dim(int, optional): If not `None`, add a final projection layer. Can be used to adapt dimensionality for decoder. proj_activ(str, optional): Non-linearity for projection layer. `None` or `linear` does not apply any non-linearity. layer_norm(bool, optional): Apply layer normalization at the output of the encoder. Input: x (Tensor): A tensor of shape (n_timesteps, n_samples) including the integer token indices for the given batch. Output: hs (Tensor): A tensor of shape (n_timesteps, n_samples, hidden) that contains encoder hidden states for all timesteps. If bidirectional, `hs` is doubled in size in the last dimension to contain both directional states. mask (Tensor): A binary mask of shape (n_timesteps, n_samples) that may further be used in attention and/or decoder. `None` is returned if batch contains only sentences with same lengths. """ def __init__(self, input_size, hidden_size, n_vocab, rnn_type, num_layers=1, bidirectional=True, dropout_rnn=0, dropout_emb=0, dropout_ctx=0, emb_maxnorm=None, emb_gradscale=False, proj_dim=None, proj_activ=None, layer_norm=False): super().__init__() self.rnn_type = rnn_type.upper() self.input_size = input_size self.hidden_size = hidden_size self.n_vocab = n_vocab self.num_layers = num_layers self.bidirectional = bidirectional self.emb_maxnorm = emb_maxnorm self.emb_gradscale = emb_gradscale self.proj_dim = proj_dim self.proj_activ = proj_activ self.layer_norm = layer_norm # For dropout btw layers, only effective if num_layers > 1 self.dropout_rnn = dropout_rnn # Our other custom dropouts after embeddings and annotations self.dropout_emb = dropout_emb self.dropout_ctx = dropout_ctx self.tile_factor = self.num_layers self.ctx_size = self.hidden_size # Doubles its size because of concatenation if self.bidirectional: self.ctx_size *= 2 self.tile_factor *= 2 # Embedding dropout self.do_emb = nn.Dropout(self.dropout_emb) # Create embedding layer self.emb = nn.Embedding(self.n_vocab, self.input_size, padding_idx=0, max_norm=self.emb_maxnorm, scale_grad_by_freq=self.emb_gradscale) # Create fused/cudnn encoder according to the requested type RNN = getattr(nn, self.rnn_type) self.enc = RNN(self.input_size, self.hidden_size, self.num_layers, bias=True, batch_first=False, dropout=self.dropout_rnn, bidirectional=self.bidirectional) output_layers = [] if self.proj_dim: output_layers.append( FF(self.ctx_size, self.proj_dim, activ=self.proj_activ)) self.ctx_size = self.proj_dim if self.layer_norm: output_layers.append(nn.LayerNorm(self.ctx_size)) if self.dropout_ctx > 0: output_layers.append(nn.Dropout(p=self.dropout_ctx)) self.output = nn.Sequential(*output_layers) # Variables for caching self._states, self._mask = None, None def embed(self, x): """Embeds and return the representations and mask.""" self._mask = x.ne(0).long() # src_embs: (seq_len, batch_size, emb_dim) return self.emb(x), self._mask def forward(self, x, **kwargs): if not isinstance(x, tuple): # Received the usual embeddings embs, _ = self.embed(x) else: embs = x[0] # Pack the tensor so that RNN correctly computes the hidden # states by ignoring padded positions packed_inputs = pack_padded_sequence( self.do_emb(embs), lengths=self._mask.sum(0).long().cpu(), enforce_sorted=False) # Get initial state hx = kwargs.get('hx', None) if hx is not None: hx = hx.expand(self.tile_factor, -1, -1).contiguous() # Encode -> unpack to obtain an ordinary tensor of hidden states # padded positions will now have explicit 0's in their hidden states # all_hids: (seq_len, batch_size, self.enc_out_dim) # hx: (num_layers * num_directions, batch, hidden_size) all_hids = pad_packed_sequence(self.enc(packed_inputs, hx=hx)[0])[0] # Cache states and return self._states = self.output(all_hids) return self._states, self._mask def get_states(self, up_to=int(1e6)): """Reveals partial source information through `up_to` argument. Useful for simultaneous NMT encodings.""" assert self._states is not None, "Call encoder first to cache states!" return self._states[:up_to], self._mask[:up_to]
Ancestors
- torch.nn.modules.module.Module
Class variables
var dump_patches : bool
var training : bool
Methods
def embed(self, x)
-
Embeds and return the representations and mask.
Expand source code
def embed(self, x): """Embeds and return the representations and mask.""" self._mask = x.ne(0).long() # src_embs: (seq_len, batch_size, emb_dim) return self.emb(x), self._mask
def forward(self, x, **kwargs) ‑> Callable[..., Any]
-
Defines the computation performed at every call.
Should be overridden by all subclasses.
Note
Although the recipe for forward pass needs to be defined within this function, one should call the :class:
Module
instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.Expand source code
def forward(self, x, **kwargs): if not isinstance(x, tuple): # Received the usual embeddings embs, _ = self.embed(x) else: embs = x[0] # Pack the tensor so that RNN correctly computes the hidden # states by ignoring padded positions packed_inputs = pack_padded_sequence( self.do_emb(embs), lengths=self._mask.sum(0).long().cpu(), enforce_sorted=False) # Get initial state hx = kwargs.get('hx', None) if hx is not None: hx = hx.expand(self.tile_factor, -1, -1).contiguous() # Encode -> unpack to obtain an ordinary tensor of hidden states # padded positions will now have explicit 0's in their hidden states # all_hids: (seq_len, batch_size, self.enc_out_dim) # hx: (num_layers * num_directions, batch, hidden_size) all_hids = pad_packed_sequence(self.enc(packed_inputs, hx=hx)[0])[0] # Cache states and return self._states = self.output(all_hids) return self._states, self._mask
def get_states(self, up_to=1000000)
-
Reveals partial source information through
up_to
argument. Useful for simultaneous NMT encodings.Expand source code
def get_states(self, up_to=int(1e6)): """Reveals partial source information through `up_to` argument. Useful for simultaneous NMT encodings.""" assert self._states is not None, "Call encoder first to cache states!" return self._states[:up_to], self._mask[:up_to]