Module pysimt.layers.encoders.speech_lstm
Expand source code
from typing import Optional
from torch import nn
from torch.nn import functional as F
from ..ff import FF
class SpeechLSTM(nn.Module):
"""A bidirectional LSTM encoder with subsampling for speech features.
The number of LSTM layers is defined by the `layers` argument, i.e.
`1_1_2_2_1_1` denotes 6 LSTM layers where the middle two applies
a subsampling factor of 2 to their inputs. Subsampling in this context
means that every N'th state will be passed to the next layer as input.
Each LSTM layer is followed by a feed-forward projection layer whose
non-linearity is given by the `activ` argument.
Note:
The input tensor should contain samples of equal lengths i.e.
`bucket_by` in training configuration should be set to the acoustic
features modality.
Args:
input_size: Input feature dimensionality.
hidden_size: LSTM hidden state dimensionality.
proj_size: Projection layer size.
activ: Non-linearity to apply to intermediate projection
layers. (Default: 'tanh')
layers: A '_' separated list of integers that defines the subsampling
factor for each LSTM.
dropout: Use dropout (Default: 0.)
Input:
x: A `torch.Tensor` of shape `(n_timesteps, n_samples, input_size)`
Output:
hs: A `torch.Tensor` of shape `(n_timesteps, n_samples, hidden_size * 2)`
that contains encoder hidden states for all timesteps.
mask: `None` since this layer expects all equal frame inputs.
"""
def __init__(self, input_size: int, hidden_size: int, proj_size: int,
layers: str, activ: Optional[str] = 'tanh',
dropout: float = 0.0):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.proj_size = proj_size
self.activ = activ
self.layers = [int(i) for i in layers.split('_')]
self.dropout = dropout
self.n_layers = len(self.layers)
# Doubles its size because of concatenation of forw-backw encs
self.ctx_size = self.hidden_size * 2
# Fill 0-vector as <eos> to the end of the frames
self.pad_tuple = (0, 0, 0, 0, 0, 1)
# Projections and LSTMs
self.ffs = nn.ModuleList()
self.lstms = nn.ModuleList()
if self.dropout > 0:
self.do = nn.Dropout(self.dropout)
for i, ss_factor in enumerate(self.layers):
# Add LSTMs
self.lstms.append(nn.LSTM(
self.input_size if i == 0 else self.hidden_size,
self.hidden_size, bidirectional=True))
# Add non-linear bottlenecks
self.ffs.append(FF(
self.ctx_size, self.proj_size, activ=self.activ))
def forward(self, x, **kwargs):
# Generate a mask to detect padded sequences
mask = x.ne(0).float().sum(2).ne(0).float()
if mask.eq(0).nonzero().numel() > 0:
raise RuntimeError("Non-homogeneous batch detected in SpeechLSTM layer.")
# Pad with <eos> zero
hs = F.pad(x, self.pad_tuple)
for (ss_factor, f_lstm, f_ff) in zip(self.layers, self.lstms, self.ffs):
if ss_factor > 1:
# Skip states
hs = f_ff(f_lstm(hs[::ss_factor])[0])
else:
hs = f_ff(f_lstm(hs)[0])
if self.dropout > 0:
hs = self.do(hs)
# No mask is returned as batch should contain same-length sequences
return hs, None
Classes
class SpeechLSTM (input_size: int, hidden_size: int, proj_size: int, layers: str, activ: Union[str, NoneType] = 'tanh', dropout: float = 0.0)
-
A bidirectional LSTM encoder with subsampling for speech features.
The number of LSTM layers is defined by the
layers
argument, i.e.1_1_2_2_1_1
denotes 6 LSTM layers where the middle two applies a subsampling factor of 2 to their inputs. Subsampling in this context means that every N'th state will be passed to the next layer as input.Each LSTM layer is followed by a feed-forward projection layer whose non-linearity is given by the
activ
argument.Note
The input tensor should contain samples of equal lengths i.e.
bucket_by
in training configuration should be set to the acoustic features modality.Args
input_size
- Input feature dimensionality.
hidden_size
- LSTM hidden state dimensionality.
proj_size
- Projection layer size.
activ
- Non-linearity to apply to intermediate projection layers. (Default: 'tanh')
layers
- A '_' separated list of integers that defines the subsampling factor for each LSTM.
dropout
- Use dropout (Default: 0.)
Input
x: A
torch.Tensor
of shape(n_timesteps, n_samples, input_size)
Output
hs: A
torch.Tensor
of shape(n_timesteps, n_samples, hidden_size * 2)
that contains encoder hidden states for all timesteps. mask:None
since this layer expects all equal frame inputs.Initializes internal Module state, shared by both nn.Module and ScriptModule.
Expand source code
class SpeechLSTM(nn.Module): """A bidirectional LSTM encoder with subsampling for speech features. The number of LSTM layers is defined by the `layers` argument, i.e. `1_1_2_2_1_1` denotes 6 LSTM layers where the middle two applies a subsampling factor of 2 to their inputs. Subsampling in this context means that every N'th state will be passed to the next layer as input. Each LSTM layer is followed by a feed-forward projection layer whose non-linearity is given by the `activ` argument. Note: The input tensor should contain samples of equal lengths i.e. `bucket_by` in training configuration should be set to the acoustic features modality. Args: input_size: Input feature dimensionality. hidden_size: LSTM hidden state dimensionality. proj_size: Projection layer size. activ: Non-linearity to apply to intermediate projection layers. (Default: 'tanh') layers: A '_' separated list of integers that defines the subsampling factor for each LSTM. dropout: Use dropout (Default: 0.) Input: x: A `torch.Tensor` of shape `(n_timesteps, n_samples, input_size)` Output: hs: A `torch.Tensor` of shape `(n_timesteps, n_samples, hidden_size * 2)` that contains encoder hidden states for all timesteps. mask: `None` since this layer expects all equal frame inputs. """ def __init__(self, input_size: int, hidden_size: int, proj_size: int, layers: str, activ: Optional[str] = 'tanh', dropout: float = 0.0): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.proj_size = proj_size self.activ = activ self.layers = [int(i) for i in layers.split('_')] self.dropout = dropout self.n_layers = len(self.layers) # Doubles its size because of concatenation of forw-backw encs self.ctx_size = self.hidden_size * 2 # Fill 0-vector as <eos> to the end of the frames self.pad_tuple = (0, 0, 0, 0, 0, 1) # Projections and LSTMs self.ffs = nn.ModuleList() self.lstms = nn.ModuleList() if self.dropout > 0: self.do = nn.Dropout(self.dropout) for i, ss_factor in enumerate(self.layers): # Add LSTMs self.lstms.append(nn.LSTM( self.input_size if i == 0 else self.hidden_size, self.hidden_size, bidirectional=True)) # Add non-linear bottlenecks self.ffs.append(FF( self.ctx_size, self.proj_size, activ=self.activ)) def forward(self, x, **kwargs): # Generate a mask to detect padded sequences mask = x.ne(0).float().sum(2).ne(0).float() if mask.eq(0).nonzero().numel() > 0: raise RuntimeError("Non-homogeneous batch detected in SpeechLSTM layer.") # Pad with <eos> zero hs = F.pad(x, self.pad_tuple) for (ss_factor, f_lstm, f_ff) in zip(self.layers, self.lstms, self.ffs): if ss_factor > 1: # Skip states hs = f_ff(f_lstm(hs[::ss_factor])[0]) else: hs = f_ff(f_lstm(hs)[0]) if self.dropout > 0: hs = self.do(hs) # No mask is returned as batch should contain same-length sequences return hs, None
Ancestors
- torch.nn.modules.module.Module
Class variables
var dump_patches : bool
var training : bool
Methods
def forward(self, x, **kwargs) ‑> Callable[..., Any]
-
Defines the computation performed at every call.
Should be overridden by all subclasses.
Note
Although the recipe for forward pass needs to be defined within this function, one should call the :class:
Module
instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.Expand source code
def forward(self, x, **kwargs): # Generate a mask to detect padded sequences mask = x.ne(0).float().sum(2).ne(0).float() if mask.eq(0).nonzero().numel() > 0: raise RuntimeError("Non-homogeneous batch detected in SpeechLSTM layer.") # Pad with <eos> zero hs = F.pad(x, self.pad_tuple) for (ss_factor, f_lstm, f_ff) in zip(self.layers, self.lstms, self.ffs): if ss_factor > 1: # Skip states hs = f_ff(f_lstm(hs[::ss_factor])[0]) else: hs = f_ff(f_lstm(hs)[0]) if self.dropout > 0: hs = self.do(hs) # No mask is returned as batch should contain same-length sequences return hs, None