Module pysimt.utils.nn
Utility classes for neural network related operations.
Expand source code
"""Utility classes for neural network related operations."""
from typing import List, Optional
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
class LabelSmoothingLoss(nn.Module):
def __init__(self, trg_vocab_size, label_smoothing=0.1, reduction='mean', with_logits=True, ignore_index=0):
"""
Creates a Label Smoothing Loss.
Based on: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/utils/loss.py#L194
:param trg_vocab_size: The target vocabulary size.
:param label_smoothing: The label smoothing value. Default: 0.1.
:param reduction: The loss reduction. Default: 'mean'.
:param with_logits: Whether the predictions are logits. Default: True.
:param ignore_index: The value to be ignored by the loss. Can be used to ignore padding tokens. Default 0.
"""
super(LabelSmoothingLoss, self).__init__()
self.with_logits = with_logits
self.ignore_index = ignore_index
self.kl_divergence = nn.KLDivLoss(reduction=reduction)
self._create_one_hot(label_smoothing, trg_vocab_size)
self.confidence = 1.0 - label_smoothing
def forward(self, predictions, target):
"""
Computes the loss.
:param predictions: The predictions of shape (N, C) where C is the number of classes.
If with_logits is True, a log_softmax will be applied to obtain valid probabilities.
:param target: The target values of shape (N).
:return: The computed loss.
"""
if self.with_logits is True:
predictions = F.log_softmax(predictions, dim=-1)
model_prob = self.one_hot.repeat(target.size(0), 1)
model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
self._apply_mask(model_prob, target)
return self.kl_divergence(predictions, model_prob)
def _create_one_hot(self, label_smoothing, trg_vocab_size):
smoothing_value = label_smoothing / (trg_vocab_size - 2)
one_hot = torch.full((trg_vocab_size,), smoothing_value)
one_hot[self.ignore_index] = 0
self.register_buffer('one_hot', one_hot.unsqueeze(0))
def _apply_mask(self, model_prob, target):
mask = (target == self.ignore_index).unsqueeze(1)
model_prob.masked_fill_(mask, 0)
def get_activation_fn(name: Optional[str]):
"""Returns a callable activation function from `torch`."""
if name in (None, 'linear'):
return lambda x: x
elif name in ('sigmoid', 'tanh'):
return getattr(torch, name)
else:
return getattr(F, name)
def generate_default_mask(data, dim1=None):
"""
Returns a default mask which allows the model to attend over all positions.
:param data: The data of shape (sequence_len, batch_size)
:param dim1: The first dimension of the mask. If none, it is equal to sequence_len.
:return:
"""
batch_size = data.size(1)
sequence_len = data.size(0)
if dim1 is None:
dim1 = sequence_len
return torch.zeros(batch_size, dim1, sequence_len).bool().to(data.device)
def generate_visual_features_padding_masks(data, pad_value=0):
"""
Returns a mask based on the data. For values of the padding token=0, the mask will contain 1, indicating the
model cannot attend over these positions.
:param data: The data of shape (sequence_len, batch_size, feature_dim)
:param pad_value: The value of the padding. Default: 0.
:return: The respective mask of shape (batch_size, 1, sequence_len)
"""
with torch.no_grad():
return (data == pad_value).all(dim=-1).t().to(data.device).unsqueeze(1)
def generate_padding_masks(data, pad_value=0):
"""
Returns a mask based on the data. For values of the padding token=0, the mask will contain 1, indicating the
model cannot attend over these positions.
:param data: The data of shape (sequence_len, batch_size)
:param pad_value: The value of the padding. Default: 0.
:return: The respective mask of shape (batch_size, 1, sequence_len)
"""
with torch.no_grad():
mask = (data == pad_value).to(data.device).t().unsqueeze(1)
return mask
def generate_lookahead_mask(data, k=1, dim1=None):
"""
Generates a lookahead mask, preventing the decoder from attending to previous positions when computing the
attention. The mask will contain 1 for positions which should not be attended to.
:param data: The data of shape (sequence_len, batch_size).
:param k: The offset for the lookahead mask. By default it's 0. Example: In the decoder self-attention, each decoder
word can use only itself and all previous words.
:param dim1: The first dimension of the mask. If none, it is equal to sequence_len.
:return: The lookahead mask of shape (1, dim1, sequence_len)
"""
sequence_len = data.size(0)
if dim1 is None:
dim1 = sequence_len
lookahead_mask = torch.triu(torch.ones((1, dim1, sequence_len)), diagonal=k)
return lookahead_mask.to(data.device).bool()
def generate_combined_mask(data, k=1):
"""
Generates a combined padding and lookahead mask.
The mask will contain 1 for positions which should not be attended to.
:param data: The data of shape (sequence_len, batch_size).
:param k: The offset for the lookahead mask. By default it's 1, allowing the decoder to observe the <bos> token.
:return: Combined padding and lookahead mask.
"""
padding_mask = generate_padding_masks(data)
lookahead_mask = generate_lookahead_mask(data, k)
combined_mask = padding_mask | lookahead_mask
return combined_mask
def readable_size(n: int) -> str:
"""Return a readable size string."""
sizes = ['K', 'M', 'G']
fmt = ''
size = n
for i, s in enumerate(sizes):
nn = n / (1000 ** (i + 1))
if nn >= 1:
size = nn
fmt = sizes[i]
else:
break
return '%.2f%s' % (size, fmt)
def get_module_groups(layer_names: List[str]) -> List[str]:
groups = set()
for name in layer_names:
if '.weight' in name:
groups.add(name.split('.weight')[0])
elif '.bias' in name:
groups.add(name.split('.bias')[0])
return sorted(list(groups))
def get_n_params(module):
n_param_learnable = 0
n_param_frozen = 0
for param in module.parameters():
if param.requires_grad:
n_param_learnable += np.cumprod(param.data.size())[-1]
else:
n_param_frozen += np.cumprod(param.data.size())[-1]
n_param_all = n_param_learnable + n_param_frozen
return "# parameters: {} ({} learnable)".format(
readable_size(n_param_all), readable_size(n_param_learnable))
Functions
def generate_combined_mask(data, k=1)
-
Generates a combined padding and lookahead mask. The mask will contain 1 for positions which should not be attended to. :param data: The data of shape (sequence_len, batch_size). :param k: The offset for the lookahead mask. By default it's 1, allowing the decoder to observe the
token. :return: Combined padding and lookahead mask. Expand source code
def generate_combined_mask(data, k=1): """ Generates a combined padding and lookahead mask. The mask will contain 1 for positions which should not be attended to. :param data: The data of shape (sequence_len, batch_size). :param k: The offset for the lookahead mask. By default it's 1, allowing the decoder to observe the <bos> token. :return: Combined padding and lookahead mask. """ padding_mask = generate_padding_masks(data) lookahead_mask = generate_lookahead_mask(data, k) combined_mask = padding_mask | lookahead_mask return combined_mask
def generate_default_mask(data, dim1=None)
-
Returns a default mask which allows the model to attend over all positions. :param data: The data of shape (sequence_len, batch_size) :param dim1: The first dimension of the mask. If none, it is equal to sequence_len. :return:
Expand source code
def generate_default_mask(data, dim1=None): """ Returns a default mask which allows the model to attend over all positions. :param data: The data of shape (sequence_len, batch_size) :param dim1: The first dimension of the mask. If none, it is equal to sequence_len. :return: """ batch_size = data.size(1) sequence_len = data.size(0) if dim1 is None: dim1 = sequence_len return torch.zeros(batch_size, dim1, sequence_len).bool().to(data.device)
def generate_lookahead_mask(data, k=1, dim1=None)
-
Generates a lookahead mask, preventing the decoder from attending to previous positions when computing the attention. The mask will contain 1 for positions which should not be attended to. :param data: The data of shape (sequence_len, batch_size). :param k: The offset for the lookahead mask. By default it's 0. Example: In the decoder self-attention, each decoder word can use only itself and all previous words. :param dim1: The first dimension of the mask. If none, it is equal to sequence_len. :return: The lookahead mask of shape (1, dim1, sequence_len)
Expand source code
def generate_lookahead_mask(data, k=1, dim1=None): """ Generates a lookahead mask, preventing the decoder from attending to previous positions when computing the attention. The mask will contain 1 for positions which should not be attended to. :param data: The data of shape (sequence_len, batch_size). :param k: The offset for the lookahead mask. By default it's 0. Example: In the decoder self-attention, each decoder word can use only itself and all previous words. :param dim1: The first dimension of the mask. If none, it is equal to sequence_len. :return: The lookahead mask of shape (1, dim1, sequence_len) """ sequence_len = data.size(0) if dim1 is None: dim1 = sequence_len lookahead_mask = torch.triu(torch.ones((1, dim1, sequence_len)), diagonal=k) return lookahead_mask.to(data.device).bool()
def generate_padding_masks(data, pad_value=0)
-
Returns a mask based on the data. For values of the padding token=0, the mask will contain 1, indicating the model cannot attend over these positions. :param data: The data of shape (sequence_len, batch_size) :param pad_value: The value of the padding. Default: 0. :return: The respective mask of shape (batch_size, 1, sequence_len)
Expand source code
def generate_padding_masks(data, pad_value=0): """ Returns a mask based on the data. For values of the padding token=0, the mask will contain 1, indicating the model cannot attend over these positions. :param data: The data of shape (sequence_len, batch_size) :param pad_value: The value of the padding. Default: 0. :return: The respective mask of shape (batch_size, 1, sequence_len) """ with torch.no_grad(): mask = (data == pad_value).to(data.device).t().unsqueeze(1) return mask
def generate_visual_features_padding_masks(data, pad_value=0)
-
Returns a mask based on the data. For values of the padding token=0, the mask will contain 1, indicating the model cannot attend over these positions. :param data: The data of shape (sequence_len, batch_size, feature_dim) :param pad_value: The value of the padding. Default: 0. :return: The respective mask of shape (batch_size, 1, sequence_len)
Expand source code
def generate_visual_features_padding_masks(data, pad_value=0): """ Returns a mask based on the data. For values of the padding token=0, the mask will contain 1, indicating the model cannot attend over these positions. :param data: The data of shape (sequence_len, batch_size, feature_dim) :param pad_value: The value of the padding. Default: 0. :return: The respective mask of shape (batch_size, 1, sequence_len) """ with torch.no_grad(): return (data == pad_value).all(dim=-1).t().to(data.device).unsqueeze(1)
def get_activation_fn(name: Union[str, NoneType])
-
Returns a callable activation function from
torch
.Expand source code
def get_activation_fn(name: Optional[str]): """Returns a callable activation function from `torch`.""" if name in (None, 'linear'): return lambda x: x elif name in ('sigmoid', 'tanh'): return getattr(torch, name) else: return getattr(F, name)
def get_module_groups(layer_names: List[str]) ‑> List[str]
-
Expand source code
def get_module_groups(layer_names: List[str]) -> List[str]: groups = set() for name in layer_names: if '.weight' in name: groups.add(name.split('.weight')[0]) elif '.bias' in name: groups.add(name.split('.bias')[0]) return sorted(list(groups))
def get_n_params(module)
-
Expand source code
def get_n_params(module): n_param_learnable = 0 n_param_frozen = 0 for param in module.parameters(): if param.requires_grad: n_param_learnable += np.cumprod(param.data.size())[-1] else: n_param_frozen += np.cumprod(param.data.size())[-1] n_param_all = n_param_learnable + n_param_frozen return "# parameters: {} ({} learnable)".format( readable_size(n_param_all), readable_size(n_param_learnable))
def readable_size(n: int) ‑> str
-
Return a readable size string.
Expand source code
def readable_size(n: int) -> str: """Return a readable size string.""" sizes = ['K', 'M', 'G'] fmt = '' size = n for i, s in enumerate(sizes): nn = n / (1000 ** (i + 1)) if nn >= 1: size = nn fmt = sizes[i] else: break return '%.2f%s' % (size, fmt)
Classes
class LabelSmoothingLoss (trg_vocab_size, label_smoothing=0.1, reduction='mean', with_logits=True, ignore_index=0)
-
Base class for all neural network modules.
Your models should also subclass this class.
Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::
import torch.nn as nn import torch.nn.functional as F class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.conv1 = nn.Conv2d(1, 20, 5) self.conv2 = nn.Conv2d(20, 20, 5) def forward(self, x): x = F.relu(self.conv1(x)) return F.relu(self.conv2(x))
Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:
to
, etc.:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool
Creates a Label Smoothing Loss. Based on: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/utils/loss.py#L194 :param trg_vocab_size: The target vocabulary size. :param label_smoothing: The label smoothing value. Default: 0.1. :param reduction: The loss reduction. Default: 'mean'. :param with_logits: Whether the predictions are logits. Default: True. :param ignore_index: The value to be ignored by the loss. Can be used to ignore padding tokens. Default 0.
Expand source code
class LabelSmoothingLoss(nn.Module): def __init__(self, trg_vocab_size, label_smoothing=0.1, reduction='mean', with_logits=True, ignore_index=0): """ Creates a Label Smoothing Loss. Based on: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/utils/loss.py#L194 :param trg_vocab_size: The target vocabulary size. :param label_smoothing: The label smoothing value. Default: 0.1. :param reduction: The loss reduction. Default: 'mean'. :param with_logits: Whether the predictions are logits. Default: True. :param ignore_index: The value to be ignored by the loss. Can be used to ignore padding tokens. Default 0. """ super(LabelSmoothingLoss, self).__init__() self.with_logits = with_logits self.ignore_index = ignore_index self.kl_divergence = nn.KLDivLoss(reduction=reduction) self._create_one_hot(label_smoothing, trg_vocab_size) self.confidence = 1.0 - label_smoothing def forward(self, predictions, target): """ Computes the loss. :param predictions: The predictions of shape (N, C) where C is the number of classes. If with_logits is True, a log_softmax will be applied to obtain valid probabilities. :param target: The target values of shape (N). :return: The computed loss. """ if self.with_logits is True: predictions = F.log_softmax(predictions, dim=-1) model_prob = self.one_hot.repeat(target.size(0), 1) model_prob.scatter_(1, target.unsqueeze(1), self.confidence) self._apply_mask(model_prob, target) return self.kl_divergence(predictions, model_prob) def _create_one_hot(self, label_smoothing, trg_vocab_size): smoothing_value = label_smoothing / (trg_vocab_size - 2) one_hot = torch.full((trg_vocab_size,), smoothing_value) one_hot[self.ignore_index] = 0 self.register_buffer('one_hot', one_hot.unsqueeze(0)) def _apply_mask(self, model_prob, target): mask = (target == self.ignore_index).unsqueeze(1) model_prob.masked_fill_(mask, 0)
Ancestors
- torch.nn.modules.module.Module
Class variables
var dump_patches : bool
var training : bool
Methods
def forward(self, predictions, target) ‑> Callable[..., Any]
-
Computes the loss. :param predictions: The predictions of shape (N, C) where C is the number of classes. If with_logits is True, a log_softmax will be applied to obtain valid probabilities. :param target: The target values of shape (N). :return: The computed loss.
Expand source code
def forward(self, predictions, target): """ Computes the loss. :param predictions: The predictions of shape (N, C) where C is the number of classes. If with_logits is True, a log_softmax will be applied to obtain valid probabilities. :param target: The target values of shape (N). :return: The computed loss. """ if self.with_logits is True: predictions = F.log_softmax(predictions, dim=-1) model_prob = self.one_hot.repeat(target.size(0), 1) model_prob.scatter_(1, target.unsqueeze(1), self.confidence) self._apply_mask(model_prob, target) return self.kl_divergence(predictions, model_prob)