Source code for block_zoo.encoder_decoder.SLUDecoder

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import copy
import numpy as np
from block_zoo.BaseLayer import BaseLayer, BaseConf
#from layers.EncoderDecoder import EncoderDecoderConf
from utils.DocInherit import DocInherit
from utils.corpus_utils import get_seq_mask

[docs]class SLUDecoderConf(BaseConf): """ Configuration of Spoken Language Understanding Encoder References: Liu, B., & Lane, I. (2016). Attention-based recurrent neural network models for joint intent detection and slot filling. Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH, (1), 685–689. https://doi.org/10.21437/Interspeech.2016-1352 Args: hidden_dim (int): dimension of hidden state dropout (float): dropout rate num_layers (int): number of BiLSTM layers num_decoder_output (int): """ def __init__(self, **kwargs): super(SLUDecoderConf, self).__init__(**kwargs)
[docs] @DocInherit def default(self): self.hidden_dim = 128 self.dropout = 0.0 self.num_layers = 1 self.decoder_emb_dim = 100 # number of decoder's outputs. E.g., for slot tagging, num_decoder_output means the number of tags; # for machine translation, num_decoder_output means the number of words in the target language; self.decoder_vocab_size = 10000
#input_dim and input_context_dim should be inferenced from encoder
[docs] @DocInherit def declare(self): self.num_of_inputs = 1 self.input_ranks = [3]
[docs] @DocInherit def inference(self): self.output_dim = copy.deepcopy(self.input_dims[0]) self.output_dim[-1] = self.decoder_vocab_size super(SLUDecoderConf, self).inference() # PUT THIS LINE AT THE END OF inference()
[docs] @DocInherit def verify(self): super(SLUDecoderConf, self).verify() necessary_attrs_for_user = ['hidden_dim', 'dropout', 'num_layers', 'decoder_emb_dim', 'decoder_vocab_size'] for attr in necessary_attrs_for_user: self.add_attr_exist_assertion_for_user(attr) necessary_attrs_for_dev = ['input_dims', 'input_context_dims'] for attr in necessary_attrs_for_dev: self.add_attr_exist_assertion_for_dev(attr)
[docs]class SLUDecoder(BaseLayer): """ Spoken Language Understanding Encoder References: Liu, B., & Lane, I. (2016). Attention-based recurrent neural network models for joint intent detection and slot filling. Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH, (1), 685–689. https://doi.org/10.21437/Interspeech.2016-1352 Args: layer_conf (SLUEncoderConf): configuration of a layer """ def __init__(self, layer_conf): super(SLUDecoder, self).__init__(layer_conf) self.layer_conf = layer_conf self.embedding = nn.Embedding(layer_conf.decoder_vocab_size, layer_conf.decoder_emb_dim) self.embedding.weight.data.uniform_(-0.1, 0.1) # init #nn.init.uniform(self.embedding.weight, -0.1, 0.1) #self.dropout = nn.Dropout(self.dropout_p) #self.lstm = nn.LSTM(layer_conf.decoder_emb_dim + layer_conf.hidden_dim * 2, layer_conf.hidden_dim, layer_conf.num_layers, batch_first=True) self.lstm = nn.LSTM(layer_conf.decoder_emb_dim + layer_conf.input_dims[0][-1] + layer_conf.input_context_dims[0][-1], layer_conf.hidden_dim, layer_conf.num_layers, batch_first=True) # CAUTION: single direction self.attn = nn.Linear(layer_conf.input_context_dims[0][-1], layer_conf.hidden_dim *layer_conf.num_layers) # Attention self.slot_out = nn.Linear(layer_conf.input_context_dims[0][-1] + layer_conf.hidden_dim * 1 *layer_conf.num_layers, layer_conf.decoder_vocab_size)
[docs] def Attention(self, hidden, encoder_outputs, encoder_maskings): """ Args: hidden : 1,B,D encoder_outputs : B,T,D encoder_maskings : B,T # ByteTensor """ hidden = hidden.view(hidden.size()[1], -1).unsqueeze(2) batch_size = encoder_outputs.size(0) # B max_len = encoder_outputs.size(1) # T energies = self.attn(encoder_outputs.contiguous().view(batch_size * max_len, -1)) # B*T,D -> B*T,D energies = energies.view(batch_size, max_len, -1) # B,T,D attn_energies = energies.bmm(hidden).transpose(1, 2) # B,T,D * B,D,1 --> B,1,T attn_energies = attn_energies.squeeze(1).masked_fill(encoder_maskings, -1e12) # PAD masking alpha = F.softmax(attn_energies) # B,T alpha = alpha.unsqueeze(1) # B,1,T context = alpha.bmm(encoder_outputs) # B,1,T * B,T,D => B,1,D return context # B,1,D
[docs] def forward(self, string, string_len, context, encoder_outputs): """ process inputs Args: string (Variable): word ids, [batch_size, seq_len] string_len (ndarray): [batch_size] context (Variable): [batch_size, 1, input_dim] encoder_outputs (Variable): [batch_size, max_seq_len, input_dim] Returns: Variable : decode scores with shape [batch_size, seq_len, decoder_vocab_size] """ batch_size = string.size(0) if torch.cuda.device_count() > 1: # otherwise, it will raise a Exception because the length inconsistence string_mask = torch.ByteTensor(1 - get_seq_mask(string_len, max_seq_len=string.shape[1])) # [batch_size, max_seq_len] else: string_mask = torch.ByteTensor(1 - get_seq_mask(string_len)) # [batch_size, max_seq_len] decoded = torch.LongTensor([[1] * batch_size]) hidden_init = torch.zeros(self.layer_conf.num_layers * 1, batch_size, self.layer_conf.hidden_dim) context_init = torch.zeros(self.layer_conf.num_layers*1, batch_size, self.layer_conf.hidden_dim) if self.is_cuda(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") string_mask = string_mask.to(device) # Note id of "<start>" is 1! decoded is a batch of '<start>' at first decoded = decoded.to(device) hidden_init = hidden_init.to(device) context_init = context_init.to(device) decoded = decoded.transpose(1, 0) # [batch_size, 1] embedded = self.embedding(decoded) hidden = (hidden_init, context_init) decode = [] aligns = encoder_outputs.transpose(0, 1) #[seq_len, bs, input_dim] length = encoder_outputs.size(1) for i in range(length): aligned = aligns[i].unsqueeze(1) # [bs, 1, input_dim] self.lstm.flatten_parameters() _, hidden = self.lstm(torch.cat((embedded, context, aligned), 2), hidden) concated = torch.cat((hidden[0].view(1, batch_size, -1), context.transpose(0, 1)), 2) score = self.slot_out(concated.squeeze(0)) softmaxed = F.log_softmax(score) # decoder_vocab_dim decode.append(softmaxed) _, decoded = torch.max(softmaxed, 1) embedded = self.embedding(decoded.unsqueeze(1)) context = self.Attention(hidden[0], encoder_outputs, string_mask) slot_scores = torch.cat(decode, 1) return slot_scores.view(batch_size, length, -1)