# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import copy
import numpy as np
from block_zoo.BaseLayer import BaseLayer, BaseConf
#from layers.EncoderDecoder import EncoderDecoderConf
from utils.DocInherit import DocInherit
from utils.corpus_utils import get_seq_mask
[docs]class SLUDecoderConf(BaseConf):
""" Configuration of Spoken Language Understanding Encoder
References:
Liu, B., & Lane, I. (2016). Attention-based recurrent neural network models for joint intent detection and slot filling. Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH, (1), 685–689. https://doi.org/10.21437/Interspeech.2016-1352
Args:
hidden_dim (int): dimension of hidden state
dropout (float): dropout rate
num_layers (int): number of BiLSTM layers
num_decoder_output (int):
"""
def __init__(self, **kwargs):
super(SLUDecoderConf, self).__init__(**kwargs)
[docs] @DocInherit
def default(self):
self.hidden_dim = 128
self.dropout = 0.0
self.num_layers = 1
self.decoder_emb_dim = 100
# number of decoder's outputs. E.g., for slot tagging, num_decoder_output means the number of tags;
# for machine translation, num_decoder_output means the number of words in the target language;
self.decoder_vocab_size = 10000
#input_dim and input_context_dim should be inferenced from encoder
[docs] @DocInherit
def declare(self):
self.num_of_inputs = 1
self.input_ranks = [3]
[docs] @DocInherit
def inference(self):
self.output_dim = copy.deepcopy(self.input_dims[0])
self.output_dim[-1] = self.decoder_vocab_size
super(SLUDecoderConf, self).inference() # PUT THIS LINE AT THE END OF inference()
[docs] @DocInherit
def verify(self):
super(SLUDecoderConf, self).verify()
necessary_attrs_for_user = ['hidden_dim', 'dropout', 'num_layers', 'decoder_emb_dim', 'decoder_vocab_size']
for attr in necessary_attrs_for_user:
self.add_attr_exist_assertion_for_user(attr)
necessary_attrs_for_dev = ['input_dims', 'input_context_dims']
for attr in necessary_attrs_for_dev:
self.add_attr_exist_assertion_for_dev(attr)
[docs]class SLUDecoder(BaseLayer):
""" Spoken Language Understanding Encoder
References:
Liu, B., & Lane, I. (2016). Attention-based recurrent neural network models for joint intent detection and slot filling. Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH, (1), 685–689. https://doi.org/10.21437/Interspeech.2016-1352
Args:
layer_conf (SLUEncoderConf): configuration of a layer
"""
def __init__(self, layer_conf):
super(SLUDecoder, self).__init__(layer_conf)
self.layer_conf = layer_conf
self.embedding = nn.Embedding(layer_conf.decoder_vocab_size, layer_conf.decoder_emb_dim)
self.embedding.weight.data.uniform_(-0.1, 0.1) # init
#nn.init.uniform(self.embedding.weight, -0.1, 0.1)
#self.dropout = nn.Dropout(self.dropout_p)
#self.lstm = nn.LSTM(layer_conf.decoder_emb_dim + layer_conf.hidden_dim * 2, layer_conf.hidden_dim, layer_conf.num_layers, batch_first=True)
self.lstm = nn.LSTM(layer_conf.decoder_emb_dim + layer_conf.input_dims[0][-1] + layer_conf.input_context_dims[0][-1],
layer_conf.hidden_dim, layer_conf.num_layers, batch_first=True) # CAUTION: single direction
self.attn = nn.Linear(layer_conf.input_context_dims[0][-1], layer_conf.hidden_dim *layer_conf.num_layers) # Attention
self.slot_out = nn.Linear(layer_conf.input_context_dims[0][-1] + layer_conf.hidden_dim * 1 *layer_conf.num_layers, layer_conf.decoder_vocab_size)
[docs] def Attention(self, hidden, encoder_outputs, encoder_maskings):
"""
Args:
hidden : 1,B,D
encoder_outputs : B,T,D
encoder_maskings : B,T # ByteTensor
"""
hidden = hidden.view(hidden.size()[1], -1).unsqueeze(2)
batch_size = encoder_outputs.size(0) # B
max_len = encoder_outputs.size(1) # T
energies = self.attn(encoder_outputs.contiguous().view(batch_size * max_len, -1)) # B*T,D -> B*T,D
energies = energies.view(batch_size, max_len, -1) # B,T,D
attn_energies = energies.bmm(hidden).transpose(1, 2) # B,T,D * B,D,1 --> B,1,T
attn_energies = attn_energies.squeeze(1).masked_fill(encoder_maskings, -1e12) # PAD masking
alpha = F.softmax(attn_energies) # B,T
alpha = alpha.unsqueeze(1) # B,1,T
context = alpha.bmm(encoder_outputs) # B,1,T * B,T,D => B,1,D
return context # B,1,D
[docs] def forward(self, string, string_len, context, encoder_outputs):
""" process inputs
Args:
string (Variable): word ids, [batch_size, seq_len]
string_len (ndarray): [batch_size]
context (Variable): [batch_size, 1, input_dim]
encoder_outputs (Variable): [batch_size, max_seq_len, input_dim]
Returns:
Variable : decode scores with shape [batch_size, seq_len, decoder_vocab_size]
"""
batch_size = string.size(0)
if torch.cuda.device_count() > 1:
# otherwise, it will raise a Exception because the length inconsistence
string_mask = torch.ByteTensor(1 - get_seq_mask(string_len, max_seq_len=string.shape[1])) # [batch_size, max_seq_len]
else:
string_mask = torch.ByteTensor(1 - get_seq_mask(string_len)) # [batch_size, max_seq_len]
decoded = torch.LongTensor([[1] * batch_size])
hidden_init = torch.zeros(self.layer_conf.num_layers * 1, batch_size, self.layer_conf.hidden_dim)
context_init = torch.zeros(self.layer_conf.num_layers*1, batch_size, self.layer_conf.hidden_dim)
if self.is_cuda():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
string_mask = string_mask.to(device)
# Note id of "<start>" is 1! decoded is a batch of '<start>' at first
decoded = decoded.to(device)
hidden_init = hidden_init.to(device)
context_init = context_init.to(device)
decoded = decoded.transpose(1, 0) # [batch_size, 1]
embedded = self.embedding(decoded)
hidden = (hidden_init, context_init)
decode = []
aligns = encoder_outputs.transpose(0, 1) #[seq_len, bs, input_dim]
length = encoder_outputs.size(1)
for i in range(length):
aligned = aligns[i].unsqueeze(1) # [bs, 1, input_dim]
self.lstm.flatten_parameters()
_, hidden = self.lstm(torch.cat((embedded, context, aligned), 2), hidden)
concated = torch.cat((hidden[0].view(1, batch_size, -1), context.transpose(0, 1)), 2)
score = self.slot_out(concated.squeeze(0))
softmaxed = F.log_softmax(score) # decoder_vocab_dim
decode.append(softmaxed)
_, decoded = torch.max(softmaxed, 1)
embedded = self.embedding(decoded.unsqueeze(1))
context = self.Attention(hidden[0], encoder_outputs, string_mask)
slot_scores = torch.cat(decode, 1)
return slot_scores.view(batch_size, length, -1)