Source code for block_zoo.Embedding

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import autograd
from block_zoo.BaseLayer import BaseLayer, BaseConf
import numpy as np
from utils.DocInherit import DocInherit
from block_zoo.embedding import *
import copy
import logging

[docs]class EmbeddingConf(BaseConf):
    """ Configuration for Embedding

    Args:
        conf:a dictionary. The key is embedding type, such as word embedding, char embedding, Part-of-Speech embedding and so on.

    Example::

        "conf": {
          "word": {
            "cols": ["question_text", "answer_text"],
            "dim": 300,
            "fix_weight": true
          },
          "postag": {
            "cols": ["question_postag","answer_postag"],
            "dim": 20
          },
          "char": {
            "cols": ["question_char", "answer_char"],
            "type": "CNNCharEmbedding",
            "dropout": 0.2,
            "dim": 30,
            "embedding_matrix_dim": 8,
            "stride":1,
            "window_size": 5,
            "activation": null
          }
        }
    """
    def __init__(self, **kwargs):
        super(EmbeddingConf, self).__init__(**kwargs)

[docs]    @DocInherit
    def default(self):
        self.conf = {
            'word': {
                'vocab_size': 1000,
                'dim': 300,
                'init_weights': np.random.randn(1000, 300)      # you can give a initial weight here like this or assign it to None
            }
        }

[docs]    @DocInherit
    def declare(self):
        self.num_of_inputs = 1
        self.input_ranks = [2]         #[batch size, sequence length]

[docs]    @DocInherit
    def inference(self):
        self.output_dim = [-1, -1, 0]
        for emb_type in self.conf:
            if emb_type == 'position':
                continue
            self.output_dim[2] += self.conf[emb_type]['dim']

        super(EmbeddingConf, self).inference()

[docs]    @DocInherit
    def verify_before_inference(self):
        necessary_attrs_for_user = ['conf']
        for attr in necessary_attrs_for_user:
            self.add_attr_exist_assertion_for_user(attr)

        necessary_attrs_for_dev = ['num_of_inputs', 'input_ranks']
        for attr in necessary_attrs_for_dev:
            self.add_attr_exist_assertion_for_dev(attr)

        type_checks = [('conf', dict),
                       ('num_of_inputs', int),
                       ('input_ranks', list)]
        for attr, attr_type in type_checks:
            self.add_attr_type_assertion(attr, attr_type)


[docs]    @DocInherit
    def verify(self):
        #super(EmbeddingConf, self).verify()

        necessary_attrs_for_dev = ['output_dim', 'output_rank']
        for attr in necessary_attrs_for_dev:
            self.add_attr_exist_assertion_for_dev(attr)

        type_checks = [('output_dim', list),
                       ('output_rank', int)]
        for attr, attr_type in type_checks:
            self.add_attr_type_assertion(attr, attr_type)


[docs]class Embedding(BaseLayer):
    """ Embedding layer

    Args:
        layer_conf (EmbeddingConf): configuration of a layer
    """
    def __init__(self, layer_conf):

        super(Embedding, self).__init__(layer_conf)
        self.layer_conf = layer_conf

        self.embeddings = dict()
        for input_cluster in layer_conf.conf:
            if 'type' in layer_conf.conf[input_cluster]:
                # char embedding
                char_emb_conf_dict = copy.deepcopy(layer_conf.conf[input_cluster])
                # del char_emb_conf_dict['cols'], char_emb_conf_dict['type']
                char_emb_conf_dict['use_gpu'] = layer_conf.use_gpu
                char_emb_conf = eval(layer_conf.conf[input_cluster]['type'] + "Conf")(** char_emb_conf_dict)
                char_emb_conf.inference()
                char_emb_conf.verify()
                self.embeddings[input_cluster] = eval(layer_conf.conf[input_cluster]['type'])(char_emb_conf)
            else:
                # word embedding, postag embedding, and so on
                self.embeddings[input_cluster] = nn.Embedding(layer_conf.conf[input_cluster]['vocab_size'], layer_conf.conf[input_cluster]['dim'], padding_idx=0)
                if 'init_weights' in layer_conf.conf[input_cluster] and layer_conf.conf[input_cluster]['init_weights'] is not None:
                    self.embeddings[input_cluster].weight = nn.Parameter(torch.from_numpy(layer_conf.conf[input_cluster]['init_weights']))

                # judge if fix the embedding weight
                if layer_conf.conf[input_cluster]['fix_weight']:
                    self.embeddings[input_cluster].weight.requires_grad = False
                    logging.info("The Embedding[%s][fix_weight] is true, fix the embeddings[%s]'s weight" % (input_cluster, input_cluster))


[docs]    def forward(self, inputs, use_gpu=False):
        """ process inputs

        Args:
            inputs (dict): a dictionary to describe each transformer_model inputs. e.g.:\n
                        char_emb': [[char ids of word1], [char ids of word2], [...], ...], shape: [batch_size, seq_len, word character num]\n
                        'word': word ids (Variable), shape:[batch_size, seq_len],\n
                        'postag': postag ids (Variable), shape: [batch_size, seq_len],\n
                        ...
            use_gpu (bool): put embedding matrix on GPU (True) or not (False)
            
        Returns:
            Variable: the embedding representation with shape [batch_size, seq_len, emb_dim]

        """
        features = []

        for input_cluster in inputs:
            if 'extra' in input_cluster:
                continue
            input = inputs[input_cluster]
            # if 'type' in self.layer_conf.conf[input_cluster]:
            #     emb = self.embeddings[input_cluster](input, lengths[input]).float()
            # else:
            #     emb = self.embeddings[input_cluster](input).float()
            emb = self.embeddings[input_cluster](input.cpu()).float()
            if use_gpu is True:
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                emb = emb.to(device)
            features.append(emb)

        if len(features) > 1:
            return torch.cat(features, 2)
        else:
            return features[0]