Skip to content

Conv1d_Model

dlrgy22 edited this page May 24, 2021 · 1 revision

Conv1d model

1. Why

  • 아래 이미지와 같이 각각의 token의 embedding vector를 convolution layer를 통해 n-gram과 유사한 방식으로 feature를 추출하여 ngram의 효과를 보고자함

  • 해당 아이디어를 backbone 네트워크에서 나온 embedding에 적용

useful links : CNN을 사용한 텍스트 분류

2. model structure

  • backbone network(xlm-roberta-large)에서 나온 embedding을 token기준으로 conv1d layer 통과(kernel size : 1, 3, 5)
  • Conv1d layer를 통과하고 나온 embedding을 concat하여 classification layer에 통과

3. Code

import torch
import numpy as np
from torch import nn, optim
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModel

class ConvModel(nn.Module):
    def __init__(self, model_name, model_config, tokenizer_name):
        super().__init__() 
        self.model_name = model_name
        self.tokenizer_name = tokenizer_name
        self.backbone_model = AutoModel.from_pretrained(model_name, config=model_config)
        self.conv1d_layer1 = nn.Conv1d(model_config.hidden_size, 1024, kernel_size=1)
        self.conv1d_layer3 = nn.Conv1d(model_config.hidden_size, 1024, kernel_size=3, padding=1)
        self.conv1d_layer5 = nn.Conv1d(model_config.hidden_size, 1024, kernel_size=5, padding=2)
        self.dropout = nn.Dropout(0.3)
        self.dense_layer = nn.Linear(1024 * 3, 2, bias=True)
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        if "xlm" in self.tokenizer_name:
            outputs = self.backbone_model(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

        else:
            outputs = self.backbone_model(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        
        sequence_output = outputs[0] # Convolution 연산을 위해 Transpose (B * hidden_size * max_seq_legth)
        conv_input = sequence_output.transpose(1, 2) # Conv 연산을 위한 Transpose (B * hidden_size * max_seq_length)
        conv_output1 = F.relu(self.conv1d_layer1(conv_input)) # Conv연산의 결과 (B * num_conv_filter * max_seq_legth)
        conv_output3 = F.relu(self.conv1d_layer3(conv_input)) # Conv연산의 결과 (B * num_conv_filter * max_seq_legth)
        conv_output5 = F.relu(self.conv1d_layer5(conv_input)) # Conv연산의 결과 (B * num_conv_filter * max_seq_legth)
        concat_output = torch.cat((conv_output1, conv_output3, conv_output5), dim=1) # Concatenation (B * num_conv_filter x 3 * max_seq_legth)

        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return {"start_logits" : start_logits, "end_logits" : end_logits, "hidden_states" :  outputs.hidden_states, "attentions" : outputs.attentions}