Text Classification Using Transformers (and implementation using Pytorch) - Zindi.html

<!DOCTYPE html>
<!-- saved from url=(0101)https://zindi.africa/learning/text-classification-using-transformers-and-implementation-using-pytorch -->
<html lang="en" class="wf-nunito-n3-active wf-nunito-n4-active wf-active" inmaintabuse="jlb"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    <!-- nextgen -->
    
    <meta http-equiv="x-ua-compatible" content="ie=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1, viewport-fit=cover">
    <link rel="apple-touch-icon-precomposed" sizes="57x57" href="https://assets.zindi.africa/apple-touch-icon-57x57.png">
    <link rel="apple-touch-icon-precomposed" sizes="114x114" href="https://assets.zindi.africa/apple-touch-icon-114x114.png">
    <link rel="apple-touch-icon-precomposed" sizes="72x72" href="https://assets.zindi.africa/apple-touch-icon-72x72.png">
    <link rel="apple-touch-icon-precomposed" sizes="144x144" href="https://assets.zindi.africa/apple-touch-icon-144x144.png">
    <link rel="apple-touch-icon-precomposed" sizes="120x120" href="https://assets.zindi.africa/apple-touch-icon-120x120.png">
    <link rel="apple-touch-icon-precomposed" sizes="152x152" href="https://assets.zindi.africa/apple-touch-icon-152x152.png">
    <link rel="icon" type="image/png" href="https://assets.zindi.africa/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="https://assets.zindi.africa/favicon-16x16.png" sizes="16x16">
    <meta name="twitter:card" content="summary_large_image">
    <meta name="twitter:site" content="@ZindiAfrica">
    <meta name="application-name" content=" ">
    <meta name="msapplication-TileColor" content="#FFFFFF">
    <meta name="msapplication-TileImage" content="https://assets.zindi.africa/mstile-144x144.png">
    <script async="" src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/tag.js"></script><script async="" src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/gtm.js"></script><script src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/webfont.js" async=""></script><script type="text/javascript">
      var WebFontConfig = {
        google: {
          families: ["Nunito:300,400"],
        },
        timeout: 2000,
      }

      ;(function (d) {
        var h = d.documentElement
        var onerror = function () {
          h.className += "wf-inactive"
        }
        var st = setTimeout(onerror, 1000)
        h.className += "wf-loading"
        var wf = d.createElement("script"),
          s = d.scripts[0]
        wf.src = "https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js"
        wf.async = true
        wf.onerror = onerror
        wf.onload = function () {
          clearTimeout(st)
        }
        s.parentNode.insertBefore(wf, s)
      })(document)
    </script>
    <title>Text Classification Using Transformers (and implementation using Pytorch) - Zindi</title><meta data-react-helmet="true" name="description" content="Zindi is a data science competition platform with the mission of building the data science ecosystem in Africa. Zindi hosts a community of data scientists dedicated to solving the continent&#39;s most pressing problems through machine learning and artificial intelligence."><meta data-react-helmet="true" property="og:site_name" content="Zindi"><meta data-react-helmet="true" property="twitter:site_name" content="Zindi"><meta data-react-helmet="true" property="og:title" content="Text Classification Using Transformers (and implementation using Pytorch)"><meta data-react-helmet="true" property="og:description" content="‘Attention Is All You Need’New deep learning models are introduced at an increasing rate, and sometimes it’s hard to keep track of all the novelties. In this article we will talk about transformers, a type of neural network architecture that has been gaining popularity, and include some guidance on implementation using a notebook. "><meta data-react-helmet="true" property="og:image" content="https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/72/big_thumb_1cc19347-ea50-4486-9b1c-6330f13dae22.png"><meta data-react-helmet="true" property="twitter:title" content="Text Classification Using Transformers (and implementation using Pytorch)"><meta data-react-helmet="true" property="twitter:description" content="‘Attention Is All You Need’New deep learning models are introduced at an increasing rate, and sometimes it’s hard to keep track of all the novelties. In this article we will talk about transformers, a type of neural network architecture that has been gaining popularity, and include some guidance on implementation using a notebook. "><meta data-react-helmet="true" property="twitter:image" content="https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/72/big_thumb_1cc19347-ea50-4486-9b1c-6330f13dae22.png">
    
    <script>
      ;(function (w, d, s, l, i) {
        w[l] = w[l] || []
        w[l].push({ "gtm.start": new Date().getTime(), event: "gtm.js" })
        var f = d.getElementsByTagName(s)[0],
          j = d.createElement(s),
          dl = l != "dataLayer" ? "&l=" + l : ""
        j.async = true
        j.src = "https://www.googletagmanager.com/gtm.js?id=" + i + dl
        f.parentNode.insertBefore(j, f)
      })(window, document, "script", "dataLayer", "GTM-KRG85D8")
    </script>
    
  <link rel="stylesheet" href="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/css" media="all"><link href="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/bundle.d57dc7d8a01460a702c7.css" rel="stylesheet"></head>
  <body _c_t_common="1" data-new-gr-c-s-check-loaded="14.1020.0" data-gr-ext-installed="">
    <noscript> You need to enable JavaScript to run this app. </noscript>
    <div id="app"><div class="App__container___fj0c9"><div class="App__section___1nGff"></div><div class="App__section___1nGff"><div class="App__contained___3emDO"><div class="Header__container___3FtbH"><a href="https://zindi.africa/"><div class="Header__logo___1eRaO">Zindi</div></a><div class="Header__menuContainer___2izgT"><div class="Menu__container___1sjgb"><a class="Menu__link___3x4C4" href="https://zindi.africa/competitions"><span class="Menu__linkInner___3LB5N">Compete</span></a><a aria-current="page" class="Menu__link___3x4C4 Menu__activeLink___1MA6r" href="https://zindi.africa/learning"><span class="Menu__linkInner___3LB5N">Learn</span></a><a class="Menu__link___3x4C4" href="https://zindi.africa/jobs"><span class="Menu__linkInner___3LB5N">Find a Job</span></a></div><div class="Menu__container___1sjgb Header__menuRight___3wiDe Menu__justifyRight___3r6Ws"><a class="Menu__link___3x4C4" href="https://zindi.africa/inbox"><span class="Menu__linkInner___3LB5N"><div class="Inbox__container___3xHCr"><svg class="Inbox__messagesIcon___2XP8V" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M4 4h16c1.1 0 2 .9 2 2v12c0 1.1-.9 2-2 2H4c-1.1 0-2-.9-2-2V6c0-1.1.9-2 2-2z"></path><polyline points="22,6 12,13 2,6"></polyline></svg><div class="Inbox__unseen___31NED"></div></div></span></a></div></div><div class="Header__userMenu___2iOts"><div class="UserMenu__container___ypkko"><button class="Button__base___NhksY Button__blank-normal___1nB5F UserMenu__user___a0zJo"><span class="Button__inner___3jkeF"><span class="User__container___18HoF User__size-normal___26ZPA"><img class="User__avatar___6aNx2" src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/thumb.default.png" alt=""><span class="User__username___64PE2">Glencode</span></span></span></button><div class=""></div></div></div></div></div></div><div class="App__section___1nGff App__content___WFkDX"><div class="WithSubheader__container___3qd5U"><div class="WithSubheader__header___2o1oX WithSubheader__withHeader___35ECw"><div class="BlogPost__headerImage___2fAz4" style="background-image: url(&quot;https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/72/header_1cc19347-ea50-4486-9b1c-6330f13dae22.png&quot;);"></div></div><div><div class="App__contained___3emDO"><div class="Paper__paper___2M-1R Paper__padding-1___3sKLR BlogPost__paper___1D3Be"><div class="BlogPost__date___3BhZy">1 Oct 2020, 14:58</div><h2 class="BlogPost__title___RUU5Z">Text Classification Using Transformers (and implementation using Pytorch)</h2><div class="Html__container___1AJFz BlogPost__intro___31fc-"><p>‘Attention Is All You Need’</p><p>New deep learning models are introduced at an increasing rate, and sometimes it’s hard to keep track of all the novelties. In this article we will talk about <span style="font-weight: bold;" class="">transformers</span>, a type of neural network architecture that has been gaining popularity, and include some guidance on implementation using a notebook. </p><p>In this post, we will address the following questions related to transformers:</p><ul class="public-DraftStyleDefault-ul">
<li>why do we need transformers?</li>
<li>transformer and its architecture in detail.</li>
<li>text classification with transformers</li>
<li>useful papers in dealing with transformers</li>
</ul><h3>Why do we need the transformer?</h3><p>Transformers were developed to solve the problem of sequence transduction, or neural machine translation. That means any task that transforms an input sequence to an output sequence. This includes speech recognition, text-to-speech transformation, etc.</p><p>For models to perform sequence transduction, it is necessary to have some sort of memory.</p><p><span style="font-weight: bold;" class="">The limitations of long-term dependencies:</span></p><p>A transformer is an architecture for transforming one sequence into another one with the help of two parts (<span style="font-weight: bold;" class="">encoder</span> and <span style="font-weight: bold;" class="">decoder</span>), but it differs from existing sequence-to-sequence models because it does not imply any recurrent networks (GRU, LSTM, etc.).</p><p>The transformer architecture is well introduced in the paper <a href="https://arxiv.org/abs/1706.03762" target="_blank" rel="noreferrer noopener">Attention is All You Need</a>; as the title indicates, transformer architecture uses the attention mechanism.</p><p>Let’s consider a language model that will predict the next word based on the previous ones:</p><blockquote>Sentence: <span style="font-weight: bold;" class="">“Bitcoin is the best cryptocurrency.”</span></blockquote><p>Here we don’t need an additional context , so obviously the next word will be “cryptocurrency”.</p><p>In this case RNN’s can sove the issue and predict the answer using the past information.</p><p></p><div class="image">
<img src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/fb761059-d70a-4c22-ab43-9ea0ccde38a0.png"> </div><p></p><p>But in other cases we need more context. For example, let’s say that you are trying to predict the last word of the text: </p><blockquote>Sentence: <span style="font-weight: bold;" class="">"I grew up in Tunisia, I speak fluent ..."</span></blockquote><p> Recent information suggests that the next word is probably a language, but if we want to narrow down which language, we need context of Tunisia, that is further back in the text.</p><p>RNNs become very ineffective when the gap between the relevant information and the point where it is needed becomes very large. That is due to the fact that the information is passed at each step and the longer the chain is, the more probable the information is lost along the chain.</p><p>I recommend the article <a href="https://towardsdatascience.com/transformers-141e32e69591" target="_blank" rel="noreferrer noopener">How Transformers Work</a>, which talks in depth about the difference between seq2seq and transformer.</p><h3>Transformer and its architecture in detail:</h3><p>An image is worth a thousand words, so we will start with that!</p><p></p><div class="image">
<img src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/31cfeb3b-8893-48ac-b060-6a6127c91700.png"> </div><p></p><p>The first thing that we can see is that it has a sequence-to-sequence encoder-decoder architecture. Much of the literature on transformers uses this very architecture to explain transformers. But this is not the one used in Open AI’s GPT model (or the GPT-2 model, which was just a larger version of its predecessor). The GPT is a 12-layer decoder only transformer with 117M parameters.</p><p>The transformer has a stack of 6 encoders and 6 decoders, unlike seq2seq; the encoder contains two sub-layers: multi-head self-attention layer and a fully connected feed-forward network. The decoder contains three sub-layers, a multi-head self-attention layer, an additional layer that performs multi-head self-attention over encoder outputs, and a fully connected feed-forward network. Each sub-layer in encoder and decoder has a residual connection followed by a layer normalization.</p><p>All input and output tokens to encoder/decoder are converted to vectors using learned embeddings; these input embeddings are then passed to positional encoding. </p><p>The transformers architecture does not contain any recurrence or convolution and hence has no notion of word order. All the words of the input sequence are fed to the network with no special order or position as they all flow simultaneously through the encoder and decoder stack. To understand the meaning of a sentence, it is essential to understand the position and the order of words.</p><h3>III - Text Classification using transformer with Pytorch implementation:</h3><p>It is too simple to use the ClassificationModel from simple transformes:</p><p>ClassificationModel(‘Architecture’, ‘model shortcut name’, use_cuda=True,num_labels=4) </p><p>Architecture: Bert , Roberta , Xlnet , Xlm…</p><p>Shortcut name models for Roberta : roberta-base , roberta-large…</p><p>More details <a href="https://huggingface.co/transformers/pretrained_models.html" target="_blank" rel="noreferrer noopener">here</a></p><p>We create a model that classify text for 4 classes <span style="font-weight: bold;" class="">[‘art’, ‘politics’, ‘health’, ‘tourism’]</span></p><p>We apply this model in our previous project</p><p>Watch video <a href="https://www.youtube.com/watch?v=SRoHaiODQag&amp;feature=emb_title&amp;ab_channel=YassineHAMDAOUI" target="_blank" rel="noreferrer noopener" savefrom_lm_index="0" savefrom_lm="1">here</a><span style="padding: 0; margin: 0; margin-left: 5px;"><a href="http://savefrom.net/?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSRoHaiODQag%26feature%3Demb_title%26ab_channel%3DYassineHAMDAOUI&amp;utm_source=chameleon&amp;utm_medium=extensions&amp;utm_campaign=link_modifier" target="_blank" title="Get a direct link" savefrom_lm="1" savefrom_lm_is_link="1" style="background-image: url(&quot;data:image/gif;base64,R0lGODlhEAAQAOZ3APf39+Xl5fT09OPj4/Hx8evr6/3+/u7u7uDh4OPi497e3t7e3/z8/P79/X3GbuXl5ubl5eHg4WzFUfb39+Pj4lzGOV7LOPz7+/n6+vn5+ZTLj9/e387Ozt7f3/7+/vv7/ISbePn5+m/JV1nRKXmVbkCnKVrSLDqsCuDh4d/e3uDn3/z7/H6TdVeaV1uSW+bn5v39/eXm5eXm5kyHP/f39pzGmVy7J3yRd9/f3mLEKkXCHJbka2TVM5vaZn6Wdfn6+YG/c/r5+ZO/jeLi41aHTIeageLn4f39/vr6+kzNG2PVM5i+lomdf2CXYKHVmtzo2YXNeDqsBebl5uHh4HDKWN3g3kKqEH6WeZHTXIPKdnSPbv79/pfmbE7PHpe1l4O8dTO5DODg4VDLIlKUUtzo2J7SmEWsLlG4NJbFjkrJHP7+/VK5Nfz8+zmnC3KKa+Hg4OHh4Y63j/3+/eDg4Ojo6P///8DAwP///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAEAAHcALAAAAAAQABAAAAfWgHd2g4SFhYJzdYqLjIpzgx5bBgYwHg1Hk2oNDXKDFwwfDF5NLmMtcStsn4MhGT8YS04aGmU1QRhIGYMTADQAQlAODlloAMYTgwICRmRfVBISIkBPKsqDBAREZmcVFhYVayUz2IMHB1dWOmImI2lgUVrmgwUFLzdtXTxKSSduMfSD6Aik48MGlx05SAykM0gKhAAPAhTB0oNFABkPHg5KMIBCxzlMQFQZMGBIggSDpsCJgGDOmzkIUCAIM2dOhEEcNijQuQDHgg4KOqRYwMGOIENIB90JBAA7&quot;); background-repeat: no-repeat; width: 16px; height: 16px; display: inline-block; border: none; text-decoration: none; padding: 0px; position: relative;"></a></span></p><p></p><div class="image">
<img src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/40c04715-7bef-4923-bed2-c75335d51539.png"> </div><p></p><p>And we integrate it in our flask application <a href="https://www.piecex.com/source-code/NLP-Tasks-with-Bert-Model-sentiment-extraction-text-summarisation-topic-classification-Python3-1825" target="_blank" rel="noreferrer noopener">here</a></p><p><a href="https://github.com/NeuroData-ltd/Transformers_Tuto" target="_blank" rel="noreferrer noopener">Here</a> you will find a commented notebook:</p><ul class="public-DraftStyleDefault-ul"><li>Setup environment &amp; configuration</li></ul><div class="codeblock"><pre>!pip install --upgrade transformers
!pip install simpletransformers
<span style="font-style: italic;" class=""># memory footprint support libraries/code</span>
!In -sf /opt/bin/nvidia-smi /user/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize importing libraries
</pre></div><ul class="public-DraftStyleDefault-ul"><li>Importing Libraries</li></ul><div class="codeblock"><pre><span style="font-weight: bold;" class="">import psutil
import humanize
import os
import GPUtil as GPU

import numpy as np
import pandas as pd
from google.colab import files
from tqdm import tqdm
import warnings</span>
warnings.simplefilter('ignore')
<span style="font-weight: bold;" class="">import gc
from scipy.special import</span> softmax
<span style="font-weight: bold;" class="">from simpletransformers.classification import</span>
ClassificationModel
<span style="font-weight: bold;" class="">from sklearn.model_selection import </span>train_test_split, StratifiedKFold, KFold
<span style="font-weight: bold;" class="">import sklearn
from sklearn.metircs import</span> log_loss
<span style="font-weight: bold;" class="">from sklearn.metrics import *
import re
import random
import torch</span>
pd.options.display.max_colwidth = 200

<span style="font-style: italic;" class="">#choose the same seed to assure that our model will be reproducible</span>

<span style="font-weight: bold;" class="">def</span> seed_all (seed_value) :
    random.seed (seed_value) <span style="font-style: italic;" class=""># Python</span>
    np.random.seed (seed_value) <span style="font-style: italic;" class=""># cpu vars</span>
    torch.manual_seed (seed_value) <span style="font-style: italic;" class=""># cpu vars</span>
    
    <span style="font-weight: bold;" class="">if</span> torch.cuda.is_available () :
        torch.cuda.manual_seed (seed_value)
        torch.cuda.manual_seed_all (seed_value) <span style="font-style: italic;" class=""># gpu vars</span>
        torch.backends.cudnn.deterministic = <span style="font-weight: bold;" class="">True</span> <span style="font-style: italic;" class="">#needed</span>
        torch.backends.cudnn.benchmark = <span style="font-weight: bold;" class="">False</span>

seed_all (2)
                                                                                                                                </pre></div><ul class="public-DraftStyleDefault-ul"><li>Reading Data</li></ul><div class="codeblock"><pre><span style="font-weight: bold;" class="">import pandas as pd</span>
<span style="font-style: italic;" class="">#We consider that our data is a csv file (2 columns : text and label)</span>
<span style="font-style: italic;" class="">#using pandas function (read_csv) to read the file</span>
train=pd.read_csv()
</pre></div><ul class="public-DraftStyleDefault-ul"><li>Verify the topic classes in the data</li></ul><div class="codeblock"><pre>train.label.unique()
</pre></div><ul class="public-DraftStyleDefault-ul"><li>train the model</li></ul><div class="codeblock"><pre>label_cols = ['art', 'politics', 'health', 'tourism']
train.head()
L=['art', 'politics', 'health', 'tourism']
<span style="font-style: italic;" class=""># Get the numerical ids of coloumn label</span>
train['label']=train.label.astype('category')

Y = train.label.cat.codes
train['label']=Y
<span style="font-style: italic;" class=""># Print initial shape</span>
print(Y.shape)
<span style="font-weight: bold;" class="">from keras.utils import</span> to_categorical
<span style="font-style: italic;" class=""># One-hot encode the indexes</span>
Y = to_categorical (Y)

<span style="font-style: italic;" class=""># Check the new shape of the variable</span>
print (Y.shape)

# Print the first 5 rows
print (Y[0:5])
<span style="font-weight: bold;" class="">for</span> i <span style="font-weight: bold;" class="">in</span> range (len(l)) :
    train[l[i]] = Y[:,i]
    
<span style="font-style: italic;" class="">#using KFOLD Cross Validation is important to test our model </span>  

%%time
err=[]
y_pred_tot=[] 

fold=StratifiedKFold(n_splits=5, shuffle=<span style="font-weight: bold;" class="">True</span>, random_state=1997)
i=1
<span style="font-weight: bold;" class="">for</span> train_index, test_index <span style="font-weight: bold;" class="">in</span>
fold.split (train,train['label']):
    train1_trn, train1_val = train.iloc[train_index], train.iloc[test_index]
    model = ClassificationModel ('roberta', 'roberta-base', use_cuda=True,num_labels=4, args={
    
'train_batch_size':16,
'reprocess_input_data': <span style="font-weight: bold;" class="">True,</span>
'overwrite_output_dir': <span style="font-weight: bold;" class="">True</span>,
'fp16': <span style="font-weight: bold;" class="">False,</span>
'do_lower_case':<span style="font-weight: bold;" class=""> False,</span>
'num_train_epochs': 4,
'max_seq_length': 128,
'regression': <span style="font-weight: bold;" class="">False,</span>
'manual_seed': 1997,
"learning_rate":2e-5,
'weight_decay':0,
"save_eval_checkpoints": <span style="font-weight: bold;" class="">True,</span>
"svae_model_every_epoch": <span style="font-weight: bold;" class="">False,</span>
"silent": <span style="font-weight: bold;" class="">True</span>})
    model.train_model (train1_trn)
    raw_outputs_val = model.eval_model{train1_val)[1]
    raw_outputs_vals = softmax(raw_outputs_val,axis=1)
    print(f"Log_loss: {log_loss(train1_val['label'], raw_outputs_vals)}")
    err.aprend(log_loss(train1_val['label'], raw_outputs_vals))    </pre></div><p><span style="font-weight: bold;" class="">Output:</span></p><p>Log_Loss: 0.35637871529928816</p><p>CPU times: user 11min 2s, sys: 4min 21s,</p><p>total: 15min 23s Wall time: 16min 7s</p><p><span style="font-weight: bold;" class="">Log Loss:</span></p><div class="codeblock"><pre>print("Mean LogLoss: ",np.mean(err))</pre></div><p><span style="font-weight: bold;" class="">Output:</span></p><p>Mean LogLoss: 0.34930175561484067</p><div class="codeblock"><pre>raw_outputs_vals</pre></div><p><span style="font-weight: bold;" class="">Output:</span></p><p>array([[9.9822301e-01, 3.4856689e-04, 3.8243082e-04, 1.0458552e-03],</p><p>[9.9695909e-01, 1.1522240e-03, 5.9563853e-04, 1.2927916e-03],</p><p>[9.9910539e-01, 2.3084633e-04, 2.5905663e-04, 4.0465154e-04],</p><p>...,</p><p>[3.6545596e-04, 2.8826005e-04, 4.3145564e-04, 9.9891484e-01],</p><p>[4.0789684e-03, 9.9224585e-01, 1.2752400e-03, 2.3997365e-03],</p><p>[3.7382307e-04, 3.4797701e-04, 3.6257200e-04, 9.9891579e-01]],</p><p>dtype=float32)</p><ul class="public-DraftStyleDefault-ul"><li>test our Model</li></ul><div class="codeblock"><pre>pred = model.predict(['i want to travel to thailand'])[1]
oreds = softmax(pred,axis=1)
preds</pre></div><p><span style="font-weight: bold;" class="">Output:</span></p><p>array([[6.0461409e-04, 3.6119239e-04, 3.3729596e-04, 9.9869716e-01]],</p><p>dtype=float32)</p><blockquote>We create a function which calculate the maximum probability and detect the topic</blockquote><blockquote>for example if we have 0.6 politics 0.1 art 0.15 health 0.15 tourism &gt;&gt;&gt;&gt; topic = politics</blockquote><div class="codeblock"><pre><span style="font-weight: bold;" class="">def</span> estm(raw_outputs_vals):
    <span style="font-weight: bold;" class="">for</span> i <span style="font-weight: bold;" class="">in</span> range (len(raw_outputs_vals)):
        <span style="font-weight: bold;" class="">for</span> j <span style="font-weight: bold;" class="">in</span> range (4):                      
            <span style="font-weight: bold;" class="">if </span>(max(raw_outputs_vals[i])==raw_outputs_vals[i][j]):
                raw_outputs_vals[i][j]=1
            <span style="font-weight: bold;" class="">else</span> :
                raw_outputs_vals[i][j]=0
    <span style="font-weight: bold;" class="">return</span>(raw_outputs_vals)
    
estm(preds)                                                            </pre></div><p><span style="font-weight: bold;" class="">Output:</span></p><p>array([[0., 0., 0., 1.]], dtype=float32)</p><blockquote>Our labels are :['art', 'politics', 'health', 'tourism']</blockquote><blockquote>so that's correct ;)</blockquote><p>I hope you find it useful &amp; helpful!</p><p>Download source <a href="https://github.com/NeuroData-ltd/Transformers_Tuto/blob/master/simpletransformers-tuto.ipynb" target="_blank" rel="noreferrer noopener">code</a> from our github.</p><h3>Useful papers to read more about transformers:</h3><p>Here a list of recommended papers to get in depth with transformers (mainly Bert Model):</p><ul class="public-DraftStyleDefault-ul">
<li>Cross-Linguistic Syntactic Evaluation of Word Prediction Models</li>
<li>Emerging Cross-lingual Structure in Pretrained Language Models</li>
<li>Finding Universal Grammatical Relations in Multilingual BERT</li>
<li>On the Cross-lingual Transferability of Monolingual Representations</li>
<li>How multilingual is Multilingual BERT?</li>
<li>Is Multilingual BERT Fluent in Language Generation?</li>
<li>Are All Languages Created Equal in Multilingual BERT?</li>
<li>What’s so special about BERT’s layers? A closer look at the NLP pipeline in monolingual and multilingual models</li>
<li>A Study of Cross-Lingual Ability and Language-specific Information in Multilingual BERT</li>
<li>Cross-Lingual Ability of Multilingual BERT: An Empirical Study</li>
<li>Multilingual is not enough: BERT for Finnish</li>
</ul><p>Download all article files from our github repo.</p><h3>Summary:</h3><p>Transformers present the next front in NLP. In just a few years since its introduction, this new architectural trend has surpassed the feats of RNN-based architectures. This exciting pace of invention is perhaps the best part of being early to a new field like Deep Learning.</p><p>If you have any suggestions or a questions please contact NeuroData Team:</p><p><a href="https://www.facebook.com/NeuroData.tn" target="_blank" rel="noreferrer noopener">Facebook</a></p><p><a href="https://www.linkedin.com/company/neurodata/" target="_blank" rel="noreferrer noopener">Linkedin</a></p><p><a href="https://github.com/NeuroData-ltd" target="_blank" rel="noreferrer noopener">Github</a></p><p>This article was written by <a href="https://www.linkedin.com/in/yassine-hamdaoui/" target="_blank" rel="noreferrer noopener">Yassine Hamdaoui</a> and first appeared on <a href="https://medium.com/swlh/text-classification-using-transformers-pytorch-implementation-5ff9f21bd106" target="_blank" rel="noreferrer noopener">Medium</a>. Code credits goes to <a href="https://www.linkedin.com/in/med-helmi-klai-933068176/" target="_blank" rel="noreferrer noopener">Med Klai Helmi</a>, NeuroData Data Scientist and Zindi Mentor.</p></div></div></div></div></div></div><div class="App__section___1nGff"><div class="Footer__container___3vGXM"><div class="App__contained___3emDO"><div class="Footer__links___dDoS-"><div class="Footer__column___1yO21"><div><a href="https://zindi.africa/competitions">Competitions</a></div><div><a href="https://zindi.africa/hackathons">Hackathons</a></div><div><a href="https://zindi.africa/data_scientists">Data Scientists</a></div><div><a href="https://zindi.africa/discussions">Discussions</a></div><div><a href="https://zindi.africa/jobs">Jobs Board</a></div></div><div class="Footer__column___1yO21"><div><a href="https://zindi.africa/hosting_competition">Host competition</a></div><div><a href="https://zindi.africa/about">About Us</a></div><div><a href="https://zindi.africa/partners">Our Partners</a></div><div><a href="https://zindi.africa/contact_us">Contact Us</a></div></div><div class="Footer__column___1yO21"><div><a href="https://zindi.africa/rules">Rules and Guidelines</a></div><div><a href="https://zindi.africa/terms">Terms of Use</a></div><div><a href="https://zindi.africa/privacy">Privacy Policy</a></div><div><a href="https://zindi.africa/faq">FAQs</a></div></div><div class="Footer__column___1yO21"><div><a target="_blank" rel="noopener noreferrer" href="https://www.linkedin.com/company/zindi-africa">LinkedIn</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://www.facebook.com/ZindiAfrica-311192052980655">Facebook</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://twitter.com/ZindiAfrica">Twitter</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://www.instagram.com/zindi.africa">Instagram</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://zindi.medium.com/">Medium</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://www.youtube.com/channel/UCQHq2JY2BqY2UTDCmVWyGBw">Youtube</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://github.com/zindiafrica">Github</a></div></div></div><div class="Footer__logo___ZtNwP">Zindi</div></div></div></div></div></div><script>window.__INITIAL_STATE__ = {"blogPosts":{"data":{},"queries":{}},"comments":{"data":{},"queries":{}},"competitionTags":{"data":{},"queries":{}},"competitions":{"data":{},"queries":{}},"conspiracyParticipations":{"data":{},"queries":{}},"discussions":{"data":{},"queries":{}},"fullBlogPosts":{"data":{"text-classification-using-transformers-and-implementation-using-pytorch":{"id":"text-classification-using-transformers-and-implementation-using-pytorch","image":"https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/72/header_1cc19347-ea50-4486-9b1c-6330f13dae22.png","big_image":"https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/72/big_thumb_1cc19347-ea50-4486-9b1c-6330f13dae22.png","header_image":"https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/72/header_1cc19347-ea50-4486-9b1c-6330f13dae22.png","title":"Text Classification Using Transformers (and implementation using Pytorch)","intro_html":"<p>&lsquo;Attention Is All You Need&rsquo;</p><p>New deep learning models are introduced at an increasing rate, and sometimes it&rsquo;s hard to keep track of all the novelties. In this article we will talk about <span style=\"font-weight: bold;\" class=\"\">transformers</span>, a type of neural network architecture that has been gaining popularity, and include some guidance on implementation using a notebook. </p>","intro_plain":"‘Attention Is All You Need’New deep learning models are introduced at an increasing rate, and sometimes it’s hard to keep track of all the novelties. In this article we will talk about transformers, a type of neural network architecture that has been gaining popularity, and include some guidance on implementation using a notebook. ","content_html":"<p>In this post, we will address the following questions related to transformers:</p><ul class=\"public-DraftStyleDefault-ul\">\n<li>why do we need transformers?</li>\n<li>transformer and its architecture in detail.</li>\n<li>text classification with transformers</li>\n<li>useful papers in dealing with transformers</li>\n</ul><h3>Why do we need the transformer?</h3><p>Transformers were developed to solve the problem of sequence transduction, or neural machine translation. That means any task that transforms an input sequence to an output sequence. This includes speech recognition, text-to-speech transformation, etc.</p><p>For models to perform sequence transduction, it is necessary to have some sort of memory.</p><p><span style=\"font-weight: bold;\" class=\"\">The limitations of long-term dependencies:</span></p><p>A transformer is an architecture for transforming one sequence into another one with the help of two parts (<span style=\"font-weight: bold;\" class=\"\">encoder</span> and <span style=\"font-weight: bold;\" class=\"\">decoder</span>), but it differs from existing sequence-to-sequence models because it does not imply any recurrent networks (GRU, LSTM, etc.).</p><p>The transformer architecture is well introduced in the paper <a href=\"https://arxiv.org/abs/1706.03762\" target=\"_blank\" rel=\"noreferrer noopener\">Attention is All You Need</a>; as the title indicates, transformer architecture uses the attention mechanism.</p><p>Let&rsquo;s consider a language model that will predict the next word based on the previous ones:</p><blockquote>Sentence: <span style=\"font-weight: bold;\" class=\"\">&ldquo;Bitcoin is the best cryptocurrency.&rdquo;</span></blockquote><p>Here we don&rsquo;t need an additional context , so obviously the next word will be &ldquo;cryptocurrency&rdquo;.</p><p>In this case RNN&rsquo;s can sove the issue and predict the answer using the past information.</p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/515/fb761059-d70a-4c22-ab43-9ea0ccde38a0.png\"> </div></p><p>But in other cases we need more context. For example, let&rsquo;s say that you are trying to predict the last word of the text: </p><blockquote>Sentence: <span style=\"font-weight: bold;\" class=\"\">\"I grew up in Tunisia, I speak fluent ...\"</span></blockquote><p> Recent information suggests that the next word is probably a language, but if we want to narrow down which language, we need context of Tunisia, that is further back in the text.</p><p>RNNs become very ineffective when the gap between the relevant information and the point where it is needed becomes very large. That is due to the fact that the information is passed at each step and the longer the chain is, the more probable the information is lost along the chain.</p><p>I recommend the article <a href=\"https://towardsdatascience.com/transformers-141e32e69591\" target=\"_blank\" rel=\"noreferrer noopener\">How Transformers Work</a>, which talks in depth about the difference between seq2seq and transformer.</p><h3>Transformer and its architecture in detail:</h3><p>An image is worth a thousand words, so we will start with that!</p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/516/31cfeb3b-8893-48ac-b060-6a6127c91700.png\"> </div></p><p>The first thing that we can see is that it has a sequence-to-sequence encoder-decoder architecture. Much of the literature on transformers uses this very architecture to explain transformers. But this is not the one used in Open AI&rsquo;s GPT model (or the GPT-2 model, which was just a larger version of its predecessor). The GPT is a 12-layer decoder only transformer with 117M parameters.</p><p>The transformer has a stack of 6 encoders and 6 decoders, unlike seq2seq; the encoder contains two sub-layers: multi-head self-attention layer and a fully connected feed-forward network. The decoder contains three sub-layers, a multi-head self-attention layer, an additional layer that performs multi-head self-attention over encoder outputs, and a fully connected feed-forward network. Each sub-layer in encoder and decoder has a residual connection followed by a layer normalization.</p><p>All input and output tokens to encoder/decoder are converted to vectors using learned embeddings; these input embeddings are then passed to positional encoding. </p><p>The transformers architecture does not contain any recurrence or convolution and hence has no notion of word order. All the words of the input sequence are fed to the network with no special order or position as they all flow simultaneously through the encoder and decoder stack. To understand the meaning of a sentence, it is essential to understand the position and the order of words.</p><h3>III - Text Classification using transformer with Pytorch implementation:</h3><p>It is too simple to use the ClassificationModel from simple transformes:</p><p>ClassificationModel(&lsquo;Architecture&rsquo;, &lsquo;model shortcut name&rsquo;, use_cuda=True,num_labels=4) </p><p>Architecture: Bert , Roberta , Xlnet , Xlm&hellip;</p><p>Shortcut name models for Roberta : roberta-base , roberta-large&hellip;</p><p>More details <a href=\"https://huggingface.co/transformers/pretrained_models.html\" target=\"_blank\" rel=\"noreferrer noopener\">here</a></p><p>We create a model that classify text for 4 classes <span style=\"font-weight: bold;\" class=\"\">[&lsquo;art&rsquo;, &lsquo;politics&rsquo;, &lsquo;health&rsquo;, &lsquo;tourism&rsquo;]</span></p><p>We apply this model in our previous project</p><p>Watch video <a href=\"https://www.youtube.com/watch?v=SRoHaiODQag&amp;feature=emb_title&amp;ab_channel=YassineHAMDAOUI\" target=\"_blank\" rel=\"noreferrer noopener\">here</a></p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/517/40c04715-7bef-4923-bed2-c75335d51539.png\"> </div></p><p>And we integrate it in our flask application <a href=\"https://www.piecex.com/source-code/NLP-Tasks-with-Bert-Model-sentiment-extraction-text-summarisation-topic-classification-Python3-1825\" target=\"_blank\" rel=\"noreferrer noopener\">here</a></p><p><a href=\"https://github.com/NeuroData-ltd/Transformers_Tuto\" target=\"_blank\" rel=\"noreferrer noopener\">Here</a> you will find a commented notebook:</p><ul class=\"public-DraftStyleDefault-ul\"><li>Setup environment &amp; configuration</li></ul><div class=\"codeblock\"><pre>!pip install --upgrade transformers\n!pip install simpletransformers\n<span style=\"font-style: italic;\" class=\"\"># memory footprint support libraries/code</span>\n!In -sf /opt/bin/nvidia-smi /user/bin/nvidia-smi\n!pip install gputil\n!pip install psutil\n!pip install humanize importing libraries\n</pre></div><ul class=\"public-DraftStyleDefault-ul\"><li>Importing Libraries</li></ul><div class=\"codeblock\"><pre><span style=\"font-weight: bold;\" class=\"\">import psutil\nimport humanize\nimport os\nimport GPUtil as GPU\n\nimport numpy as np\nimport pandas as pd\nfrom google.colab import files\nfrom tqdm import tqdm\nimport warnings</span>\nwarnings.simplefilter('ignore')\n<span style=\"font-weight: bold;\" class=\"\">import gc\nfrom scipy.special import</span> softmax\n<span style=\"font-weight: bold;\" class=\"\">from simpletransformers.classification import</span>\nClassificationModel\n<span style=\"font-weight: bold;\" class=\"\">from sklearn.model_selection import </span>train_test_split, StratifiedKFold, KFold\n<span style=\"font-weight: bold;\" class=\"\">import sklearn\nfrom sklearn.metircs import</span> log_loss\n<span style=\"font-weight: bold;\" class=\"\">from sklearn.metrics import *\nimport re\nimport random\nimport torch</span>\npd.options.display.max_colwidth = 200\n\n<span style=\"font-style: italic;\" class=\"\">#choose the same seed to assure that our model will be reproducible</span>\n\n<span style=\"font-weight: bold;\" class=\"\">def</span> seed_all (seed_value) :\n    random.seed (seed_value) <span style=\"font-style: italic;\" class=\"\"># Python</span>\n    np.random.seed (seed_value) <span style=\"font-style: italic;\" class=\"\"># cpu vars</span>\n    torch.manual_seed (seed_value) <span style=\"font-style: italic;\" class=\"\"># cpu vars</span>\n    \n    <span style=\"font-weight: bold;\" class=\"\">if</span> torch.cuda.is_available () :\n        torch.cuda.manual_seed (seed_value)\n        torch.cuda.manual_seed_all (seed_value) <span style=\"font-style: italic;\" class=\"\"># gpu vars</span>\n        torch.backends.cudnn.deterministic = <span style=\"font-weight: bold;\" class=\"\">True</span> <span style=\"font-style: italic;\" class=\"\">#needed</span>\n        torch.backends.cudnn.benchmark = <span style=\"font-weight: bold;\" class=\"\">False</span>\n\nseed_all (2)\n                                                                                                                                </pre></div><ul class=\"public-DraftStyleDefault-ul\"><li>Reading Data</li></ul><div class=\"codeblock\"><pre><span style=\"font-weight: bold;\" class=\"\">import pandas as pd</span>\n<span style=\"font-style: italic;\" class=\"\">#We consider that our data is a csv file (2 columns : text and label)</span>\n<span style=\"font-style: italic;\" class=\"\">#using pandas function (read_csv) to read the file</span>\ntrain=pd.read_csv()\n</pre></div><ul class=\"public-DraftStyleDefault-ul\"><li>Verify the topic classes in the data</li></ul><div class=\"codeblock\"><pre>train.label.unique()\n</pre></div><ul class=\"public-DraftStyleDefault-ul\"><li>train the model</li></ul><div class=\"codeblock\"><pre>label_cols = ['art', 'politics', 'health', 'tourism']\ntrain.head()\nL=['art', 'politics', 'health', 'tourism']\n<span style=\"font-style: italic;\" class=\"\"># Get the numerical ids of coloumn label</span>\ntrain['label']=train.label.astype('category')\n\nY = train.label.cat.codes\ntrain['label']=Y\n<span style=\"font-style: italic;\" class=\"\"># Print initial shape</span>\nprint(Y.shape)\n<span style=\"font-weight: bold;\" class=\"\">from keras.utils import</span> to_categorical\n<span style=\"font-style: italic;\" class=\"\"># One-hot encode the indexes</span>\nY = to_categorical (Y)\n\n<span style=\"font-style: italic;\" class=\"\"># Check the new shape of the variable</span>\nprint (Y.shape)\n\n# Print the first 5 rows\nprint (Y[0:5])\n<span style=\"font-weight: bold;\" class=\"\">for</span> i <span style=\"font-weight: bold;\" class=\"\">in</span> range (len(l)) :\n    train[l[i]] = Y[:,i]\n    \n<span style=\"font-style: italic;\" class=\"\">#using KFOLD Cross Validation is important to test our model </span>  \n\n%%time\nerr=[]\ny_pred_tot=[] \n\nfold=StratifiedKFold(n_splits=5, shuffle=<span style=\"font-weight: bold;\" class=\"\">True</span>, random_state=1997)\ni=1\n<span style=\"font-weight: bold;\" class=\"\">for</span> train_index, test_index <span style=\"font-weight: bold;\" class=\"\">in</span>\nfold.split (train,train['label']):\n    train1_trn, train1_val = train.iloc[train_index], train.iloc[test_index]\n    model = ClassificationModel ('roberta', 'roberta-base', use_cuda=True,num_labels=4, args={\n    \n'train_batch_size':16,\n'reprocess_input_data': <span style=\"font-weight: bold;\" class=\"\">True,</span>\n'overwrite_output_dir': <span style=\"font-weight: bold;\" class=\"\">True</span>,\n'fp16': <span style=\"font-weight: bold;\" class=\"\">False,</span>\n'do_lower_case':<span style=\"font-weight: bold;\" class=\"\"> False,</span>\n'num_train_epochs': 4,\n'max_seq_length': 128,\n'regression': <span style=\"font-weight: bold;\" class=\"\">False,</span>\n'manual_seed': 1997,\n\"learning_rate\":2e-5,\n'weight_decay':0,\n\"save_eval_checkpoints\": <span style=\"font-weight: bold;\" class=\"\">True,</span>\n\"svae_model_every_epoch\": <span style=\"font-weight: bold;\" class=\"\">False,</span>\n\"silent\": <span style=\"font-weight: bold;\" class=\"\">True</span>})\n    model.train_model (train1_trn)\n    raw_outputs_val = model.eval_model{train1_val)[1]\n    raw_outputs_vals = softmax(raw_outputs_val,axis=1)\n    print(f\"Log_loss: {log_loss(train1_val['label'], raw_outputs_vals)}\")\n    err.aprend(log_loss(train1_val['label'], raw_outputs_vals))    </pre></div><p><span style=\"font-weight: bold;\" class=\"\">Output:</span></p><p>Log_Loss: 0.35637871529928816</p><p>CPU times: user 11min 2s, sys: 4min 21s,</p><p>total: 15min 23s Wall time: 16min 7s</p><p><span style=\"font-weight: bold;\" class=\"\">Log Loss:</span></p><div class=\"codeblock\"><pre>print(\"Mean LogLoss: \",np.mean(err))</pre></div><p><span style=\"font-weight: bold;\" class=\"\">Output:</span></p><p>Mean LogLoss: 0.34930175561484067</p><div class=\"codeblock\"><pre>raw_outputs_vals</pre></div><p><span style=\"font-weight: bold;\" class=\"\">Output:</span></p><p>array([[9.9822301e-01, 3.4856689e-04, 3.8243082e-04, 1.0458552e-03],</p><p>[9.9695909e-01, 1.1522240e-03, 5.9563853e-04, 1.2927916e-03],</p><p>[9.9910539e-01, 2.3084633e-04, 2.5905663e-04, 4.0465154e-04],</p><p>...,</p><p>[3.6545596e-04, 2.8826005e-04, 4.3145564e-04, 9.9891484e-01],</p><p>[4.0789684e-03, 9.9224585e-01, 1.2752400e-03, 2.3997365e-03],</p><p>[3.7382307e-04, 3.4797701e-04, 3.6257200e-04, 9.9891579e-01]],</p><p>dtype=float32)</p><ul class=\"public-DraftStyleDefault-ul\"><li>test our Model</li></ul><div class=\"codeblock\"><pre>pred = model.predict(['i want to travel to thailand'])[1]\noreds = softmax(pred,axis=1)\npreds</pre></div><p><span style=\"font-weight: bold;\" class=\"\">Output:</span></p><p>array([[6.0461409e-04, 3.6119239e-04, 3.3729596e-04, 9.9869716e-01]],</p><p>dtype=float32)</p><blockquote>We create a function which calculate the maximum probability and detect the topic</blockquote><blockquote>for example if we have 0.6 politics 0.1 art 0.15 health 0.15 tourism &gt;&gt;&gt;&gt; topic = politics</blockquote><div class=\"codeblock\"><pre><span style=\"font-weight: bold;\" class=\"\">def</span> estm(raw_outputs_vals):\n    <span style=\"font-weight: bold;\" class=\"\">for</span> i <span style=\"font-weight: bold;\" class=\"\">in</span> range (len(raw_outputs_vals)):\n        <span style=\"font-weight: bold;\" class=\"\">for</span> j <span style=\"font-weight: bold;\" class=\"\">in</span> range (4):                      \n            <span style=\"font-weight: bold;\" class=\"\">if </span>(max(raw_outputs_vals[i])==raw_outputs_vals[i][j]):\n                raw_outputs_vals[i][j]=1\n            <span style=\"font-weight: bold;\" class=\"\">else</span> :\n                raw_outputs_vals[i][j]=0\n    <span style=\"font-weight: bold;\" class=\"\">return</span>(raw_outputs_vals)\n    \nestm(preds)                                                            </pre></div><p><span style=\"font-weight: bold;\" class=\"\">Output:</span></p><p>array([[0., 0., 0., 1.]], dtype=float32)</p><blockquote>Our labels are :['art', 'politics', 'health', 'tourism']</blockquote><blockquote>so that's correct ;)</blockquote><p>I hope you find it useful &amp; helpful!</p><p>Download source <a href=\"https://github.com/NeuroData-ltd/Transformers_Tuto/blob/master/simpletransformers-tuto.ipynb\" target=\"_blank\" rel=\"noreferrer noopener\">code</a> from our github.</p><h3>Useful papers to read more about transformers:</h3><p>Here a list of recommended papers to get in depth with transformers (mainly Bert Model):</p><ul class=\"public-DraftStyleDefault-ul\">\n<li>Cross-Linguistic Syntactic Evaluation of Word Prediction Models</li>\n<li>Emerging Cross-lingual Structure in Pretrained Language Models</li>\n<li>Finding Universal Grammatical Relations in Multilingual BERT</li>\n<li>On the Cross-lingual Transferability of Monolingual Representations</li>\n<li>How multilingual is Multilingual BERT?</li>\n<li>Is Multilingual BERT Fluent in Language Generation?</li>\n<li>Are All Languages Created Equal in Multilingual BERT?</li>\n<li>What&rsquo;s so special about BERT&rsquo;s layers? A closer look at the NLP pipeline in monolingual and multilingual models</li>\n<li>A Study of Cross-Lingual Ability and Language-specific Information in Multilingual BERT</li>\n<li>Cross-Lingual Ability of Multilingual BERT: An Empirical Study</li>\n<li>Multilingual is not enough: BERT for Finnish</li>\n</ul><p>Download all article files from our github repo.</p><h3>Summary:</h3><p>Transformers present the next front in NLP. In just a few years since its introduction, this new architectural trend has surpassed the feats of RNN-based architectures. This exciting pace of invention is perhaps the best part of being early to a new field like Deep Learning.</p><p>If you have any suggestions or a questions please contact NeuroData Team:</p><p><a href=\"https://www.facebook.com/NeuroData.tn\" target=\"_blank\" rel=\"noreferrer noopener\">Facebook</a></p><p><a href=\"https://www.linkedin.com/company/neurodata/\" target=\"_blank\" rel=\"noreferrer noopener\">Linkedin</a></p><p><a href=\"https://github.com/NeuroData-ltd\" target=\"_blank\" rel=\"noreferrer noopener\">Github</a></p><p>This article was written by <a href=\"https://www.linkedin.com/in/yassine-hamdaoui/\" target=\"_blank\" rel=\"noreferrer noopener\">Yassine Hamdaoui</a> and first appeared on <a href=\"https://medium.com/swlh/text-classification-using-transformers-pytorch-implementation-5ff9f21bd106\" target=\"_blank\" rel=\"noreferrer noopener\">Medium</a>. Code credits goes to <a href=\"https://www.linkedin.com/in/med-helmi-klai-933068176/\" target=\"_blank\" rel=\"noreferrer noopener\">Med Klai Helmi</a>, NeuroData Data Scientist and Zindi Mentor.</p>","published_at":"2020-10-01T11:58:18.184Z"}},"queries":{"\"text-classification-using-transformers-and-implementation-using-pytorch\"":{"data":"text-classification-using-transformers-and-implementation-using-pytorch","loading":false,"error":null}}},"fullCompetitions":{},"fullDiscussions":{"data":{},"queries":{"default":{"loading":false,"error":null}}},"fullJobs":{"data":{},"queries":{}},"jobs":{"data":{},"queries":{}},"jobApplications":{"data":{},"queries":{}},"myTeams":{},"notificationSubscriptions":{"data":{},"queries":{}},"participations":{"data":{},"queries":{}},"submissions":{"data":{},"queries":{}},"submissionLimits":{"data":{},"queries":{}},"teams":{"data":{},"queries":{}},"userDiscussions":{"data":{},"queries":{}},"userParticipations":{"data":{},"queries":{}},"userProfiles":{"users":{}},"users":{"data":{},"queries":{}}}</script>
    
    <script>
      window.ga =
        window.ga ||
        function () {
          ;(ga.q = ga.q || []).push(arguments)
        }
      ga.l = +new Date()
      ga("create", "UA-125419148-1", "auto")
      ga("send", "pageview")
      ga('set', 'appName', 'zindi.web')
      ga('set', 'dimension1', 'nextgen');
    </script>
    <script async="" src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/analytics.js"></script>
     
    <noscript><iframe
        src="https://www.googletagmanager.com/ns.html?id=GTM-KRG85D8"
        height="0"
        width="0"
        style="display: none; visibility: hidden"
      ></iframe
    ></noscript>
    
  <script>!function(l){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,a=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&a.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(l[r]=o[r]);for(s&&s(e);a.length;)a.shift()();return c.push.apply(c,u||[]),f()}function f(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={runtime:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return l[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=l,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="https://assets.zindi.africa/";var r=(n=window.webpackJsonp=window.webpackJsonp||[]).push.bind(n);n.push=e;for(var n=n.slice(),o=0;o<n.length;o++)e(n[o]);var s=r;f()}([]);
//# sourceMappingURL=runtime.8d4eb9324d7c3d54849b.js.map</script><script type="text/javascript" src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/vendor.019171b12285f6597772.js" defer=""></script><script type="text/javascript" src="./Text Classification Using Transformers (and implementation using Pytorch) - Zindi_files/bundle.9c8d652f1bf8225cb2ca.js" defer=""></script>

<div class="ReactModalPortal"></div>
<script type="text/javascript" id="">(function(a,e,f,g,b,c,d){a[b]=a[b]||function(){(a[b].a=a[b].a||[]).push(arguments)};a[b].l=1*new Date;c=e.createElement(f);d=e.getElementsByTagName(f)[0];c.async=1;c.src=g;d.parentNode.insertBefore(c,d)})(window,document,"script","https://mc.yandex.ru/metrika/tag.js","ym");ym(67869277,"init",{clickmap:!0,trackLinks:!0,accurateTrackBounce:!0,webvisor:!0});</script>
<noscript><div><img src="https://mc.yandex.ru/watch/67869277" style="position:absolute; left:-9999px;" alt=""></div></noscript>
<div id="fatkun-drop-panel">
        <a id="fatkun-drop-panel-close-btn">×</a>
            <div id="fatkun-drop-panel-inner">
                <div class="fatkun-content">
                    <svg class="fatkun-icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5892"><path d="M494.933333 782.933333c2.133333 2.133333 4.266667 4.266667 8.533334 6.4h8.533333c6.4 0 10.666667-2.133333 14.933333-6.4l2.133334-2.133333 275.2-275.2c8.533333-8.533333 8.533333-21.333333 0-29.866667-8.533333-8.533333-21.333333-8.533333-29.866667 0L533.333333 716.8V128c0-12.8-8.533333-21.333333-21.333333-21.333333s-21.333333 8.533333-21.333333 21.333333v588.8L249.6 475.733333c-8.533333-8.533333-21.333333-8.533333-29.866667 0-8.533333 8.533333-8.533333 21.333333 0 29.866667l275.2 277.333333zM853.333333 874.666667H172.8c-12.8 0-21.333333 8.533333-21.333333 21.333333s8.533333 21.333333 21.333333 21.333333H853.333333c12.8 0 21.333333-8.533333 21.333334-21.333333s-10.666667-21.333333-21.333334-21.333333z" p-id="5893"></path></svg>
                    <div class="fatkun-title">Drag and Drop</div>
                    <div class="fatkun-desc">The image will be downloaded</div>
                </div>
            </div>
    </div></body></html>