-
Notifications
You must be signed in to change notification settings - Fork 0
/
How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi.html
228 lines (195 loc) · 60.4 KB
/
How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
<!DOCTYPE html>
<!-- saved from url=(0099)https://zindi.africa/learning/how-to-use-texthero-to-prep-a-text-based-dataset-for-your-nlp-project -->
<html lang="en" class="wf-loadingwf-inactive wf-nunito-n3-active wf-nunito-n4-active wf-active"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<!-- nextgen -->
<meta http-equiv="x-ua-compatible" content="ie=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, viewport-fit=cover">
<link rel="apple-touch-icon-precomposed" sizes="57x57" href="https://assets.zindi.africa/apple-touch-icon-57x57.png">
<link rel="apple-touch-icon-precomposed" sizes="114x114" href="https://assets.zindi.africa/apple-touch-icon-114x114.png">
<link rel="apple-touch-icon-precomposed" sizes="72x72" href="https://assets.zindi.africa/apple-touch-icon-72x72.png">
<link rel="apple-touch-icon-precomposed" sizes="144x144" href="https://assets.zindi.africa/apple-touch-icon-144x144.png">
<link rel="apple-touch-icon-precomposed" sizes="120x120" href="https://assets.zindi.africa/apple-touch-icon-120x120.png">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="https://assets.zindi.africa/apple-touch-icon-152x152.png">
<link rel="icon" type="image/png" href="https://assets.zindi.africa/favicon-32x32.png" sizes="32x32">
<link rel="icon" type="image/png" href="https://assets.zindi.africa/favicon-16x16.png" sizes="16x16">
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:site" content="@ZindiAfrica">
<meta name="application-name" content=" ">
<meta name="msapplication-TileColor" content="#FFFFFF">
<meta name="msapplication-TileImage" content="https://assets.zindi.africa/mstile-144x144.png">
<script async="" src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/tag.js"></script><script async="" src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/gtm.js"></script><script src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/webfont.js" async=""></script><script type="text/javascript">
var WebFontConfig = {
google: {
families: ["Nunito:300,400"],
},
timeout: 2000,
}
;(function (d) {
var h = d.documentElement
var onerror = function () {
h.className += "wf-inactive"
}
var st = setTimeout(onerror, 1000)
h.className += "wf-loading"
var wf = d.createElement("script"),
s = d.scripts[0]
wf.src = "https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js"
wf.async = true
wf.onerror = onerror
wf.onload = function () {
clearTimeout(st)
}
s.parentNode.insertBefore(wf, s)
})(document)
</script>
<title>How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi</title><meta data-react-helmet="true" name="description" content="Zindi is a data science competition platform with the mission of building the data science ecosystem in Africa. Zindi hosts a community of data scientists dedicated to solving the continent's most pressing problems through machine learning and artificial intelligence."><meta data-react-helmet="true" property="og:site_name" content="Zindi"><meta data-react-helmet="true" property="twitter:site_name" content="Zindi"><meta data-react-helmet="true" property="og:title" content="How to Use Texthero to Prep a Text-based Dataset for Your NLP Project"><meta data-react-helmet="true" property="og:description" content="Natural Language Processing (NLP) is one of the most important fields of study and research in today’s world. It has many applications in the business sector such as chatbots, sentiment analysis, and document classification."><meta data-react-helmet="true" property="og:image" content="https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/71/big_thumb_0762d4d0-b258-42b7-bcfb-80007420af85.jpg"><meta data-react-helmet="true" property="twitter:title" content="How to Use Texthero to Prep a Text-based Dataset for Your NLP Project"><meta data-react-helmet="true" property="twitter:description" content="Natural Language Processing (NLP) is one of the most important fields of study and research in today’s world. It has many applications in the business sector such as chatbots, sentiment analysis, and document classification."><meta data-react-helmet="true" property="twitter:image" content="https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/71/big_thumb_0762d4d0-b258-42b7-bcfb-80007420af85.jpg">
<script>
;(function (w, d, s, l, i) {
w[l] = w[l] || []
w[l].push({ "gtm.start": new Date().getTime(), event: "gtm.js" })
var f = d.getElementsByTagName(s)[0],
j = d.createElement(s),
dl = l != "dataLayer" ? "&l=" + l : ""
j.async = true
j.src = "https://www.googletagmanager.com/gtm.js?id=" + i + dl
f.parentNode.insertBefore(j, f)
})(window, document, "script", "dataLayer", "GTM-KRG85D8")
</script>
<link rel="stylesheet" href="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/css" media="all"><link href="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/bundle.d57dc7d8a01460a702c7.css" rel="stylesheet"></head>
<body _c_t_common="1" data-new-gr-c-s-check-loaded="14.1020.0" data-gr-ext-installed="">
<noscript> You need to enable JavaScript to run this app. </noscript>
<div id="app"><div class="App__container___fj0c9"><div class="App__section___1nGff"></div><div class="App__section___1nGff"><div class="App__contained___3emDO"><div class="Header__container___3FtbH"><a href="https://zindi.africa/"><div class="Header__logo___1eRaO">Zindi</div></a><div class="Header__menuContainer___2izgT"><div class="Menu__container___1sjgb"><a class="Menu__link___3x4C4" href="https://zindi.africa/competitions"><span class="Menu__linkInner___3LB5N">Compete</span></a><a aria-current="page" class="Menu__link___3x4C4 Menu__activeLink___1MA6r" href="https://zindi.africa/learning"><span class="Menu__linkInner___3LB5N">Learn</span></a><a class="Menu__link___3x4C4" href="https://zindi.africa/jobs"><span class="Menu__linkInner___3LB5N">Find a Job</span></a></div><div class="Menu__container___1sjgb Header__menuRight___3wiDe Menu__justifyRight___3r6Ws"><a class="Menu__link___3x4C4" href="https://zindi.africa/inbox"><span class="Menu__linkInner___3LB5N"><div class="Inbox__container___3xHCr"><svg class="Inbox__messagesIcon___2XP8V" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M4 4h16c1.1 0 2 .9 2 2v12c0 1.1-.9 2-2 2H4c-1.1 0-2-.9-2-2V6c0-1.1.9-2 2-2z"></path><polyline points="22,6 12,13 2,6"></polyline></svg><div class="Inbox__unseen___31NED"></div></div></span></a></div></div><div class="Header__userMenu___2iOts"><div class="UserMenu__container___ypkko"><button class="Button__base___NhksY Button__blank-normal___1nB5F UserMenu__user___a0zJo"><span class="Button__inner___3jkeF"><span class="User__container___18HoF User__size-normal___26ZPA"><img class="User__avatar___6aNx2" src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/thumb.default.png" alt=""><span class="User__username___64PE2">Glencode</span></span></span></button><div class=""></div></div></div></div></div></div><div class="App__section___1nGff App__content___WFkDX"><div class="WithSubheader__container___3qd5U"><div class="WithSubheader__header___2o1oX WithSubheader__withHeader___35ECw"><div class="BlogPost__headerImage___2fAz4" style="background-image: url("https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/71/header_0762d4d0-b258-42b7-bcfb-80007420af85.jpg");"></div></div><div><div class="App__contained___3emDO"><div class="Paper__paper___2M-1R Paper__padding-1___3sKLR BlogPost__paper___1D3Be"><div class="BlogPost__date___3BhZy">20 Aug 2020, 17:34</div><h2 class="BlogPost__title___RUU5Z">How to Use Texthero to Prep a Text-based Dataset for Your NLP Project</h2><div class="Html__container___1AJFz BlogPost__intro___31fc-"><p>Natural Language Processing (NLP) is one of the most important fields of study and research in today’s world. It has many applications in the business sector such as chatbots, sentiment analysis, and document classification.</p><p>
</p><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/93ded517-60dc-426d-b560-ed235d0e086d.png"> </div><p></p><p>Preprocessing and representing text is one of the trickiest and most annoying parts of working on an NLP project. Text-based datasets can be incredibly thorny and difficult to preprocess. But fortunately, the latest Python package called Texthero can help you solve these challenges.</p><h2>What is Texthero?</h2><p>Texthero is a simple Python toolkit that helps you work with a text-based dataset. It provides quick and easy functionalities that let you <span style="font-style: italic;" class="">preprocess, represent, map into vectors </span>and<span style="font-style: italic;" class=""> visualize</span> text data in just a couple of lines of code.</p><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/dfd9c45f-0c28-4b73-9e51-2b3d93a33b4a.png"> </div><p></p><p>Texthero is designed to be used on top of pandas, so it makes it easier to preprocess and analyze text-based Pandas Series or Dataframes.</p><p>If you are working on an NLP project, Texthero can help you get things done faster than before and gives you more time to focus on important tasks.</p><p><span style="font-weight: bold;" class="">NOTE:</span> The Texthero library is still in the beta version. You might face some bugs and pipelines might change. A faster and better version will be released and it will bring some major changes.</p><h2>Texthero Overview</h2><p></p><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/628c0356-b7cc-44e5-a6c2-d6547e8dffbe.jpg"> </div><p></p><p>Texthero has four useful modules that handle different functionalities that you can apply in your text-based dataset.</p><ol class="public-DraftStyleDefault-ol">
<li>
<a href="https://texthero.org/docs/api-preprocessing" target="_blank" rel="noreferrer noopener"><span style="font-weight: bold;" class="">Preprocessing</span></a>
This module allows for the efficient pre-processing of text-based Pandas Series or DataFrames. It has different methods to clean your text dataset such as lowercase(), remove_html_tags() and remove_urls().</li>
<li>
<a href="https://texthero.org/docs/api-nlp" target="_blank" rel="noreferrer noopener"><span style="font-weight: bold;" class="">NLP</span></a>
This module has a few NLP tasks such as named_entities, noun_chunks, and so on.</li>
<li>
<a href="https://texthero.org/docs/api-representation" target="_blank" rel="noreferrer noopener"><span style="font-weight: bold;" class="">Representation</span></a>
This module has different algorithms to map words into vectors such as TF-IDF, GloVe, Principal Component Analysis(PCA), and term_frequency.</li>
<li>
<a href="https://texthero.org/docs/api-visualization" target="_blank" rel="noreferrer noopener"><span style="font-weight: bold;" class="">Visualization</span></a>
The last module has three different methods to visualize the insights and statistics of a text-based Pandas DataFrame. It can plot a scatter plot and word cloud.</li>
</ol><h3>Install Texthero</h3><p>Texthero is free, open-source, and well documented. To install it open a terminal and execute the following command:</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">pip install texthero</span></pre></div><p>The package uses a lot of other libraries on the back-end such as Gensim, SpaCy, scikit-learn, and NLTK. You don't need to install them all separately, pip will take care of that.</p><h3>How to use Texthero</h3><p>In this article I will use a news dataset to show you how you can use different methods provided by texthero's modules in your own NLP project.</p><p>We will start by importing important Python packages that we are going to use.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#import important packages
</span></pre></div><p></p><p><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">
import texthero as hero
import pandas as pd</span></p><p>Then we'll load a dataset from the data directory. The dataset for this article focuses on news in the <a href="https://medium.com/@Davis_David/meet-the-winners-of-swahili-news-classification-challenge-60f5edd7aa9" target="_blank" rel="noreferrer noopener">Swahili </a>Language.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#load dataset
data = pd.read_csv("data/swahili_news_dataset.csv")</span></pre></div><p>Let's look at the top 5 rows of the dataset:</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># show top 5 rows
data.head()</span>
</pre></div><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/eff614d8-e4fb-402c-a7a4-30061f5f5c37.png"> </div><p></p><p>As you can see, in our dataset we have three columns (id, content, and category). For this article we will focus on the content feature.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># select news content only and show top 5 rows
news_content = data[["content"]]
news_content.head()</span></pre></div><p>We have created a new dataframe focused on content only, and then we'll show the top 5 rows.</p><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/e54305eb-8d27-42c1-9c6f-480b1b9643c1.png"> </div><p></p><h3>Preprocessing with Texthero</h3><p>We can use the <span style="font-weight: bold;" class="">clean().</span> method to pre-process a text-based Pandas Series.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># clean the news content by using clean method from hero package
news_content['clean_content'] = hero.clean(news_content['content'])</span></pre></div><p>The <span style="font-weight: bold;" class="">clean()</span> method runs seven functions when you pass a pandas series. These seven functions are:</p><ul class="public-DraftStyleDefault-ul">
<li>lowercase(s): Lowercases all text.</li>
<li>remove_diacritics(): Removes all accents from strings.</li>
<li>remove_stopwords(): Removes all stop words.</li>
<li>remove_digits(): Removes all blocks of digits.</li>
<li>remove_punctuation(): Removes all string.punctuation (!"#$%&'()*+,-./:;<=>?@[]^_`{|}~).</li>
<li>fillna(s): Replaces unassigned values with empty spaces.</li>
<li>remove_whitespace(): Removes all white space between words</li>
</ul><p>Now we can see the cleaned news content.</p><p><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#show unclean and clean news content
news_content.head()</span></p><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/452945b3-751b-4634-b911-d3bfd1343d24.png"> </div><p></p><h3>Custom Cleaning</h3><p>If the default pipeline from the <span style="font-weight: bold;" class="">clean()</span> method does not fit your needs, you can create a custom pipeline with the list of functions that you want to apply in your dataset.</p><p>As an example, I created a custom pipeline with only 5 functions to clean my dataset.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#create custom pipeline
from texthero import preprocessing
custom_pipeline = [preprocessing.fillna,
preprocessing.lowercase,
preprocessing.remove_whitespace,
preprocessing.remove_punctuation,
preprocessing.remove_urls,
]</span></pre></div><p>Now I can use the custome_pipeline to clean my dataset.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#altearnative for custom pipeline
news_content['clean_custom_content'] = news_content['content'].pipe(hero.clean, custom_pipeline)</span></pre></div><p>You can see the clean dataset we have created by using custom pipeline .</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># show output of custome pipeline
news_content.clean_custom_content.head()</span></pre></div><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/09b5f647-55cd-49d1-bc32-82164d1eb179.png"> </div><p></p><h2>Useful preprocessing methods</h2><p>Here are some other useful functions from preprocessing modules that you can try to clean you text-based dataset.</p><h3>Remove digits</h3><p>You can use the <span style="font-weight: bold;" class="">remove_digits() </span>function to remove digits in your text-based datasets.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">text = pd.Series("Hi my phone number is +255 711 111 111 call me at 09:00 am")
clean_text = hero.preprocessing.remove_digits(text)
print(clean_text)</span></pre></div><p>output: Hi my phone number is + call me at : am
dtype: object</p><h3>Remove stopwords</h3><p>You can use the <span style="font-weight: bold;" class="">remove_stopwords() </span>function to remove stopwords in your text-based datasets.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">text = pd.Series("you need to know NLP to develop the chatbot that you desire")
clean_text = hero.remove_stopwords(text)
print(clean_text)</span></pre></div><p>output: need know NLP develop chatbot desire
dtype: object</p><h3>Remove URLs</h3><p>You can use the <span style="font-weight: bold;" class="">remove_urls() </span>function to remove links in your text-based datasets.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">text = pd.Series("Go to https://www.freecodecamp.org/news/ to read more articles you like")
clean_text = hero.remove_urls(text)
print(clean_text)</span></pre></div><p>output: Go to to read more articles you like
dtype: object</p><h3>Tokenize</h3><p>Tokenize each row of the given Pandas Series by using the <span style="font-weight: bold;" class="">tokenize() </span>method and return a Pandas Series where each row contains a list of tokens.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">text = pd.Series(["You can think of Texthero as a tool to help you understand and work with text-based dataset. "])
clean_text = hero.tokenize(text)
print(clean_text)</span></pre></div><p>output: [You, can, think, of, Texthero, as, a, tool, to, help, you, understand, and, work, with, text, based, dataset]
dtype: object</p><h3>Remove HTML tags</h3><p>You can remove html tags from the given Pandas Series by using the <span style="font-weight: bold;" class="">remove_html_tags()</span> method.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">text = pd.Series("<html><body><h2>hello world</h2></body></html>")
clean_text = hero.remove_html_tags(text)
print(clean_text)</span></pre></div><p>output: hello world
dtype: object</p><h2>Useful visualization methods</h2><p>Texthero contains different method to visualize insights and statistics of a text-based Pandas DataFrame.</p><h3>Top words</h3><p>If you want to know the top words in your text-based dataset, you can use the <span style="font-weight: bold;" class="">top_words() </span>method from the visualization module. This method is useful if you want see additional words that you can add to the stop words lists.</p><p>This method does not return a bar graph, so I will use <span style="font-weight: bold;" class="">matplotlib</span> to visualize the top words in a bar graph.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">import matplotlib.pyplot as plt
NUM_TOP_WORDS = 20
top_20 = hero.visualization.top_words(news_content['clean_content']).head(NUM_TOP_WORDS)
# Draw the bar chart
top_20.plot.bar(rot=90, title="Top 20 words");
plt.show(block=True);</span></pre></div><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/288523b4-10c6-4b59-934c-bdcaaa194249.png"> </div><p></p><p>In the graph above we can visualize the top 20 words from our news dataset.</p><h3>Wordclouds</h3><p>The <span style="font-weight: bold;" class="">wordcloud()</span> method from the visualization module plots an image using WordCloud from the word_cloud package.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#Plot wordcloud image using WordCloud method
hero.wordcloud(news_content.clean_content, max_words=100,)</span></pre></div><p>We passed the dataframe series and number of maximum words (for this example, it is 100 words) in the <span style="font-weight: bold;" class="">wordcloud()</span> method.</p><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/02636136-6f44-4871-8061-e2565724f9a3.png"> </div><p></p><h2>Useful representation methods</h2><p>Texthero contains different methods from the representation module that help you map words into vectors using different algorithms such as TF-IDF, word2vec or GloVe. In this section I will show you how you can use these methods.</p><h3>TF-IDF</h3><p>You can represent a text-based Pandas Series using TF-IDF. I created a new pandas series with two pieces of news content and represented them in TF_IDF features by using the <span style="font-weight: bold;" class="">tfidf() </span>method.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># Create a new text-based Pandas Series.
news = pd.Series(["mkuu wa mkoa wa tabora aggrey mwanri amesitisha likizo za viongozi wote mkoani humo kutekeleza maazimio ya jukwaa la fursa za biashara la mkoa huo", "serikali imetoa miezi sita kwa taasisi zote za umma ambazo hazitumii mfumo wa gepg katika ukusanyaji wa fedha kufanya hivyo na baada ya hapo itafanya ukaguzi na kuwawajibisha"])
#convert into tfidf features
hero.tfidf(news)</span></pre></div><p>output: [0.187132760851739, 0.0, 0.187132760851739, 0....
[0.0, 0.18557550845969953, 0.0, 0.185575508459...
dtype: object</p><p><span style="font-weight: bold;" class="">NOTE:</span> TF-IDF stands for<span style="font-style: italic;" class=""> term frequency-inverse document frequency.</span></p><h3>Term Frequency</h3><p>You can represent a text-based Pandas Series using the <span style="font-weight: bold;" class="">term_frequency()</span> method. Term frequency (TF) is used to show how frequently an expression (term or word) occurs in a document or text content.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">news = pd.Series(["mkuu wa mkoa wa tabora aggrey mwanri amesitisha likizo za viongozi wote mkoani humo kutekeleza maazimio ya jukwaa la fursa za biashara la mkoa huo", "serikali imetoa miezi sita kwa taasisi zote za umma ambazo hazitumii mfumo wa gepg katika ukusanyaji wa fedha kufanya hivyo na baada ya hapo itafanya ukaguzi na kuwawajibisha"])
# Represent a text-based Pandas Series using term_frequency.
hero.term_frequency(news)</span></pre></div><p>output: [1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, ...
[0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, ...
dtype: object</p><h3>K-means</h3><p>Texthero can perform K-means clustering algorithm by using the <span style="font-weight: bold;" class="">kmeans() </span>method. If you have an unlabeled text-based dataset, you can use this method to group content according to their similarities.</p><p>In this example, I will create a new pandas dataframe called <span style="font-weight: bold;" class="">news</span> with the following columns <span style="font-style: italic;" class="">content,tfidf and kmeans_labels.</span></p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">column_names = ["content","tfidf", "kmeans_labels"]
news = pd.DataFrame(columns = column_names)</span></pre></div><p>We will use only the first 30 pieces of cleaned content from our <span style="font-style: italic;" class="">news_content dataframe</span> and cluster them into groups by using the <span style="font-weight: bold;" class="">kmeans() </span>method.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># collect 30 clean content.
news["content"] = news_content.clean_content[:30]
# convert them into tf-idf features.
news['tfidf'] = (
news['content']
.pipe(hero.tfidf)
)
# perform clustering algorithm by using kmeans()
news['kmeans_labels'] = (
news['tfidf']
.pipe(hero.kmeans, n_clusters=5)
.astype(str)
)</span></pre></div><p>In the above source code, in the pipeline of the k-means method we passed the number of clusters which is 5. This means we will group these contents into 5 groups.</p><p>Now the selected news content has been labeled into five groups.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline"># show content and their labels
news[["content","kmeans_labels"]].head()</span></pre></div><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/530c2b83-c5b5-4057-aa33-4d6948e0722f.png"> </div><p></p><h3>PCA</h3><p>You can also use the <span style="font-weight: bold;" class="">pca() </span>method to perform principal component analysis on the given Pandas Series. <span style="font-weight: bold;" class="">Principal component analysis</span> (<span style="font-weight: bold;" class="">PCA</span>) is a technique for reducing the dimensionality of your datasets. This increases interpretability but at the same time minimizes information loss.</p><p>In this example we use the tfidf features from the news dataframe and represent them into two components by using the <span style="font-weight: bold;" class="">pca()</span> method. Finally we will show a scatterplot by using the <span style="font-weight: bold;" class="">scatterplot()</span> method.</p><div class="codeblock"><pre><span style="font-family: "SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;" class="codeinline">#perform pca
news['pca'] = news['tfidf'].pipe(hero.pca)
#show scatterplot
hero.scatterplot(news, 'pca', color='kmeans_labels', title="news")</span></pre></div><p></p><div class="image">
<img src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/83680430-1bff-4f7f-87b3-537038abd950.png"> </div><p></p><h2>Wrap up</h2><p>In this article, you've learned the basics of how to use the Texthero toolkit Python package in your NLP project. You can learn more about the methods available in the <a href="https://texthero.org/docs/api-preprocessing" target="_blank" rel="noreferrer noopener">documentation</a>.</p><p>You can download the dataset and notebook used in this article here: <a href="https://github.com/Davisy/Texthero-Python-Toolkit" target="_blank" rel="noreferrer noopener">https://github.com/Davisy/Texthero-Python-Toolkit</a> .</p><p>If you learned something new or enjoyed reading this article, please share it so that others can see it. Until then, see you in the next post!</p><h3>About the author</h3><p>Davis David is Zindi Ambassador for Tanzania and a data scientist at ParrotAI. He is passionate about artificial intelligence, machine learning, deep learning and big data. He is a co-organizer and facilitator of the AI movement in Tanzania; conducting AI meetups, workshops and events with a passion to build a community of data scientists to solve local problems. He can be reached on Twitter <a href="https://twitter.com/Davis_McDavid" target="_blank" rel="noreferrer noopener">@Davis_McDavid</a>.</p><p>You can read the original blog post <a href="https://www.freecodecamp.org/news/how-to-work-and-understand-text-based-dataset-with-texthero/" target="_blank" rel="noreferrer noopener">here</a>.</p></div></div></div></div></div></div><div class="App__section___1nGff"><div class="Footer__container___3vGXM"><div class="App__contained___3emDO"><div class="Footer__links___dDoS-"><div class="Footer__column___1yO21"><div><a href="https://zindi.africa/competitions">Competitions</a></div><div><a href="https://zindi.africa/hackathons">Hackathons</a></div><div><a href="https://zindi.africa/data_scientists">Data Scientists</a></div><div><a href="https://zindi.africa/discussions">Discussions</a></div><div><a href="https://zindi.africa/jobs">Jobs Board</a></div></div><div class="Footer__column___1yO21"><div><a href="https://zindi.africa/hosting_competition">Host competition</a></div><div><a href="https://zindi.africa/about">About Us</a></div><div><a href="https://zindi.africa/partners">Our Partners</a></div><div><a href="https://zindi.africa/contact_us">Contact Us</a></div></div><div class="Footer__column___1yO21"><div><a href="https://zindi.africa/rules">Rules and Guidelines</a></div><div><a href="https://zindi.africa/terms">Terms of Use</a></div><div><a href="https://zindi.africa/privacy">Privacy Policy</a></div><div><a href="https://zindi.africa/faq">FAQs</a></div></div><div class="Footer__column___1yO21"><div><a target="_blank" rel="noopener noreferrer" href="https://www.linkedin.com/company/zindi-africa">LinkedIn</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://www.facebook.com/ZindiAfrica-311192052980655">Facebook</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://twitter.com/ZindiAfrica">Twitter</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://www.instagram.com/zindi.africa">Instagram</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://zindi.medium.com/">Medium</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://www.youtube.com/channel/UCQHq2JY2BqY2UTDCmVWyGBw">Youtube</a></div><div><a target="_blank" rel="noopener noreferrer" href="https://github.com/zindiafrica">Github</a></div></div></div><div class="Footer__logo___ZtNwP">Zindi</div></div></div></div></div></div><script>window.__INITIAL_STATE__ = {"blogPosts":{"data":{},"queries":{}},"comments":{"data":{},"queries":{}},"competitionTags":{"data":{},"queries":{}},"competitions":{"data":{},"queries":{}},"conspiracyParticipations":{"data":{},"queries":{}},"discussions":{"data":{},"queries":{}},"fullBlogPosts":{"data":{"how-to-use-texthero-to-prep-a-text-based-dataset-for-your-nlp-project":{"id":"how-to-use-texthero-to-prep-a-text-based-dataset-for-your-nlp-project","image":"https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/71/header_0762d4d0-b258-42b7-bcfb-80007420af85.jpg","big_image":"https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/71/big_thumb_0762d4d0-b258-42b7-bcfb-80007420af85.jpg","header_image":"https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/71/header_0762d4d0-b258-42b7-bcfb-80007420af85.jpg","title":"How to Use Texthero to Prep a Text-based Dataset for Your NLP Project","intro_html":"<p>Natural Language Processing (NLP) is one of the most important fields of study and research in today’s world. It has many applications in the business sector such as chatbots, sentiment analysis, and document classification.</p>","intro_plain":"Natural Language Processing (NLP) is one of the most important fields of study and research in today’s world. It has many applications in the business sector such as chatbots, sentiment analysis, and document classification.","content_html":"<p>\n</p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/482/93ded517-60dc-426d-b560-ed235d0e086d.png\"> </div></p><p>Preprocessing and representing text is one of the trickiest and most annoying parts of working on an NLP project. Text-based datasets can be incredibly thorny and difficult to preprocess. But fortunately, the latest Python package called Texthero can help you solve these challenges.</p><h2>What is Texthero?</h2><p>Texthero is a simple Python toolkit that helps you work with a text-based dataset. It provides quick and easy functionalities that let you <span style=\"font-style: italic;\" class=\"\">preprocess, represent, map into vectors </span>and<span style=\"font-style: italic;\" class=\"\"> visualize</span> text data in just a couple of lines of code.</p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/474/dfd9c45f-0c28-4b73-9e51-2b3d93a33b4a.png\"> </div></p><p>Texthero is designed to be used on top of pandas, so it makes it easier to preprocess and analyze text-based Pandas Series or Dataframes.</p><p>If you are working on an NLP project, Texthero can help you get things done faster than before and gives you more time to focus on important tasks.</p><p><span style=\"font-weight: bold;\" class=\"\">NOTE:</span> The Texthero library is still in the beta version. You might face some bugs and pipelines might change. A faster and better version will be released and it will bring some major changes.</p><h2>Texthero Overview</h2><p></p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/480/628c0356-b7cc-44e5-a6c2-d6547e8dffbe.jpg\"> </div></p><p>Texthero has four useful modules that handle different functionalities that you can apply in your text-based dataset.</p><ol class=\"public-DraftStyleDefault-ol\">\n<li>\n<a href=\"https://texthero.org/docs/api-preprocessing\" target=\"_blank\" rel=\"noreferrer noopener\"><span style=\"font-weight: bold;\" class=\"\">Preprocessing</span></a>\nThis module allows for the efficient pre-processing of text-based Pandas Series or DataFrames. It has different methods to clean your text dataset such as lowercase(), remove_html_tags() and remove_urls().</li>\n<li>\n<a href=\"https://texthero.org/docs/api-nlp\" target=\"_blank\" rel=\"noreferrer noopener\"><span style=\"font-weight: bold;\" class=\"\">NLP</span></a>\nThis module has a few NLP tasks such as named_entities, noun_chunks, and so on.</li>\n<li>\n<a href=\"https://texthero.org/docs/api-representation\" target=\"_blank\" rel=\"noreferrer noopener\"><span style=\"font-weight: bold;\" class=\"\">Representation</span></a>\nThis module has different algorithms to map words into vectors such as TF-IDF, GloVe, Principal Component Analysis(PCA), and term_frequency.</li>\n<li>\n<a href=\"https://texthero.org/docs/api-visualization\" target=\"_blank\" rel=\"noreferrer noopener\"><span style=\"font-weight: bold;\" class=\"\">Visualization</span></a>\nThe last module has three different methods to visualize the insights and statistics of a text-based Pandas DataFrame. It can plot a scatter plot and word cloud.</li>\n</ol><h3>Install Texthero</h3><p>Texthero is free, open-source, and well documented. To install it open a terminal and execute the following command:</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">pip install texthero</span></pre></div><p>The package uses a lot of other libraries on the back-end such as Gensim, SpaCy, scikit-learn, and NLTK. You don't need to install them all separately, pip will take care of that.</p><h3>How to use Texthero</h3><p>In this article I will use a news dataset to show you how you can use different methods provided by texthero's modules in your own NLP project.</p><p>We will start by importing important Python packages that we are going to use.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#import important packages\n</span></pre></div><p></p><p><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">\nimport texthero as hero\nimport pandas as pd</span></p><p>Then we'll load a dataset from the data directory. The dataset for this article focuses on news in the <a href=\"https://medium.com/@Davis_David/meet-the-winners-of-swahili-news-classification-challenge-60f5edd7aa9\" target=\"_blank\" rel=\"noreferrer noopener\">Swahili </a>Language.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#load dataset \n\ndata = pd.read_csv(\"data/swahili_news_dataset.csv\")</span></pre></div><p>Let's look at the top 5 rows of the dataset:</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># show top 5 rows \n\ndata.head()</span>\n</pre></div><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/481/eff614d8-e4fb-402c-a7a4-30061f5f5c37.png\"> </div></p><p>As you can see, in our dataset we have three columns (id, content, and category). For this article we will focus on the content feature.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># select news content only and show top 5 rows\n\nnews_content = data[[\"content\"]]\nnews_content.head()</span></pre></div><p>We have created a new dataframe focused on content only, and then we'll show the top 5 rows.</p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/483/e54305eb-8d27-42c1-9c6f-480b1b9643c1.png\"> </div></p><h3>Preprocessing with Texthero</h3><p>We can use the <span style=\"font-weight: bold;\" class=\"\">clean().</span> method to pre-process a text-based Pandas Series.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># clean the news content by using clean method from hero package\n\nnews_content['clean_content'] = hero.clean(news_content['content'])</span></pre></div><p>The <span style=\"font-weight: bold;\" class=\"\">clean()</span> method runs seven functions when you pass a pandas series. These seven functions are:</p><ul class=\"public-DraftStyleDefault-ul\">\n<li>lowercase(s): Lowercases all text.</li>\n<li>remove_diacritics(): Removes all accents from strings.</li>\n<li>remove_stopwords(): Removes all stop words.</li>\n<li>remove_digits(): Removes all blocks of digits.</li>\n<li>remove_punctuation(): Removes all string.punctuation (!\"#$%&'()*+,-./:;<=>?@[]^_`{|}~).</li>\n<li>fillna(s): Replaces unassigned values with empty spaces.</li>\n<li>remove_whitespace(): Removes all white space between words</li>\n</ul><p>Now we can see the cleaned news content.</p><p><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#show unclean and clean news content\n\nnews_content.head()</span></p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/484/452945b3-751b-4634-b911-d3bfd1343d24.png\"> </div></p><h3>Custom Cleaning</h3><p>If the default pipeline from the <span style=\"font-weight: bold;\" class=\"\">clean()</span> method does not fit your needs, you can create a custom pipeline with the list of functions that you want to apply in your dataset.</p><p>As an example, I created a custom pipeline with only 5 functions to clean my dataset.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#create custom pipeline\nfrom texthero import preprocessing\n\ncustom_pipeline = [preprocessing.fillna,\n preprocessing.lowercase,\n preprocessing.remove_whitespace,\n preprocessing.remove_punctuation,\n preprocessing.remove_urls,\n ]</span></pre></div><p>Now I can use the custome_pipeline to clean my dataset.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#altearnative for custom pipeline\n\nnews_content['clean_custom_content'] = news_content['content'].pipe(hero.clean, custom_pipeline)</span></pre></div><p>You can see the clean dataset we have created by using custom pipeline .</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># show output of custome pipeline\n\nnews_content.clean_custom_content.head()</span></pre></div><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/475/09b5f647-55cd-49d1-bc32-82164d1eb179.png\"> </div></p><h2>Useful preprocessing methods</h2><p>Here are some other useful functions from preprocessing modules that you can try to clean you text-based dataset.</p><h3>Remove digits</h3><p>You can use the <span style=\"font-weight: bold;\" class=\"\">remove_digits() </span>function to remove digits in your text-based datasets.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">text = pd.Series(\"Hi my phone number is +255 711 111 111 call me at 09:00 am\")\nclean_text = hero.preprocessing.remove_digits(text)\n\nprint(clean_text)</span></pre></div><p>output: Hi my phone number is + call me at : am\ndtype: object</p><h3>Remove stopwords</h3><p>You can use the <span style=\"font-weight: bold;\" class=\"\">remove_stopwords() </span>function to remove stopwords in your text-based datasets.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">text = pd.Series(\"you need to know NLP to develop the chatbot that you desire\")\nclean_text = hero.remove_stopwords(text)\n\nprint(clean_text)</span></pre></div><p>output: need know NLP develop chatbot desire\ndtype: object</p><h3>Remove URLs</h3><p>You can use the <span style=\"font-weight: bold;\" class=\"\">remove_urls() </span>function to remove links in your text-based datasets.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">text = pd.Series(\"Go to https://www.freecodecamp.org/news/ to read more articles you like\")\nclean_text = hero.remove_urls(text)\n\nprint(clean_text)</span></pre></div><p>output: Go to to read more articles you like\ndtype: object</p><h3>Tokenize</h3><p>Tokenize each row of the given Pandas Series by using the <span style=\"font-weight: bold;\" class=\"\">tokenize() </span>method and return a Pandas Series where each row contains a list of tokens.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">text = pd.Series([\"You can think of Texthero as a tool to help you understand and work with text-based dataset. \"])\nclean_text = hero.tokenize(text)\n\nprint(clean_text)</span></pre></div><p>output: [You, can, think, of, Texthero, as, a, tool, to, help, you, understand, and, work, with, text, based, dataset]\ndtype: object</p><h3>Remove HTML tags</h3><p>You can remove html tags from the given Pandas Series by using the <span style=\"font-weight: bold;\" class=\"\">remove_html_tags()</span> method.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">text = pd.Series(\"<html><body><h2>hello world</h2></body></html>\")\nclean_text = hero.remove_html_tags(text)\n\nprint(clean_text)</span></pre></div><p>output: hello world\ndtype: object</p><h2>Useful visualization methods</h2><p>Texthero contains different method to visualize insights and statistics of a text-based Pandas DataFrame.</p><h3>Top words</h3><p>If you want to know the top words in your text-based dataset, you can use the <span style=\"font-weight: bold;\" class=\"\">top_words() </span>method from the visualization module. This method is useful if you want see additional words that you can add to the stop words lists.</p><p>This method does not return a bar graph, so I will use <span style=\"font-weight: bold;\" class=\"\">matplotlib</span> to visualize the top words in a bar graph.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">import matplotlib.pyplot as plt\n\nNUM_TOP_WORDS = 20\n\ntop_20 = hero.visualization.top_words(news_content['clean_content']).head(NUM_TOP_WORDS)\n\n# Draw the bar chart\n\ntop_20.plot.bar(rot=90, title=\"Top 20 words\");\n\nplt.show(block=True);</span></pre></div><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/476/288523b4-10c6-4b59-934c-bdcaaa194249.png\"> </div></p><p>In the graph above we can visualize the top 20 words from our news dataset.</p><h3>Wordclouds</h3><p>The <span style=\"font-weight: bold;\" class=\"\">wordcloud()</span> method from the visualization module plots an image using WordCloud from the word_cloud package.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#Plot wordcloud image using WordCloud method\nhero.wordcloud(news_content.clean_content, max_words=100,)</span></pre></div><p>We passed the dataframe series and number of maximum words (for this example, it is 100 words) in the <span style=\"font-weight: bold;\" class=\"\">wordcloud()</span> method.</p><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/485/02636136-6f44-4871-8061-e2565724f9a3.png\"> </div></p><h2>Useful representation methods</h2><p>Texthero contains different methods from the representation module that help you map words into vectors using different algorithms such as TF-IDF, word2vec or GloVe. In this section I will show you how you can use these methods.</p><h3>TF-IDF</h3><p>You can represent a text-based Pandas Series using TF-IDF. I created a new pandas series with two pieces of news content and represented them in TF_IDF features by using the <span style=\"font-weight: bold;\" class=\"\">tfidf() </span>method.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># Create a new text-based Pandas Series.\n\nnews = pd.Series([\"mkuu wa mkoa wa tabora aggrey mwanri amesitisha likizo za viongozi wote mkoani humo kutekeleza maazimio ya jukwaa la fursa za biashara la mkoa huo\", \"serikali imetoa miezi sita kwa taasisi zote za umma ambazo hazitumii mfumo wa gepg katika ukusanyaji wa fedha kufanya hivyo na baada ya hapo itafanya ukaguzi na kuwawajibisha\"])\n\n#convert into tfidf features \nhero.tfidf(news)</span></pre></div><p>output: [0.187132760851739, 0.0, 0.187132760851739, 0....\n [0.0, 0.18557550845969953, 0.0, 0.185575508459...\ndtype: object</p><p><span style=\"font-weight: bold;\" class=\"\">NOTE:</span> TF-IDF stands for<span style=\"font-style: italic;\" class=\"\"> term frequency-inverse document frequency.</span></p><h3>Term Frequency</h3><p>You can represent a text-based Pandas Series using the <span style=\"font-weight: bold;\" class=\"\">term_frequency()</span> method. Term frequency (TF) is used to show how frequently an expression (term or word) occurs in a document or text content.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">news = pd.Series([\"mkuu wa mkoa wa tabora aggrey mwanri amesitisha likizo za viongozi wote mkoani humo kutekeleza maazimio ya jukwaa la fursa za biashara la mkoa huo\", \"serikali imetoa miezi sita kwa taasisi zote za umma ambazo hazitumii mfumo wa gepg katika ukusanyaji wa fedha kufanya hivyo na baada ya hapo itafanya ukaguzi na kuwawajibisha\"])\n\n# Represent a text-based Pandas Series using term_frequency.\nhero.term_frequency(news)</span></pre></div><p>output: [1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, ...\n [0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, ...\ndtype: object</p><h3>K-means</h3><p>Texthero can perform K-means clustering algorithm by using the <span style=\"font-weight: bold;\" class=\"\">kmeans() </span>method. If you have an unlabeled text-based dataset, you can use this method to group content according to their similarities.</p><p>In this example, I will create a new pandas dataframe called <span style=\"font-weight: bold;\" class=\"\">news</span> with the following columns <span style=\"font-style: italic;\" class=\"\">content,tfidf and kmeans_labels.</span></p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">column_names = [\"content\",\"tfidf\", \"kmeans_labels\"]\n\nnews = pd.DataFrame(columns = column_names)</span></pre></div><p>We will use only the first 30 pieces of cleaned content from our <span style=\"font-style: italic;\" class=\"\">news_content dataframe</span> and cluster them into groups by using the <span style=\"font-weight: bold;\" class=\"\">kmeans() </span>method.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># collect 30 clean content.\nnews[\"content\"] = news_content.clean_content[:30]\n\n# convert them into tf-idf features.\nnews['tfidf'] = (\n news['content']\n .pipe(hero.tfidf)\n)\n\n# perform clustering algorithm by using kmeans() \nnews['kmeans_labels'] = (\n news['tfidf']\n .pipe(hero.kmeans, n_clusters=5)\n .astype(str)\n)</span></pre></div><p>In the above source code, in the pipeline of the k-means method we passed the number of clusters which is 5. This means we will group these contents into 5 groups.</p><p>Now the selected news content has been labeled into five groups.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\"># show content and their labels\nnews[[\"content\",\"kmeans_labels\"]].head()</span></pre></div><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/478/530c2b83-c5b5-4057-aa33-4d6948e0722f.png\"> </div></p><h3>PCA</h3><p>You can also use the <span style=\"font-weight: bold;\" class=\"\">pca() </span>method to perform principal component analysis on the given Pandas Series. <span style=\"font-weight: bold;\" class=\"\">Principal component analysis</span> (<span style=\"font-weight: bold;\" class=\"\">PCA</span>) is a technique for reducing the dimensionality of your datasets. This increases interpretability but at the same time minimizes information loss.</p><p>In this example we use the tfidf features from the news dataframe and represent them into two components by using the <span style=\"font-weight: bold;\" class=\"\">pca()</span> method. Finally we will show a scatterplot by using the <span style=\"font-weight: bold;\" class=\"\">scatterplot()</span> method.</p><div class=\"codeblock\"><pre><span style='font-family: \"SFMono-Regular\",Consolas,\"Liberation Mono\",Menlo,Courier,monospace;' class=\"codeinline\">#perform pca\nnews['pca'] = news['tfidf'].pipe(hero.pca)\n\n#show scatterplot\nhero.scatterplot(news, 'pca', color='kmeans_labels', title=\"news\")</span></pre></div><p><div class=\"image\">\n<img src=\"https://zindpublic.blob.core.windows.net/public/uploads/image_attachment/image/479/83680430-1bff-4f7f-87b3-537038abd950.png\"> </div></p><h2>Wrap up</h2><p>In this article, you've learned the basics of how to use the Texthero toolkit Python package in your NLP project. You can learn more about the methods available in the <a href=\"https://texthero.org/docs/api-preprocessing\" target=\"_blank\" rel=\"noreferrer noopener\">documentation</a>.</p><p>You can download the dataset and notebook used in this article here: <a href=\"https://github.com/Davisy/Texthero-Python-Toolkit\" target=\"_blank\" rel=\"noreferrer noopener\">https://github.com/Davisy/Texthero-Python-Toolkit</a> .</p><p>If you learned something new or enjoyed reading this article, please share it so that others can see it. Until then, see you in the next post!</p><h3>About the author</h3><p>Davis David is Zindi Ambassador for Tanzania and a data scientist at ParrotAI. He is passionate about artificial intelligence, machine learning, deep learning and big data. He is a co-organizer and facilitator of the AI movement in Tanzania; conducting AI meetups, workshops and events with a passion to build a community of data scientists to solve local problems. He can be reached on Twitter <a href=\"https://twitter.com/Davis_McDavid\" target=\"_blank\" rel=\"noreferrer noopener\">@Davis_McDavid</a>.</p><p>You can read the original blog post <a href=\"https://www.freecodecamp.org/news/how-to-work-and-understand-text-based-dataset-with-texthero/\" target=\"_blank\" rel=\"noreferrer noopener\">here</a>.</p>","published_at":"2020-08-20T14:34:29.748Z"}},"queries":{"\"how-to-use-texthero-to-prep-a-text-based-dataset-for-your-nlp-project\"":{"data":"how-to-use-texthero-to-prep-a-text-based-dataset-for-your-nlp-project","loading":false,"error":null}}},"fullCompetitions":{},"fullDiscussions":{"data":{},"queries":{"default":{"loading":false,"error":null}}},"fullJobs":{"data":{},"queries":{}},"jobs":{"data":{},"queries":{}},"jobApplications":{"data":{},"queries":{}},"myTeams":{},"notificationSubscriptions":{"data":{},"queries":{}},"participations":{"data":{},"queries":{}},"submissions":{"data":{},"queries":{}},"submissionLimits":{"data":{},"queries":{}},"teams":{"data":{},"queries":{}},"userDiscussions":{"data":{},"queries":{}},"userParticipations":{"data":{},"queries":{}},"userProfiles":{"users":{}},"users":{"data":{},"queries":{}}}</script>
<script>
window.ga =
window.ga ||
function () {
;(ga.q = ga.q || []).push(arguments)
}
ga.l = +new Date()
ga("create", "UA-125419148-1", "auto")
ga("send", "pageview")
ga('set', 'appName', 'zindi.web')
ga('set', 'dimension1', 'nextgen');
</script>
<script async="" src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/analytics.js"></script>
<noscript><iframe
src="https://www.googletagmanager.com/ns.html?id=GTM-KRG85D8"
height="0"
width="0"
style="display: none; visibility: hidden"
></iframe
></noscript>
<script>!function(l){function e(e){for(var r,t,n=e[0],o=e[1],u=e[2],i=0,a=[];i<n.length;i++)t=n[i],Object.prototype.hasOwnProperty.call(p,t)&&p[t]&&a.push(p[t][0]),p[t]=0;for(r in o)Object.prototype.hasOwnProperty.call(o,r)&&(l[r]=o[r]);for(s&&s(e);a.length;)a.shift()();return c.push.apply(c,u||[]),f()}function f(){for(var e,r=0;r<c.length;r++){for(var t=c[r],n=!0,o=1;o<t.length;o++){var u=t[o];0!==p[u]&&(n=!1)}n&&(c.splice(r--,1),e=i(i.s=t[0]))}return e}var t={},p={runtime:0},c=[];function i(e){if(t[e])return t[e].exports;var r=t[e]={i:e,l:!1,exports:{}};return l[e].call(r.exports,r,r.exports,i),r.l=!0,r.exports}i.m=l,i.c=t,i.d=function(e,r,t){i.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},i.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.t=function(r,e){if(1&e&&(r=i(r)),8&e)return r;if(4&e&&"object"==typeof r&&r&&r.__esModule)return r;var t=Object.create(null);if(i.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:r}),2&e&&"string"!=typeof r)for(var n in r)i.d(t,n,function(e){return r[e]}.bind(null,n));return t},i.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(r,"a",r),r},i.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},i.p="https://assets.zindi.africa/";var r=(n=window.webpackJsonp=window.webpackJsonp||[]).push.bind(n);n.push=e;for(var n=n.slice(),o=0;o<n.length;o++)e(n[o]);var s=r;f()}([]);
//# sourceMappingURL=runtime.8d4eb9324d7c3d54849b.js.map</script><script type="text/javascript" src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/vendor.019171b12285f6597772.js" defer=""></script><script type="text/javascript" src="./How to Use Texthero to Prep a Text-based Dataset for Your NLP Project - Zindi_files/bundle.9c8d652f1bf8225cb2ca.js" defer=""></script>
<div class="ReactModalPortal"></div>
<script type="text/javascript" id="">(function(a,e,f,g,b,c,d){a[b]=a[b]||function(){(a[b].a=a[b].a||[]).push(arguments)};a[b].l=1*new Date;c=e.createElement(f);d=e.getElementsByTagName(f)[0];c.async=1;c.src=g;d.parentNode.insertBefore(c,d)})(window,document,"script","https://mc.yandex.ru/metrika/tag.js","ym");ym(67869277,"init",{clickmap:!0,trackLinks:!0,accurateTrackBounce:!0,webvisor:!0});</script>
<noscript><div><img src="https://mc.yandex.ru/watch/67869277" style="position:absolute; left:-9999px;" alt=""></div></noscript>
<div id="fatkun-drop-panel">
<a id="fatkun-drop-panel-close-btn">×</a>
<div id="fatkun-drop-panel-inner">
<div class="fatkun-content">
<svg class="fatkun-icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5892"><path d="M494.933333 782.933333c2.133333 2.133333 4.266667 4.266667 8.533334 6.4h8.533333c6.4 0 10.666667-2.133333 14.933333-6.4l2.133334-2.133333 275.2-275.2c8.533333-8.533333 8.533333-21.333333 0-29.866667-8.533333-8.533333-21.333333-8.533333-29.866667 0L533.333333 716.8V128c0-12.8-8.533333-21.333333-21.333333-21.333333s-21.333333 8.533333-21.333333 21.333333v588.8L249.6 475.733333c-8.533333-8.533333-21.333333-8.533333-29.866667 0-8.533333 8.533333-8.533333 21.333333 0 29.866667l275.2 277.333333zM853.333333 874.666667H172.8c-12.8 0-21.333333 8.533333-21.333333 21.333333s8.533333 21.333333 21.333333 21.333333H853.333333c12.8 0 21.333333-8.533333 21.333334-21.333333s-10.666667-21.333333-21.333334-21.333333z" p-id="5893"></path></svg>
<div class="fatkun-title">Drag and Drop</div>
<div class="fatkun-desc">The image will be downloaded</div>
</div>
</div>
</div></body></html>