Skip to content

Commit

Permalink
Created using Colaboratory
Browse files Browse the repository at this point in the history
  • Loading branch information
hkha4640 committed Oct 16, 2020
1 parent 85d0051 commit 3b480e4
Showing 1 changed file with 222 additions and 69 deletions.
291 changes: 222 additions & 69 deletions PreprocessingFinal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"colab": {
"name": "PreprocessingFinal.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyMgZo25t5kp1nELT9FOxLtd",
"include_colab_link": true
},
"kernelspec": {
Expand All @@ -32,6 +31,8 @@
"source": [
"!pip install contractions\n",
"!pip install emoji\n",
"!pip install ekphrasis\n",
"\n",
"import pandas as pd\n",
"import re\n",
"import emoji\n",
Expand Down Expand Up @@ -64,24 +65,153 @@
"import json\n",
"import random\n",
"import string\n",
"import sys"
"import sys\n",
"from ekphrasis.classes.segmenter import Segmenter\n",
"import itertools\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "z70r1eYrnoB_"
"id": "vFOsh5pJq2QZ",
"outputId": "5e5b78f9-48e8-4757-a2a1-356258c3d736",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "BVXGLwXaq5I5",
"outputId": "874c4c78-14a8-49d2-f1bf-d68c621db7bd",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Percentage change)/AAPL_label2.1.csv')\n",
"combine_ds = df_comb.sample(frac=1)\n",
"combine_ds.head()"
],
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>message</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>563747</th>\n",
" <td>$AAPL sold my 250 April puts! BAM</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>378786</th>\n",
" <td>Stocks from the SMB Scanner: $FB $EXPE $FEYE $...</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>295238</th>\n",
" <td>$AAPL wish apple would just buyback to 125 to ...</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110904</th>\n",
" <td>@zspecs @SlopeOfHope @JRNavarro75 Options can ...</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1117553</th>\n",
" <td>$SPY $AAPL \\n\\nJesús Christ. That was some day!</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" message label\n",
"563747 $AAPL sold my 250 April puts! BAM -1.0\n",
"378786 Stocks from the SMB Scanner: $FB $EXPE $FEYE $... -1.0\n",
"295238 $AAPL wish apple would just buyback to 125 to ... 0.0\n",
"110904 @zspecs @SlopeOfHope @JRNavarro75 Options can ... 1.0\n",
"1117553 $SPY $AAPL \\n\\nJesús Christ. That was some day! 0.0"
]
},
"metadata": {
"tags": []
},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "z70r1eYrnoB_",
"outputId": "bd2bd23f-5c39-4eb8-9169-5e18df89a7e9",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
}
},
"source": [
"combine_ds['message'] = combine_ds['message'].str.lower()\n",
"message = combine_ds['message'].tolist()\n",
"\n",
"print(message[:10])\n"
],
"execution_count": null,
"outputs": []
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"['$aapl sold my 250 april puts! bam', 'stocks from the smb scanner: $fb $expe $feye $blue $nvo $uso $aapl $expe', '$aapl wish apple would just buyback to 125 to get over the 120/700 mental barrier', '@zspecs @slopeofhope @jrnavarro75 options can also be used to reduce risk, as in these examples: http://stks.co/gne2 $aapl', '$spy $aapl \\n\\njesús christ. that was some day!', '$aapl forging ahead with an upward range every day...whether closing at new highs or just small gains..very bullish signal over all.', '$aapl trump supporters can&#39;t afford apple products anyway so this boycott isn&#39;t going to affect sales. rather, it will boost sales and attn', 'today&#39;s iphone event will be a test of trust for apple $aapl http://goo.gl/fpwxke', 'it&#39;s pretty clear that supply &gt;&gt; demand out there right now.. $spy $dia $qqq $aapl $nflx $nvda $pypl etc', 'the industry average profit margin is 1.64%. $aapl outperforms 97% of its industry peers. https://www.chartmill.com/stock/quote/aapl/fundamental-analysis?key=bb853040-a4ac-41c6-b549-d218d2f21b32&amp;utm_source=stocktwits&amp;utm_medium=fa&amp;utm_content=aapl&amp;utm_campaign=social_tracking']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
Expand All @@ -90,7 +220,7 @@
},
"source": [
"def remove_stopwords(msg):\n",
" filtered_sentence = [w for w in tokens if not w in stop_words]\n",
" filtered_sentence = [w for w in msg_tokens if not w in stop_words]\n",
" return filtered_sentence\n",
"\n",
"def remove_punctuation_re(x):\n",
Expand All @@ -103,103 +233,126 @@
"\n",
" x = ' '.join(re.sub(r'_',\" \",x).split()) #Removing _ from emojis text\n",
"\n",
" return x"
" return x\n",
"\n",
"# replace repeating letter\n",
"def rpt_replace(match):\n",
" # print(match.group(1))\n",
" return match.group(1)+match.group(1)\n",
"\n",
"# substitute original word with replaced word, if any\n",
"def processRepeatings(data):\n",
" re_t= re.sub(message_rpt, rpt_replace, data )\n",
" # print(re_t)\n",
" return re_t"
],
"execution_count": null,
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "4LRFAxgcn_fi"
"id": "4LRFAxgcn_fi",
"outputId": "a10b4eae-a37d-4694-fe94-f127beeebfcf",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"source": [
"message_er = []\n",
"ps = PorterStemmer()\n",
"stop_words = sw.words(\"english\")\n",
"tweet_tokenizer = TweetTokenizer()\n",
"detokenizer = TreebankWordDetokenizer()\n",
"message_p = []\n",
"\n",
"# for repeating characters in words\n",
"message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n",
"\n",
"# segmenter using the word statistics from Twitter\n",
"seg_tw = Segmenter(corpus=\"twitter\")\n",
"\n",
"for msg in message:\n",
" \n",
" # remove emojis\n",
" msg = emoji.demojize(msg)\n",
" \n",
"\n",
" # fix contractions\n",
" msg = contractions.fix(msg)\n",
"\n",
" # remove punctuations\n",
" msg = remove_punctuation_re(msg) \n",
" message_p.append(msg)\n",
" #tokenize\n",
" tokens = tweet_tokenizer.tokenize(msg)\n",
" msg_tokens = tweet_tokenizer.tokenize(msg)\n",
"\n",
" #For Hashtags elongated words using Word segmenter\n",
" message_seg = []\n",
" for w in msg_tokens:\n",
" message_seg.append(seg_tw.segment(w))\n",
"\n",
" # remove stopwords\n",
" msg = remove_stopwords(msg)\n",
" msg = remove_stopwords(message_seg)\n",
"\n",
" if 'rt' in msg:\n",
" # remove retweets\n",
" message_p.append('-1')\n",
" else: \n",
" # detokenize\n",
" msg = detokenizer.detokenize(msg)\n",
" \n",
" # fix contractions\n",
" msg = contractions.fix(msg)\n",
"\n",
" # remove punctuations\n",
" msg = remove_punctuation_re(msg) \n",
" message_p.append(msg)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pXWjHAcQoIEh"
},
"source": [
"#Removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n",
"\n",
"strOfMsg = \" \".join(itertools.chain.from_iterable(message_tok))\n",
"message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n",
"\n",
"def rpt_replace(match):\n",
" return match.group(1)+match.group(1)\n",
"\n",
"# t = 'amzn dip buyer fulll attack boooyaaaaaaaaaaaaaaaah'\n",
"re_t = ''\n",
"message_nrp = []\n",
" # removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n",
" msg = processRepeatings(msg)\n",
"\n",
"def processRepeatings(data):\n",
" re_t= re.sub(message_rpt, rpt_replace, data )\n",
" return message_nrp.append(re_t)\n",
"\n",
"processRepeatings(strOfMsg)"
" message_p.append(msg)\n"
],
"execution_count": null,
"outputs": []
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"Reading twitter - 1grams ...\n",
"Reading twitter - 2grams ...\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "pwVS7SDnoaIt"
"id": "pXWjHAcQoIEh",
"outputId": "79b08153-5f77-4203-a1bf-308ca594ab36",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
}
},
"source": [
"#For Hashtags elongated words using Word segmenter\n",
"!pip install ekphrasis\n",
"from ekphrasis.classes.segmenter import Segmenter\n",
"\n",
"# segmenter using the word statistics from english Wikipedia\n",
"# seg_eng = Segmenter(corpus=\"english\") \n",
"message_seg = []\n",
"\n",
"# segmenter using the word statistics from Twitter\n",
"seg_tw = Segmenter(corpus=\"twitter\")\n",
"\n",
"# words = [\"exponentialbackoff\", \"gamedev\", \"retrogaming\", \"thewatercooler\", \"panpsychism\"]\n",
"for w in message_sw:\n",
" # print(w)\n",
" message_seg.append(seg_tw.segment(w))\n",
" # print(\"(tw):\", seg_tw.segment(w))\n",
" # print()"
"message_p[:10]"
],
"execution_count": null,
"outputs": []
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['aapl sold my 250 april puts bam',\n",
" 'aapl sold 250 april puts bam',\n",
" 'stocks from the smb scanner fb expe feye blue nvo uso aapl expe',\n",
" 'stocks smb scanner fb expe feye blue nvo uso aapl expe',\n",
" 'aapl wish apple would just buyback to 125 to get over the 120 700 mental barrier',\n",
" 'aapl wish apple would buyback 125 get 120 700 mental barrier',\n",
" 'options can also be used to reduce risk as in these examples aapl',\n",
" 'options also used reduce risk examples aapl',\n",
" 'spy aapl jesús christ that was some day',\n",
" 'spy aapl jesús christ day']"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
}
]
}

0 comments on commit 3b480e4

Please sign in to comment.