diff --git a/PreprocessingFinal.ipynb b/PreprocessingFinal.ipynb index 8e586cc..84ba3a5 100644 --- a/PreprocessingFinal.ipynb +++ b/PreprocessingFinal.ipynb @@ -5,7 +5,6 @@ "colab": { "name": "PreprocessingFinal.ipynb", "provenance": [], - "authorship_tag": "ABX9TyMgZo25t5kp1nELT9FOxLtd", "include_colab_link": true }, "kernelspec": { @@ -32,6 +31,8 @@ "source": [ "!pip install contractions\n", "!pip install emoji\n", + "!pip install ekphrasis\n", + "\n", "import pandas as pd\n", "import re\n", "import emoji\n", @@ -64,7 +65,9 @@ "import json\n", "import random\n", "import string\n", - "import sys" + "import sys\n", + "from ekphrasis.classes.segmenter import Segmenter\n", + "import itertools\n" ], "execution_count": null, "outputs": [] @@ -72,7 +75,126 @@ { "cell_type": "code", "metadata": { - "id": "z70r1eYrnoB_" + "id": "vFOsh5pJq2QZ", + "outputId": "5e5b78f9-48e8-4757-a2a1-356258c3d736", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BVXGLwXaq5I5", + "outputId": "874c4c78-14a8-49d2-f1bf-d68c621db7bd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } + }, + "source": [ + "df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Percentage change)/AAPL_label2.1.csv')\n", + "combine_ds = df_comb.sample(frac=1)\n", + "combine_ds.head()" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messagelabel
563747$AAPL sold my 250 April puts! BAM-1.0
378786Stocks from the SMB Scanner: $FB $EXPE $FEYE $...-1.0
295238$AAPL wish apple would just buyback to 125 to ...0.0
110904@zspecs @SlopeOfHope @JRNavarro75 Options can ...1.0
1117553$SPY $AAPL \\n\\nJesús Christ. That was some day!0.0
\n", + "
" + ], + "text/plain": [ + " message label\n", + "563747 $AAPL sold my 250 April puts! BAM -1.0\n", + "378786 Stocks from the SMB Scanner: $FB $EXPE $FEYE $... -1.0\n", + "295238 $AAPL wish apple would just buyback to 125 to ... 0.0\n", + "110904 @zspecs @SlopeOfHope @JRNavarro75 Options can ... 1.0\n", + "1117553 $SPY $AAPL \\n\\nJesús Christ. That was some day! 0.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "z70r1eYrnoB_", + "outputId": "bd2bd23f-5c39-4eb8-9169-5e18df89a7e9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + } }, "source": [ "combine_ds['message'] = combine_ds['message'].str.lower()\n", @@ -80,8 +202,16 @@ "\n", "print(message[:10])\n" ], - "execution_count": null, - "outputs": [] + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['$aapl sold my 250 april puts! bam', 'stocks from the smb scanner: $fb $expe $feye $blue $nvo $uso $aapl $expe', '$aapl wish apple would just buyback to 125 to get over the 120/700 mental barrier', '@zspecs @slopeofhope @jrnavarro75 options can also be used to reduce risk, as in these examples: http://stks.co/gne2 $aapl', '$spy $aapl \\n\\njesús christ. that was some day!', '$aapl forging ahead with an upward range every day...whether closing at new highs or just small gains..very bullish signal over all.', '$aapl trump supporters can't afford apple products anyway so this boycott isn't going to affect sales. rather, it will boost sales and attn', 'today's iphone event will be a test of trust for apple $aapl http://goo.gl/fpwxke', 'it's pretty clear that supply >> demand out there right now.. $spy $dia $qqq $aapl $nflx $nvda $pypl etc', 'the industry average profit margin is 1.64%. $aapl outperforms 97% of its industry peers. https://www.chartmill.com/stock/quote/aapl/fundamental-analysis?key=bb853040-a4ac-41c6-b549-d218d2f21b32&utm_source=stocktwits&utm_medium=fa&utm_content=aapl&utm_campaign=social_tracking']\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", @@ -90,7 +220,7 @@ }, "source": [ "def remove_stopwords(msg):\n", - " filtered_sentence = [w for w in tokens if not w in stop_words]\n", + " filtered_sentence = [w for w in msg_tokens if not w in stop_words]\n", " return filtered_sentence\n", "\n", "def remove_punctuation_re(x):\n", @@ -103,103 +233,126 @@ "\n", " x = ' '.join(re.sub(r'_',\" \",x).split()) #Removing _ from emojis text\n", "\n", - " return x" + " return x\n", + "\n", + "# replace repeating letter\n", + "def rpt_replace(match):\n", + " # print(match.group(1))\n", + " return match.group(1)+match.group(1)\n", + "\n", + "# substitute original word with replaced word, if any\n", + "def processRepeatings(data):\n", + " re_t= re.sub(message_rpt, rpt_replace, data )\n", + " # print(re_t)\n", + " return re_t" ], - "execution_count": null, + "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { - "id": "4LRFAxgcn_fi" + "id": "4LRFAxgcn_fi", + "outputId": "a10b4eae-a37d-4694-fe94-f127beeebfcf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + } }, "source": [ - "message_er = []\n", - "ps = PorterStemmer()\n", "stop_words = sw.words(\"english\")\n", "tweet_tokenizer = TweetTokenizer()\n", "detokenizer = TreebankWordDetokenizer()\n", "message_p = []\n", + "\n", + "# for repeating characters in words\n", + "message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n", + "\n", + "# segmenter using the word statistics from Twitter\n", + "seg_tw = Segmenter(corpus=\"twitter\")\n", + "\n", "for msg in message:\n", - " \n", " # remove emojis\n", " msg = emoji.demojize(msg)\n", - " \n", + "\n", + " # fix contractions\n", + " msg = contractions.fix(msg)\n", + "\n", + " # remove punctuations\n", + " msg = remove_punctuation_re(msg) \n", + " message_p.append(msg)\n", " #tokenize\n", - " tokens = tweet_tokenizer.tokenize(msg)\n", + " msg_tokens = tweet_tokenizer.tokenize(msg)\n", + "\n", + " #For Hashtags elongated words using Word segmenter\n", + " message_seg = []\n", + " for w in msg_tokens:\n", + " message_seg.append(seg_tw.segment(w))\n", "\n", " # remove stopwords\n", - " msg = remove_stopwords(msg)\n", + " msg = remove_stopwords(message_seg)\n", + "\n", " if 'rt' in msg:\n", " # remove retweets\n", " message_p.append('-1')\n", " else: \n", " # detokenize\n", " msg = detokenizer.detokenize(msg)\n", - " \n", - " # fix contractions\n", - " msg = contractions.fix(msg)\n", - "\n", - " # remove punctuations\n", - " msg = remove_punctuation_re(msg) \n", - " message_p.append(msg)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "pXWjHAcQoIEh" - }, - "source": [ - "#Removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n", - "\n", - "strOfMsg = \" \".join(itertools.chain.from_iterable(message_tok))\n", - "message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n", - "\n", - "def rpt_replace(match):\n", - " return match.group(1)+match.group(1)\n", "\n", - "# t = 'amzn dip buyer fulll attack boooyaaaaaaaaaaaaaaaah'\n", - "re_t = ''\n", - "message_nrp = []\n", + " # removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n", + " msg = processRepeatings(msg)\n", "\n", - "def processRepeatings(data):\n", - " re_t= re.sub(message_rpt, rpt_replace, data )\n", - " return message_nrp.append(re_t)\n", - "\n", - "processRepeatings(strOfMsg)" + " message_p.append(msg)\n" ], - "execution_count": null, - "outputs": [] + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Reading twitter - 1grams ...\n", + "Reading twitter - 2grams ...\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", "metadata": { - "id": "pwVS7SDnoaIt" + "id": "pXWjHAcQoIEh", + "outputId": "79b08153-5f77-4203-a1bf-308ca594ab36", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 187 + } }, "source": [ - "#For Hashtags elongated words using Word segmenter\n", - "!pip install ekphrasis\n", - "from ekphrasis.classes.segmenter import Segmenter\n", - "\n", - "# segmenter using the word statistics from english Wikipedia\n", - "# seg_eng = Segmenter(corpus=\"english\") \n", - "message_seg = []\n", - "\n", - "# segmenter using the word statistics from Twitter\n", - "seg_tw = Segmenter(corpus=\"twitter\")\n", - "\n", - "# words = [\"exponentialbackoff\", \"gamedev\", \"retrogaming\", \"thewatercooler\", \"panpsychism\"]\n", - "for w in message_sw:\n", - " # print(w)\n", - " message_seg.append(seg_tw.segment(w))\n", - " # print(\"(tw):\", seg_tw.segment(w))\n", - " # print()" + "message_p[:10]" ], - "execution_count": null, - "outputs": [] + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['aapl sold my 250 april puts bam',\n", + " 'aapl sold 250 april puts bam',\n", + " 'stocks from the smb scanner fb expe feye blue nvo uso aapl expe',\n", + " 'stocks smb scanner fb expe feye blue nvo uso aapl expe',\n", + " 'aapl wish apple would just buyback to 125 to get over the 120 700 mental barrier',\n", + " 'aapl wish apple would buyback 125 get 120 700 mental barrier',\n", + " 'options can also be used to reduce risk as in these examples aapl',\n", + " 'options also used reduce risk examples aapl',\n", + " 'spy aapl jesús christ that was some day',\n", + " 'spy aapl jesús christ day']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 12 + } + ] } ] } \ No newline at end of file