diff --git a/PreprocessingFinal.ipynb b/PreprocessingFinal.ipynb index a89d5af..c57f098 100644 --- a/PreprocessingFinal.ipynb +++ b/PreprocessingFinal.ipynb @@ -5,12 +5,14 @@ "colab": { "name": "PreprocessingFinal.ipynb", "provenance": [], + "toc_visible": true, "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" - } + }, + "accelerator": "GPU" }, "cells": [ { @@ -36,10 +38,10 @@ "cell_type": "code", "metadata": { "id": "_gXtdFlzndWh", - "outputId": "534bcbcb-1ea4-4c2c-e8db-8bd6b96d8cc1", + "outputId": "4eef95d3-6530-4ab1-e777-cef79611e6d8", "colab": { "base_uri": "https://localhost:8080/", - "height": 445 + "height": 1000 } }, "source": [ @@ -47,6 +49,8 @@ "!pip install emoji\n", "!pip install ekphrasis\n", "\n", + "import sys\n", + "import math\n", "import pandas as pd\n", "import re\n", "import emoji\n", @@ -88,30 +92,69 @@ { "output_type": "stream", "text": [ - "Requirement already satisfied: contractions in /usr/local/lib/python3.6/dist-packages (0.0.25)\n", - "Requirement already satisfied: textsearch in /usr/local/lib/python3.6/dist-packages (from contractions) (0.0.17)\n", - "Requirement already satisfied: Unidecode in /usr/local/lib/python3.6/dist-packages (from textsearch->contractions) (1.1.1)\n", - "Requirement already satisfied: pyahocorasick in /usr/local/lib/python3.6/dist-packages (from textsearch->contractions) (1.4.0)\n", - "Requirement already satisfied: emoji in /usr/local/lib/python3.6/dist-packages (0.6.0)\n", - "Requirement already satisfied: ekphrasis in /usr/local/lib/python3.6/dist-packages (0.5.1)\n", - "Requirement already satisfied: ujson in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (4.0.1)\n", - "Requirement already satisfied: ftfy in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (5.8)\n", + "Collecting contractions\n", + " Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl\n", + "Collecting textsearch\n", + " Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl\n", + "Collecting pyahocorasick\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)\n", + "\u001b[K |████████████████████████████████| 317kB 13.0MB/s \n", + "\u001b[?25hCollecting Unidecode\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n", + "\u001b[K |████████████████████████████████| 245kB 42.0MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: pyahocorasick\n", + " Building wheel for pyahocorasick (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81698 sha256=5b9f31992a58143cd2196c4bf8b9c9146f9f059c397e2567cd79ab13e29dcbf2\n", + " Stored in directory: /root/.cache/pip/wheels/0a/90/61/87a55f5b459792fbb2b7ba6b31721b06ff5cf6bde541b40994\n", + "Successfully built pyahocorasick\n", + "Installing collected packages: pyahocorasick, Unidecode, textsearch, contractions\n", + "Successfully installed Unidecode-1.1.1 contractions-0.0.25 pyahocorasick-1.4.0 textsearch-0.0.17\n", + "Collecting emoji\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ff/1c/1f1457fe52d0b30cbeebfd578483cedb3e3619108d2d5a21380dfecf8ffd/emoji-0.6.0.tar.gz (51kB)\n", + "\u001b[K |████████████████████████████████| 51kB 4.7MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: emoji\n", + " Building wheel for emoji (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for emoji: filename=emoji-0.6.0-cp36-none-any.whl size=49716 sha256=2dcc4fd17dc129eebe5e44402a5f00fdfd0e75e52f7009c3765ed6597b6f3d24\n", + " Stored in directory: /root/.cache/pip/wheels/46/2c/8b/9dcf5216ca68e14e0320e283692dce8ae321cdc01e73e17796\n", + "Successfully built emoji\n", + "Installing collected packages: emoji\n", + "Successfully installed emoji-0.6.0\n", + "Collecting ekphrasis\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/92/e6/37c59d65e78c3a2aaf662df58faca7250eb6b36c559b912a39a7ca204cfb/ekphrasis-0.5.1.tar.gz (80kB)\n", + "\u001b[K |████████████████████████████████| 81kB 6.5MB/s \n", + "\u001b[?25hRequirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (1.1.0)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (4.41.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (1.18.5)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (3.2.2)\n", - "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (1.1.0)\n", + "Collecting colorama\n", + " Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl\n", + "Collecting ujson\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f1/84/e039c6ffc6603f2dfe966972d345d4f650a4ffd74b18c852ece645de12ac/ujson-4.0.1-cp36-cp36m-manylinux1_x86_64.whl (179kB)\n", + "\u001b[K |████████████████████████████████| 184kB 18.5MB/s \n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (3.2.2)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (3.2.5)\n", - "Requirement already satisfied: colorama in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (0.4.4)\n", - "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from ftfy->ekphrasis) (0.2.5)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (1.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (0.10.0)\n", + "Collecting ftfy\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ff/e2/3b51c53dffb1e52d9210ebc01f1fb9f2f6eba9b3201fa971fd3946643c71/ftfy-5.8.tar.gz (64kB)\n", + "\u001b[K |████████████████████████████████| 71kB 10.3MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (1.18.5)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (2.8.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (2.4.7)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (0.10.0)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk->ekphrasis) (1.15.0)\n", + "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from ftfy->ekphrasis) (0.2.5)\n", + "Building wheels for collected packages: ekphrasis, ftfy\n", + " Building wheel for ekphrasis (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ekphrasis: filename=ekphrasis-0.5.1-cp36-none-any.whl size=82844 sha256=fbde4943ba562ff50d306a66827bf8ea0013b6b153eeb5442c5dc5c6508f4e9c\n", + " Stored in directory: /root/.cache/pip/wheels/2f/c5/9b/c9b60f535a2cf9fdbc92d84c4801a010c35a9cd348011ed2a1\n", + " Building wheel for ftfy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ftfy: filename=ftfy-5.8-cp36-none-any.whl size=45612 sha256=1b281ae0046213239ab5a904f64063ec25e883354a5142e72f17da880d2d47c0\n", + " Stored in directory: /root/.cache/pip/wheels/ba/c0/ef/f28c4da5ac84a4e06ac256ca9182fc34fa57fefffdbc68425b\n", + "Successfully built ekphrasis ftfy\n", + "Installing collected packages: colorama, ujson, ftfy, ekphrasis\n", + "Successfully installed colorama-0.4.4 ekphrasis-0.5.1 ftfy-5.8 ujson-4.0.1\n", "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n", "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" + "[nltk_data] Unzipping corpora/stopwords.zip.\n" ], "name": "stdout" } @@ -121,10 +164,10 @@ "cell_type": "code", "metadata": { "id": "vFOsh5pJq2QZ", - "outputId": "d8e8d313-417d-4b07-af4b-3195d01cc237", + "outputId": "bf03225b-0a06-4198-f0c1-54a36f77b397", "colab": { "base_uri": "https://localhost:8080/", - "height": 54 + "height": 34 } }, "source": [ @@ -136,7 +179,7 @@ { "output_type": "stream", "text": [ - "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + "Mounted at /content/drive\n" ], "name": "stdout" } @@ -155,14 +198,14 @@ "cell_type": "code", "metadata": { "id": "BVXGLwXaq5I5", - "outputId": "07e36ae8-912a-482f-e70a-6b1276a3aae7", + "outputId": "b23b1b2e-696d-4af2-98dd-5e2201e13cd5", "colab": { "base_uri": "https://localhost:8080/", - "height": 419 + "height": 623 } }, "source": [ - "df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Percentage change)/WMT_label2.1.csv')\n", + "df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Binary - same day open close)/labelled_DIA.csv')\n", "combine_ds = df_comb.sample(frac=1)\n", "combine_ds" ], @@ -190,86 +233,182 @@ " \n", " \n", " \n", + " Unnamed: 0\n", + " symbol\n", " message\n", - " label\n", + " datetime\n", + " user\n", + " message_id\n", + " Date\n", + " Time\n", + " Day_of_week\n", + " polarity\n", " \n", " \n", " \n", " \n", - " 10779\n", - " $WMT how are you bearish fools doing?\n", - " 1\n", + " 16590\n", + " 16590\n", + " DIA\n", + " $DIA BETS on market tomorrow?\n", + " 2020-04-07 18:12:46\n", + " 1117334\n", + " 205129040\n", + " 2020-04-07\n", + " 18:12:46\n", + " Tuesday\n", + " 0\n", " \n", " \n", - " 7731\n", - " $WMT more pain for stubborn shorts. -signed st...\n", + " 154988\n", + " 154988\n", + " DIA\n", + " Municipal bond funds reported an inflow for th...\n", + " 2014-10-17 12:15:41\n", + " 237666\n", + " 28110605\n", + " 2014-10-17\n", + " 12:15:41\n", + " Friday\n", " 1\n", " \n", " \n", - " 3159\n", - " $WMT layoffs and cost cutting measures coming ...\n", - " -1\n", + " 174141\n", + " 174141\n", + " DIA\n", + " TS TradePlan for YM http://stks.co/hbRm - Frid...\n", + " 2013-06-28 08:55:34\n", + " 13803\n", + " 14345522\n", + " 2013-06-28\n", + " 08:55:34\n", + " Friday\n", + " 0\n", " \n", " \n", - " 561\n", - " fyi: Huge (over 1 million) call vol in $T and ...\n", - " 1\n", + " 94680\n", + " 94680\n", + " DIA\n", + " $SPX $SPY $QQQ $NDX $IWM $VIX $DIA $DJIA NASDA...\n", + " 2018-03-14 17:18:12\n", + " 917666\n", + " 116238736\n", + " 2018-03-14\n", + " 17:18:12\n", + " Wednesday\n", + " 0\n", " \n", " \n", - " 12165\n", - " The 6 Most Shorted Dow Stocks\\n\\n$MSFT $CSCO ...\n", + " 64506\n", + " 64506\n", + " DIA\n", + " Although the technical rating is bad, $DIA doe...\n", + " 2018-12-19 15:39:54\n", + " 47688\n", + " 148382242\n", + " 2018-12-19\n", + " 15:39:54\n", + " Wednesday\n", " 0\n", " \n", " \n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 7153\n", - " Yesterdays Sell On Close Closing Order Imbal...\n", - " -1\n", + " 166025\n", + " 166025\n", + " DIA\n", + " $DIA overly bearish sentiment tells me we coul...\n", + " 2014-02-03 20:08:06\n", + " 170632\n", + " 19676228\n", + " 2014-02-03\n", + " 20:08:06\n", + " Monday\n", + " 0\n", " \n", " \n", - " 7790\n", - " Why XPO Logistics Inc Stock Is More Than a Spe...\n", - " 1\n", + " 182008\n", + " 182008\n", + " DIA\n", + " If $SPY $DIA $QQQ continue to fall. Look for b...\n", + " 2012-09-26 15:08:40\n", + " 161307\n", + " 9693821\n", + " 2012-09-26\n", + " 15:08:40\n", + " Wednesday\n", + " 0\n", " \n", " \n", - " 10965\n", - " Newport Wealth Strategies Inc.,has filed Form ...\n", - " 1\n", + " 36692\n", + " 36692\n", + " DIA\n", + " $SPY $DIA green \\n \\nSo looks like the rever...\n", + " 2019-11-21 19:36:44\n", + " 349810\n", + " 184600235\n", + " 2019-11-21\n", + " 19:36:44\n", + " Thursday\n", + " 0\n", " \n", " \n", - " 12201\n", - " $WMT Walmart needs to show us some action today\n", - " 1\n", + " 144330\n", + " 144330\n", + " DIA\n", + " "@ep_capital: $INTC -0.70% the lone decli...\n", + " 2015-07-09 14:02:15\n", + " 8115\n", + " 39428873\n", + " 2015-07-09\n", + " 14:02:15\n", + " Thursday\n", + " 0\n", " \n", " \n", - " 1225\n", - " #Tesco 2nd of Top 3 global grocers to quit Jap...\n", - " 1\n", + " 49227\n", + " 49227\n", + " DIA\n", + " $spy #spy $qqq #qqq $dia #dia worth attempting...\n", + " 2019-05-13 15:08:04\n", + " 1874654\n", + " 164013252\n", + " 2019-05-13\n", + " 15:08:04\n", + " Monday\n", + " 0\n", " \n", " \n", "\n", - "

12231 rows × 2 columns

\n", + "

189685 rows × 10 columns

\n", "" ], "text/plain": [ - " message label\n", - "10779 $WMT how are you bearish fools doing? 1\n", - "7731 $WMT more pain for stubborn shorts. -signed st... 1\n", - "3159 $WMT layoffs and cost cutting measures coming ... -1\n", - "561 fyi: Huge (over 1 million) call vol in $T and ... 1\n", - "12165 The 6 Most Shorted Dow Stocks\\n\\n$MSFT $CSCO ... 0\n", - "... ... ...\n", - "7153 Yesterdays Sell On Close Closing Order Imbal... -1\n", - "7790 Why XPO Logistics Inc Stock Is More Than a Spe... 1\n", - "10965 Newport Wealth Strategies Inc.,has filed Form ... 1\n", - "12201 $WMT Walmart needs to show us some action today 1\n", - "1225 #Tesco 2nd of Top 3 global grocers to quit Jap... 1\n", + " Unnamed: 0 symbol ... Day_of_week polarity\n", + "16590 16590 DIA ... Tuesday 0\n", + "154988 154988 DIA ... Friday 1\n", + "174141 174141 DIA ... Friday 0\n", + "94680 94680 DIA ... Wednesday 0\n", + "64506 64506 DIA ... Wednesday 0\n", + "... ... ... ... ... ...\n", + "166025 166025 DIA ... Monday 0\n", + "182008 182008 DIA ... Wednesday 0\n", + "36692 36692 DIA ... Thursday 0\n", + "144330 144330 DIA ... Thursday 0\n", + "49227 49227 DIA ... Monday 0\n", "\n", - "[12231 rows x 2 columns]" + "[189685 rows x 10 columns]" ] }, "metadata": { @@ -288,11 +427,24 @@ "## Preprocess data" ] }, + { + "cell_type": "code", + "metadata": { + "id": "i8TteZhWQ0Ba" + }, + "source": [ + "# fill nan values in file with '0'\n", + "combine_ds.isna().values.any()\n", + "combine_ds['message'] = combine_ds['message'].fillna('0')" + ], + "execution_count": 8, + "outputs": [] + }, { "cell_type": "code", "metadata": { "id": "z70r1eYrnoB_", - "outputId": "ea37ebdc-ea6a-4fa9-ce8c-610b7bbac64f", + "outputId": "62b8b512-a7fa-411d-de03-cebd62463cb4", "colab": { "base_uri": "https://localhost:8080/", "height": 54 @@ -304,12 +456,12 @@ "\n", "print(message[:10])" ], - "execution_count": 4, + "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ - "['$wmt how are you bearish fools doing?', '$wmt more pain for stubborn shorts. -signed stubborn long', '$wmt layoffs and cost cutting measures coming -brazil is a disaster.no position. if it gets to 45, maybe..lol', 'fyi: huge (over 1 million) call vol in $t and $vz itm july calls yesterday. chain: http://stk.ly/owk4r5 , http://stk.ly/mqiyye > what's up?', 'the 6 most shorted dow stocks\\n\\n$msft $csco $vz $ko $v \\n\\nhttps://247wallst.com/investing/2019/12/26/6-most-shorted-dow-stocks-microsoft-is-king-of-the-hill-in-december/1/', '02:57:30 pm manual exit. closing my $wmt position of 672 shares that was opened oct 29 for a 0.07% gain.', '@sspencer_smb tmobile/sprint still a foreign owed enterprise, either way.. buy american $t and $vz ..', '@scarborory agree .... $s owns the most spectrum in the us comparing w $t , $vz and $tmus', '$baba just had some positive pr on fox news radio. talked about how $baba is destroying $wmt due to it's success.', '$wmt $ostk $msft employees chant this in the bathroom']\n" + "['$dia bets on market tomorrow?', 'municipal bond funds reported an inflow for the 23rd time in the past 24wks (+$444mn). $dia $qqq $spy $mub $hyd $tlt $tbt', 'ts tradeplan for ym http://stks.co/hbrm - friday june 28 2013 $es_f $ym_f $6e_f $cl_f $gc_f sent to members $dia $indu #futures #trading', '$spx $spy $qqq $ndx $iwm $vix $dia $djia nasdaq market internals summary', 'although the technical rating is bad, $dia does present a nice setup opportunity. https://www.chartmill.com/analyze.php?utm_source=stocktwits&utm_medium=ta&utm_content=setup&utm_campaign=social_tracking#/dia?key=810cb939-dc70-485b-b7ec-a9a1fe911f70', 'new hod on $tick on that move over 15459 as well $dia $indu', '$iwm $spy $qqq $dia $spx markets makers move prices flat into the sun. vol. event. as close to middle as possible.', '$dia they will not rollback tariffs . c'mon look at the news man! the asian markets usa everyone this will be no deal!!!!', '$spy $dia panic selling in overall markets however not all stocks selling off here and continue to focus on stronger positions #livetradepro', 'bearish persistence on the aaii survey http://stks.co/dmvy via @ppearlman $spy $dia']\n" ], "name": "stdout" } @@ -348,19 +500,46 @@ " # print(re_t)\n", " return re_t" ], - "execution_count": 5, + "execution_count": 10, "outputs": [] }, { "cell_type": "code", "metadata": { - "id": "_3YWRwzLBGs1", - "outputId": "25a76e0b-f43d-4419-bfc5-77f4da53ff49", + "id": "gfz_YC4eQ6nk", + "outputId": "3faa5c5c-5dd2-4a21-b2c7-664c044b4887", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, + "source": [ + "print(sys.getrecursionlimit())\n", + "sys.setrecursionlimit(1800)\n", + "print(sys.getrecursionlimit())" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "text": [ + "1000\n", + "1800\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_3YWRwzLBGs1", + "outputId": "3127d551-45f8-4435-f173-92abbd01342d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 170 + } + }, "source": [ "stop_words = sw.words(\"english\")\n", "tweet_tokenizer = TweetTokenizer()\n", @@ -373,13 +552,20 @@ "# segmenter using the word statistics from Twitter\n", "seg_tw = Segmenter(corpus=\"twitter\")\n" ], - "execution_count": 6, + "execution_count": 12, "outputs": [ { "output_type": "stream", "text": [ + "Word statistics files not found!\n", + "Downloading... done!\n", + "Unpacking... done!\n", "Reading twitter - 1grams ...\n", - "Reading twitter - 2grams ...\n" + "generating cache file for faster loading...\n", + "reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt\n", + "Reading twitter - 2grams ...\n", + "generating cache file for faster loading...\n", + "reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt\n" ], "name": "stdout" } @@ -393,46 +579,51 @@ "source": [ "message_p = []\n", "for msg in message:\n", - " # remove emojis\n", - " msg = emoji.demojize(msg)\n", "\n", - " # fix contractions\n", - " msg = contractions.fix(msg)\n", + " if msg == '0': #nan replaced by '0'\n", + " message_p.append('-1')\n", "\n", - " # remove punctuations\n", - " msg = remove_punctuation_re(msg) \n", + " else:\n", + " # remove emojis\n", + " msg = emoji.demojize(msg)\n", "\n", - " #tokenize\n", - " msg_tokens = tweet_tokenizer.tokenize(msg)\n", + " # fix contractions\n", + " msg = contractions.fix(msg)\n", "\n", - " #For Hashtags elongated words using Word segmenter\n", - " message_seg = []\n", - " for w in msg_tokens:\n", - " message_seg.append(seg_tw.segment(w))\n", + " # remove punctuations\n", + " msg = remove_punctuation_re(msg) \n", "\n", - " # remove stopwords\n", - " msg = remove_stopwords(message_seg)\n", + " #tokenize\n", + " msg_tokens = tweet_tokenizer.tokenize(msg)\n", "\n", - " if 'rt' in msg:\n", - " # remove retweets\n", - " message_p.append('-1')\n", - " else: \n", - " # detokenize\n", - " msg = detokenizer.detokenize(msg)\n", + " #For Hashtags elongated words using Word segmenter\n", + " message_seg = []\n", + " for w in msg_tokens:\n", + " message_seg.append(seg_tw.segment(w))\n", + "\n", + " # remove stopwords\n", + " msg = remove_stopwords(message_seg)\n", "\n", - " # removing repeating characters like hurrrryyyyyy-- worrks on tokenized list\n", - " msg = processRepeatings(msg)\n", + " if 'rt' in msg:\n", + " # remove retweets\n", + " message_p.append('-1')\n", + " else: \n", + " # detokenize\n", + " msg = detokenizer.detokenize(msg)\n", "\n", - " message_p.append(msg)\n" + " # removing repeating characters like hurrrryyyyyy-- worrks on tokenized list\n", + " msg = processRepeatings(msg)\n", + "\n", + " message_p.append(msg)\n" ], - "execution_count": 7, + "execution_count": 13, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pXWjHAcQoIEh", - "outputId": "346a4824-d1fd-485f-8280-64eb64327499", + "outputId": "da41adaf-1b9a-46ee-80a0-7bcb935a31c6", "colab": { "base_uri": "https://localhost:8080/", "height": 187 @@ -441,28 +632,28 @@ "source": [ "message_p[:10]" ], - "execution_count": 8, + "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "['wmt bearish fools',\n", - " 'wmt pain stubborn shorts signed stubborn long',\n", - " 'wmt layoffs cost cutting measures coming brazil disaster position gets 45 maybe lol',\n", - " 'fyi huge 1 million call vol vz itm july calls yesterday chain gt 39',\n", - " '6 shorted dow stocks msft csco vz ko v',\n", - " '02 57 30 pm manual exit closing wmt position 672 shares opened oct 29 0 07 gain',\n", - " 'tmobile sprint still foreign owed enterprise either way buy american vz',\n", - " 'agree owns spectrum us comparing w vz tmus',\n", - " 'baba positive pr fox news radio talked baba destroying wmt due 39 success',\n", - " 'wmt ostk msft employees chant bathroom']" + "['dia bets market tomorrow',\n", + " 'municipal bond funds reported inflow 23rd time past 24wks 44mn dia qq spy mub hyd tlt tbt',\n", + " 'ts tradeplan ym friday june 28 2013 es f ym f 6e f cl f gc f sent members dia indu futures trading',\n", + " 'spx spy qq ndx iwm vix dia djia nasdaq market internals summary',\n", + " 'although technical rating bad dia present nice setup opportunity',\n", + " 'new hod tick move 15459 well dia indu',\n", + " 'iwm spy qq dia spx markets makers move prices flat sun vol event close middle possible',\n", + " 'dia rollback tariffs c 39 mon look news man asian markets usa everyone deal',\n", + " 'spy dia panic selling overall markets however stocks selling continue focus stronger positions livetradepro',\n", + " 'bearish persistence aaii survey via spy dia']" ] }, "metadata": { "tags": [] }, - "execution_count": 8 + "execution_count": 14 } ] }, @@ -470,17 +661,17 @@ "cell_type": "code", "metadata": { "id": "KkSpMpJWAG4u", - "outputId": "434cb601-02ad-4600-d726-93487cc11064", + "outputId": "7ac5fba5-e2a0-4599-af4f-8d41f5dcd91f", "colab": { "base_uri": "https://localhost:8080/", - "height": 419 + "height": 623 } }, "source": [ "combine_ds['message'] = message_p\n", "combine_ds" ], - "execution_count": 10, + "execution_count": 15, "outputs": [ { "output_type": "execute_result", @@ -504,92 +695,219 @@ " \n", " \n", " \n", + " Unnamed: 0\n", + " symbol\n", " message\n", - " label\n", + " datetime\n", + " user\n", + " message_id\n", + " Date\n", + " Time\n", + " Day_of_week\n", + " polarity\n", " \n", " \n", " \n", " \n", - " 10779\n", - " wmt bearish fools\n", - " 1\n", + " 16590\n", + " 16590\n", + " DIA\n", + " dia bets market tomorrow\n", + " 2020-04-07 18:12:46\n", + " 1117334\n", + " 205129040\n", + " 2020-04-07\n", + " 18:12:46\n", + " Tuesday\n", + " 0\n", " \n", " \n", - " 7731\n", - " wmt pain stubborn shorts signed stubborn long\n", + " 154988\n", + " 154988\n", + " DIA\n", + " municipal bond funds reported inflow 23rd time...\n", + " 2014-10-17 12:15:41\n", + " 237666\n", + " 28110605\n", + " 2014-10-17\n", + " 12:15:41\n", + " Friday\n", " 1\n", " \n", " \n", - " 3159\n", - " wmt layoffs cost cutting measures coming brazi...\n", - " -1\n", + " 174141\n", + " 174141\n", + " DIA\n", + " ts tradeplan ym friday june 28 2013 es f ym f ...\n", + " 2013-06-28 08:55:34\n", + " 13803\n", + " 14345522\n", + " 2013-06-28\n", + " 08:55:34\n", + " Friday\n", + " 0\n", " \n", " \n", - " 561\n", - " fyi huge 1 million call vol vz itm july calls ...\n", - " 1\n", + " 94680\n", + " 94680\n", + " DIA\n", + " spx spy qq ndx iwm vix dia djia nasdaq market ...\n", + " 2018-03-14 17:18:12\n", + " 917666\n", + " 116238736\n", + " 2018-03-14\n", + " 17:18:12\n", + " Wednesday\n", + " 0\n", " \n", " \n", - " 12165\n", - " 6 shorted dow stocks msft csco vz ko v\n", + " 64506\n", + " 64506\n", + " DIA\n", + " although technical rating bad dia present nice...\n", + " 2018-12-19 15:39:54\n", + " 47688\n", + " 148382242\n", + " 2018-12-19\n", + " 15:39:54\n", + " Wednesday\n", " 0\n", " \n", " \n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 7153\n", - " yesterdays sell close closing order imbalance ...\n", - " -1\n", + " 166025\n", + " 166025\n", + " DIA\n", + " dia overly bearish sentiment tells could see b...\n", + " 2014-02-03 20:08:06\n", + " 170632\n", + " 19676228\n", + " 2014-02-03\n", + " 20:08:06\n", + " Monday\n", + " 0\n", " \n", " \n", - " 7790\n", - " xpo logistics inc stock speculative buy xpo tg...\n", - " 1\n", + " 182008\n", + " 182008\n", + " DIA\n", + " spy dia qq continue fall look biotechs like am...\n", + " 2012-09-26 15:08:40\n", + " 161307\n", + " 9693821\n", + " 2012-09-26\n", + " 15:08:40\n", + " Wednesday\n", + " 0\n", " \n", " \n", - " 10965\n", - " newport wealth strategies inc filed form 13f q...\n", - " 1\n", + " 36692\n", + " 36692\n", + " DIA\n", + " spy dia green looks like reversal day release ...\n", + " 2019-11-21 19:36:44\n", + " 349810\n", + " 184600235\n", + " 2019-11-21\n", + " 19:36:44\n", + " Thursday\n", + " 0\n", " \n", " \n", - " 12201\n", - " wmt walmart needs show us action today\n", - " 1\n", + " 144330\n", + " 144330\n", + " DIA\n", + " quot ep capital intc 0 70 lone decliner dji di...\n", + " 2015-07-09 14:02:15\n", + " 8115\n", + " 39428873\n", + " 2015-07-09\n", + " 14:02:15\n", + " Thursday\n", + " 0\n", " \n", " \n", - " 1225\n", - " tesco 2nd top 3 global grocers quit japan carr...\n", - " 1\n", + " 49227\n", + " 49227\n", + " DIA\n", + " spy spy qq qq dia dia worth attempting day tra...\n", + " 2019-05-13 15:08:04\n", + " 1874654\n", + " 164013252\n", + " 2019-05-13\n", + " 15:08:04\n", + " Monday\n", + " 0\n", " \n", " \n", "\n", - "

12231 rows × 2 columns

\n", + "

189685 rows × 10 columns

\n", "" ], "text/plain": [ - " message label\n", - "10779 wmt bearish fools 1\n", - "7731 wmt pain stubborn shorts signed stubborn long 1\n", - "3159 wmt layoffs cost cutting measures coming brazi... -1\n", - "561 fyi huge 1 million call vol vz itm july calls ... 1\n", - "12165 6 shorted dow stocks msft csco vz ko v 0\n", - "... ... ...\n", - "7153 yesterdays sell close closing order imbalance ... -1\n", - "7790 xpo logistics inc stock speculative buy xpo tg... 1\n", - "10965 newport wealth strategies inc filed form 13f q... 1\n", - "12201 wmt walmart needs show us action today 1\n", - "1225 tesco 2nd top 3 global grocers quit japan carr... 1\n", + " Unnamed: 0 symbol ... Day_of_week polarity\n", + "16590 16590 DIA ... Tuesday 0\n", + "154988 154988 DIA ... Friday 1\n", + "174141 174141 DIA ... Friday 0\n", + "94680 94680 DIA ... Wednesday 0\n", + "64506 64506 DIA ... Wednesday 0\n", + "... ... ... ... ... ...\n", + "166025 166025 DIA ... Monday 0\n", + "182008 182008 DIA ... Wednesday 0\n", + "36692 36692 DIA ... Thursday 0\n", + "144330 144330 DIA ... Thursday 0\n", + "49227 49227 DIA ... Monday 0\n", "\n", - "[12231 rows x 2 columns]" + "[189685 rows x 10 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nTSMxP_VRRQu", + "outputId": "cf9d2165-28df-45d8-a04e-6b5122e98f83", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + } + }, + "source": [ + "combine_ds['polarity'].value_counts()" + ], + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 99652\n", + "0 90033\n", + "Name: polarity, dtype: int64" ] }, "metadata": { "tags": [] }, - "execution_count": 10 + "execution_count": 16 } ] }, @@ -597,10 +915,10 @@ "cell_type": "code", "metadata": { "id": "3DraQK_uDcBU", - "outputId": "1413d0b6-54b1-49e4-8fcb-26c8302a2c30", + "outputId": "e80c6e22-7af4-4050-cbcb-c7c7f5393a90", "colab": { "base_uri": "https://localhost:8080/", - "height": 419 + "height": 623 } }, "source": [ @@ -608,7 +926,7 @@ "combine_ds.drop(combine_ds[combine_ds['message'] == '-1'].index, inplace = True) \n", "combine_ds" ], - "execution_count": 12, + "execution_count": 17, "outputs": [ { "output_type": "execute_result", @@ -632,94 +950,272 @@ " \n", " \n", " \n", + " Unnamed: 0\n", + " symbol\n", " message\n", - " label\n", + " datetime\n", + " user\n", + " message_id\n", + " Date\n", + " Time\n", + " Day_of_week\n", + " polarity\n", " \n", " \n", " \n", " \n", - " 10779\n", - " wmt bearish fools\n", - " 1\n", + " 16590\n", + " 16590\n", + " DIA\n", + " dia bets market tomorrow\n", + " 2020-04-07 18:12:46\n", + " 1117334\n", + " 205129040\n", + " 2020-04-07\n", + " 18:12:46\n", + " Tuesday\n", + " 0\n", " \n", " \n", - " 7731\n", - " wmt pain stubborn shorts signed stubborn long\n", + " 154988\n", + " 154988\n", + " DIA\n", + " municipal bond funds reported inflow 23rd time...\n", + " 2014-10-17 12:15:41\n", + " 237666\n", + " 28110605\n", + " 2014-10-17\n", + " 12:15:41\n", + " Friday\n", " 1\n", " \n", " \n", - " 3159\n", - " wmt layoffs cost cutting measures coming brazi...\n", - " -1\n", + " 174141\n", + " 174141\n", + " DIA\n", + " ts tradeplan ym friday june 28 2013 es f ym f ...\n", + " 2013-06-28 08:55:34\n", + " 13803\n", + " 14345522\n", + " 2013-06-28\n", + " 08:55:34\n", + " Friday\n", + " 0\n", " \n", " \n", - " 561\n", - " fyi huge 1 million call vol vz itm july calls ...\n", - " 1\n", + " 94680\n", + " 94680\n", + " DIA\n", + " spx spy qq ndx iwm vix dia djia nasdaq market ...\n", + " 2018-03-14 17:18:12\n", + " 917666\n", + " 116238736\n", + " 2018-03-14\n", + " 17:18:12\n", + " Wednesday\n", + " 0\n", " \n", " \n", - " 12165\n", - " 6 shorted dow stocks msft csco vz ko v\n", + " 64506\n", + " 64506\n", + " DIA\n", + " although technical rating bad dia present nice...\n", + " 2018-12-19 15:39:54\n", + " 47688\n", + " 148382242\n", + " 2018-12-19\n", + " 15:39:54\n", + " Wednesday\n", " 0\n", " \n", " \n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 7153\n", - " yesterdays sell close closing order imbalance ...\n", - " -1\n", + " 166025\n", + " 166025\n", + " DIA\n", + " dia overly bearish sentiment tells could see b...\n", + " 2014-02-03 20:08:06\n", + " 170632\n", + " 19676228\n", + " 2014-02-03\n", + " 20:08:06\n", + " Monday\n", + " 0\n", " \n", " \n", - " 7790\n", - " xpo logistics inc stock speculative buy xpo tg...\n", - " 1\n", + " 182008\n", + " 182008\n", + " DIA\n", + " spy dia qq continue fall look biotechs like am...\n", + " 2012-09-26 15:08:40\n", + " 161307\n", + " 9693821\n", + " 2012-09-26\n", + " 15:08:40\n", + " Wednesday\n", + " 0\n", " \n", " \n", - " 10965\n", - " newport wealth strategies inc filed form 13f q...\n", - " 1\n", + " 36692\n", + " 36692\n", + " DIA\n", + " spy dia green looks like reversal day release ...\n", + " 2019-11-21 19:36:44\n", + " 349810\n", + " 184600235\n", + " 2019-11-21\n", + " 19:36:44\n", + " Thursday\n", + " 0\n", " \n", " \n", - " 12201\n", - " wmt walmart needs show us action today\n", - " 1\n", + " 144330\n", + " 144330\n", + " DIA\n", + " quot ep capital intc 0 70 lone decliner dji di...\n", + " 2015-07-09 14:02:15\n", + " 8115\n", + " 39428873\n", + " 2015-07-09\n", + " 14:02:15\n", + " Thursday\n", + " 0\n", " \n", " \n", - " 1225\n", - " tesco 2nd top 3 global grocers quit japan carr...\n", - " 1\n", + " 49227\n", + " 49227\n", + " DIA\n", + " spy spy qq qq dia dia worth attempting day tra...\n", + " 2019-05-13 15:08:04\n", + " 1874654\n", + " 164013252\n", + " 2019-05-13\n", + " 15:08:04\n", + " Monday\n", + " 0\n", " \n", " \n", "\n", - "

12082 rows × 2 columns

\n", + "

188599 rows × 10 columns

\n", "" ], "text/plain": [ - " message label\n", - "10779 wmt bearish fools 1\n", - "7731 wmt pain stubborn shorts signed stubborn long 1\n", - "3159 wmt layoffs cost cutting measures coming brazi... -1\n", - "561 fyi huge 1 million call vol vz itm july calls ... 1\n", - "12165 6 shorted dow stocks msft csco vz ko v 0\n", - "... ... ...\n", - "7153 yesterdays sell close closing order imbalance ... -1\n", - "7790 xpo logistics inc stock speculative buy xpo tg... 1\n", - "10965 newport wealth strategies inc filed form 13f q... 1\n", - "12201 wmt walmart needs show us action today 1\n", - "1225 tesco 2nd top 3 global grocers quit japan carr... 1\n", + " Unnamed: 0 symbol ... Day_of_week polarity\n", + "16590 16590 DIA ... Tuesday 0\n", + "154988 154988 DIA ... Friday 1\n", + "174141 174141 DIA ... Friday 0\n", + "94680 94680 DIA ... Wednesday 0\n", + "64506 64506 DIA ... Wednesday 0\n", + "... ... ... ... ... ...\n", + "166025 166025 DIA ... Monday 0\n", + "182008 182008 DIA ... Wednesday 0\n", + "36692 36692 DIA ... Thursday 0\n", + "144330 144330 DIA ... Thursday 0\n", + "49227 49227 DIA ... Monday 0\n", "\n", - "[12082 rows x 2 columns]" + "[188599 rows x 10 columns]" ] }, "metadata": { "tags": [] }, - "execution_count": 12 + "execution_count": 17 } ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0L1EiPK6RTOJ", + "outputId": "143de1eb-8beb-470f-8ca9-13b16d3df5f1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + } + }, + "source": [ + "combine_ds['polarity'].value_counts()" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 99069\n", + "0 89530\n", + "Name: polarity, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ExuSBMADRVWx" + }, + "source": [ + "# Uncomment for FB\n", + "# FB_IPO = '2012-05-21'\n", + "# combine_ds.drop(combine_ds[combine_ds['Date'] < '2012-05-21'].index, inplace = True) \n", + "# combine_ds" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SzbU2bZgRWfg" + }, + "source": [ + "# Uncomment for TSLA\n", + "# TSLA_IPO = '2010-06-30'\n", + "# combine_ds.drop(combine_ds[combine_ds['Date'] < TSLA_IPO].index, inplace = True) \n", + "# combine_ds" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "hMWwq5dKRX2X" + }, + "source": [ + "# combine_ds['polarity'].value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uOx_0zmyRZbT" + }, + "source": [ + "combine_ds.to_csv('prep_WMT1.csv')\n", + "!cp prep_WMT1.csv \"/content/drive/My Drive\"" + ], + "execution_count": 19, + "outputs": [] } ] } \ No newline at end of file