diff --git a/PreprocessingFinal.ipynb b/PreprocessingFinal.ipynb index 84ba3a5..a89d5af 100644 --- a/PreprocessingFinal.ipynb +++ b/PreprocessingFinal.ipynb @@ -23,10 +23,24 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "HkqDSyOVDpRD" + }, + "source": [ + "## Install, Import, Mount statements" + ] + }, { "cell_type": "code", "metadata": { - "id": "_gXtdFlzndWh" + "id": "_gXtdFlzndWh", + "outputId": "534bcbcb-1ea4-4c2c-e8db-8bd6b96d8cc1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 445 + } }, "source": [ "!pip install contractions\n", @@ -69,50 +83,90 @@ "from ekphrasis.classes.segmenter import Segmenter\n", "import itertools\n" ], - "execution_count": null, - "outputs": [] + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: contractions in /usr/local/lib/python3.6/dist-packages (0.0.25)\n", + "Requirement already satisfied: textsearch in /usr/local/lib/python3.6/dist-packages (from contractions) (0.0.17)\n", + "Requirement already satisfied: Unidecode in /usr/local/lib/python3.6/dist-packages (from textsearch->contractions) (1.1.1)\n", + "Requirement already satisfied: pyahocorasick in /usr/local/lib/python3.6/dist-packages (from textsearch->contractions) (1.4.0)\n", + "Requirement already satisfied: emoji in /usr/local/lib/python3.6/dist-packages (0.6.0)\n", + "Requirement already satisfied: ekphrasis in /usr/local/lib/python3.6/dist-packages (0.5.1)\n", + "Requirement already satisfied: ujson in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (4.0.1)\n", + "Requirement already satisfied: ftfy in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (5.8)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (4.41.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (1.18.5)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (3.2.2)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (1.1.0)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (3.2.5)\n", + "Requirement already satisfied: colorama in /usr/local/lib/python3.6/dist-packages (from ekphrasis) (0.4.4)\n", + "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from ftfy->ekphrasis) (0.2.5)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (0.10.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (2.8.1)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->ekphrasis) (2.4.7)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk->ekphrasis) (1.15.0)\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "code", "metadata": { "id": "vFOsh5pJq2QZ", - "outputId": "5e5b78f9-48e8-4757-a2a1-356258c3d736", + "outputId": "d8e8d313-417d-4b07-af4b-3195d01cc237", "colab": { "base_uri": "https://localhost:8080/", - "height": 34 + "height": 54 } }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], - "execution_count": 3, + "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ - "Mounted at /content/drive\n" + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ], "name": "stdout" } ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "T1O0ewkADyRM" + }, + "source": [ + "## Load file" + ] + }, { "cell_type": "code", "metadata": { "id": "BVXGLwXaq5I5", - "outputId": "874c4c78-14a8-49d2-f1bf-d68c621db7bd", + "outputId": "07e36ae8-912a-482f-e70a-6b1276a3aae7", "colab": { "base_uri": "https://localhost:8080/", - "height": 204 + "height": 419 } }, "source": [ - "df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Percentage change)/AAPL_label2.1.csv')\n", + "df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Percentage change)/WMT_label2.1.csv')\n", "combine_ds = df_comb.sample(frac=1)\n", - "combine_ds.head()" + "combine_ds" ], - "execution_count": 4, + "execution_count": 3, "outputs": [ { "output_type": "execute_result", @@ -142,55 +196,103 @@ " \n", "
\n", "12231 rows × 2 columns
\n", "" ], "text/plain": [ - " message label\n", - "563747 $AAPL sold my 250 April puts! BAM -1.0\n", - "378786 Stocks from the SMB Scanner: $FB $EXPE $FEYE $... -1.0\n", - "295238 $AAPL wish apple would just buyback to 125 to ... 0.0\n", - "110904 @zspecs @SlopeOfHope @JRNavarro75 Options can ... 1.0\n", - "1117553 $SPY $AAPL \\n\\nJesús Christ. That was some day! 0.0" + " message label\n", + "10779 $WMT how are you bearish fools doing? 1\n", + "7731 $WMT more pain for stubborn shorts. -signed st... 1\n", + "3159 $WMT layoffs and cost cutting measures coming ... -1\n", + "561 fyi: Huge (over 1 million) call vol in $T and ... 1\n", + "12165 The 6 Most Shorted Dow Stocks\\n\\n$MSFT $CSCO ... 0\n", + "... ... ...\n", + "7153 Yesterdays Sell On Close Closing Order Imbal... -1\n", + "7790 Why XPO Logistics Inc Stock Is More Than a Spe... 1\n", + "10965 Newport Wealth Strategies Inc.,has filed Form ... 1\n", + "12201 $WMT Walmart needs to show us some action today 1\n", + "1225 #Tesco 2nd of Top 3 global grocers to quit Jap... 1\n", + "\n", + "[12231 rows x 2 columns]" ] }, "metadata": { "tags": [] }, - "execution_count": 4 + "execution_count": 3 } ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "B8G7WU8SD08e" + }, + "source": [ + "## Preprocess data" + ] + }, { "cell_type": "code", "metadata": { "id": "z70r1eYrnoB_", - "outputId": "bd2bd23f-5c39-4eb8-9169-5e18df89a7e9", + "outputId": "ea37ebdc-ea6a-4fa9-ce8c-610b7bbac64f", "colab": { "base_uri": "https://localhost:8080/", "height": 54 @@ -200,14 +302,14 @@ "combine_ds['message'] = combine_ds['message'].str.lower()\n", "message = combine_ds['message'].tolist()\n", "\n", - "print(message[:10])\n" + "print(message[:10])" ], - "execution_count": 5, + "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ - "['$aapl sold my 250 april puts! bam', 'stocks from the smb scanner: $fb $expe $feye $blue $nvo $uso $aapl $expe', '$aapl wish apple would just buyback to 125 to get over the 120/700 mental barrier', '@zspecs @slopeofhope @jrnavarro75 options can also be used to reduce risk, as in these examples: http://stks.co/gne2 $aapl', '$spy $aapl \\n\\njesús christ. that was some day!', '$aapl forging ahead with an upward range every day...whether closing at new highs or just small gains..very bullish signal over all.', '$aapl trump supporters can't afford apple products anyway so this boycott isn't going to affect sales. rather, it will boost sales and attn', 'today's iphone event will be a test of trust for apple $aapl http://goo.gl/fpwxke', 'it's pretty clear that supply >> demand out there right now.. $spy $dia $qqq $aapl $nflx $nvda $pypl etc', 'the industry average profit margin is 1.64%. $aapl outperforms 97% of its industry peers. https://www.chartmill.com/stock/quote/aapl/fundamental-analysis?key=bb853040-a4ac-41c6-b549-d218d2f21b32&utm_source=stocktwits&utm_medium=fa&utm_content=aapl&utm_campaign=social_tracking']\n" + "['$wmt how are you bearish fools doing?', '$wmt more pain for stubborn shorts. -signed stubborn long', '$wmt layoffs and cost cutting measures coming -brazil is a disaster.no position. if it gets to 45, maybe..lol', 'fyi: huge (over 1 million) call vol in $t and $vz itm july calls yesterday. chain: http://stk.ly/owk4r5 , http://stk.ly/mqiyye > what's up?', 'the 6 most shorted dow stocks\\n\\n$msft $csco $vz $ko $v \\n\\nhttps://247wallst.com/investing/2019/12/26/6-most-shorted-dow-stocks-microsoft-is-king-of-the-hill-in-december/1/', '02:57:30 pm manual exit. closing my $wmt position of 672 shares that was opened oct 29 for a 0.07% gain.', '@sspencer_smb tmobile/sprint still a foreign owed enterprise, either way.. buy american $t and $vz ..', '@scarborory agree .... $s owns the most spectrum in the us comparing w $t , $vz and $tmus', '$baba just had some positive pr on fox news radio. talked about how $baba is destroying $wmt due to it's success.', '$wmt $ostk $msft employees chant this in the bathroom']\n" ], "name": "stdout" } @@ -246,14 +348,14 @@ " # print(re_t)\n", " return re_t" ], - "execution_count": 8, + "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { - "id": "4LRFAxgcn_fi", - "outputId": "a10b4eae-a37d-4694-fe94-f127beeebfcf", + "id": "_3YWRwzLBGs1", + "outputId": "25a76e0b-f43d-4419-bfc5-77f4da53ff49", "colab": { "base_uri": "https://localhost:8080/", "height": 51 @@ -263,14 +365,33 @@ "stop_words = sw.words(\"english\")\n", "tweet_tokenizer = TweetTokenizer()\n", "detokenizer = TreebankWordDetokenizer()\n", - "message_p = []\n", + "# message_p = []\n", "\n", "# for repeating characters in words\n", "message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n", "\n", "# segmenter using the word statistics from Twitter\n", - "seg_tw = Segmenter(corpus=\"twitter\")\n", - "\n", + "seg_tw = Segmenter(corpus=\"twitter\")\n" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Reading twitter - 1grams ...\n", + "Reading twitter - 2grams ...\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4LRFAxgcn_fi" + }, + "source": [ + "message_p = []\n", "for msg in message:\n", " # remove emojis\n", " msg = emoji.demojize(msg)\n", @@ -280,7 +401,7 @@ "\n", " # remove punctuations\n", " msg = remove_punctuation_re(msg) \n", - " message_p.append(msg)\n", + "\n", " #tokenize\n", " msg_tokens = tweet_tokenizer.tokenize(msg)\n", "\n", @@ -299,52 +420,298 @@ " # detokenize\n", " msg = detokenizer.detokenize(msg)\n", "\n", - " # removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n", + " # removing repeating characters like hurrrryyyyyy-- worrks on tokenized list\n", " msg = processRepeatings(msg)\n", "\n", " message_p.append(msg)\n" ], - "execution_count": 9, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "pXWjHAcQoIEh", + "outputId": "346a4824-d1fd-485f-8280-64eb64327499", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 187 + } + }, + "source": [ + "message_p[:10]" + ], + "execution_count": 8, "outputs": [ { - "output_type": "stream", - "text": [ - "Reading twitter - 1grams ...\n", - "Reading twitter - 2grams ...\n" - ], - "name": "stdout" + "output_type": "execute_result", + "data": { + "text/plain": [ + "['wmt bearish fools',\n", + " 'wmt pain stubborn shorts signed stubborn long',\n", + " 'wmt layoffs cost cutting measures coming brazil disaster position gets 45 maybe lol',\n", + " 'fyi huge 1 million call vol vz itm july calls yesterday chain gt 39',\n", + " '6 shorted dow stocks msft csco vz ko v',\n", + " '02 57 30 pm manual exit closing wmt position 672 shares opened oct 29 0 07 gain',\n", + " 'tmobile sprint still foreign owed enterprise either way buy american vz',\n", + " 'agree owns spectrum us comparing w vz tmus',\n", + " 'baba positive pr fox news radio talked baba destroying wmt due 39 success',\n", + " 'wmt ostk msft employees chant bathroom']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 } ] }, { "cell_type": "code", "metadata": { - "id": "pXWjHAcQoIEh", - "outputId": "79b08153-5f77-4203-a1bf-308ca594ab36", + "id": "KkSpMpJWAG4u", + "outputId": "434cb601-02ad-4600-d726-93487cc11064", "colab": { "base_uri": "https://localhost:8080/", - "height": 187 + "height": 419 } }, "source": [ - "message_p[:10]" + "combine_ds['message'] = message_p\n", + "combine_ds" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + " | message | \n", + "label | \n", + "
---|---|---|
10779 | \n", + "wmt bearish fools | \n", + "1 | \n", + "
7731 | \n", + "wmt pain stubborn shorts signed stubborn long | \n", + "1 | \n", + "
3159 | \n", + "wmt layoffs cost cutting measures coming brazi... | \n", + "-1 | \n", + "
561 | \n", + "fyi huge 1 million call vol vz itm july calls ... | \n", + "1 | \n", + "
12165 | \n", + "6 shorted dow stocks msft csco vz ko v | \n", + "0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
7153 | \n", + "yesterdays sell close closing order imbalance ... | \n", + "-1 | \n", + "
7790 | \n", + "xpo logistics inc stock speculative buy xpo tg... | \n", + "1 | \n", + "
10965 | \n", + "newport wealth strategies inc filed form 13f q... | \n", + "1 | \n", + "
12201 | \n", + "wmt walmart needs show us action today | \n", + "1 | \n", + "
1225 | \n", + "tesco 2nd top 3 global grocers quit japan carr... | \n", + "1 | \n", + "
12231 rows × 2 columns
\n", + "\n", + " | message | \n", + "label | \n", + "
---|---|---|
10779 | \n", + "wmt bearish fools | \n", + "1 | \n", + "
7731 | \n", + "wmt pain stubborn shorts signed stubborn long | \n", + "1 | \n", + "
3159 | \n", + "wmt layoffs cost cutting measures coming brazi... | \n", + "-1 | \n", + "
561 | \n", + "fyi huge 1 million call vol vz itm july calls ... | \n", + "1 | \n", + "
12165 | \n", + "6 shorted dow stocks msft csco vz ko v | \n", + "0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
7153 | \n", + "yesterdays sell close closing order imbalance ... | \n", + "-1 | \n", + "
7790 | \n", + "xpo logistics inc stock speculative buy xpo tg... | \n", + "1 | \n", + "
10965 | \n", + "newport wealth strategies inc filed form 13f q... | \n", + "1 | \n", + "
12201 | \n", + "wmt walmart needs show us action today | \n", + "1 | \n", + "
1225 | \n", + "tesco 2nd top 3 global grocers quit japan carr... | \n", + "1 | \n", + "
12082 rows × 2 columns
\n", + "