diff --git a/PreprocessingFinal.ipynb b/PreprocessingFinal.ipynb
index 8e586cc..84ba3a5 100644
--- a/PreprocessingFinal.ipynb
+++ b/PreprocessingFinal.ipynb
@@ -5,7 +5,6 @@
"colab": {
"name": "PreprocessingFinal.ipynb",
"provenance": [],
- "authorship_tag": "ABX9TyMgZo25t5kp1nELT9FOxLtd",
"include_colab_link": true
},
"kernelspec": {
@@ -32,6 +31,8 @@
"source": [
"!pip install contractions\n",
"!pip install emoji\n",
+ "!pip install ekphrasis\n",
+ "\n",
"import pandas as pd\n",
"import re\n",
"import emoji\n",
@@ -64,7 +65,9 @@
"import json\n",
"import random\n",
"import string\n",
- "import sys"
+ "import sys\n",
+ "from ekphrasis.classes.segmenter import Segmenter\n",
+ "import itertools\n"
],
"execution_count": null,
"outputs": []
@@ -72,7 +75,126 @@
{
"cell_type": "code",
"metadata": {
- "id": "z70r1eYrnoB_"
+ "id": "vFOsh5pJq2QZ",
+ "outputId": "5e5b78f9-48e8-4757-a2a1-356258c3d736",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ }
+ },
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/drive')"
+ ],
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Mounted at /content/drive\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "BVXGLwXaq5I5",
+ "outputId": "874c4c78-14a8-49d2-f1bf-d68c621db7bd",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 204
+ }
+ },
+ "source": [
+ "df_comb = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Capstone/Labelled (Percentage change)/AAPL_label2.1.csv')\n",
+ "combine_ds = df_comb.sample(frac=1)\n",
+ "combine_ds.head()"
+ ],
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " message | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 563747 | \n",
+ " $AAPL sold my 250 April puts! BAM | \n",
+ " -1.0 | \n",
+ "
\n",
+ " \n",
+ " 378786 | \n",
+ " Stocks from the SMB Scanner: $FB $EXPE $FEYE $... | \n",
+ " -1.0 | \n",
+ "
\n",
+ " \n",
+ " 295238 | \n",
+ " $AAPL wish apple would just buyback to 125 to ... | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 110904 | \n",
+ " @zspecs @SlopeOfHope @JRNavarro75 Options can ... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1117553 | \n",
+ " $SPY $AAPL \\n\\nJesús Christ. That was some day! | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " message label\n",
+ "563747 $AAPL sold my 250 April puts! BAM -1.0\n",
+ "378786 Stocks from the SMB Scanner: $FB $EXPE $FEYE $... -1.0\n",
+ "295238 $AAPL wish apple would just buyback to 125 to ... 0.0\n",
+ "110904 @zspecs @SlopeOfHope @JRNavarro75 Options can ... 1.0\n",
+ "1117553 $SPY $AAPL \\n\\nJesús Christ. That was some day! 0.0"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "z70r1eYrnoB_",
+ "outputId": "bd2bd23f-5c39-4eb8-9169-5e18df89a7e9",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 54
+ }
},
"source": [
"combine_ds['message'] = combine_ds['message'].str.lower()\n",
@@ -80,8 +202,16 @@
"\n",
"print(message[:10])\n"
],
- "execution_count": null,
- "outputs": []
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "['$aapl sold my 250 april puts! bam', 'stocks from the smb scanner: $fb $expe $feye $blue $nvo $uso $aapl $expe', '$aapl wish apple would just buyback to 125 to get over the 120/700 mental barrier', '@zspecs @slopeofhope @jrnavarro75 options can also be used to reduce risk, as in these examples: http://stks.co/gne2 $aapl', '$spy $aapl \\n\\njesús christ. that was some day!', '$aapl forging ahead with an upward range every day...whether closing at new highs or just small gains..very bullish signal over all.', '$aapl trump supporters can't afford apple products anyway so this boycott isn't going to affect sales. rather, it will boost sales and attn', 'today's iphone event will be a test of trust for apple $aapl http://goo.gl/fpwxke', 'it's pretty clear that supply >> demand out there right now.. $spy $dia $qqq $aapl $nflx $nvda $pypl etc', 'the industry average profit margin is 1.64%. $aapl outperforms 97% of its industry peers. https://www.chartmill.com/stock/quote/aapl/fundamental-analysis?key=bb853040-a4ac-41c6-b549-d218d2f21b32&utm_source=stocktwits&utm_medium=fa&utm_content=aapl&utm_campaign=social_tracking']\n"
+ ],
+ "name": "stdout"
+ }
+ ]
},
{
"cell_type": "code",
@@ -90,7 +220,7 @@
},
"source": [
"def remove_stopwords(msg):\n",
- " filtered_sentence = [w for w in tokens if not w in stop_words]\n",
+ " filtered_sentence = [w for w in msg_tokens if not w in stop_words]\n",
" return filtered_sentence\n",
"\n",
"def remove_punctuation_re(x):\n",
@@ -103,103 +233,126 @@
"\n",
" x = ' '.join(re.sub(r'_',\" \",x).split()) #Removing _ from emojis text\n",
"\n",
- " return x"
+ " return x\n",
+ "\n",
+ "# replace repeating letter\n",
+ "def rpt_replace(match):\n",
+ " # print(match.group(1))\n",
+ " return match.group(1)+match.group(1)\n",
+ "\n",
+ "# substitute original word with replaced word, if any\n",
+ "def processRepeatings(data):\n",
+ " re_t= re.sub(message_rpt, rpt_replace, data )\n",
+ " # print(re_t)\n",
+ " return re_t"
],
- "execution_count": null,
+ "execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
- "id": "4LRFAxgcn_fi"
+ "id": "4LRFAxgcn_fi",
+ "outputId": "a10b4eae-a37d-4694-fe94-f127beeebfcf",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 51
+ }
},
"source": [
- "message_er = []\n",
- "ps = PorterStemmer()\n",
"stop_words = sw.words(\"english\")\n",
"tweet_tokenizer = TweetTokenizer()\n",
"detokenizer = TreebankWordDetokenizer()\n",
"message_p = []\n",
+ "\n",
+ "# for repeating characters in words\n",
+ "message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n",
+ "\n",
+ "# segmenter using the word statistics from Twitter\n",
+ "seg_tw = Segmenter(corpus=\"twitter\")\n",
+ "\n",
"for msg in message:\n",
- " \n",
" # remove emojis\n",
" msg = emoji.demojize(msg)\n",
- " \n",
+ "\n",
+ " # fix contractions\n",
+ " msg = contractions.fix(msg)\n",
+ "\n",
+ " # remove punctuations\n",
+ " msg = remove_punctuation_re(msg) \n",
+ " message_p.append(msg)\n",
" #tokenize\n",
- " tokens = tweet_tokenizer.tokenize(msg)\n",
+ " msg_tokens = tweet_tokenizer.tokenize(msg)\n",
+ "\n",
+ " #For Hashtags elongated words using Word segmenter\n",
+ " message_seg = []\n",
+ " for w in msg_tokens:\n",
+ " message_seg.append(seg_tw.segment(w))\n",
"\n",
" # remove stopwords\n",
- " msg = remove_stopwords(msg)\n",
+ " msg = remove_stopwords(message_seg)\n",
+ "\n",
" if 'rt' in msg:\n",
" # remove retweets\n",
" message_p.append('-1')\n",
" else: \n",
" # detokenize\n",
" msg = detokenizer.detokenize(msg)\n",
- " \n",
- " # fix contractions\n",
- " msg = contractions.fix(msg)\n",
- "\n",
- " # remove punctuations\n",
- " msg = remove_punctuation_re(msg) \n",
- " message_p.append(msg)"
- ],
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "pXWjHAcQoIEh"
- },
- "source": [
- "#Removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n",
- "\n",
- "strOfMsg = \" \".join(itertools.chain.from_iterable(message_tok))\n",
- "message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n",
- "\n",
- "def rpt_replace(match):\n",
- " return match.group(1)+match.group(1)\n",
"\n",
- "# t = 'amzn dip buyer fulll attack boooyaaaaaaaaaaaaaaaah'\n",
- "re_t = ''\n",
- "message_nrp = []\n",
+ " # removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n",
+ " msg = processRepeatings(msg)\n",
"\n",
- "def processRepeatings(data):\n",
- " re_t= re.sub(message_rpt, rpt_replace, data )\n",
- " return message_nrp.append(re_t)\n",
- "\n",
- "processRepeatings(strOfMsg)"
+ " message_p.append(msg)\n"
],
- "execution_count": null,
- "outputs": []
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Reading twitter - 1grams ...\n",
+ "Reading twitter - 2grams ...\n"
+ ],
+ "name": "stdout"
+ }
+ ]
},
{
"cell_type": "code",
"metadata": {
- "id": "pwVS7SDnoaIt"
+ "id": "pXWjHAcQoIEh",
+ "outputId": "79b08153-5f77-4203-a1bf-308ca594ab36",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 187
+ }
},
"source": [
- "#For Hashtags elongated words using Word segmenter\n",
- "!pip install ekphrasis\n",
- "from ekphrasis.classes.segmenter import Segmenter\n",
- "\n",
- "# segmenter using the word statistics from english Wikipedia\n",
- "# seg_eng = Segmenter(corpus=\"english\") \n",
- "message_seg = []\n",
- "\n",
- "# segmenter using the word statistics from Twitter\n",
- "seg_tw = Segmenter(corpus=\"twitter\")\n",
- "\n",
- "# words = [\"exponentialbackoff\", \"gamedev\", \"retrogaming\", \"thewatercooler\", \"panpsychism\"]\n",
- "for w in message_sw:\n",
- " # print(w)\n",
- " message_seg.append(seg_tw.segment(w))\n",
- " # print(\"(tw):\", seg_tw.segment(w))\n",
- " # print()"
+ "message_p[:10]"
],
- "execution_count": null,
- "outputs": []
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['aapl sold my 250 april puts bam',\n",
+ " 'aapl sold 250 april puts bam',\n",
+ " 'stocks from the smb scanner fb expe feye blue nvo uso aapl expe',\n",
+ " 'stocks smb scanner fb expe feye blue nvo uso aapl expe',\n",
+ " 'aapl wish apple would just buyback to 125 to get over the 120 700 mental barrier',\n",
+ " 'aapl wish apple would buyback 125 get 120 700 mental barrier',\n",
+ " 'options can also be used to reduce risk as in these examples aapl',\n",
+ " 'options also used reduce risk examples aapl',\n",
+ " 'spy aapl jesús christ that was some day',\n",
+ " 'spy aapl jesús christ day']"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 12
+ }
+ ]
}
]
}
\ No newline at end of file