Skip to content

Commit

Permalink
Created using Colaboratory
Browse files Browse the repository at this point in the history
  • Loading branch information
Priyankamandal7719 committed Oct 15, 2020
1 parent dddadc4 commit 85d0051
Showing 1 changed file with 205 additions and 0 deletions.
205 changes: 205 additions & 0 deletions PreprocessingFinal.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "PreprocessingFinal.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyMgZo25t5kp1nELT9FOxLtd",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/mjag7682/CS9-1-NLP-for-Twitter-Data-for-predicting-stocks/blob/master/PreprocessingFinal.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_gXtdFlzndWh"
},
"source": [
"!pip install contractions\n",
"!pip install emoji\n",
"import pandas as pd\n",
"import re\n",
"import emoji\n",
"import nltk\n",
"import contractions\n",
"import torch\n",
"\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"# from nltk.tokenize import word_tokenize\n",
"from nltk.tokenize import TweetTokenizer \n",
"from nltk.corpus import stopwords as sw\n",
"from nltk.tokenize import word_tokenize \n",
"from nltk.tokenize.treebank import TreebankWordDetokenizer\n",
"import requests \n",
"from pprint import pprint\n",
"import numpy as np\n",
"from nltk.stem import PorterStemmer\n",
"\n",
"import tensorflow as tf\n",
"import tensorflow_hub as hub\n",
"from datetime import datetime\n",
"from tensorflow import keras\n",
"# import bert\n",
"# from bert import run_classifier\n",
"# from bert import optimization\n",
"# from bert import tokenization\n",
"import os\n",
"import pprint\n",
"import json\n",
"import random\n",
"import string\n",
"import sys"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "z70r1eYrnoB_"
},
"source": [
"combine_ds['message'] = combine_ds['message'].str.lower()\n",
"message = combine_ds['message'].tolist()\n",
"\n",
"print(message[:10])\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "C65MA9lJnwEv"
},
"source": [
"def remove_stopwords(msg):\n",
" filtered_sentence = [w for w in tokens if not w in stop_words]\n",
" return filtered_sentence\n",
"\n",
"def remove_punctuation_re(x):\n",
" x = ' '.join(re.sub(\"https?://\\S+\",\"\",x).split()) #Removing URLs\n",
"\n",
" x = ' '.join(re.sub(\"^@\\S+|\\s@\\S+\",\"\",x).split()) #Removing Mentions\n",
"\n",
" # x = ' '.join(re.sub(r'[^$\\w\\s]',\" \",x).split())\n",
" x = ' '.join(re.sub(r'[^\\w\\s]',\" \",x).split()) #Removes Hashtags\n",
"\n",
" x = ' '.join(re.sub(r'_',\" \",x).split()) #Removing _ from emojis text\n",
"\n",
" return x"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "4LRFAxgcn_fi"
},
"source": [
"message_er = []\n",
"ps = PorterStemmer()\n",
"stop_words = sw.words(\"english\")\n",
"tweet_tokenizer = TweetTokenizer()\n",
"detokenizer = TreebankWordDetokenizer()\n",
"message_p = []\n",
"for msg in message:\n",
" \n",
" # remove emojis\n",
" msg = emoji.demojize(msg)\n",
" \n",
" #tokenize\n",
" tokens = tweet_tokenizer.tokenize(msg)\n",
"\n",
" # remove stopwords\n",
" msg = remove_stopwords(msg)\n",
" if 'rt' in msg:\n",
" # remove retweets\n",
" message_p.append('-1')\n",
" else: \n",
" # detokenize\n",
" msg = detokenizer.detokenize(msg)\n",
" \n",
" # fix contractions\n",
" msg = contractions.fix(msg)\n",
"\n",
" # remove punctuations\n",
" msg = remove_punctuation_re(msg) \n",
" message_p.append(msg)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pXWjHAcQoIEh"
},
"source": [
"#Removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n",
"\n",
"strOfMsg = \" \".join(itertools.chain.from_iterable(message_tok))\n",
"message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n",
"\n",
"def rpt_replace(match):\n",
" return match.group(1)+match.group(1)\n",
"\n",
"# t = 'amzn dip buyer fulll attack boooyaaaaaaaaaaaaaaaah'\n",
"re_t = ''\n",
"message_nrp = []\n",
"\n",
"def processRepeatings(data):\n",
" re_t= re.sub(message_rpt, rpt_replace, data )\n",
" return message_nrp.append(re_t)\n",
"\n",
"processRepeatings(strOfMsg)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pwVS7SDnoaIt"
},
"source": [
"#For Hashtags elongated words using Word segmenter\n",
"!pip install ekphrasis\n",
"from ekphrasis.classes.segmenter import Segmenter\n",
"\n",
"# segmenter using the word statistics from english Wikipedia\n",
"# seg_eng = Segmenter(corpus=\"english\") \n",
"message_seg = []\n",
"\n",
"# segmenter using the word statistics from Twitter\n",
"seg_tw = Segmenter(corpus=\"twitter\")\n",
"\n",
"# words = [\"exponentialbackoff\", \"gamedev\", \"retrogaming\", \"thewatercooler\", \"panpsychism\"]\n",
"for w in message_sw:\n",
" # print(w)\n",
" message_seg.append(seg_tw.segment(w))\n",
" # print(\"(tw):\", seg_tw.segment(w))\n",
" # print()"
],
"execution_count": null,
"outputs": []
}
]
}

0 comments on commit 85d0051

Please sign in to comment.