From 85d005123c4b5fdbac31ceec99d691826a7d253b Mon Sep 17 00:00:00 2001 From: Priyankamandal7719 <71107020+Priyankamandal7719@users.noreply.github.com> Date: Thu, 15 Oct 2020 20:45:50 +1100 Subject: [PATCH] Created using Colaboratory --- PreprocessingFinal.ipynb | 205 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 PreprocessingFinal.ipynb diff --git a/PreprocessingFinal.ipynb b/PreprocessingFinal.ipynb new file mode 100644 index 0000000..8e586cc --- /dev/null +++ b/PreprocessingFinal.ipynb @@ -0,0 +1,205 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "PreprocessingFinal.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyMgZo25t5kp1nELT9FOxLtd", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_gXtdFlzndWh" + }, + "source": [ + "!pip install contractions\n", + "!pip install emoji\n", + "import pandas as pd\n", + "import re\n", + "import emoji\n", + "import nltk\n", + "import contractions\n", + "import torch\n", + "\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')\n", + "# from nltk.tokenize import word_tokenize\n", + "from nltk.tokenize import TweetTokenizer \n", + "from nltk.corpus import stopwords as sw\n", + "from nltk.tokenize import word_tokenize \n", + "from nltk.tokenize.treebank import TreebankWordDetokenizer\n", + "import requests \n", + "from pprint import pprint\n", + "import numpy as np\n", + "from nltk.stem import PorterStemmer\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "from datetime import datetime\n", + "from tensorflow import keras\n", + "# import bert\n", + "# from bert import run_classifier\n", + "# from bert import optimization\n", + "# from bert import tokenization\n", + "import os\n", + "import pprint\n", + "import json\n", + "import random\n", + "import string\n", + "import sys" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "z70r1eYrnoB_" + }, + "source": [ + "combine_ds['message'] = combine_ds['message'].str.lower()\n", + "message = combine_ds['message'].tolist()\n", + "\n", + "print(message[:10])\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "C65MA9lJnwEv" + }, + "source": [ + "def remove_stopwords(msg):\n", + " filtered_sentence = [w for w in tokens if not w in stop_words]\n", + " return filtered_sentence\n", + "\n", + "def remove_punctuation_re(x):\n", + " x = ' '.join(re.sub(\"https?://\\S+\",\"\",x).split()) #Removing URLs\n", + "\n", + " x = ' '.join(re.sub(\"^@\\S+|\\s@\\S+\",\"\",x).split()) #Removing Mentions\n", + "\n", + " # x = ' '.join(re.sub(r'[^$\\w\\s]',\" \",x).split())\n", + " x = ' '.join(re.sub(r'[^\\w\\s]',\" \",x).split()) #Removes Hashtags\n", + "\n", + " x = ' '.join(re.sub(r'_',\" \",x).split()) #Removing _ from emojis text\n", + "\n", + " return x" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4LRFAxgcn_fi" + }, + "source": [ + "message_er = []\n", + "ps = PorterStemmer()\n", + "stop_words = sw.words(\"english\")\n", + "tweet_tokenizer = TweetTokenizer()\n", + "detokenizer = TreebankWordDetokenizer()\n", + "message_p = []\n", + "for msg in message:\n", + " \n", + " # remove emojis\n", + " msg = emoji.demojize(msg)\n", + " \n", + " #tokenize\n", + " tokens = tweet_tokenizer.tokenize(msg)\n", + "\n", + " # remove stopwords\n", + " msg = remove_stopwords(msg)\n", + " if 'rt' in msg:\n", + " # remove retweets\n", + " message_p.append('-1')\n", + " else: \n", + " # detokenize\n", + " msg = detokenizer.detokenize(msg)\n", + " \n", + " # fix contractions\n", + " msg = contractions.fix(msg)\n", + "\n", + " # remove punctuations\n", + " msg = remove_punctuation_re(msg) \n", + " message_p.append(msg)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "pXWjHAcQoIEh" + }, + "source": [ + "#Removing repeating words like hurrrryyyyyy-- worrks on tokenized list\n", + "\n", + "strOfMsg = \" \".join(itertools.chain.from_iterable(message_tok))\n", + "message_rpt = re.compile(r\"(.)\\1{2,}\", re.IGNORECASE)\n", + "\n", + "def rpt_replace(match):\n", + " return match.group(1)+match.group(1)\n", + "\n", + "# t = 'amzn dip buyer fulll attack boooyaaaaaaaaaaaaaaaah'\n", + "re_t = ''\n", + "message_nrp = []\n", + "\n", + "def processRepeatings(data):\n", + " re_t= re.sub(message_rpt, rpt_replace, data )\n", + " return message_nrp.append(re_t)\n", + "\n", + "processRepeatings(strOfMsg)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "pwVS7SDnoaIt" + }, + "source": [ + "#For Hashtags elongated words using Word segmenter\n", + "!pip install ekphrasis\n", + "from ekphrasis.classes.segmenter import Segmenter\n", + "\n", + "# segmenter using the word statistics from english Wikipedia\n", + "# seg_eng = Segmenter(corpus=\"english\") \n", + "message_seg = []\n", + "\n", + "# segmenter using the word statistics from Twitter\n", + "seg_tw = Segmenter(corpus=\"twitter\")\n", + "\n", + "# words = [\"exponentialbackoff\", \"gamedev\", \"retrogaming\", \"thewatercooler\", \"panpsychism\"]\n", + "for w in message_sw:\n", + " # print(w)\n", + " message_seg.append(seg_tw.segment(w))\n", + " # print(\"(tw):\", seg_tw.segment(w))\n", + " # print()" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file