From 5a8253fbd678b5dfb2a3f40c5a8bc866d4cb3ac8 Mon Sep 17 00:00:00 2001 From: Omj69 <144990466+Omj69@users.noreply.github.com> Date: Thu, 17 Apr 2025 09:58:31 +0530 Subject: [PATCH] Add files via upload --- Fake_News_Detector_Improved.ipynb | 647 ++++++++++++++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 Fake_News_Detector_Improved.ipynb diff --git a/Fake_News_Detector_Improved.ipynb b/Fake_News_Detector_Improved.ipynb new file mode 100644 index 0000000..dbe7c65 --- /dev/null +++ b/Fake_News_Detector_Improved.ipynb @@ -0,0 +1,647 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "b6edf3ba", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import pickle\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a8b8f15a", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(text):\n", + " text = text.lower()\n", + " text = re.sub(r'\\W', ' ', text)\n", + " text = re.sub(r'\\s+', ' ', text)\n", + " return text.strip()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cb9f81aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Load datasets\n", + "true = pd.read_csv('data/True.csv')\n", + "fake = pd.read_csv('data/Fake.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "610f29ba", + "metadata": {}, + "outputs": [], + "source": [ + "# Add labels\n", + "true['label'] = 1\n", + "fake['label'] = 0\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e5bd1bae", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocess\n", + "true['text'] = true['text'].apply(preprocess)\n", + "fake['text'] = fake['text'].apply(preprocess)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7013c671", + "metadata": {}, + "outputs": [], + "source": [ + "# Balance dataset\n", + "min_len = min(len(true), len(fake))\n", + "true = true.sample(min_len, random_state=42)\n", + "fake = fake.sample(min_len, random_state=42)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "90325fb1", + "metadata": {}, + "outputs": [], + "source": [ + "# Combine and shuffle\n", + "df = pd.concat([true, fake])\n", + "df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n", + "\n", + "X = df['text']\n", + "y = df['label']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2565508c", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Improved vectorizer settings\n", + "vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2, ngram_range=(1,2))\n", + "X_train_vec = vectorizer.fit_transform(X_train)\n", + "X_test_vec = vectorizer.transform(X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2fdcccc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()