.Rhistory

setwd("C:/Users/axays/Downloads/United_Airlines_Analysis")
library(tidyverse)
library(tidytext)
library(readr)
library(syuzhet)
library(tm)
library(stringr)
# Load the CSV file containing all the passenger reviews of 4 airline companies
reviews <- read.csv("Airline_Reviews.csv")
# Function to determine Sentiment and process reviews for each airline
process_airline_reviews <- function(airline_name, data) {
reviews_filtered <- subset(data, Airline.Name == airline_name)
# Using Syuzhet package, determine the sentiment of each passenger Review
reviews_filtered$sentiment <- get_sentiment(reviews_filtered$Review, method = "syuzhet")
# To obtain date in Year, month format for analysis
reviews_filtered$Review.Date <- as.Date(reviews_filtered$Review.Date, format = "%d-%b-%y")
reviews_filtered$Year <- year(reviews_filtered$Review.Date)
reviews_filtered$Month <- month(reviews_filtered$Review.Date)
reviews_filtered$Mentioned_Luggage <- as.integer(grepl("luggage", tolower(reviews_filtered$Review)))
return(reviews_filtered)
}
# Process reviews for each airline
reviews_united <- process_airline_reviews("United Airlines", reviews)
reviews_delta <- process_airline_reviews("Delta Air Lines", reviews)
reviews_american <- process_airline_reviews("American Airlines", reviews)
reviews_southwest <- process_airline_reviews("Southwest Airlines", reviews)
# Function to count the "luggage" mentions in Reviews
analyze_luggage_mentions <- function(data, airline_name, color) {
luggage_mentions <- data[grep("luggage", data$Review, ignore.case = TRUE), ]
luggage_count <- luggage_mentions %>%
filter(sentiment < 0) %>%
group_by(Year) %>%
summarise(Count = n())
# Plot the Number of Times 'Luggage' Was Mentioned Negatively Per Year Over Time
ggplot(luggage_count, aes(x = interaction(Year), y = Count)) +
geom_bar(stat = "identity", fill = color) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste("Number of Times 'Luggage' Was Mentioned Negatively Per Year Over Time for", airline_name),
x = "Year and Month",
y = "Number of Mentions")
}
# Analyze luggage mentions for each airline over the years
analyze_luggage_mentions(reviews_united, "United Airlines", "blue")
analyze_luggage_mentions(reviews_delta, "Delta Airlines", "red")
analyze_luggage_mentions(reviews_american, "American Airlines", "yellow")
analyze_luggage_mentions(reviews_southwest, "Southwest Airlines", "green")
# Combine common English stopwords with custom stopwords
all_stopwords <- bind_rows(get_stopwords(), read_lines("custom_stopwords.txt") %>% tibble(word = .)) %>%
distinct(word) # Remove any duplicates
# Function to display word count for negative reviews
display_word_count <- function(data) {
word_count <- data %>%
filter(sentiment < 0) %>%
unnest_tokens(word, Review) %>%
anti_join(all_stopwords, by = "word") %>%
count(word, sort = TRUE)
print(head(word_count, n = 10))
}
# Display word count for negative reviews of United Airlines
display_word_count(reviews_united)
# Compare mean ratings for mentioned and not mentioned luggage
tapply(reviews_united$Overall_Rating, reviews_united$Mentioned_Luggage, mean)
# Perform a t-test
t.test(reviews_united$Overall_Rating ~ reviews_united$Mentioned_Luggage, reviews_united = reviews_united)
# Box plot
boxplot(reviews_united$Overall_Rating ~ reviews_united$Mentioned_Luggage, reviews_united = reviews_united, xlab = "Luggage Mentioned? (0=No, 1=Yes)", ylab = "Overall Rating")
# Function to calculate sentiment statistics and show comparison of current Customer sentiment between each airline
calculate_sentiment_stats <- function(data, airline_name) {
current_mean_sentiment <- mean(data$sentiment, na.rm = TRUE)
data_without_negative_luggage <- data %>% filter(!(grepl("luggage", Review, ignore.case = TRUE) & sentiment < 0))
new_mean_sentiment <- mean(data_without_negative_luggage$sentiment, na.rm = TRUE)
# To show increase in sentiment for united airlines in absence of negative luggage reviews
increase_in_mean_sentiment <- new_mean_sentiment - current_mean_sentiment
cat("Current Mean Sentiment of", airline_name, ":", current_mean_sentiment, "\n")
if(airline_name == "United Airlines"){
cat("Increase in Mean Sentiment in absence of reviews containing 'Luggage' having Negative sentiment:", increase_in_mean_sentiment, "\n")
cat("Mean Sentiment in absence of Reviews containing 'Luggage' having negative sentiment:", new_mean_sentiment, "\n")}
}
# Calculate sentiment statistics for each airlines to show comparison of sentiments, and increase in sentiment for united airlines in absence of negative luggage reviews
calculate_sentiment_stats(reviews_united, "United Airlines")
calculate_sentiment_stats(reviews_delta, "Delta Airlines")
calculate_sentiment_stats(reviews_american, "American Airlines")
calculate_sentiment_stats(reviews_southwest, "Southwest Airlines")
# For Recommendations to United to resolve their Luggage issues based on Reviews:
# Filter the data for reviews mentioning Luggage and Sentiment < 0
filtered_data <- subset(reviews_united, grepl("luggage", reviews_united$Review, ignore.case = TRUE) & reviews_united$sentiment < 0)
# Read custom stop words
custom_stopwords <- readLines("custom_stopwords_luggage.txt")
# Combine with English stop words
all_stopwords <- c(stopwords("english"), custom_stopwords)
# Prepare the corpus
corpus <- Corpus(VectorSource(filtered_data$Review))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, all_stopwords)
# Create a document-term matrix
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
word_freq <- sort(rowSums(m), decreasing = TRUE)
# Get the most frequent words causing Luggage handling issues
top_luggage_handling_issues <- head(word_freq, 10)
print("\n")
print(top_luggage_handling_issues)
# Create a pie chart showing top luggage handling issues
pie(top_luggage_handling_issues,
labels = names(top_luggage_handling_issues),
main = "Top 10 Words in Luggage Handling Issues",
col = rainbow(length(top_luggage_handling_issues)))
library(syuzhet)
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
reviews <- read.csv("United_All.csv")
# Calculate the sentiment
reviews$sentiment <- get_sentiment(reviews$Review, method = "syuzhet")
# Convert the Review Date column to Date type
reviews$Review.Date <- dmy(reviews$Review.Date)
# Aggregate sentiment scores by date
daily_sentiment <- reviews %>%
group_by(Review.Date) %>%
summarise(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot only the linear trend line
ggplot(daily_sentiment, aes(x = Review.Date, y = mean_sentiment)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add only the linear trend line
labs(title = "Average Sentiment Over Review Date",
x = "Review Date",
y = "Average Sentiment Score") +
theme_minimal()
library(syuzhet)
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
reviews <- read.csv("United_All.csv")
# Calculate the sentiment
reviews$sentiment <- get_sentiment(reviews$Review, method = "syuzhet")
# Convert the Review Date column to Date type
reviews$Review.Date <- dmy(reviews$Review.Date)
# Aggregate sentiment scores by date
daily_sentiment <- reviews %>%
group_by(Review.Date) %>%
summarise(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot only the linear trend line
ggplot(daily_sentiment, aes(x = Review.Date, y = mean_sentiment)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add only the linear trend line
labs(title = "Average Sentiment Over Review Date",
x = "Review Date",
y = "Average Sentiment Score") +
theme_minimal()
setwd("C:/Users/axays/Downloads/United_Airlines_Analysis")
library(syuzhet)
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
reviews <- read.csv("United_All.csv")
# Calculate the sentiment
reviews$sentiment <- get_sentiment(reviews$Review, method = "syuzhet")
# Convert the Review Date column to Date type
reviews$Review.Date <- dmy(reviews$Review.Date)
# Aggregate sentiment scores by date
daily_sentiment <- reviews %>%
group_by(Review.Date) %>%
summarise(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot only the linear trend line
ggplot(daily_sentiment, aes(x = Review.Date, y = mean_sentiment)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add only the linear trend line
labs(title = "Average Sentiment Over Review Date",
x = "Review Date",
y = "Average Sentiment Score") +
theme_minimal()