-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
172 lines (172 loc) · 8.25 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
setwd("C:/Users/axays/Downloads/United_Airlines_Analysis")
library(tidyverse)
library(tidytext)
library(readr)
library(syuzhet)
library(tm)
library(stringr)
# Load the CSV file containing all the passenger reviews of 4 airline companies
reviews <- read.csv("Airline_Reviews.csv")
# Function to determine Sentiment and process reviews for each airline
process_airline_reviews <- function(airline_name, data) {
reviews_filtered <- subset(data, Airline.Name == airline_name)
# Using Syuzhet package, determine the sentiment of each passenger Review
reviews_filtered$sentiment <- get_sentiment(reviews_filtered$Review, method = "syuzhet")
# To obtain date in Year, month format for analysis
reviews_filtered$Review.Date <- as.Date(reviews_filtered$Review.Date, format = "%d-%b-%y")
reviews_filtered$Year <- year(reviews_filtered$Review.Date)
reviews_filtered$Month <- month(reviews_filtered$Review.Date)
reviews_filtered$Mentioned_Luggage <- as.integer(grepl("luggage", tolower(reviews_filtered$Review)))
return(reviews_filtered)
}
# Process reviews for each airline
reviews_united <- process_airline_reviews("United Airlines", reviews)
reviews_delta <- process_airline_reviews("Delta Air Lines", reviews)
reviews_american <- process_airline_reviews("American Airlines", reviews)
reviews_southwest <- process_airline_reviews("Southwest Airlines", reviews)
# Function to count the "luggage" mentions in Reviews
analyze_luggage_mentions <- function(data, airline_name, color) {
luggage_mentions <- data[grep("luggage", data$Review, ignore.case = TRUE), ]
luggage_count <- luggage_mentions %>%
filter(sentiment < 0) %>%
group_by(Year) %>%
summarise(Count = n())
# Plot the Number of Times 'Luggage' Was Mentioned Negatively Per Year Over Time
ggplot(luggage_count, aes(x = interaction(Year), y = Count)) +
geom_bar(stat = "identity", fill = color) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste("Number of Times 'Luggage' Was Mentioned Negatively Per Year Over Time for", airline_name),
x = "Year and Month",
y = "Number of Mentions")
}
# Analyze luggage mentions for each airline over the years
analyze_luggage_mentions(reviews_united, "United Airlines", "blue")
analyze_luggage_mentions(reviews_delta, "Delta Airlines", "red")
analyze_luggage_mentions(reviews_american, "American Airlines", "yellow")
analyze_luggage_mentions(reviews_southwest, "Southwest Airlines", "green")
# Combine common English stopwords with custom stopwords
all_stopwords <- bind_rows(get_stopwords(), read_lines("custom_stopwords.txt") %>% tibble(word = .)) %>%
distinct(word) # Remove any duplicates
# Function to display word count for negative reviews
display_word_count <- function(data) {
word_count <- data %>%
filter(sentiment < 0) %>%
unnest_tokens(word, Review) %>%
anti_join(all_stopwords, by = "word") %>%
count(word, sort = TRUE)
print(head(word_count, n = 10))
}
# Display word count for negative reviews of United Airlines
display_word_count(reviews_united)
# Compare mean ratings for mentioned and not mentioned luggage
tapply(reviews_united$Overall_Rating, reviews_united$Mentioned_Luggage, mean)
# Perform a t-test
t.test(reviews_united$Overall_Rating ~ reviews_united$Mentioned_Luggage, reviews_united = reviews_united)
# Box plot
boxplot(reviews_united$Overall_Rating ~ reviews_united$Mentioned_Luggage, reviews_united = reviews_united, xlab = "Luggage Mentioned? (0=No, 1=Yes)", ylab = "Overall Rating")
# Function to calculate sentiment statistics and show comparison of current Customer sentiment between each airline
calculate_sentiment_stats <- function(data, airline_name) {
current_mean_sentiment <- mean(data$sentiment, na.rm = TRUE)
data_without_negative_luggage <- data %>% filter(!(grepl("luggage", Review, ignore.case = TRUE) & sentiment < 0))
new_mean_sentiment <- mean(data_without_negative_luggage$sentiment, na.rm = TRUE)
# To show increase in sentiment for united airlines in absence of negative luggage reviews
increase_in_mean_sentiment <- new_mean_sentiment - current_mean_sentiment
cat("Current Mean Sentiment of", airline_name, ":", current_mean_sentiment, "\n")
if(airline_name == "United Airlines"){
cat("Increase in Mean Sentiment in absence of reviews containing 'Luggage' having Negative sentiment:", increase_in_mean_sentiment, "\n")
cat("Mean Sentiment in absence of Reviews containing 'Luggage' having negative sentiment:", new_mean_sentiment, "\n")}
}
# Calculate sentiment statistics for each airlines to show comparison of sentiments, and increase in sentiment for united airlines in absence of negative luggage reviews
calculate_sentiment_stats(reviews_united, "United Airlines")
calculate_sentiment_stats(reviews_delta, "Delta Airlines")
calculate_sentiment_stats(reviews_american, "American Airlines")
calculate_sentiment_stats(reviews_southwest, "Southwest Airlines")
# For Recommendations to United to resolve their Luggage issues based on Reviews:
# Filter the data for reviews mentioning Luggage and Sentiment < 0
filtered_data <- subset(reviews_united, grepl("luggage", reviews_united$Review, ignore.case = TRUE) & reviews_united$sentiment < 0)
# Read custom stop words
custom_stopwords <- readLines("custom_stopwords_luggage.txt")
# Combine with English stop words
all_stopwords <- c(stopwords("english"), custom_stopwords)
# Prepare the corpus
corpus <- Corpus(VectorSource(filtered_data$Review))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, all_stopwords)
# Create a document-term matrix
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
word_freq <- sort(rowSums(m), decreasing = TRUE)
# Get the most frequent words causing Luggage handling issues
top_luggage_handling_issues <- head(word_freq, 10)
print("\n")
print(top_luggage_handling_issues)
# Create a pie chart showing top luggage handling issues
pie(top_luggage_handling_issues,
labels = names(top_luggage_handling_issues),
main = "Top 10 Words in Luggage Handling Issues",
col = rainbow(length(top_luggage_handling_issues)))
library(syuzhet)
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
reviews <- read.csv("United_All.csv")
# Calculate the sentiment
reviews$sentiment <- get_sentiment(reviews$Review, method = "syuzhet")
# Convert the Review Date column to Date type
reviews$Review.Date <- dmy(reviews$Review.Date)
# Aggregate sentiment scores by date
daily_sentiment <- reviews %>%
group_by(Review.Date) %>%
summarise(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot only the linear trend line
ggplot(daily_sentiment, aes(x = Review.Date, y = mean_sentiment)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add only the linear trend line
labs(title = "Average Sentiment Over Review Date",
x = "Review Date",
y = "Average Sentiment Score") +
theme_minimal()
library(syuzhet)
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
reviews <- read.csv("United_All.csv")
# Calculate the sentiment
reviews$sentiment <- get_sentiment(reviews$Review, method = "syuzhet")
# Convert the Review Date column to Date type
reviews$Review.Date <- dmy(reviews$Review.Date)
# Aggregate sentiment scores by date
daily_sentiment <- reviews %>%
group_by(Review.Date) %>%
summarise(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot only the linear trend line
ggplot(daily_sentiment, aes(x = Review.Date, y = mean_sentiment)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add only the linear trend line
labs(title = "Average Sentiment Over Review Date",
x = "Review Date",
y = "Average Sentiment Score") +
theme_minimal()
setwd("C:/Users/axays/Downloads/United_Airlines_Analysis")
library(syuzhet)
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
reviews <- read.csv("United_All.csv")
# Calculate the sentiment
reviews$sentiment <- get_sentiment(reviews$Review, method = "syuzhet")
# Convert the Review Date column to Date type
reviews$Review.Date <- dmy(reviews$Review.Date)
# Aggregate sentiment scores by date
daily_sentiment <- reviews %>%
group_by(Review.Date) %>%
summarise(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot only the linear trend line
ggplot(daily_sentiment, aes(x = Review.Date, y = mean_sentiment)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add only the linear trend line
labs(title = "Average Sentiment Over Review Date",
x = "Review Date",
y = "Average Sentiment Score") +
theme_minimal()