-
Notifications
You must be signed in to change notification settings - Fork 0
/
Academic Article Organizer and Analyzer.Rmd
140 lines (120 loc) · 5.81 KB
/
Academic Article Organizer and Analyzer.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
---
title: "Academic Article Organizer and Analyzer"
author: "Your Name"
date: "2023-01-04"
output: html_document
---
```{r setup, include=FALSE}
# Clearing the workspace and setting global chunk options
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
```
```{r Load all packages}
# Loading necessary libraries
library(RPostgreSQL)
library(dplyr)
library(dbplyr)
library(data.table)
library(lubridate)
library(reshape2)
library(stringr)
library(tidyverse)
library(readxl)
library(knitr)
library(ggplot2)
library(udpipe)
```
```{r Get a list of existing articles in PDF format}
# Define the base path for articles
base_path <- "path/to/your/articles/directory"
# Listing PDF files in a specific directory
file_list <- as.data.frame(list.files(file.path(base_path, "Deep Learning-Based Recommendation System Systematic Review and Classification"),
all.files = TRUE,
full.names = FALSE))
colnames(file_list) <- c("article_title")
# Listing PDF files in the article classification directory
article_list <- as.data.frame(list.files(file.path(base_path, "Article Classification"),
pattern = ".pdf$",
full.names = TRUE,
recursive = TRUE,
include.dirs = TRUE))
# Getting a list of reviewed articles
old_names <- as.data.frame(list.files(file.path(base_path, "Reviewed Articles/Thesis"),
all.files = TRUE,
full.names = FALSE))
colnames(old_names) <- c("article_title")
# Cleaning the file names and removing unwanted characters
old_names <- old_names %>%
filter(!article_title %in% c(".", "..", ".DS_Store"))
new_names <- old_names
new_names$article_title <- gsub("[[:punct:]]", " ", new_names$article_title)
new_names$article_title <- gsub("[[:digit:]]+", " ", new_names$article_title)
new_names$article_title <- lapply(new_names$article_title, tolower)
new_names$article_title <- trimws(new_names$article_title)
new_names$article_title <- gsub("\\s+", " ", new_names$article_title)
new_names$article_title <- gsub(" +"," ",new_names$article_title)
new_names$article_title<- str_to_title(new_names$article_title)
new_names$article_title <- gsub("E Commerce","eCommerce",new_names$article_title)
new_names$article_title <- gsub(" Pdf","",new_names$article_title)
new_names <- new_names
new_names <- paste0(new_names$article_title, '.pdf')
# Renaming the files in the specified directory
old.names <- (old_names$article_title)
thesis_path <- file.path(base_path, "Reviewed Articles/Thesis")
file.rename(from = paste0(thesis_path, old.names), to = paste0(thesis_path, new_names))
```
```{r Rename the pre-processing files}
# Renaming files in the pre-processing directory
preprocess_path <- file.path(base_path, "pre-processing/round 2")
old_names <- as.data.frame(list.files(preprocess_path, full.names = FALSE))
colnames(old_names) <- c("article_title")
# Cleaning and modifying file names
new_names <- old_names
new_names$article_title <- gsub("_.*","",new_names$article_title)
new_names$article_title <- gsub("[[:punct:]]", " ", new_names$article_title)
new_names$article_title <- gsub("[[:digit:]]+", " ", new_names$article_title)
new_names$article_title <- lapply(new_names$article_title, tolower)
new_names$article_title <- trimws(new_names$article_title)
new_names$article_title <- gsub("\\s+", " ", new_names$article_title)
new_names$article_title <- gsub(" +"," ",new_names$article_title)
new_names$article_title <- gsub(" pdf", "", new_names$article_title)
new_names$article_title<- str_to_sentence(new_names$article_title)
new_names$article_title <- gsub("e commerce|ecommerce","e-commerce",new_names$article_title)
new_names$article_title <- gsub("E commerce|Ecommerce","E-commerce",new_names$article_title)
new_names <- new_names
new_names <- paste0(new_names$article_title, '.pdf')
old.names <- (old_names$article_title)
# Renaming files
file.rename(from = paste0(preprocess_path, old.names), to = paste0(preprocess
_path, new_names))
```
```{r Mendeley citation analysis}
# Loading and processing citation data
library(bib2df)
citation_path <- "path/to/your/citation/file"
thesis_articles <- as_tibble(bib2df(citation_path))
thesis_articles <- thesis_articles %>%
select(CATEGORY, BIBTEXKEY, AUTHOR, JOURNAL, PUBLISHER, TITLE, VOLUME, YEAR, ABSTRACT, DOI, ISSN, KEYWORDS, URL, ISBN)
# Processing abstracts for citation analysis
abstract <- thesis_articles %>% select(TITLE, BIBTEXKEY, ABSTRACT)
colnames(abstract) <- c("title", "bibtexkey", "abstract")
abstract <- data.frame(tibble::rownames_to_column(abstract, "doc_id"), stringsAsFactors=FALSE)
abstract_f <- unnest_sentences_(abstract, "sents", "abstract")
# Cleaning and formatting abstract sentences
abstract_f$sents <- trimws(abstract_f$sents)
abstract_f[is.na(abstract_f)] <- ""
abstract_f$sents <- str_to_sentence(abstract_f$sents)
abstract_f$title <- str_to_title(abstract_f$title)
# Cleaning and formatting the main thesis articles data
thesis_articles <- as.data.frame(lapply(thesis_articles, as.character), stringsAsFactors=FALSE)
thesis_articles$AUTHOR <- gsub("c()", "", thesis_articles$AUTHOR)
thesis_articles$TITLE <- str_to_title(thesis_articles$TITLE)
thesis_articles$ABSTRACT <- str_to_sentence(thesis_articles$ABSTRACT)
thesis_articles$KEYWORDS <- trimws(tolower(thesis_articles$KEYWORDS))
# Writing the processed data to Excel files
output_path <- "path/to/your/output/directory"
write.xlsx(abstract_f, file.path(output_path, "thesis_article_analysis_v4.xlsx"),
sheetName="abstract_sentence_classification", append=TRUE, row.names=FALSE)
write.xlsx(thesis_articles, file.path(output_path, "thesis_article_analysis_v4.xlsx"),
sheetName="thesis_articles", append=TRUE, row.names=FALSE)
```