From 0eda1fafd2432712863172a1cfebdbd37bfb7f4d Mon Sep 17 00:00:00 2001
From: Chen Wang <cwang138@illinois.edu>
Date: Thu, 22 Aug 2024 17:03:05 -0500
Subject: [PATCH] fix id field and update changelog (#125)

---
 .../smile/preprocessing/CHANGELOG.md          |  4 ++
 .../smile/preprocessing/preprocessing.py      | 47 ++++++-------------
 2 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/containerized_analytics/smile/preprocessing/CHANGELOG.md b/containerized_analytics/smile/preprocessing/CHANGELOG.md
index 7d167e72..aedf8dc9 100644
--- a/containerized_analytics/smile/preprocessing/CHANGELOG.md
+++ b/containerized_analytics/smile/preprocessing/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.6] - 08-22-2024
+
+### Fixed
+- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121)
 
 ## [0.1.5] - 01-23-2024
 
diff --git a/containerized_analytics/smile/preprocessing/preprocessing.py b/containerized_analytics/smile/preprocessing/preprocessing.py
index 6e520cd7..cc98a1e1 100644
--- a/containerized_analytics/smile/preprocessing/preprocessing.py
+++ b/containerized_analytics/smile/preprocessing/preprocessing.py
@@ -10,38 +10,21 @@ class Preprocess:
 
     def __init__(self, df, column):
 
-        self.id_column = "id"
-        if 'id_str' in df.columns:
-            self.id_column = 'id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif 'id' in df.columns:
-            self.id_column = 'id'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif 'comment_id' in df.columns:
-            self.id_column = 'comment_id'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif '_source.id_str':
-            self.id_column = '_source.id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif '_source.id':
-            self.id_column = '_source.id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        else:
-            sentences = df[df[column] != ''][column].dropna().astype(
-                'str').tolist()
-            self.id = []
-
-        sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences]
+        # Define potential id columns in order of precedence
+        potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id']
+
+        # Find the first available id column from the potential list
+        self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index')
+
+        # If using index as the id_column, create a new column based on the index
+        if self.id_column == 'index':
+            df[self.id_column] = df.index.astype('str')
+
+        # Filter the dataframe based on the column condition
+        df_new = df[df[column] != ''][[self.id_column, column]].dropna()
+        sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()]
+
+        self.id = df_new[self.id_column].astype('str').tolist()
         self.sentences = sentences
 
     def get_phrases(self):