fix id field and update changelog

longshuicy · longshuicy · commit 27a8846805af · 2024-08-22T13:42:40.000-05:00
diff --git a/containerized_analytics/smile/preprocessing/CHANGELOG.md b/containerized_analytics/smile/preprocessing/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.6] - 08-22-2024
+
+### Fixed
+- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121)
 
 ## [0.1.5] - 01-23-2024
 
diff --git a/containerized_analytics/smile/preprocessing/preprocessing.py b/containerized_analytics/smile/preprocessing/preprocessing.py
@@ -10,38 +10,21 @@ class Preprocess:
 
     def __init__(self, df, column):
 
-        self.id_column = "id"
-        if 'id_str' in df.columns:
-            self.id_column = 'id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif 'id' in df.columns:
-            self.id_column = 'id'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif 'comment_id' in df.columns:
-            self.id_column = 'comment_id'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif '_source.id_str':
-            self.id_column = '_source.id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif '_source.id':
-            self.id_column = '_source.id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        else:
-            sentences = df[df[column] != ''][column].dropna().astype(
-                'str').tolist()
-            self.id = []
-
-        sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences]
+        # Define potential id columns in order of precedence
+        potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id']
+
+        # Find the first available id column from the potential list
+        self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index')
+
+        # If using index as the id_column, create a new column based on the index
+        if self.id_column == 'index':
+            df[self.id_column] = df.index.astype('str')
+
+        # Filter the dataframe based on the column condition
+        df_new = df[df[column] != ''][[self.id_column, column]].dropna()
+        sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()]
+
+        self.id = df_new[self.id_column].astype('str').tolist()
         self.sentences = sentences
 
     def get_phrases(self):