From 0eda1fafd2432712863172a1cfebdbd37bfb7f4d Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 22 Aug 2024 17:03:05 -0500 Subject: [PATCH] fix id field and update changelog (#125) --- .../smile/preprocessing/CHANGELOG.md | 4 ++ .../smile/preprocessing/preprocessing.py | 47 ++++++------------- 2 files changed, 19 insertions(+), 32 deletions(-) diff --git a/containerized_analytics/smile/preprocessing/CHANGELOG.md b/containerized_analytics/smile/preprocessing/CHANGELOG.md index 7d167e72..aedf8dc9 100644 --- a/containerized_analytics/smile/preprocessing/CHANGELOG.md +++ b/containerized_analytics/smile/preprocessing/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.6] - 08-22-2024 + +### Fixed +- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121) ## [0.1.5] - 01-23-2024 diff --git a/containerized_analytics/smile/preprocessing/preprocessing.py b/containerized_analytics/smile/preprocessing/preprocessing.py index 6e520cd7..cc98a1e1 100644 --- a/containerized_analytics/smile/preprocessing/preprocessing.py +++ b/containerized_analytics/smile/preprocessing/preprocessing.py @@ -10,38 +10,21 @@ class Preprocess: def __init__(self, df, column): - self.id_column = "id" - if 'id_str' in df.columns: - self.id_column = 'id_str' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif 'id' in df.columns: - self.id_column = 'id' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif 'comment_id' in df.columns: - self.id_column = 'comment_id' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif '_source.id_str': - self.id_column = '_source.id_str' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif '_source.id': - self.id_column = '_source.id_str' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - else: - sentences = df[df[column] != ''][column].dropna().astype( - 'str').tolist() - self.id = [] - - sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences] + # Define potential id columns in order of precedence + potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id'] + + # Find the first available id column from the potential list + self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index') + + # If using index as the id_column, create a new column based on the index + if self.id_column == 'index': + df[self.id_column] = df.index.astype('str') + + # Filter the dataframe based on the column condition + df_new = df[df[column] != ''][[self.id_column, column]].dropna() + sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()] + + self.id = df_new[self.id_column].astype('str').tolist() self.sentences = sentences def get_phrases(self):