Skip to content

Commit 27a8846

Browse files
committed
fix id field and update changelog
1 parent 8dc9137 commit 27a8846

File tree

2 files changed

+19
-32
lines changed

2 files changed

+19
-32
lines changed

containerized_analytics/smile/preprocessing/CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [0.1.6] - 08-22-2024
8+
9+
### Fixed
10+
- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121)
711

812
## [0.1.5] - 01-23-2024
913

containerized_analytics/smile/preprocessing/preprocessing.py

+15-32
Original file line numberDiff line numberDiff line change
@@ -10,38 +10,21 @@ class Preprocess:
1010

1111
def __init__(self, df, column):
1212

13-
self.id_column = "id"
14-
if 'id_str' in df.columns:
15-
self.id_column = 'id_str'
16-
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
17-
sentences = df_new[column].astype('str').tolist()
18-
self.id = df_new[self.id_column].astype('str').tolist()
19-
elif 'id' in df.columns:
20-
self.id_column = 'id'
21-
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
22-
sentences = df_new[column].astype('str').tolist()
23-
self.id = df_new[self.id_column].astype('str').tolist()
24-
elif 'comment_id' in df.columns:
25-
self.id_column = 'comment_id'
26-
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
27-
sentences = df_new[column].astype('str').tolist()
28-
self.id = df_new[self.id_column].astype('str').tolist()
29-
elif '_source.id_str':
30-
self.id_column = '_source.id_str'
31-
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
32-
sentences = df_new[column].astype('str').tolist()
33-
self.id = df_new[self.id_column].astype('str').tolist()
34-
elif '_source.id':
35-
self.id_column = '_source.id_str'
36-
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
37-
sentences = df_new[column].astype('str').tolist()
38-
self.id = df_new[self.id_column].astype('str').tolist()
39-
else:
40-
sentences = df[df[column] != ''][column].dropna().astype(
41-
'str').tolist()
42-
self.id = []
43-
44-
sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences]
13+
# Define potential id columns in order of precedence
14+
potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id']
15+
16+
# Find the first available id column from the potential list
17+
self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index')
18+
19+
# If using index as the id_column, create a new column based on the index
20+
if self.id_column == 'index':
21+
df[self.id_column] = df.index.astype('str')
22+
23+
# Filter the dataframe based on the column condition
24+
df_new = df[df[column] != ''][[self.id_column, column]].dropna()
25+
sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()]
26+
27+
self.id = df_new[self.id_column].astype('str').tolist()
4528
self.sentences = sentences
4629

4730
def get_phrases(self):

0 commit comments

Comments
 (0)