@@ -10,38 +10,21 @@ class Preprocess:
10
10
11
11
def __init__ (self , df , column ):
12
12
13
- self .id_column = "id"
14
- if 'id_str' in df .columns :
15
- self .id_column = 'id_str'
16
- df_new = df [df [column ] != '' ][[self .id_column , column ]].dropna ()
17
- sentences = df_new [column ].astype ('str' ).tolist ()
18
- self .id = df_new [self .id_column ].astype ('str' ).tolist ()
19
- elif 'id' in df .columns :
20
- self .id_column = 'id'
21
- df_new = df [df [column ] != '' ][[self .id_column , column ]].dropna ()
22
- sentences = df_new [column ].astype ('str' ).tolist ()
23
- self .id = df_new [self .id_column ].astype ('str' ).tolist ()
24
- elif 'comment_id' in df .columns :
25
- self .id_column = 'comment_id'
26
- df_new = df [df [column ] != '' ][[self .id_column , column ]].dropna ()
27
- sentences = df_new [column ].astype ('str' ).tolist ()
28
- self .id = df_new [self .id_column ].astype ('str' ).tolist ()
29
- elif '_source.id_str' :
30
- self .id_column = '_source.id_str'
31
- df_new = df [df [column ] != '' ][[self .id_column , column ]].dropna ()
32
- sentences = df_new [column ].astype ('str' ).tolist ()
33
- self .id = df_new [self .id_column ].astype ('str' ).tolist ()
34
- elif '_source.id' :
35
- self .id_column = '_source.id_str'
36
- df_new = df [df [column ] != '' ][[self .id_column , column ]].dropna ()
37
- sentences = df_new [column ].astype ('str' ).tolist ()
38
- self .id = df_new [self .id_column ].astype ('str' ).tolist ()
39
- else :
40
- sentences = df [df [column ] != '' ][column ].dropna ().astype (
41
- 'str' ).tolist ()
42
- self .id = []
43
-
44
- sentences = [re .sub (r"http\S+" , "" , tweet ) for tweet in sentences ]
13
+ # Define potential id columns in order of precedence
14
+ potential_id_columns = ['id_str' , 'id' , 'comment_id' , '_source.id_str' , '_source.id' ]
15
+
16
+ # Find the first available id column from the potential list
17
+ self .id_column = next ((col for col in potential_id_columns if col in df .columns ), 'index' )
18
+
19
+ # If using index as the id_column, create a new column based on the index
20
+ if self .id_column == 'index' :
21
+ df [self .id_column ] = df .index .astype ('str' )
22
+
23
+ # Filter the dataframe based on the column condition
24
+ df_new = df [df [column ] != '' ][[self .id_column , column ]].dropna ()
25
+ sentences = [re .sub (r"http\S+" , "" , str (tweet )) for tweet in df_new [column ].tolist ()]
26
+
27
+ self .id = df_new [self .id_column ].astype ('str' ).tolist ()
45
28
self .sentences = sentences
46
29
47
30
def get_phrases (self ):
0 commit comments