Merge pull request #46 from fact-project/fix_nans

Do not add runs with nans to the erna database
fact-project · Oct 23, 2017 · 234da34 · 234da34
2 parents 7155096 + ccff5cc
commit 234da34
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 5 deletions.
diff --git a/erna/automatic_processing/database_utils.py b/erna/automatic_processing/database_utils.py
@@ -3,7 +3,7 @@
 import os
 from tqdm import tqdm
 
-from .database import(
+from .database import (
     RawDataFile, DrsFile, Job,
     ProcessingState, Jar, XML,
     requires_database_connection
@@ -34,7 +34,6 @@ def fill_data_runs(df, database):
         },
         inplace=True,
     )
-    df.drop('fDrsStep', axis=1, inplace=True)
     with database.atomic():
         query = (
             RawDataFile
@@ -182,11 +181,18 @@ def insert_new_job(
 @requires_database_connection
 def insert_new_jobs(raw_data_files, jar, xml, queue, progress=True, **kwargs):
 
+    failed_files = []
     for f in tqdm(raw_data_files, total=raw_data_files.count(), disable=not progress):
         try:
             insert_new_job(f, jar=jar, xml=xml, queue=queue, **kwargs)
         except peewee.IntegrityError:
-            log.warning('Job already submitted: {} {}'.format(f.night, f.run_id))
+            log.warning('Job already submitted: {}_{:03d}'.format(f.night, f.run_id))
+        except ValueError as e:
+            log.warning('Could not submit {}_{:03d}: {}'.format(
+                f.night, f.run_id, e,
+            ))
+            failed_files.append(f)
+    return failed_files
 
 
 @requires_database_connection

diff --git a/erna/scripts/fill_database.py b/erna/scripts/fill_database.py
@@ -56,9 +56,23 @@ def main(start, end, config):
     runs['fNight'] = pd.to_datetime(runs.fNight.astype(str), format='%Y%m%d')
 
     # fill all non drs runs into raw_data_files
-    fill_data_runs(runs.query('fDrsStep != 2'), database=database)
+    data_runs = runs.query('fDrsStep != 2').drop('fDrsStep', axis=1)
+    nan_entries = data_runs.isnull().any(axis=1)
+    if len(data_runs[nan_entries]) != 0:
+        print('Found invalid entries, skipping:')
+        print(data_runs[nan_entries])
+        data_runs.dropna(inplace=True)
+
+    fill_data_runs(data_runs, database=database)
+
     # fill all drs runs into drs_files
-    fill_drs_runs(runs.query('(fRunTypeKey == 2) & (fDrsStep == 2)'), database=database)
+    drs_runs = runs.query('(fRunTypeKey == 2) & (fDrsStep == 2)')
+    nan_entries = drs_runs.isnull().any(axis=1)
+    if len(drs_runs[nan_entries]) != 0:
+        print('Found invalid entries, skipping:')
+        print(drs_runs[nan_entries])
+        drs_runs.dropna(inplace=True)
+    fill_drs_runs(drs_runs, database=database)
 
     database.close()