-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcleanup.py
73 lines (52 loc) · 1.76 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Delete rows where the measurement is the same at the previous and next
timestamp. Assuming linearity, these can be removed without loss of
information.
Also prints some rough estimates for how much smaller the results are.
- bpm
- stress
- level
- spo2
---
Currently this operates on one file at a time, so here's a shell
script for running it on multiple files:
```
#!/usr/bin/env bash
for file in data/stress/*.csv; do
python cleanup.py -c stress $file
done
for file in data/heart_rate/*.csv; do
python cleanup.py -c bpm $file
done
for file in data/sleep/*.csv; do
python cleanup.py -c level $file
done
for file in data/blood_oxygenation/*.csv; do
python cleanup.py -c spo2 $file
done
```
"""
import argparse
import pandas as pd
import logging
import json
PARSER = argparse.ArgumentParser()
PARSER.add_argument("-c", "--column-name", type=str, help="Name of the column (e.g. bpm, stress)")
PARSER.add_argument("-d", "--dry-run", action="store_true", help="Show stats, but don't overwrite")
PARSER.add_argument("file", help="Input/Output file to compress")
ARGS = PARSER.parse_args()
logging.basicConfig(filename='cleanup.log', level=logging.INFO, format='%(message)s')
df = pd.read_csv(ARGS.file)
column = ARGS.column_name
n_rows_orig, _ = df.shape
df = df.loc[(df[column].shift(1) != df[column]) | (df[column] != df[column].shift(-1))]
n_rows_new, _ = df.shape
orig_str = "{:,}".format(n_rows_orig).rjust(15)
new_str = "{:,}".format(n_rows_new).rjust(15)
print(f"# rows original: {orig_str}")
print(f"# rows updated: {new_str}")
print(" {0}x smaller".format(round(n_rows_orig / n_rows_new, 2)))
if not ARGS.dry_run:
print("Writing to csv")
logging.info(f"{json.dumps([ARGS.file, n_rows_orig, n_rows_new])}")
df.to_csv(ARGS.file, index=False)