diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..41833397 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + analytics = false + remote = remote_storage +['remote "remote_storage"'] + url = /home/leo/estudodvc/dvc_remote diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json new file mode 100644 index 00000000..af1b48d0 --- /dev/null +++ b/.dvc/plots/confusion.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "max", + "field": "xy_count", + "as": "max_count" + } + ], + "groupby": [] + }, + { + "calculate": "datum.xy_count / datum.max_count", + "as": "percent_of_max" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "xy_count", + "type": "quantitative", + "title": "", + "scale": { + "domainMin": 0, + "nice": true + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "xy_count", + "type": "quantitative" + }, + "color": { + "condition": { + "test": "datum.percent_of_max > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json new file mode 100644 index 00000000..1d38849f --- /dev/null +++ b/.dvc/plots/confusion_normalized.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "sum", + "field": "xy_count", + "as": "sum_y" + } + ], + "groupby": [ + "" + ] + }, + { + "calculate": "datum.xy_count / datum.sum_y", + "as": "percent_of_y" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "percent_of_y", + "type": "quantitative", + "title": "", + "scale": { + "domain": [ + 0, + 1 + ] + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "percent_of_y", + "type": "quantitative", + "format": ".2f" + }, + "color": { + "condition": { + "test": "datum.percent_of_y > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json new file mode 100644 index 00000000..9cf71ce0 --- /dev/null +++ b/.dvc/plots/default.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + } +} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json new file mode 100644 index 00000000..65549f9e --- /dev/null +++ b/.dvc/plots/linear.json @@ -0,0 +1,116 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "line" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "mark": { + "type": "rule", + "color": "gray" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative" + } + } + }, + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json new file mode 100644 index 00000000..9af9304c --- /dev/null +++ b/.dvc/plots/scatter.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "point" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json new file mode 100644 index 00000000..d497ce75 --- /dev/null +++ b/.dvc/plots/smooth.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "transform": [ + { + "loess": "", + "on": "", + "groupby": [ + "rev" + ], + "bandwidth": 0.3 + } + ] +} diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/data/prepared/.gitignore b/data/prepared/.gitignore index e69de29b..22a65dd9 100644 --- a/data/prepared/.gitignore +++ b/data/prepared/.gitignore @@ -0,0 +1,2 @@ +/train.csv +/test.csv diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..1cf9fbc4 --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 97295a44585b235c1a4a4620b9c303c1 + size: 77803 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..eaa770b9 --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: d9ffb53c26a0a73b17a45bd0f21d4677 + size: 193348 + path: train.csv diff --git a/data/raw/.gitignore b/data/raw/.gitignore index e69de29b..a5d9d98f 100644 --- a/data/raw/.gitignore +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +/train +/val diff --git a/data/raw/train.dvc b/data/raw/train.dvc new file mode 100644 index 00000000..f8e77548 --- /dev/null +++ b/data/raw/train.dvc @@ -0,0 +1,5 @@ +outs: +- md5: b673168a4895bd24aef44f431ecb39a4.dir + size: 75302934 + nfiles: 9469 + path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc new file mode 100644 index 00000000..237148f4 --- /dev/null +++ b/data/raw/val.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 0ad4dcf197b452735726bf8d8777201d.dir + size: 31248080 + nfiles: 3925 + path: val diff --git a/metrics/accuracy.json b/metrics/accuracy.json new file mode 100644 index 00000000..7e85b934 --- /dev/null +++ b/metrics/accuracy.json @@ -0,0 +1 @@ +{"accuracy": 0.7870722433460076} \ No newline at end of file diff --git a/model/.gitignore b/model/.gitignore index e69de29b..565a9d50 100644 --- a/model/.gitignore +++ b/model/.gitignore @@ -0,0 +1 @@ +/model.joblib diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..33b0cf4c --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 2011bd3a9f08c3535d0396eb7e5aa0b7 + size: 241086 + path: model.joblib diff --git a/src/train.py b/src/train.py index e5feeda6..5d3c7c2c 100644 --- a/src/train.py +++ b/src/train.py @@ -37,7 +37,7 @@ def load_data(data_path): def main(repo_path): train_csv_path = repo_path / "data/prepared/train.csv" train_data, labels = load_data(train_csv_path) - sgd = SGDClassifier(max_iter=10) + sgd = SGDClassifier(max_iter=100) trained_model = sgd.fit(train_data, labels) dump(trained_model, repo_path / "model/model.joblib")