-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathtaskfile.yml
94 lines (83 loc) · 2.57 KB
/
taskfile.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
version: '3'
tasks:
index:
deps: [lunr-sentence, lunr-abstract]
desc: Creates the simsity/lunr indices.
sources:
- data/cleaned/*.jsonl
cmds:
- python -m frontpage index simsity sentence
- python -m frontpage index simsity abstract
lunr-sentence:
desc: Creates the sentence-level lunr index.
cmds:
- python -m frontpage index lunr sentence
sources:
- data/cleaned/*.jsonl
generates:
- indices/lunr/sentence.json
lunr-abstract:
desc: Creates the abstract-level lunr index.
cmds:
- python -m frontpage index lunr abstract
sources:
- data/cleaned/*.jsonl
generates:
- indices/lunr/abstract.json
clean:
desc: Cleans the downloaded data.
cmds:
- python -c "from frontpage.datastream import DataStream; DataStream().save_clean_download_stream()"
sources:
- data/downloads/*.jsonl
build-learned-streams:
desc: Generates files for the active-learn-y annotation recipes.
cmds:
- python -c "from frontpage.datastream import DataStream; DataStream().build_active_learn_stream()"
- python -c "from frontpage.datastream import DataStream; DataStream().build_second_opinion_stream()"
sources:
- data/cleaned/*.jsonl
- training
stats-out:
desc: Runs `prodigy stats`, which could indicate that there are new annotations.
cmds:
- python -m prodigy stats -nf > /tmp/stats.jsonl
generates:
- tmp/stats.jsonl
prepare-annotations:
desc: Export the annotations from Prodigy so ML can be trained on it.
cmds:
- python -m frontpage annotprep
sources:
- /tmp/stats.jsonl
generates:
- data/annot/annotations.jsonl
train:
desc: Trains new classification heads on top of pretrained SBERT layer.
cmds:
- python -m frontpage train
sources:
- data/annot/annotations.jsonl
- training/custom-sbert-emb
pretrain:
desc: Trains new SBERT representations. Can be expensive. Will also upload to wandb.
cmds:
- python -m frontpage pretrain
- python -m frontpage artifact upload
prepare:
desc: Runs all the steps required to update the streams for annotation.
cmds:
- task: clean
- task: index
- task: stats-out
- task: prepare-annotations
- task: train
- task: build-learned-streams
build:
desc: Constuct a new frontpage. Always retrains heads just in case.
cmds:
- task: clean
# - python -m frontpage artifact download
- task: train
- python -m frontpage build
- python -m http.server