taskfile.yml

version: '3'

tasks:
  index:
    deps: [lunr-sentence, lunr-abstract]
    desc: Creates the simsity/lunr indices.
    sources:
      - data/cleaned/*.jsonl
    cmds:
      - python -m frontpage index simsity sentence
      - python -m frontpage index simsity abstract

  lunr-sentence:
    desc: Creates the sentence-level lunr index.
    cmds:
      - python -m frontpage index lunr sentence
    sources:
      - data/cleaned/*.jsonl
    generates:
      - indices/lunr/sentence.json
  
  lunr-abstract:
    desc: Creates the abstract-level lunr index.
    cmds:
      - python -m frontpage index lunr abstract
    sources:
      - data/cleaned/*.jsonl
    generates:
      - indices/lunr/abstract.json
  
  clean:
    desc: Cleans the downloaded data.
    cmds:
      - python -c "from frontpage.datastream import DataStream; DataStream().save_clean_download_stream()"
    sources:
      - data/downloads/*.jsonl

  build-learned-streams:
    desc: Generates files for the active-learn-y annotation recipes.
    cmds:
      - python -c "from frontpage.datastream import DataStream; DataStream().build_active_learn_stream()"
      - python -c "from frontpage.datastream import DataStream; DataStream().build_second_opinion_stream()"
    sources:
      - data/cleaned/*.jsonl
      - training
  
  stats-out:
    desc: Runs `prodigy stats`, which could indicate that there are new annotations.
    cmds: 
      - python -m prodigy stats -nf > /tmp/stats.jsonl
    generates:
      - tmp/stats.jsonl

  prepare-annotations:
    desc: Export the annotations from Prodigy so ML can be trained on it.
    cmds:
      - python -m frontpage annotprep
    sources:
      - /tmp/stats.jsonl
    generates:
      - data/annot/annotations.jsonl
  
  train:
    desc: Trains new classification heads on top of pretrained SBERT layer.
    cmds:
      - python -m frontpage train
    sources:
      - data/annot/annotations.jsonl
      - training/custom-sbert-emb
  
  pretrain:
    desc: Trains new SBERT representations. Can be expensive. Will also upload to wandb.
    cmds:
      - python -m frontpage pretrain
      - python -m frontpage artifact upload

  prepare:
    desc: Runs all the steps required to update the streams for annotation.
    cmds:
      - task: clean
      - task: index
      - task: stats-out
      - task: prepare-annotations
      - task: train
      - task: build-learned-streams

  build:
    desc: Constuct a new frontpage. Always retrains heads just in case.
    cmds:
    - task: clean
    # - python -m frontpage artifact download 
    - task: train
    - python -m frontpage build
    - python -m http.server