Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upload segmentation models after segmentation training completes #31

Merged
merged 2 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/htr2hpc/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,21 @@ def model_update(self, model_id: int, model_file: pathlib.Path):
# on successful update, returns the model object
return to_namedtuple("model", resp.json())

def model_create(self, model_file: pathlib.Path, model_name: str, job: str):
def model_create(
self,
model_file: pathlib.Path,
job: str,
model_name: Optional[str] = None,
):
"""Add a new model to eScriptorium. Takes a model file, name, and job
(Segment or Recognize)."""
if job not in {"Segment", "Recognize"}:
raise ValueError(f"{job} is not a valid model job name")

api_url = "models/"
# if model name is unset, use filename stem
if model_name is None:
model_name = model_file.stem

with open(model_file, "rb") as mfile:
files = {"file": mfile}
Expand Down
28 changes: 28 additions & 0 deletions src/htr2hpc/train/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pathlib
import subprocess
from collections import defaultdict
from tqdm import tqdm

from kraken.containers import BaselineLine, Region, Segmentation

Expand Down Expand Up @@ -173,5 +174,32 @@ def get_best_model(model_dir: pathlib.Path) -> pathlib.Path | None:
return best[0] if best else None


def upload_models(
api, model_dir: pathlib.Path, model_type: str, show_progress=True
) -> int:
"""Upload all model files in the specified model directory to eScriptorum
with the specified job type (Segment/Recognize). Returns a count of the
number of models created."""
uploaded = 0

# segtrain creates models based on modelname with _0, _1, _2 ... _49
# sort numerically on the latter portion of the name
modelfiles = sorted(
model_dir.glob("*.mlmodel"), key=lambda path: int(path.stem.split("_")[-1])
)
for model_file in tqdm(
modelfiles,
desc=f"Uploading {model_type} models",
disable=not show_progress,
):
# NOTE: should have error handling here;
# what kinds of exceptions/errors might occur?
created = api.model_create(model_file, job=model_type)
if created:
uploaded += 1
Comment on lines +195 to +199
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On kinds of errors/exceptions:

  • various failures of connection, not all models uploaded. escriptorium instance might be temporarily down. if cleanup runs before all models are uploaded, then that data is lost.
  • user could be over their Quota disk storage, if something like that comes to be enforced.
  • training fails or is interrupted and no models are in the model directory.
  • for transcription training, no best model is in the directory.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It also occurs to me that the cluster sometimes has downtime, so add that to the list of possibilities to account for.


return uploaded


# use api.update_model with model id and pathlib.Path to model file
# to update existing model record with new file
36 changes: 28 additions & 8 deletions src/htr2hpc/train/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from tqdm import tqdm

from htr2hpc.api_client import eScriptoriumAPIClient
from htr2hpc.train.data import get_training_data, get_model
from htr2hpc.train.data import get_training_data, get_model, upload_models
from htr2hpc.train.slurm import segtrain, slurm_job_status, slurm_job_queue_status


Expand Down Expand Up @@ -99,6 +99,14 @@ def main():
action="store_true",
default=False,
)
# control progress bar display (on by default)
parser.add_argument(
"--progress",
help="Show progress",
action=argparse.BooleanOptionalAction,
default=True,
dest="show_progress",
)

# training for transcription requires a transcription id
transcription_parser.add_argument(
Expand Down Expand Up @@ -145,7 +153,7 @@ def main():

logging.basicConfig(encoding="utf-8", level=logging.WARN)
logger_upscope = logging.getLogger("htr2hpc")
logger_upscope.setLevel(logging.DEBUG)
logger_upscope.setLevel(logging.INFO)

api = eScriptoriumAPIClient(args.base_url, api_token=api_token)

Expand Down Expand Up @@ -174,11 +182,6 @@ def main():
# kraken default defs are path objects
model_file = default_model[args.mode]

# - get input data for that job
# - run the bash app with the slurm provider
# - get the result from the bash app and check for failure/success
#

# create a directory and path for the output model file
output_model_dir = args.work_dir / "output_model"
# currently assuming model dir is empty
Expand All @@ -192,6 +195,9 @@ def main():
abs_model_file = model_file.absolute()
abs_output_modelfile = output_modelfile.absolute()

# store the path to original working directory before changing directory
orig_working_dir = pathlib.Path.cwd()

# change directory to working directory, since by default,
# slurm executes the job from the directory where it was submitted
os.chdir(args.work_dir)
Expand All @@ -215,6 +221,7 @@ def main():
with tqdm(
desc=f"Slurm job {job_id}",
bar_format="{desc} | total time: {elapsed}{postfix} ",
disable=not args.show_progress,
) as statusbar:
running = False
while job_status:
Expand All @@ -239,8 +246,21 @@ def main():
job_output = args.work_dir / f"segtrain_{job_id}.out"
print(f"Job output should be in {job_output}")

# TODO: after run completes, check for results
# change back to original working directory
os.chdir(orig_working_dir)

# after run completes, check for results
# - for segmentation, upload all models to eScriptorium as new models
upload_count = upload_models(
api,
output_modelfile.parent,
es_model_jobs[args.mode],
show_progress=args.show_progress,
)
# - does this behavior depend on job exit status?
# reasonable to assume any model files created should be uploaded¿
Comment on lines +260 to +261
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for segmentation, yes any model files in the directory should be uploaded, even if the training process exits early for some reason. we will want to discuss best practices for handling transcription results when a _best model is not produced.


print(f"Uploaded {upload_count} segmentation models to eScriptorium")

# TODO: handle transcription training

Expand Down