.compute

#!/bin/bash

# this script gets executed on every node of an allocation;
# in our case we use the provided COMPUTE_* env variables
# to construct a distributed TensorFlow cluster definition

set -o pipefail

# activating standard TensorFlow Python virtual environment
source /usr/local/tensorflow/bin/activate

# the standard script to execute
if [ -f ".run" ]; then
  base_cmd="./.run"
else
  base_cmd="bin/run-wer-automation.sh"
fi

# reading the comma separated node list into array "nodes"
IFS=',' read -r -a nodes <<< "$COMPUTE_NODES"
# keep fist node for convenience
first=${nodes[0]}
# hostname for debugging
hostname=`hostname`
# log timestamp prefix
time_format='[%Y-%m-%d %H:%M:%.S]'
sub_processes=1

if ((${#nodes[@]} == 1)); then
    # there is only one (this) node - so we are alone and we don't need a cluster definition
    echo "Starting single node process on $hostname ..."
    logfile=../single.log
    touch $logfile # guarantees existence for "tail -f *.log"
    $base_cmd --job_name localhost "$@" 2>&1 | ts "$time_format [single  ]" >$logfile &
else
    # there is more than one node so we will build a cluster definition

    # defining all cluster ports in a way that avoids collisions with other cluster allocations
    # (that could eventually get scheduled on the same node)
    ((port_base=10000 + (COMPUTE_JOB_NUMBER * 100) % 50000))
    ((coord_port=port_base))
    ((ps_port=port_base + 1))
    ((worker_port=port_base + 2))
    for node in "${nodes[@]}"; do
        worker_hosts[$worker_port]="$node:$worker_port"
        ((worker_port=worker_port + 1))
    done

    # converting worker_hosts array of host:port pairs into a comma separated list
    worker_hosts=$(printf ",%s" "${worker_hosts[@]}")
    worker_hosts=${worker_hosts:1}

    # shared cluster configuration
    # assert: for this job it should be exactly the same on all allocated nodes
    cluster="--coord_host $first --coord_port $coord_port --ps_hosts=$first:$ps_port --worker_hosts=$worker_hosts"

    # helpful for debugging potential networking issues
    echo "Starting allocated node no. $COMPUTE_NODE_INDEX on $hostname of cluster (coordinator: $first:$coord_port, ps: $first:$ps_port, workers: $worker_hosts) ..."

    # starting the parameter server side by side with first worker on the first node;
    # so for the moment we only run one ps per allocation
    if ((COMPUTE_NODE_INDEX == 0)); then
        # CUDA_VISIBLE_DEVICES="" - as the parameter server does not require a GPU;
        # the GPU would be shared with the worker on the same machine;
        # it turned out that this would reduce available GPU memory for the worker by almost 50%
        sub_processes=2
        logfile=../ps_$COMPUTE_NODE_INDEX.log
        touch $logfile # guarantees existence for "tail -f *.log"
        CUDA_VISIBLE_DEVICES="" $base_cmd $cluster --job_name ps --task_index 0 "$@" 2>&1 | ts "$time_format [ps     $COMPUTE_NODE_INDEX]" >$logfile &
    fi

    # starting the worker
    logfile=../worker_$COMPUTE_NODE_INDEX.log
    touch $logfile # guarantees existence for "tail -f *.log"
    $base_cmd $cluster --job_name worker --task_index $COMPUTE_NODE_INDEX "$@" 2>&1 | ts "$time_format [worker $COMPUTE_NODE_INDEX]" >$logfile &
fi

for index in $(seq 1 $sub_processes);
do
  # "wait -n" waits for any sub-process to exit
  # doing this sub_processes times will wait for all sub-processes to finish
  # in case of any sub-process failing, it will exit immediately
  wait -n
  code=$?
  if ((code > 0)); then
    echo "One compute process failed with exit code $code."
    exit $code
  else
    echo "One compute process succeeded."
  fi
done