Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner: introduce SPOT VMs policy #433

Merged
merged 1 commit into from
May 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FAABRIC_VERSION=0.18.0
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.18.0
FAABRIC_VERSION=0.19.0
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.19.0
COMPOSE_PROJECT_NAME=faabric-dev
CONAN_CACHE_MOUNT_SOURCE=./conan-cache/
12 changes: 6 additions & 6 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
env:
DEPLOYMENT_TYPE: gha-ci
steps:
Expand All @@ -34,7 +34,7 @@ jobs:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
steps:
- name: "Check out code"
uses: actions/checkout@v4
Expand All @@ -45,7 +45,7 @@ jobs:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
steps:
- name: "Check out code"
uses: actions/checkout@v4
Expand All @@ -65,7 +65,7 @@ jobs:
REDIS_QUEUE_HOST: redis
REDIS_STATE_HOST: redis
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
options: --privileged
services:
redis:
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
REDIS_QUEUE_HOST: redis
REDIS_STATE_HOST: redis
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
options: --privileged
services:
redis:
Expand Down Expand Up @@ -156,7 +156,7 @@ jobs:
REDIS_QUEUE_HOST: redis
REDIS_STATE_HOST: redis
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
services:
redis:
image: redis
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.18.0
0.19.0
5 changes: 5 additions & 0 deletions include/faabric/batch-scheduler/BatchScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
#define NOT_ENOUGH_SLOTS_DECISION \
faabric::batch_scheduler::SchedulingDecision(NOT_ENOUGH_SLOTS, \
NOT_ENOUGH_SLOTS)
#define MUST_FREEZE -97
#define MUST_FREEZE_DECISION \
faabric::batch_scheduler::SchedulingDecision(MUST_FREEZE, MUST_FREEZE)

#define MUST_EVICT_IP "E.VI.CT.ME"

namespace faabric::batch_scheduler {

Expand Down
33 changes: 33 additions & 0 deletions include/faabric/batch-scheduler/SpotScheduler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include <faabric/batch-scheduler/BatchScheduler.h>
#include <faabric/batch-scheduler/SchedulingDecision.h>
#include <faabric/util/batch.h>

namespace faabric::batch_scheduler {

// This batch scheduler behaves in the same way than BinPack for NEW and
// SCALE_CHANGE requests, but for DIST_CHANGE it considers if any of the
// hosts in the Host Map have been tainted with the eviction mark. In which
// case it first tries to migrate them to other running hosts and, if not
// enough hosts are available, freezes the messages.
class SpotScheduler final : public BatchScheduler
{
public:
std::shared_ptr<SchedulingDecision> makeSchedulingDecision(
HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req) override;

private:
bool isFirstDecisionBetter(
std::shared_ptr<SchedulingDecision> decisionA,
std::shared_ptr<SchedulingDecision> decisionB) override;

std::vector<Host> getSortedHosts(
HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req,
const DecisionType& decisionType) override;
};
}
13 changes: 13 additions & 0 deletions include/faabric/planner/Planner.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class Planner

void printConfig() const;

std::string getPolicy();

void setPolicy(const std::string& newPolicy);

// ----------
Expand Down Expand Up @@ -87,10 +89,21 @@ class Planner
// the planner was last reset
int getNumMigrations();

// Helper method to get the next host that will be evicted
std::set<std::string> getNextEvictedHostIps();

std::map<int32_t, std::shared_ptr<BatchExecuteRequest>> getEvictedReqs();

// Main entrypoint to request the execution of batches
std::shared_ptr<faabric::batch_scheduler::SchedulingDecision> callBatch(
std::shared_ptr<BatchExecuteRequest> req);

// ----------
// API exclusive to SPOT policy mode
// ----------

void setNextEvictedVm(const std::set<std::string>& vmIp);

private:
// There's a singleton instance of the planner running, but it must allow
// concurrent requests
Expand Down
18 changes: 18 additions & 0 deletions include/faabric/planner/PlannerState.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ namespace faabric::planner {
*/
struct PlannerState
{
// Policy to operate the planner in. Mostly determins the batch scheduler
// behaviour, but also the planner's in some cases
std::string policy;

// Accounting of the hosts that are registered in the system and responsive
// We deliberately use the host's IP as unique key, but assign a unique host
// id for redundancy
Expand All @@ -36,5 +40,19 @@ struct PlannerState

// Helper coutner of the total number of migrations
std::atomic<int> numMigrations = 0;

// -----
// Data structures used only under the SPOT policy
// -----

// Map containing the BER that have been evicted due to a SPOT VM eviction.
// All messages in the VM have been checkpointed, are in the snapshot
// registry in the planner, and are ready to be scheduled when capacity
// appears
std::map<int, std::shared_ptr<BatchExecuteRequest>> evictedRequests;

// This variable simulates the values we would get from a cloud provider's
// API indicating the (set of) VM to be evicted next
std::set<std::string> nextEvictedHostIps;
};
}
9 changes: 9 additions & 0 deletions include/faabric/util/func.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
#include <vector>

#define MIGRATED_FUNCTION_RETURN_VALUE -99
#define FROZEN_FUNCTION_RETURN_VALUE -98

namespace faabric::util {

class FunctionFrozenException : public faabric::util::FaabricException
{
public:
explicit FunctionFrozenException(std::string message)
: FaabricException(std::move(message))
{}

Check warning on line 18 in include/faabric/util/func.h

View check run for this annotation

Codecov / codecov/patch

include/faabric/util/func.h#L18

Added line #L18 was not covered by tests
};

class FunctionMigratedException : public faabric::util::FaabricException
{
public:
Expand Down
3 changes: 3 additions & 0 deletions src/batch-scheduler/BatchScheduler.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <faabric/batch-scheduler/BatchScheduler.h>
#include <faabric/batch-scheduler/BinPackScheduler.h>
#include <faabric/batch-scheduler/CompactScheduler.h>
#include <faabric/batch-scheduler/SpotScheduler.h>
#include <faabric/util/config.h>
#include <faabric/util/logging.h>

Expand All @@ -23,6 +24,8 @@ std::shared_ptr<BatchScheduler> getBatchScheduler()
batchScheduler = std::make_shared<BinPackScheduler>();
} else if (mode == "compact") {
batchScheduler = std::make_shared<CompactScheduler>();
} else if (mode == "spot") {
batchScheduler = std::make_shared<SpotScheduler>();
} else {
SPDLOG_ERROR("Unrecognised batch scheduler mode: {}", mode);
throw std::runtime_error("Unrecognised batch scheduler mode");
Expand Down
1 change: 1 addition & 0 deletions src/batch-scheduler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ faabric_lib(batch_scheduler
BatchScheduler.cpp
BinPackScheduler.cpp
CompactScheduler.cpp
SpotScheduler.cpp
)

target_link_libraries(batch_scheduler PRIVATE
Expand Down
8 changes: 4 additions & 4 deletions src/batch-scheduler/CompactScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ bool CompactScheduler::isFirstDecisionBetter(
throw std::runtime_error("Method not supported for COMPACT scheduler");
}

HostMap deepCopyHostMap(const HostMap& hostMap)
static HostMap deepCopyHostMap(const HostMap& hostMap)
{
HostMap newHostMap;

Expand Down Expand Up @@ -173,9 +173,9 @@ bool CompactScheduler::isFirstDecisionBetter(

// Filter-out from the host map all nodes that are executing requests from a
// different user
void filterHosts(HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req)
static void filterHosts(HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req)
{
// We temporarily use the request subtype field to attach a user id for our
// multi-tenant simulations
Expand Down
Loading