From 7f3a3cdd42c78b53d37f5d65a1791b4f32c825bc Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Wed, 4 Apr 2018 11:22:47 +0100 Subject: [PATCH 01/58] Net-tools are not necessary for running the worker --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 2ffaaf17..13c689b1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,6 @@ CA4019 Project #### Development - Rust Nightly - Protobuf Compiler -- net-tools (if running worker locally) #### Deployment - Docker From 1c12864623cdf0011fd10f0e90a046285b18d7dc Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Wed, 4 Apr 2018 11:37:40 +0100 Subject: [PATCH 02/58] Update review requirements --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 29d1f66c..b4fedc26 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,6 +50,6 @@ All code must be submitted via pull requests, *not* checked straight into the re request will be from a single feature branch, which will not be used for any other features after merging. Ideally, it will be deleted after a merge. -All pull requests must be reviewed by at least two people who are not the submitter. You can ask +All pull requests must be reviewed by at least one person who are not the submitter. You can ask in Slack for a review, or use Github's assign feature to assign the pull request to a specific person. From ef9ac76b6ec3e7b2ad41a5cfc81ae756eb75b6b3 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Wed, 4 Apr 2018 12:20:33 +0100 Subject: [PATCH 03/58] Remove TODOs for large features and unplanned changes --- proto/mapreduce.proto | 7 ------- worker/src/server/intermediate_data_service.rs | 2 -- 2 files changed, 9 deletions(-) diff --git a/proto/mapreduce.proto b/proto/mapreduce.proto index 3a9711ac..55796d65 100644 --- a/proto/mapreduce.proto +++ b/proto/mapreduce.proto @@ -7,8 +7,6 @@ package mapreduce; service MapReduceService { // Schedules a map reduce. Returns various gRPC error codes if the // operation failed before even starting. - // TODO(voy): Decide do we want to return more information about scheduled - // request. rpc PerformMapReduce (MapReduceRequest) returns (MapReduceResponse); // Gets the status of the map reduce. @@ -25,7 +23,6 @@ service MapReduceService { message EmptyMessage {} message MapReduceRequest { - // TODO(voy): Rename/Remove this field once we actually send a library. string binary_path = 1; // Location of the input data on an accessible filesystem. @@ -37,7 +34,6 @@ message MapReduceRequest { // ID of the client. Generated to be used for easier recognising who is // making the request. Generated on a single machine for now - // TODO(voy): Convert to unique identity once we have authentication. string client_id = 4; // Priority of the MapReduce @@ -57,9 +53,6 @@ message MapReduceStatusRequest { // Optionally instead of the client_id the specific ID of the map reduce // can be given to get its status. - // TODO(voy): Add restrictions on the type of data that is returned if the - // client performing the MapReduce is not the same as the client - // requesting the status. string mapreduce_id = 2; } diff --git a/worker/src/server/intermediate_data_service.rs b/worker/src/server/intermediate_data_service.rs index 12a54093..fc988262 100644 --- a/worker/src/server/intermediate_data_service.rs +++ b/worker/src/server/intermediate_data_service.rs @@ -16,8 +16,6 @@ impl grpc_pb::IntermediateDataService for IntermediateDataService { _o: RequestOptions, req: pb::IntermediateDataRequest, ) -> SingleResponse { - // TODO: After the unnecessary stuff is removed from the request path, add the absolute - // path for this worker. info!("Serving file {}", &req.get_path()); match io::read_local(req.get_path()) { Ok(data) => { From 4ea7d32f7bed9778ba3c697c5c8667bfdd98f1b4 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Wed, 4 Apr 2018 14:58:36 +0100 Subject: [PATCH 04/58] Handle tracking in progress task better --- worker/src/operations/map.rs | 42 ++++++++---- worker/src/operations/operation_handler.rs | 75 +++++++--------------- worker/src/operations/reduce.rs | 29 ++++++--- worker/src/operations/state.rs | 15 +---- 4 files changed, 72 insertions(+), 89 deletions(-) diff --git a/worker/src/operations/map.rs b/worker/src/operations/map.rs index 9a4699ab..ea969127 100644 --- a/worker/src/operations/map.rs +++ b/worker/src/operations/map.rs @@ -29,9 +29,13 @@ pub struct MapInput { pub value: String, } -fn log_map_operation_err(err: Error, operation_state_arc: &Arc>) { +fn log_map_operation_err( + err: Error, + operation_state_arc: &Arc>, + task_id: &str, +) { output_error(&err.chain_err(|| "Error running map operation.")); - operation_handler::set_failed_status(operation_state_arc); + operation_handler::set_failed_status(operation_state_arc, task_id); } fn send_map_result( @@ -131,15 +135,20 @@ pub fn perform_map( debug!("Input files: {:?}", input_files); - if operation_handler::get_worker_status(&resources.operation_state) == pb::WorkerStatus::BUSY { - warn!("Map operation requested while worker is busy"); - return Err("Worker is busy.".into()); + { + let mut state = resources.operation_state.lock().unwrap(); + if state.current_task_id != "" { + warn!("Map operation requested while worker is busy"); + return Err("Worker is busy.".into()); + } + + state.current_task_id = map_options.task_id.clone(); + state.operation_status = pb::OperationStatus::IN_PROGRESS; } - operation_handler::set_busy_status(&resources.operation_state); let result = internal_perform_map(map_options, resources, output_dir_uuid); if let Err(err) = result { - log_map_operation_err(err, &resources.operation_state); + log_map_operation_err(err, &resources.operation_state, &map_options.task_id); return Err("Error starting map operation.".into()); } @@ -157,6 +166,12 @@ fn combine_map_results( let mut map_results_vec = Vec::new(); { let mut operation_state = resources.operation_state.lock().unwrap(); + + // Task has been cancelled + if operation_state.current_task_id != task_id { + return Ok(()); + } + for map_result in operation_state.intermediate_map_results.clone() { map_results_vec.push(map_result); } @@ -192,10 +207,10 @@ fn combine_map_results( map_result.set_task_id(task_id.to_owned()); if let Err(err) = send_map_result(&resources.master_interface, map_result) { - log_map_operation_err(err, &resources.operation_state); + log_map_operation_err(err, &resources.operation_state, task_id); } else { info!("Map operation completed sucessfully."); - operation_handler::set_complete_status(&resources.operation_state); + operation_handler::set_complete_status(&resources.operation_state, task_id); } Ok(()) @@ -209,11 +224,11 @@ fn process_map_operation_error( ) { { let mut operation_state = resources.operation_state.lock().unwrap(); - if operation_state.map_operation_failed { + if operation_state.current_task_id != task_id { // Already processed an error, no need to send status again. return; } - operation_state.map_operation_failed = true; + operation_state.current_task_id = String::new(); } let mut map_result = pb::MapResult::new(); @@ -225,7 +240,7 @@ fn process_map_operation_error( if let Err(err) = send_map_result(&resources.master_interface, map_result) { error!("Could not send map operation failed: {}", err); } - log_map_operation_err(err, &resources.operation_state); + log_map_operation_err(err, &resources.operation_state, task_id); } fn process_map_result( @@ -245,7 +260,7 @@ fn process_map_result( let finished = { let mut operation_state = resources.operation_state.lock().unwrap(); - if operation_state.map_operation_failed { + if operation_state.current_task_id != task_id { // The map operation has failed, no need to continue. return; } @@ -291,7 +306,6 @@ fn internal_perform_map( { let mut operation_state = resources.operation_state.lock().unwrap(); operation_state.waiting_map_operations = input_locations.len(); - operation_state.map_operation_failed = false; operation_state.intermediate_map_results.clear(); initial_cpu_time = operation_state.initial_cpu_time; diff --git a/worker/src/operations/operation_handler.rs b/worker/src/operations/operation_handler.rs index 1fbacb0d..ad7b5531 100644 --- a/worker/src/operations/operation_handler.rs +++ b/worker/src/operations/operation_handler.rs @@ -37,44 +37,20 @@ pub struct OperationHandler { data_abstraction_layer: Arc, } -pub fn get_worker_status(operation_state_arc: &Arc>) -> pb::WorkerStatus { - let operation_state = operation_state_arc.lock().unwrap(); - - operation_state.worker_status -} - -fn set_operation_handler_status( - operation_state_arc: &Arc>, - worker_status: pb::WorkerStatus, - operation_status: pb::OperationStatus, -) { +pub fn set_complete_status(operation_state_arc: &Arc>, task_id: &str) { let mut operation_state = operation_state_arc.lock().unwrap(); - operation_state.worker_status = worker_status; - operation_state.operation_status = operation_status; -} - -pub fn set_complete_status(operation_state_arc: &Arc>) { - set_operation_handler_status( - operation_state_arc, - pb::WorkerStatus::AVAILABLE, - pb::OperationStatus::COMPLETE, - ); -} - -pub fn set_failed_status(operation_state_arc: &Arc>) { - set_operation_handler_status( - operation_state_arc, - pb::WorkerStatus::AVAILABLE, - pb::OperationStatus::FAILED, - ); + if operation_state.current_task_id == task_id { + operation_state.current_task_id = String::new(); + operation_state.operation_status = pb::OperationStatus::COMPLETE; + } } -pub fn set_cancelled_status(operation_state_arc: &Arc>) { - set_operation_handler_status( - operation_state_arc, - pb::WorkerStatus::AVAILABLE, - pb::OperationStatus::CANCELLED, - ); +pub fn set_failed_status(operation_state_arc: &Arc>, task_id: &str) { + let mut operation_state = operation_state_arc.lock().unwrap(); + if operation_state.current_task_id == task_id { + operation_state.current_task_id = String::new(); + operation_state.operation_status = pb::OperationStatus::FAILED; + } } // Checks if the tasks is cancelled and handles this case. Returns true if the task was canceled. @@ -82,22 +58,8 @@ pub fn check_task_cancelled( operation_state_arc: &Arc>, task_id: &str, ) -> bool { - let cancelled = { - let operation_state = operation_state_arc.lock().unwrap(); - operation_state.task_cancelled(task_id) - }; - if cancelled { - set_cancelled_status(operation_state_arc); - println!("Succesfully cancelled task: {}", task_id); - } - cancelled -} - -pub fn set_busy_status(operation_state_arc: &Arc>) { - let mut operation_state = operation_state_arc.lock().unwrap(); - - operation_state.worker_status = pb::WorkerStatus::BUSY; - operation_state.operation_status = pb::OperationStatus::IN_PROGRESS; + let operation_state = operation_state_arc.lock().unwrap(); + operation_state.task_cancelled(task_id) } pub fn failure_details_from_error(err: &Error) -> String { @@ -135,7 +97,11 @@ impl OperationHandler { pub fn get_worker_status(&self) -> pb::WorkerStatus { let operation_state = self.operation_state.lock().unwrap(); - operation_state.worker_status + if operation_state.current_task_id == "" { + return pb::WorkerStatus::AVAILABLE; + } + + pb::WorkerStatus::BUSY } pub fn get_worker_operation_status(&self) -> pb::OperationStatus { @@ -186,7 +152,10 @@ impl OperationHandler { pub fn cancel_task(&self, request: &pb::CancelTaskRequest) -> Result<()> { let mut operation_state = self.operation_state.lock().unwrap(); - operation_state.last_cancelled_task_id = Some(request.task_id.clone()); + if operation_state.current_task_id == request.task_id { + operation_state.current_task_id = String::new(); + operation_state.operation_status = pb::OperationStatus::UNKNOWN; + } Ok(()) } diff --git a/worker/src/operations/reduce.rs b/worker/src/operations/reduce.rs index 8f9d37dd..43eda8ba 100644 --- a/worker/src/operations/reduce.rs +++ b/worker/src/operations/reduce.rs @@ -108,9 +108,13 @@ fn run_reducer( Ok(reduce_results) } -fn log_reduce_operation_err(err: Error, operation_state_arc: &Arc>) { +fn log_reduce_operation_err( + err: Error, + operation_state_arc: &Arc>, + task_id: &str, +) { output_error(&err.chain_err(|| "Error running reduce operation.")); - operation_handler::set_failed_status(operation_state_arc); + operation_handler::set_failed_status(operation_state_arc, task_id); } fn send_reduce_result( @@ -192,15 +196,20 @@ pub fn perform_reduce( reduce_request.reducer_file_path ); - if operation_handler::get_worker_status(&resources.operation_state) == pb::WorkerStatus::BUSY { - warn!("Reduce operation requested while worker is busy"); - return Err("Worker is busy.".into()); + { + let mut state = resources.operation_state.lock().unwrap(); + if state.current_task_id != "" { + warn!("Reduce operation requested while worker is busy"); + return Err("Worker is busy.".into()); + } + + state.current_task_id = reduce_request.task_id.clone(); + state.operation_status = pb::OperationStatus::IN_PROGRESS; } - operation_handler::set_busy_status(&resources.operation_state); let result = internal_perform_reduce(reduce_request, resources, output_uuid); if let Err(err) = result { - log_reduce_operation_err(err, &resources.operation_state); + log_reduce_operation_err(err, &resources.operation_state, &reduce_request.task_id); return Err("Error starting reduce operation.".into()); } @@ -239,7 +248,7 @@ fn handle_reduce_error(err: Error, resources: &OperationResources, task_id: &str error!("Error sending reduce failed: {}", err); } - log_reduce_operation_err(err, &resources.operation_state); + log_reduce_operation_err(err, &resources.operation_state, task_id); } fn handle_reduce_success(resources: &OperationResources, initial_cpu_time: u64, task_id: &str) { @@ -252,12 +261,12 @@ fn handle_reduce_success(resources: &OperationResources, initial_cpu_time: u64, match result { Ok(_) => { - operation_handler::set_complete_status(&resources.operation_state); + operation_handler::set_complete_status(&resources.operation_state, task_id); info!("Reduce operation completed sucessfully."); } Err(err) => { error!("Error sending reduce result: {}", err); - operation_handler::set_failed_status(&resources.operation_state); + operation_handler::set_failed_status(&resources.operation_state, task_id); } } } diff --git a/worker/src/operations/state.rs b/worker/src/operations/state.rs index 7e85abc7..fc63fd06 100644 --- a/worker/src/operations/state.rs +++ b/worker/src/operations/state.rs @@ -4,7 +4,7 @@ use cerberus_proto::worker as pb; /// `OperationState` is a data only struct for holding the current state for the `OperationHandler` #[derive(Default)] pub struct OperationState { - pub worker_status: pb::WorkerStatus, + pub current_task_id: String, pub operation_status: pb::OperationStatus, // Initial CPU time of the current operation. This is used to calculate the total cpu time used @@ -12,32 +12,23 @@ pub struct OperationState { pub initial_cpu_time: u64, pub waiting_map_operations: usize, - pub map_operation_failed: bool, pub intermediate_map_results: Vec, - - pub last_cancelled_task_id: Option, } impl OperationState { pub fn new() -> Self { OperationState { - worker_status: pb::WorkerStatus::AVAILABLE, + current_task_id: String::new(), operation_status: pb::OperationStatus::UNKNOWN, initial_cpu_time: 0, waiting_map_operations: 0, - map_operation_failed: false, intermediate_map_results: Vec::new(), - - last_cancelled_task_id: None, } } pub fn task_cancelled(&self, task_id: &str) -> bool { - match self.last_cancelled_task_id.clone() { - Some(id) => task_id == id, - None => false, - } + self.current_task_id != task_id } } From 5b976445a09338b6b2de3283598251c2b8992e43 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Wed, 4 Apr 2018 17:45:31 +0100 Subject: [PATCH 05/58] Split up long run function --- cli/src/commands/run.rs | 48 +++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/cli/src/commands/run.rs b/cli/src/commands/run.rs index c475d5a7..14f859d9 100644 --- a/cli/src/commands/run.rs +++ b/cli/src/commands/run.rs @@ -26,6 +26,32 @@ fn verify_valid_path(path_str: &str) -> Result { } } +fn get_priority(matches: &ArgMatches) -> Result { + let priority_str = matches.value_of("priority").unwrap_or(DEFAULT_PRIORITY); + let priority: u32 = match priority_str.parse() { + Ok(val) => val, + Err(err) => { + return Err( + format!( + "Error occured while converting '{}' to a u32: {}", + priority_str, + err + ).into(), + ); + } + }; + + if priority < 1 || priority > 10 { + return Err( + format!( + "Priority can only be between 1 and 10. {} is not in this range", + priority + ).into(), + ); + } + Ok(priority) +} + pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Result<()> { let mut input = matches .value_of("input") @@ -52,27 +78,7 @@ pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Re || "Invalid binary path.", )?; - let priority_str = matches.value_of("priority").unwrap_or(DEFAULT_PRIORITY); - let priority: u32 = match priority_str.parse() { - Ok(val) => val, - Err(err) => { - return Err( - format!( - "Error occured while converting '{}' to a u32: {}", - priority_str, - err - ).into(), - ); - } - }; - if priority < 1 || priority > 10 { - return Err( - format!( - "Priority can only be between 1 and 10. {} is not in this range", - priority - ).into(), - ); - } + let priority = get_priority(matches)?; let mut req = pb::MapReduceRequest::new(); req.set_binary_path(binary.to_owned()); From 9f1942a74aa1fd759f2c8ce8160c99bee17a113c Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Wed, 4 Apr 2018 19:46:46 +0100 Subject: [PATCH 06/58] Update CLI documentation Added priority option, cancel command and distributed filesystem commands to the documentation --- cli/doc/command_line_interface.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/cli/doc/command_line_interface.md b/cli/doc/command_line_interface.md index daa7e4b6..fe47c374 100644 --- a/cli/doc/command_line_interface.md +++ b/cli/doc/command_line_interface.md @@ -1,7 +1,7 @@ Command Line Interface ====================== -**Version: 0.1.0** +**Version: 0.2.0** This document outlines the command line interface options. --- @@ -17,7 +17,7 @@ The application has 1 global option. ### Commands -There are 3 main Commands +There are 4 main Commands * `run`: This command tells the master to perform a MapReduce job with the given binary and input directory. It has the following flags: @@ -30,6 +30,10 @@ There are 3 main Commands placed. The location must also be accessible in the shared filesystem, otherwise an error will be raised. If the flag is not set, the output will match the following format: `$shared_location/$map_reduce_ID/output`. + * `--priority`: Optional value from 1-10 giving a priority for the scheduled + MapReduce job. The master will allocate the tasks for jobs with higher + priority first. The default priority is 3. + Example: ``` @@ -74,3 +78,23 @@ $ cli status --job_id=mr13 | Output | /shared/super_secret | |====================================| ``` + +* `cancel`: This command cancels a running MapReduce job. + It has 1 flag: + * `--id`: This specifies the MapReduce ID of the job to cancel. + +### Distributed Filesystem Commands + +There are 2 additional commands that can be used when using DFS for storage +on the cluster. + +* `upload`: This command uploads a file or directory to the cluster distributed + filesystem. It has the following flags: + * `--local_path`: Path of the file or directory on the local machine. + * `--remote_path`: Path of the file or directory on the cluster. + The local file path will be used if this is not provided. + +* `download`: This command downloads a file or directory from the cluster + distributed filesystem. It has the following flags: + * `--remote_path`: Path of the file or directory on the cluster. + * `--local_path`: Path of location to store the downloaded file or directory. From fc18439d52f642e2497d6abe7a3a5c2f0a0aaf2b Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Wed, 4 Apr 2018 20:45:57 +0100 Subject: [PATCH 07/58] Remove unnessary clonning of potentially large data --- master/src/scheduling/scheduler.rs | 51 ++++++++++++++++++++++------- master/src/scheduling/state.rs | 8 ++--- master/src/server/client_service.rs | 25 +++----------- worker/src/operations/map.rs | 13 +------- 4 files changed, 49 insertions(+), 48 deletions(-) diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index 7d8c1dbd..21b3f876 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -4,6 +4,7 @@ use std::sync::{Mutex, Arc}; use serde_json; +use cerberus_proto::mapreduce as pb; use common::{Task, TaskStatus, Job}; use errors::*; use scheduling::state::{ScheduledJob, State}; @@ -42,17 +43,17 @@ impl Scheduler { let mut state = self.state.lock().unwrap(); - let job = state.get_job(job_id).chain_err( - || "Error scheduling reduce tasks", - )?; - let reduce_tasks = { + let job = state.get_job(job_id).chain_err( + || "Error scheduling reduce tasks", + )?; + let map_tasks = state.get_map_tasks(job_id).chain_err(|| { format!("Could not get map tasks for job {}", job_id) })?; self.task_processor - .create_reduce_tasks(&job, map_tasks) + .create_reduce_tasks(job, map_tasks) .chain_err(|| { format!("Could not create reduce tasks for job {}", job_id) })? @@ -204,17 +205,45 @@ impl Scheduler { self.worker_manager.get_available_workers() } - pub fn get_mapreduce_status(&self, mapreduce_id: &str) -> Result { + fn get_status_for_job(&self, job: &Job) -> pb::MapReduceReport { + let mut report = pb::MapReduceReport::new(); + report.mapreduce_id = job.id.clone(); + report.status = job.status; + if job.status == pb::Status::FAILED { + report.failure_details = job.status_details.clone().unwrap_or_else( + || "Unknown.".to_owned(), + ); + } + report.scheduled_timestamp = job.time_requested.timestamp(); + report.output_directory = job.output_directory.clone(); + if let Some(time) = job.time_started { + report.started_timestamp = time.timestamp(); + } + if let Some(time) = job.time_completed { + report.done_timestamp = time.timestamp(); + } + + report + } + + pub fn get_mapreduce_status(&self, mapreduce_id: &str) -> Result { let state = self.state.lock().unwrap(); - state.get_job(mapreduce_id).chain_err( + let job = state.get_job(mapreduce_id).chain_err( || "Error getting map reduce status.", - ) + )?; + Ok(self.get_status_for_job(job)) } - /// `get_mapreduce_client_status` returns a vector of `Job`s for a given client. - pub fn get_mapreduce_client_status(&self, client_id: &str) -> Vec { + /// `get_mapreduce_client_status` returns a vector of `MapReduceReport`s for a given client. + pub fn get_mapreduce_client_status(&self, client_id: &str) -> Vec { let state = self.state.lock().unwrap(); - state.get_jobs(client_id) + let jobs = state.get_jobs(client_id); + + let mut reports = Vec::new(); + for job in jobs { + reports.push(self.get_status_for_job(job)); + } + reports } pub fn get_most_recent_client_job_id(&self, client_id: &str) -> Result { diff --git a/master/src/scheduling/state.rs b/master/src/scheduling/state.rs index fe6338c8..839ff721 100644 --- a/master/src/scheduling/state.rs +++ b/master/src/scheduling/state.rs @@ -129,9 +129,9 @@ impl State { Ok(()) } - pub fn get_job(&self, job_id: &str) -> Result { + pub fn get_job(&self, job_id: &str) -> Result<&Job> { match self.scheduled_jobs.get(job_id) { - Some(scheduled_job) => Ok(scheduled_job.job.clone()), + Some(scheduled_job) => Ok(&scheduled_job.job), None => Err(format!("Job with ID {} is not found.", &job_id).into()), } } @@ -145,11 +145,11 @@ impl State { jobs } - pub fn get_jobs(&self, client_id: &str) -> Vec { + pub fn get_jobs(&self, client_id: &str) -> Vec<&Job> { let mut jobs = Vec::new(); for scheduled_job in self.scheduled_jobs.values() { if scheduled_job.job.client_id == client_id { - jobs.push(scheduled_job.job.clone()); + jobs.push(&scheduled_job.job); } } diff --git a/master/src/server/client_service.rs b/master/src/server/client_service.rs index 592c763e..c0f0b731 100644 --- a/master/src/server/client_service.rs +++ b/master/src/server/client_service.rs @@ -72,41 +72,24 @@ impl grpc_pb::MapReduceService for ClientService { req: pb::MapReduceStatusRequest, ) -> SingleResponse { let mut response = pb::MapReduceStatusResponse::new(); - let jobs: Vec; + let reports: Vec; if !req.client_id.is_empty() { - jobs = self.scheduler.get_mapreduce_client_status(&req.client_id); + reports = self.scheduler.get_mapreduce_client_status(&req.client_id); } else if !req.mapreduce_id.is_empty() { match self.scheduler.get_mapreduce_status(&req.mapreduce_id) { Err(err) => { output_error(&err.chain_err(|| "Error getting mapreduces status.")); return SingleResponse::err(Error::Other(JOB_RETRIEVAL_ERROR)); } - Ok(job) => jobs = vec![job], + Ok(report) => reports = vec![report], } } else { error!("Client requested job status without job id or client id."); return SingleResponse::err(Error::Other(MISSING_JOB_IDS)); } - for job in jobs { - let mut report = pb::MapReduceReport::new(); - report.mapreduce_id = job.id.clone(); - report.status = job.status; - if job.status == pb::Status::FAILED { - report.failure_details = job.status_details.clone().unwrap_or_else( - || "Unknown.".to_owned(), - ); - } - report.scheduled_timestamp = job.time_requested.timestamp(); - report.output_directory = job.output_directory.clone(); - if let Some(time) = job.time_started { - report.started_timestamp = time.timestamp(); - } - if let Some(time) = job.time_completed { - report.done_timestamp = time.timestamp(); - } - + for report in reports { response.reports.push(report); } diff --git a/worker/src/operations/map.rs b/worker/src/operations/map.rs index ea969127..33467071 100644 --- a/worker/src/operations/map.rs +++ b/worker/src/operations/map.rs @@ -119,22 +119,12 @@ pub fn perform_map( operation_state.initial_cpu_time = operation_handler::get_cpu_time(); } - let input_files: Vec = map_options - .get_input() - .get_input_locations() - .into_iter() - .map(|loc| loc.input_path.clone()) - .collect(); - - info!( "Performing map operation. mapper={} number of inputs={}", map_options.mapper_file_path, map_options.get_input().get_input_locations().len() ); - debug!("Input files: {:?}", input_files); - { let mut state = resources.operation_state.lock().unwrap(); if state.current_task_id != "" { @@ -172,10 +162,9 @@ fn combine_map_results( return Ok(()); } - for map_result in operation_state.intermediate_map_results.clone() { + for map_result in operation_state.intermediate_map_results.drain(0..) { map_results_vec.push(map_result); } - operation_state.intermediate_map_results.clear(); } for map_result in map_results_vec { From 6a435fbe0443d425882588d04d56ad7d6e6925dd Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Wed, 4 Apr 2018 21:24:11 +0100 Subject: [PATCH 08/58] Clean up data abstraction layer initialization --- master/src/initialization/data_layer.rs | 77 ++++++++++++++----------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/master/src/initialization/data_layer.rs b/master/src/initialization/data_layer.rs index d3e0178e..6fff8b44 100644 --- a/master/src/initialization/data_layer.rs +++ b/master/src/initialization/data_layer.rs @@ -16,52 +16,61 @@ const DEFAULT_S3_DIRECTORY: &str = "/tmp/cerberus/s3/"; type AbstractionLayerArc = Arc; +fn initialize_dfs( + storage_location: &Option<&str>, +) -> (AbstractionLayerArc, Arc) { + let mut storage_dir = PathBuf::new(); + storage_dir.push(storage_location.unwrap_or(DEFAULT_DFS_DIRECTORY)); + + let local_file_manager_arc = Arc::new(LocalFileManager::new(storage_dir)); + let file_manager_arc = Arc::new(FileSystemManager::new()); + + let master_interface = Box::new(LocalFileSystemMasterInterface::new( + Arc::clone(&file_manager_arc), + )); + + let dfs_abstraction_layer = Arc::new(DFSAbstractionLayer::new( + Arc::clone(&local_file_manager_arc), + master_interface, + )); + + (dfs_abstraction_layer, file_manager_arc) +} + +fn initialize_s3(storage_location: &Option<&str>, bucket: &str) -> Result { + let mut storage_dir = PathBuf::new(); + storage_dir.push(storage_location.unwrap_or(DEFAULT_S3_DIRECTORY)); + + let local_file_manager_arc = Arc::new(LocalFileManager::new(storage_dir)); + + let s3_layer = + AmazonS3AbstractionLayer::new(bucket.into(), Arc::clone(&local_file_manager_arc)) + .chain_err(|| "Unable to create AmazonS3 abstraction layer")?; + + Ok(Arc::new(s3_layer)) +} + pub fn get_data_abstraction_layer( matches: &ArgMatches, worker_info_receiver: Receiver, ) -> Result<(AbstractionLayerArc, Option>)> { let data_abstraction_layer: Arc; - let filesystem_manager: Option>; + let mut filesystem_manager: Option> = None; - let nfs_path = matches.value_of("nfs"); - let dfs = matches.is_present("dfs"); - let s3 = matches.value_of("s3"); let storage_location = matches.value_of("storage-location"); - if let Some(path) = nfs_path { + if let Some(path) = matches.value_of("nfs") { data_abstraction_layer = Arc::new(NFSAbstractionLayer::new(Path::new(path))); - filesystem_manager = None; - } else if dfs { - let mut storage_dir = PathBuf::new(); - storage_dir.push(storage_location.unwrap_or(DEFAULT_DFS_DIRECTORY)); - - let local_file_manager_arc = Arc::new(LocalFileManager::new(storage_dir)); - let file_manager_arc = Arc::new(FileSystemManager::new()); - - let master_interface = Box::new(LocalFileSystemMasterInterface::new( - Arc::clone(&file_manager_arc), - )); - - data_abstraction_layer = Arc::new(DFSAbstractionLayer::new( - Arc::clone(&local_file_manager_arc), - master_interface, - )); + } else if matches.is_present("dfs") { + let (abstraction_layer, file_manager_arc) = initialize_dfs(&storage_location); + data_abstraction_layer = abstraction_layer; filesystem_manager = Some(file_manager_arc); - } else if let Some(bucket) = s3 { - let mut storage_dir = PathBuf::new(); - storage_dir.push(storage_location.unwrap_or(DEFAULT_S3_DIRECTORY)); - - let local_file_manager_arc = Arc::new(LocalFileManager::new(storage_dir)); - - let s3_layer = - AmazonS3AbstractionLayer::new(bucket.into(), Arc::clone(&local_file_manager_arc)) - .chain_err(|| "Unable to create AmazonS3 abstraction layer")?; - data_abstraction_layer = Arc::new(s3_layer); - - filesystem_manager = None; + } else if let Some(bucket) = matches.value_of("s3") { + data_abstraction_layer = initialize_s3(&storage_location, bucket).chain_err( + || "Error initializing S3 abstraction layer", + )?; } else { data_abstraction_layer = Arc::new(NullAbstractionLayer::new()); - filesystem_manager = None; } run_worker_info_upate_loop(&filesystem_manager, worker_info_receiver); From 513642ab8002f254d863645c963748ae844ccb33 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Wed, 4 Apr 2018 21:26:37 +0100 Subject: [PATCH 09/58] Change printlns that should use logging --- master/src/dashboard/server.rs | 2 +- master/src/scheduling/scheduler.rs | 2 +- master/src/server/client_service.rs | 2 +- master/src/worker_management/state.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/master/src/dashboard/server.rs b/master/src/dashboard/server.rs index 73ad20f0..7333cd28 100644 --- a/master/src/dashboard/server.rs +++ b/master/src/dashboard/server.rs @@ -160,7 +160,7 @@ impl ApiHandler { output_directory: output_path, validate_paths: true, - priority: priority, + priority, }; let job = Job::new(job_options, &self.data_abstraction_layer_arc) diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index 7d8c1dbd..062d2a3f 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -190,7 +190,7 @@ impl Scheduler { .cancel_workers_tasks(workers) .chain_err(|| "Unable to cancel task on workers")?; - println!("Succesfully cancelled job {}", job_id); + info!("Succesfully cancelled job {}", job_id); Ok(()) } diff --git a/master/src/server/client_service.rs b/master/src/server/client_service.rs index 592c763e..ad6e6143 100644 --- a/master/src/server/client_service.rs +++ b/master/src/server/client_service.rs @@ -134,7 +134,7 @@ impl grpc_pb::MapReduceService for ClientService { }; response.set_mapreduce_id(job_id.clone()); - println!("Attempting to cancel MapReduce: {}", job_id); + info!("Attempting to cancel MapReduce: {}", job_id); let result = self.scheduler.cancel_job(job_id.as_ref()); if let Err(err) = result { output_error(&err.chain_err(|| "Error cancelling MapReduce")); diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 6c5b235a..92ea258b 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -506,7 +506,7 @@ impl State { let scheduled_task_id: String = match task_option { Some(priority_task) => { - println!( + info!( "Popped off task {} with priority {}", priority_task.id, priority_task.priority From 47d47a527c13f082240a299f0fbaccc2c632b7b3 Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Wed, 4 Apr 2018 23:10:52 +0100 Subject: [PATCH 10/58] Add submodule for worker GRPC communication --- worker/src/{ => communication}/master_interface.rs | 0 worker/src/communication/mod.rs | 5 +++++ worker/src/{ => communication}/worker_interface.rs | 0 worker/src/initialization/register_worker.rs | 2 +- worker/src/initialization/worker_resources.rs | 2 +- worker/src/main.rs | 5 ++--- worker/src/operations/map.rs | 2 +- worker/src/operations/operation_handler.rs | 2 +- worker/src/operations/reduce.rs | 3 +-- 9 files changed, 12 insertions(+), 9 deletions(-) rename worker/src/{ => communication}/master_interface.rs (100%) create mode 100644 worker/src/communication/mod.rs rename worker/src/{ => communication}/worker_interface.rs (100%) diff --git a/worker/src/master_interface.rs b/worker/src/communication/master_interface.rs similarity index 100% rename from worker/src/master_interface.rs rename to worker/src/communication/master_interface.rs diff --git a/worker/src/communication/mod.rs b/worker/src/communication/mod.rs new file mode 100644 index 00000000..1eb4239e --- /dev/null +++ b/worker/src/communication/mod.rs @@ -0,0 +1,5 @@ +mod master_interface; +mod worker_interface; + +pub use self::master_interface::MasterInterface; +pub use self::worker_interface::WorkerInterface; diff --git a/worker/src/worker_interface.rs b/worker/src/communication/worker_interface.rs similarity index 100% rename from worker/src/worker_interface.rs rename to worker/src/communication/worker_interface.rs diff --git a/worker/src/initialization/register_worker.rs b/worker/src/initialization/register_worker.rs index 43837e85..46ae6fb3 100644 --- a/worker/src/initialization/register_worker.rs +++ b/worker/src/initialization/register_worker.rs @@ -2,7 +2,7 @@ use std::{thread, time}; use std::net::SocketAddr; use errors::*; -use master_interface::MasterInterface; +use communication::MasterInterface; const WORKER_REGISTRATION_RETRIES: u16 = 5; const WORKER_REGISTRATION_RETRY_WAIT_DURATION_MS: u64 = 1000; diff --git a/worker/src/initialization/worker_resources.rs b/worker/src/initialization/worker_resources.rs index 3014a77f..d99a935d 100644 --- a/worker/src/initialization/worker_resources.rs +++ b/worker/src/initialization/worker_resources.rs @@ -7,7 +7,7 @@ use clap::ArgMatches; use errors::*; use initialization::{get_data_abstraction_layer, initialize_grpc_server, initialize_state_handler}; -use master_interface::MasterInterface; +use communication::MasterInterface; use operations::OperationHandler; use server::Server; use state::StateHandler; diff --git a/worker/src/main.rs b/worker/src/main.rs index 3e8cb052..c31ba92f 100644 --- a/worker/src/main.rs +++ b/worker/src/main.rs @@ -37,14 +37,13 @@ mod errors { } } +mod communication; mod initialization; mod main_loop; -mod master_interface; mod operations; +mod parser; mod server; mod state; -mod parser; -mod worker_interface; use std::net::SocketAddr; use std::str::FromStr; diff --git a/worker/src/operations/map.rs b/worker/src/operations/map.rs index ea969127..a52dcde4 100644 --- a/worker/src/operations/map.rs +++ b/worker/src/operations/map.rs @@ -12,7 +12,7 @@ use uuid::Uuid; use errors::*; use cerberus_proto::worker as pb; -use master_interface::MasterInterface; +use communication::MasterInterface; use super::combine; use super::io; use super::operation_handler; diff --git a/worker/src/operations/operation_handler.rs b/worker/src/operations/operation_handler.rs index ad7b5531..55e72cbb 100644 --- a/worker/src/operations/operation_handler.rs +++ b/worker/src/operations/operation_handler.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use cerberus_proto::worker as pb; use errors::*; -use master_interface::MasterInterface; +use communication::MasterInterface; use util::data_layer::AbstractionLayer; use super::map; use super::reduce; diff --git a/worker/src/operations/reduce.rs b/worker/src/operations/reduce.rs index 43eda8ba..b76e2d07 100644 --- a/worker/src/operations/reduce.rs +++ b/worker/src/operations/reduce.rs @@ -8,14 +8,13 @@ use std::thread; use serde_json; use errors::*; -use master_interface::MasterInterface; +use communication::{MasterInterface, WorkerInterface}; use super::io; use super::operation_handler; use super::operation_handler::OperationResources; use super::state::OperationState; use util::output_error; use util::data_layer::AbstractionLayer; -use worker_interface::WorkerInterface; use cerberus_proto::worker as pb; From 6ba8ca814139a1d0ba430854cf423b1eb6747a61 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Thu, 5 Apr 2018 14:32:17 +0100 Subject: [PATCH 11/58] Reassign a worker that reports status as available --- master/src/common/worker.rs | 2 -- master/src/worker_management/state.rs | 30 ++++++++++----------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/master/src/common/worker.rs b/master/src/common/worker.rs index 9e208aa5..0bda3a7b 100644 --- a/master/src/common/worker.rs +++ b/master/src/common/worker.rs @@ -36,7 +36,6 @@ pub struct Worker { pub status_last_updated: DateTime, pub current_task_id: String, - pub last_cancelled_task_id: Option, pub worker_id: String, pub task_assignments_failed: u16, @@ -61,7 +60,6 @@ impl Worker { status_last_updated: Utc::now(), current_task_id: String::new(), - last_cancelled_task_id: None, worker_id, task_assignments_failed: 0, diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 92ea258b..6ca0a32f 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -147,6 +147,17 @@ impl State { worker.operation_status = operation_status; worker.status_last_updated = Utc::now(); + if worker_status == pb::WorkerStatus::AVAILABLE && !worker.current_task_id.is_empty() { + if let Some(assigned_task) = self.tasks.get_mut(&worker.current_task_id) { + self.priority_task_queue.push(PriorityTask::new( + worker.current_task_id.clone(), + REQUEUED_TASK_PRIORITY * assigned_task.job_priority, + )); + assigned_task.assigned_worker_id = String::new(); + } + worker.current_task_id = String::new(); + } + Ok(()) } @@ -166,17 +177,6 @@ impl State { return Err("Task id does not match expected task id.".into()); } - let worker = self.workers.get(&reduce_result.worker_id).chain_err(|| { - format!("Worker with ID {} not found.", reduce_result.worker_id) - })?; - - if let Some(task_id) = worker.last_cancelled_task_id.clone() { - if task_id == reduce_result.task_id { - scheduled_task.status = TaskStatus::Cancelled; - return Ok(scheduled_task); - } - } - scheduled_task.status = TaskStatus::Complete; scheduled_task.time_completed = Some(Utc::now()); scheduled_task.cpu_time = reduce_result.get_cpu_time(); @@ -208,13 +208,6 @@ impl State { format!("Worker with ID {} not found.", map_result.worker_id) })?; - if let Some(task_id) = worker.last_cancelled_task_id.clone() { - if task_id == map_result.task_id { - scheduled_task.status = TaskStatus::Cancelled; - return Ok(scheduled_task); - } - } - for (partition, output_file) in map_result.get_map_results() { scheduled_task.map_output_files.insert( *partition, @@ -529,7 +522,6 @@ impl State { })?; let previous_task_id = worker.current_task_id.clone(); - worker.last_cancelled_task_id = Some(worker.current_task_id.clone()); worker.current_task_id = String::new(); self.tasks.remove(&previous_task_id); From fd03b140c7300e6b74d24acc4b6d97653d1c8a9e Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Thu, 5 Apr 2018 15:30:37 +0100 Subject: [PATCH 12/58] Change partition function to no longer use emitter --- libcerberus/examples/end-to-end.rs | 37 ++++++-------- libcerberus/examples/rating-by-year.rs | 23 +++------ libcerberus/src/emitter.rs | 14 ------ libcerberus/src/lib.rs | 4 +- libcerberus/src/partition.rs | 57 +++++++++------------ libcerberus/src/runner.rs | 24 +++++---- libcerberus/src/serialise.rs | 68 +------------------------- 7 files changed, 64 insertions(+), 163 deletions(-) diff --git a/libcerberus/examples/end-to-end.rs b/libcerberus/examples/end-to-end.rs index 93101c2a..710ba08d 100644 --- a/libcerberus/examples/end-to-end.rs +++ b/libcerberus/examples/end-to-end.rs @@ -35,29 +35,22 @@ impl Reduce for TestReducer { struct TestPartitioner; impl Partition for TestPartitioner { - fn partition(&self, input: PartitionInputPairs, mut emitter: E) -> Result<()> - where - E: EmitPartitionedIntermediate, - { - for (key, value) in input.pairs { - let first_char = key.chars().nth(0).chain_err( - || "Cannot partition key of empty string.", - )?; - let partition = { - if first_char.is_lowercase() { - if first_char > 'm' { 1 } else { 0 } - } else if first_char > 'M' { - 1 - } else { - 0 - } - }; + fn partition(&self, input: PartitionInputKV) -> Result { + let key = input.key; + let first_char = key.chars().nth(0).chain_err( + || "Cannot partition key of empty string.", + )?; + let partition = { + if first_char.is_lowercase() { + if first_char > 'm' { 1 } else { 0 } + } else if first_char > 'M' { + 1 + } else { + 0 + } + }; - emitter.emit(partition, key, value).chain_err( - || "Error partitioning map output.", - )?; - } - Ok(()) + Ok(partition) } } diff --git a/libcerberus/examples/rating-by-year.rs b/libcerberus/examples/rating-by-year.rs index b17bc888..3e6f6896 100644 --- a/libcerberus/examples/rating-by-year.rs +++ b/libcerberus/examples/rating-by-year.rs @@ -63,21 +63,14 @@ impl Map for RatingByYearMapper { struct RatingByYearPartitioner; impl Partition for RatingByYearPartitioner { - fn partition(&self, input: PartitionInputPairs, mut emitter: E) -> Result<()> - where - E: EmitPartitionedIntermediate, - { - for (key, value) in input.pairs { - let year_str = key[(key.len() - 5)..(key.len() - 1)].to_owned(); - let partition: u64 = year_str.parse().chain_err(|| { - format!("Error getting year from movie title {}, {}", key, year_str) - })?; - - emitter.emit(partition, key, value).chain_err( - || "Error partitioning map output.", - )?; - } - Ok(()) + fn partition(&self, input: PartitionInputKV) -> Result { + let key = input.key; + let year_str = key[(key.len() - 5)..(key.len() - 1)].to_owned(); + let partition: u64 = year_str.parse().chain_err(|| { + format!("Error getting year from movie title {}, {}", key, year_str) + })?; + + Ok(partition) } } diff --git a/libcerberus/src/emitter.rs b/libcerberus/src/emitter.rs index 6acfc625..dcf9024d 100644 --- a/libcerberus/src/emitter.rs +++ b/libcerberus/src/emitter.rs @@ -14,20 +14,6 @@ pub trait EmitIntermediate { fn emit(&mut self, key: K, value: V) -> Result<()>; } -/// The `EmitPartitionedIntermediate` trait specifies structs which can send partitioned key-value -/// pairs to an in-memory data structure. -/// -/// `EmitPartitionedIntermediate` is intended for use by the `MapPartitioner` during the key -/// partitioning phase, for emitting key value pairs in their coresponding partition. Since these -/// in-memory data structures will eventually be serialised to disk, they must implement the -/// `serde::Serialize` trait. -pub trait EmitPartitionedIntermediate { - /// Takes ownership of a key-value pair and stores it in a sink based on its partition. - /// - /// Returns an empty `Result` used for error handling. - fn emit(&mut self, partition: u64, key: K, value: V) -> Result<()>; -} - /// The `EmitFinal` trait specifies structs which can send values to an in-memory data structure. /// /// `EmitFinal` is intended for use in `Reduce` operations, for emitting an intermediate key-value diff --git a/libcerberus/src/lib.rs b/libcerberus/src/lib.rs index 22a7fc12..6145c18c 100644 --- a/libcerberus/src/lib.rs +++ b/libcerberus/src/lib.rs @@ -39,10 +39,10 @@ pub mod serialise; pub use combiner::Combine; pub use errors::*; -pub use emitter::{EmitIntermediate, EmitPartitionedIntermediate, EmitFinal}; +pub use emitter::{EmitIntermediate, EmitFinal}; pub use intermediate::IntermediateInputKV; pub use mapper::{Map, MapInputKV}; -pub use partition::{HashPartitioner, Partition, PartitionInputPairs}; +pub use partition::{HashPartitioner, Partition, PartitionInputKV}; pub use reducer::Reduce; pub use registry::{UserImplRegistry, UserImplRegistryBuilder}; pub use runner::*; diff --git a/libcerberus/src/partition.rs b/libcerberus/src/partition.rs index de145183..75ea7e7d 100644 --- a/libcerberus/src/partition.rs +++ b/libcerberus/src/partition.rs @@ -3,29 +3,30 @@ use std::hash::{Hash, Hasher}; use serde::Serialize; -use emitter::EmitPartitionedIntermediate; use errors::*; -/// The `PartitionInputPairs` is a struct for passing input data to a `Partition`. +/// The `PartitionInputKV` is a struct for passing input data to a `Partition`. /// -/// `PartitionInputPairs` is a thin wrapper around a `Vec<(Key, Value)>`, used for creating a clearer API. -/// It can be constructed normally or using `PartitionInputPairs::new()`. -#[derive(Debug, Default, Deserialize, PartialEq)] -pub struct PartitionInputPairs +/// `PartitionInputKV` is a thin wrapper around a `(Key, Value)`, +/// used for creating a clearer API. +/// It can be constructed normally or using `PartitionInputKV::new()`. +#[derive(Debug, PartialEq)] +pub struct PartitionInputKV<'a, K, V> where - K: Default + Serialize, - V: Default + Serialize, + K: Default + Serialize + 'a, + V: Default + Serialize + 'a, { - pub pairs: Vec<(K, V)>, + pub key: &'a K, + pub value: &'a V, } -impl PartitionInputPairs +impl<'a, K, V> PartitionInputKV<'a, K, V> where - K: Default + Serialize, - V: Default + Serialize, + K: Default + Serialize + 'a, + V: Default + Serialize + 'a, { - pub fn new(pairs: Vec<(K, V)>) -> Self { - PartitionInputPairs { pairs } + pub fn new(key: &'a K, value: &'a V) -> Self { + PartitionInputKV { key, value } } } @@ -33,21 +34,17 @@ where /// /// # Arguments /// -/// * `input` - A `Vec` containing the output pairs of a map operation. -/// * `emitter` - A struct implementing the `EmitPartitionedIntermediate` trait, provided by the map runner. +/// * `input` - A `ParitionInputKV` containing an output pair of a map operation. /// /// # Outputs /// -/// An empty result used for returning an error. Outputs of the map operation are sent out through -/// the `emitter`. +/// A Result, representing the output partition for the given key and value. pub trait Partition where K: Default + Serialize, V: Default + Serialize, { - fn partition(&self, input: PartitionInputPairs, emitter: E) -> Result<()> - where - E: EmitPartitionedIntermediate; + fn partition(&self, input: PartitionInputKV) -> Result; } /// `HashPartitioner` implements the `Partition` for any Key that can be hashed. @@ -72,18 +69,10 @@ where K: Default + Serialize + Hash, V: Default + Serialize, { - fn partition(&self, input: PartitionInputPairs, mut emitter: E) -> Result<()> - where - E: EmitPartitionedIntermediate, - { - for (key, value) in input.pairs { - let hash: u64 = self.calculate_hash(&key); - let partition_count: u64 = self.partition_count; - let partition = hash % partition_count; - emitter.emit(partition, key, value).chain_err( - || "Error partitioning map output.", - )?; - } - Ok(()) + fn partition(&self, input: PartitionInputKV) -> Result { + let hash: u64 = self.calculate_hash(input.key); + let partition_count: u64 = self.partition_count; + let partition = hash % partition_count; + Ok(partition) } } diff --git a/libcerberus/src/runner.rs b/libcerberus/src/runner.rs index 68cfff06..576db743 100644 --- a/libcerberus/src/runner.rs +++ b/libcerberus/src/runner.rs @@ -14,11 +14,11 @@ use errors::*; use intermediate::IntermediateInputKV; use io::*; use mapper::Map; -use partition::{Partition, PartitionInputPairs}; +use partition::{Partition, PartitionInputKV}; use reducer::Reduce; use registry::UserImplRegistry; -use serialise::{FinalOutputObject, FinalOutputObjectEmitter, IntermediateOutputObject, - IntermediateOutputObjectEmitter, VecEmitter}; +use serialise::{FinalOutputObject, FinalOutputObjectEmitter, IntermediateOutputObject, VecEmitter, + IntermediateOutputPair}; use super::VERSION; /// `parse_command_line` uses `clap` to parse the command-line arguments passed to the payload. @@ -115,12 +115,18 @@ where let mut output_object = IntermediateOutputObject::::default(); - partitioner - .partition( - PartitionInputPairs::new(pairs_vec), - IntermediateOutputObjectEmitter::new(&mut output_object), - ) - .chain_err(|| "Error partitioning map output")?; + for pair in pairs_vec.drain(0..) { + let partition = partitioner + .partition(PartitionInputKV::new(&pair.0, &pair.1)) + .chain_err(|| "Error partitioning map output")?; + let output_array = output_object.partitions.entry(partition).or_insert_with( + Default::default, + ); + output_array.push(IntermediateOutputPair { + key: pair.0, + value: pair.1, + }); + } write_intermediate_output(&mut sink, &output_object) .chain_err(|| "Error writing map output to stdout.")?; diff --git a/libcerberus/src/serialise.rs b/libcerberus/src/serialise.rs index d9fdf4fe..d0f01987 100644 --- a/libcerberus/src/serialise.rs +++ b/libcerberus/src/serialise.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use serde::Serialize; -use emitter::{EmitFinal, EmitPartitionedIntermediate}; +use emitter::EmitFinal; use errors::*; /// `IntermediateOutputPair` is a struct representing an intermediate key-value pair as outputted @@ -68,48 +68,6 @@ where } } -/// A struct implementing `EmitPartitionedIntermediate` -/// which emits to an `IntermediateOutputObject`. -pub struct IntermediateOutputObjectEmitter<'a, K, V> -where - K: Default + Serialize + 'a, - V: Default + Serialize + 'a, -{ - sink: &'a mut IntermediateOutputObject, -} - -impl<'a, K, V> IntermediateOutputObjectEmitter<'a, K, V> -where - K: Default + Serialize, - V: Default + Serialize, -{ - /// Constructs a new `IntermediateOutputObjectEmitter` with a mutable reference to a given - /// `IntermediateOutputObject`. - /// - /// # Arguments - /// - /// * `sink` - A mutable reference to the `IntermediateOutputObject` - /// to receive the emitted values. - pub fn new(sink: &'a mut IntermediateOutputObject) -> Self { - IntermediateOutputObjectEmitter { sink } - } -} - -impl<'a, K, V> EmitPartitionedIntermediate for IntermediateOutputObjectEmitter<'a, K, V> -where - K: Default + Serialize, - V: Default + Serialize, -{ - fn emit(&mut self, partition: u64, key: K, value: V) -> Result<()> { - let output_array = self.sink.partitions.entry(partition).or_insert_with(Default::default); - output_array.push(IntermediateOutputPair { - key, - value, - }); - Ok(()) - } -} - /// A struct implementing `EmitFinal` which emits to a `FinalOutputObject`. pub struct FinalOutputObjectEmitter<'a, V: Default + Serialize + 'a> { sink: &'a mut FinalOutputObject, @@ -194,30 +152,6 @@ mod tests { assert_eq!(expected_json_string, json_string); } - #[test] - fn intermediate_output_emitter_works() { - let mut output = IntermediateOutputObject::default(); - let mut partitions = HashMap::new(); - partitions.insert( - 0, - vec![ - IntermediateOutputPair { - key: "foo", - value: "bar", - }, - ], - ); - - let expected_output = IntermediateOutputObject { partitions }; - - { - let mut emitter = IntermediateOutputObjectEmitter::new(&mut output); - emitter.emit(0, "foo", "bar").unwrap(); - } - - assert_eq!(expected_output, output); - } - #[test] fn final_output_emitter_works() { let mut output = FinalOutputObject::default(); From e3302a07c2d6c2b5954662263def938746eb949d Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Thu, 5 Apr 2018 15:48:03 +0100 Subject: [PATCH 13/58] Change test dashboard address --- tests/dfs_integration.sh | 2 ++ tests/distributed_grep_test.sh | 4 +++- tests/integration.sh | 4 +++- tests/state_saving.sh | 8 ++++++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/dfs_integration.sh b/tests/dfs_integration.sh index 79cae0d9..11680ab3 100755 --- a/tests/dfs_integration.sh +++ b/tests/dfs_integration.sh @@ -17,6 +17,7 @@ mkdir -p dfs-integration-test/worker3-state echo "Launching Master." ./master --dfs --fresh --state-location="$PWD"/dfs-integration-test/master-state \ --storage-location="$PWD"/dfs-integration-test/master-state \ + --dashboard-address="127.0.0.1:3002" \ --port=10011 > dfs-integration-test/logs/master.log 2>&1 & master_pid=$! @@ -67,6 +68,7 @@ sleep 1 echo "Relaunching Master." ./master --dfs --state-location="$PWD"/dfs-integration-test/master-state \ + --dashboard-address="127.0.0.1:3002" \ --storage-location="$PWD"/dfs-integration-test/master-state \ --port=10011 > dfs-integration-test/logs/master.log 2>&1 & master_pid=$! diff --git a/tests/distributed_grep_test.sh b/tests/distributed_grep_test.sh index 82b60005..e737dde4 100755 --- a/tests/distributed_grep_test.sh +++ b/tests/distributed_grep_test.sh @@ -11,7 +11,9 @@ mkdir -p distributed-grep-test/output # Launch the master. echo "Launching Master." -./master --fresh --nodump --port=10008 > distributed-grep-test/logs/master.log 2>&1 & +./master --fresh --nodump \ + --dashboard-address="127.0.0.1:3003" \ + --port=10008 > distributed-grep-test/logs/master.log 2>&1 & master_pid=$! local_ip="127.0.0.1" diff --git a/tests/integration.sh b/tests/integration.sh index c445b010..b7a866dc 100755 --- a/tests/integration.sh +++ b/tests/integration.sh @@ -11,7 +11,9 @@ mkdir -p integration-test/output # Launch the master. echo "Launching Master." -./master --fresh --nodump --port=10008 > integration-test/logs/master.log 2>&1 & +./master --fresh --nodump \ + --dashboard-address="127.0.0.1:3004" \ + --port=10008 > integration-test/logs/master.log 2>&1 & master_pid=$! local_ip="127.0.0.1" diff --git a/tests/state_saving.sh b/tests/state_saving.sh index 5d85f34b..777d5096 100755 --- a/tests/state_saving.sh +++ b/tests/state_saving.sh @@ -12,7 +12,9 @@ mkdir -p state-integration-test/state # Launch the master. echo "Launching Master." -./master --fresh --state-location="$PWD"/state-integration-test/state --port=10009 > state-integration-test/logs/master.log 2>&1 & +./master --fresh --state-location="$PWD"/state-integration-test/state \ + --dashboard-address="127.0.0.1:3005" \ + --port=10009 > state-integration-test/logs/master.log 2>&1 & master_pid=$! local_ip="127.0.0.1" @@ -44,7 +46,9 @@ $(kill -9 ${master_pid}); sleep 1 echo "Relaunching Master." -./master --state-location="$PWD"/state-integration-test/state --port=10009 > state-integration-test/logs/master2.log 2>&1 & +./master --state-location="$PWD"/state-integration-test/state \ + --dashboard-address="127.0.0.1:3005" \ + --port=10009 > state-integration-test/logs/master2.log 2>&1 & master_pid=$! sleep 1 From 514583f8f82fc22c1aac069985eb5c891ef92f62 Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Thu, 5 Apr 2018 23:55:40 +0100 Subject: [PATCH 14/58] Fetch map reduce input files in parallel --- Cargo.lock | 1 + worker/Cargo.toml | 1 + .../intermediate_data_fetching.rs | 42 +++++++++++++++++++ worker/src/communication/mod.rs | 2 + worker/src/main.rs | 1 + worker/src/operations/reduce.rs | 18 ++++---- 6 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 worker/src/communication/intermediate_data_fetching.rs diff --git a/Cargo.lock b/Cargo.lock index 38c4ae14..b6dbd368 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1841,6 +1841,7 @@ dependencies = [ "clap 2.26.2 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)", + "futures-cpupool 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", "grpc 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.39 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/worker/Cargo.toml b/worker/Cargo.toml index ccd98cf2..335239bc 100644 --- a/worker/Cargo.toml +++ b/worker/Cargo.toml @@ -9,6 +9,7 @@ clap = "~2.26" protobuf = "1.4.1" error-chain = "0.11.0" futures = "0.1" +futures-cpupool = "0.1" grpc = "0.2.1" log = "0.3.8" libc = "0.2.33" diff --git a/worker/src/communication/intermediate_data_fetching.rs b/worker/src/communication/intermediate_data_fetching.rs new file mode 100644 index 00000000..aa552e32 --- /dev/null +++ b/worker/src/communication/intermediate_data_fetching.rs @@ -0,0 +1,42 @@ +use futures::Future; +use futures::future; +use futures_cpupool::CpuPool; + +use errors::*; +use super::worker_interface::WorkerInterface; +use operations::OperationResources; + +pub fn fetch_reduce_inputs( + input_files: Vec, + output_uuid: String, + resources: OperationResources, + task_id: String, +) -> Result> { + let cpu_pool = CpuPool::new_num_cpus(); + let mut input_futures = Vec::new(); + + for reduce_input_file in input_files { + let output_uuid = output_uuid.clone(); + let resources = resources.clone(); + let task_id = task_id.clone(); + + let input_future = cpu_pool.spawn_fn(move || { + let reduce_input_result = + WorkerInterface::get_data(reduce_input_file, &output_uuid, &resources, &task_id) + .chain_err(|| "Couldn't read reduce input file"); + + match reduce_input_result { + Ok(input) => future::ok(input), + Err(err) => future::err(err), + } + + }); + + input_futures.push(input_future); + } + + let results_future = future::join_all(input_futures); + results_future.wait().chain_err( + || "Error running fetch reduce input futures", + ) +} diff --git a/worker/src/communication/mod.rs b/worker/src/communication/mod.rs index 1eb4239e..324e14cc 100644 --- a/worker/src/communication/mod.rs +++ b/worker/src/communication/mod.rs @@ -1,5 +1,7 @@ +mod intermediate_data_fetching; mod master_interface; mod worker_interface; +pub use self::intermediate_data_fetching::fetch_reduce_inputs; pub use self::master_interface::MasterInterface; pub use self::worker_interface::WorkerInterface; diff --git a/worker/src/main.rs b/worker/src/main.rs index c31ba92f..a0c81bb2 100644 --- a/worker/src/main.rs +++ b/worker/src/main.rs @@ -7,6 +7,7 @@ extern crate clap; #[macro_use] extern crate error_chain; extern crate futures; +extern crate futures_cpupool; extern crate grpc; extern crate libc; #[macro_use] diff --git a/worker/src/operations/reduce.rs b/worker/src/operations/reduce.rs index b76e2d07..2e115eac 100644 --- a/worker/src/operations/reduce.rs +++ b/worker/src/operations/reduce.rs @@ -8,7 +8,8 @@ use std::thread; use serde_json; use errors::*; -use communication::{MasterInterface, WorkerInterface}; +use communication; +use communication::MasterInterface; use super::io; use super::operation_handler; use super::operation_handler::OperationResources; @@ -133,15 +134,14 @@ fn create_reduce_input( ) -> Result> { let mut reduce_map: HashMap> = HashMap::new(); - for reduce_input_file in reduce_request.get_input_file_paths() { - // TODO: Run these operations in parallel as networks can be slow - let reduce_input = WorkerInterface::get_data( - reduce_input_file, - output_uuid, - resources, - &reduce_request.task_id, - ).chain_err(|| "Couldn't read reduce input file")?; + let reduce_inputs = communication::fetch_reduce_inputs( + reduce_request.get_input_file_paths().clone().to_vec(), + output_uuid.to_string(), + resources.clone(), + reduce_request.task_id.to_string(), + ).chain_err(|| "Error fetching reduce inputs")?; + for reduce_input in reduce_inputs { let parsed_value: serde_json::Value = serde_json::from_str(&reduce_input).chain_err( || "Error parsing reduce input", )?; From 0c967b36b56db95cfdd716a22a7b4562582aa737 Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Fri, 6 Apr 2018 13:38:35 +0100 Subject: [PATCH 15/58] Change CPU pool size for intermediate data fetching --- worker/src/communication/intermediate_data_fetching.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/worker/src/communication/intermediate_data_fetching.rs b/worker/src/communication/intermediate_data_fetching.rs index aa552e32..2cda2685 100644 --- a/worker/src/communication/intermediate_data_fetching.rs +++ b/worker/src/communication/intermediate_data_fetching.rs @@ -6,13 +6,15 @@ use errors::*; use super::worker_interface::WorkerInterface; use operations::OperationResources; +const INPUT_FETCHING_CPU_POOL_SIZE: usize = 20; + pub fn fetch_reduce_inputs( input_files: Vec, output_uuid: String, resources: OperationResources, task_id: String, ) -> Result> { - let cpu_pool = CpuPool::new_num_cpus(); + let cpu_pool = CpuPool::new(INPUT_FETCHING_CPU_POOL_SIZE); let mut input_futures = Vec::new(); for reduce_input_file in input_files { From a9b7d53f466becfec4f888d1760fcc9a4a439c21 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Fri, 6 Apr 2018 15:59:39 +0100 Subject: [PATCH 16/58] Improve DFS support for large files --- cli/src/commands/download.rs | 34 +++++++++----- cli/src/commands/upload.rs | 34 ++++++++++---- .../distributed_file_layer.rs | 44 ++++++++++++++++--- .../local_file_manager.rs | 23 +++++++--- 4 files changed, 101 insertions(+), 34 deletions(-) diff --git a/cli/src/commands/download.rs b/cli/src/commands/download.rs index c930fd3e..29b2d8ff 100644 --- a/cli/src/commands/download.rs +++ b/cli/src/commands/download.rs @@ -1,3 +1,4 @@ +use std::cmp::min; use std::fs; use std::fs::File; use std::io::Write; @@ -13,6 +14,9 @@ use util::distributed_filesystem::{NetworkFileSystemMasterInterface, DFSAbstract LocalFileManager}; const DFS_FILE_DIRECTORY: &str = "/tmp/cerberus/dfs/"; +const MEGA_BYTE: u64 = 1000 * 1000; +const MAX_DOWNLOAD_SIZE: u64 = MEGA_BYTE * 32; + fn get_files_to_download( data_layer: &DFSAbstractionLayer, @@ -45,15 +49,6 @@ fn download_file( ) -> Result<()> { println!("Downloading file {} to {}", remote_path, local_path); - let remote_path = Path::new(remote_path); - let file_length = data_layer.get_file_length(remote_path).chain_err( - || "Error getting file length", - )?; - - let file_data = data_layer - .read_file_location(remote_path, 0 /* Start btye */, file_length) - .chain_err(|| "Error reading file")?; - let mut local_directory = Path::new(local_path).to_path_buf(); local_directory.pop(); fs::create_dir_all(local_directory).chain_err( @@ -64,12 +59,27 @@ fn download_file( format!("unable to create file {}", local_path) })?; - file.write_all(&file_data).chain_err(|| { - format!( + let remote_path = Path::new(remote_path); + + let mut start_byte = 0; + let file_length = data_layer.get_file_length(remote_path).chain_err( + || "Error getting file length", + )?; + + while start_byte < file_length { + let end_byte = min(file_length, start_byte + MAX_DOWNLOAD_SIZE); + let file_data = data_layer + .read_file_location(remote_path, start_byte, end_byte) + .chain_err(|| "Error reading file")?; + + file.write_all(&file_data).chain_err(|| { + format!( "unable to write content to {}", local_path, ) - })?; + })?; + start_byte = end_byte; + } Ok(()) } diff --git a/cli/src/commands/upload.rs b/cli/src/commands/upload.rs index 2029533d..e0272b8c 100644 --- a/cli/src/commands/upload.rs +++ b/cli/src/commands/upload.rs @@ -1,3 +1,4 @@ +use std::cmp::min; use std::fs; use std::fs::{DirEntry, File}; use std::io::prelude::Read; @@ -10,6 +11,9 @@ use clap::ArgMatches; use errors::*; use util::distributed_filesystem::{NetworkFileSystemMasterInterface, FileSystemMasterInterface}; +const MEGA_BYTE: u64 = 1000 * 1000; +const MAX_UPLOAD_SIZE: u64 = MEGA_BYTE * 32; + fn get_local_files(path: &Path) -> Result> { let mut files = Vec::new(); @@ -38,21 +42,33 @@ fn upload_local_file( remote_path: &str, ) -> Result<()> { println!("Uploading File {} to Cluster", local_path); - //TODO(conor): Improve this function to allow for files that can not be kept in memory. - let file = File::open(local_path).chain_err(|| { format!("unable to open file {}", local_path) })?; + let mut start_byte = 0; + let metadata = fs::metadata(local_path).chain_err( + || "Error getting metadata", + )?; + let file_length = metadata.len(); + let mut first = true; // Allow uploading empty files + let mut buf_reader = BufReader::new(file); - let mut data = Vec::new(); - buf_reader.read_to_end(&mut data).chain_err(|| { - format!("unable to read content of {}", local_path) - })?; + while start_byte < file_length || first { + first = false; + + let mut data = vec![0; min(MAX_UPLOAD_SIZE, file_length - start_byte) as usize]; + let bytes_read = buf_reader.read(&mut data).chain_err(|| { + format!("unable to read content of {}", local_path) + })?; + + master_interface + .upload_file_chunk(remote_path, start_byte, data) + .chain_err(|| "Error uploading file chunk.")?; + + start_byte += bytes_read as u64; + } - master_interface - .upload_file_chunk(remote_path, 0, data) - .chain_err(|| "Error uploading file chunk.")?; Ok(()) } diff --git a/util/src/distributed_filesystem/distributed_file_layer.rs b/util/src/distributed_filesystem/distributed_file_layer.rs index f1cda05a..271d0fb2 100644 --- a/util/src/distributed_filesystem/distributed_file_layer.rs +++ b/util/src/distributed_filesystem/distributed_file_layer.rs @@ -1,4 +1,7 @@ use std::cmp::{max, min}; +use std::fs::OpenOptions; +use std::io::Write; +use std::os::unix::fs::OpenOptionsExt; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -10,6 +13,8 @@ use distributed_filesystem::{LocalFileManager, FileSystemMasterInterface, use errors::*; const MAX_GET_DATA_RETRIES: usize = 3; +const MEGA_BYTE: u64 = 1000 * 1000; +const MAX_LOCAL_FILE_CHUNK: u64 = MEGA_BYTE * 32; pub struct DFSAbstractionLayer { local_file_manager: Arc, @@ -152,17 +157,44 @@ impl AbstractionLayer for DFSAbstractionLayer { return Ok(PathBuf::from(local_file_path)); } - // TODO(conor): Improve this function to work for files that can not fit in memory. + let mut start_byte = 0; let file_length = self.get_file_length(path).chain_err( || "Error getting file length", )?; - let data = self.read_file_location(path, 0, file_length).chain_err( - || "Error getting local file", - )?; let local_file_path = self.local_file_manager - .write_local_file(&path.to_string_lossy(), &data) - .chain_err(|| "Error writing local file")?; + .get_new_local_file_path() + .chain_err(|| "Error getting local file path")?; + + let mut options = OpenOptions::new(); + options.read(true); + options.write(true); + options.truncate(true); + options.create(true); + options.mode(0o777); + + let mut file = options.open(local_file_path.to_owned()).chain_err({ + || format!("unable to create file {}", local_file_path) + })?; + + while start_byte < file_length { + let end_byte = min(file_length, start_byte + MAX_LOCAL_FILE_CHUNK); + let data = self.read_file_location(path, start_byte, file_length) + .chain_err(|| "Error getting local file")?; + + file.write_all(&data).chain_err(|| { + format!( + "unable to write content to {}", + local_file_path, + ) + })?; + start_byte = end_byte; + } + + self.local_file_manager.complete_local_file( + &path.to_string_lossy(), + &local_file_path, + ); Ok(PathBuf::from(local_file_path)) } diff --git a/util/src/distributed_filesystem/local_file_manager.rs b/util/src/distributed_filesystem/local_file_manager.rs index f7ae137c..b253bd0c 100644 --- a/util/src/distributed_filesystem/local_file_manager.rs +++ b/util/src/distributed_filesystem/local_file_manager.rs @@ -89,7 +89,7 @@ impl LocalFileManager { complete_file_map.get(file_path).map(|s| s.to_owned()) } - pub fn write_local_file(&self, file_path: &str, data: &[u8]) -> Result { + pub fn get_new_local_file_path(&self) -> Result { let mut storage_path = PathBuf::new(); storage_path.push(self.storage_directory.clone()); storage_path.push(COMPLETE_SUB_DIR); @@ -103,6 +103,19 @@ impl LocalFileManager { let file_name = Uuid::new_v4().to_string(); storage_path.push(file_name); + Ok(storage_path.to_string_lossy().to_string()) + } + + pub fn complete_local_file(&self, file_path: &str, local_file_path: &str) { + let mut complete_file_map = self.complete_file_map.write().unwrap(); + complete_file_map.insert(file_path.to_owned(), local_file_path.to_owned()); + } + + pub fn write_local_file(&self, file_path: &str, data: &[u8]) -> Result { + let storage_path = self.get_new_local_file_path().chain_err( + || "Error writing local file", + )?; + let mut options = OpenOptions::new(); options.read(true); options.write(true); @@ -115,13 +128,9 @@ impl LocalFileManager { )?; file.write_all(data).chain_err(|| "Unable to write data")?; - let mut complete_file_map = self.complete_file_map.write().unwrap(); - complete_file_map.insert( - file_path.to_owned(), - storage_path.to_string_lossy().to_string(), - ); + self.complete_local_file(file_path, &storage_path); - Ok(storage_path.to_string_lossy().to_string()) + Ok(storage_path) } /// `read_file_chunk` reads a single file chunk known to exist requested by another worker. From 99240585736b3cf36de6884ecc94278468a16848 Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 9 Apr 2018 20:07:13 +0100 Subject: [PATCH 17/58] Take data chunks into account when calculating file closeness --- master/src/worker_management/state.rs | 16 ++++++++-------- util/src/data_layer/abstraction_layer.rs | 16 +++++++++++----- util/src/data_layer/nfs_layer.rs | 8 +++++++- util/src/data_layer/null_layer.rs | 8 +++++++- util/src/data_layer/s3_layer.rs | 8 +++++++- .../distributed_file_layer.rs | 14 ++++++++++++-- 6 files changed, 52 insertions(+), 18 deletions(-) diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 6ca0a32f..0238d049 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -401,19 +401,19 @@ impl State { } fn get_data_score(&self, map_request: &pb::PerformMapRequest, worker_id: &str) -> Result { - let mut input_paths: Vec = Vec::new(); - for input_location in map_request.get_input().get_input_locations() { - input_paths.push(input_location.get_input_path().to_string()); - } - let mut score: u64 = 0; - for input_path in input_paths { + for input_location in map_request.get_input().get_input_locations() { score += self.data_layer - .get_file_closeness(Path::new(&input_path), worker_id) + .get_data_closeness( + Path::new(&input_location.input_path), + input_location.start_byte, + input_location.end_byte, + worker_id, + ) .chain_err(|| { format!( "Could not get closeness for file {} and worker {}", - input_path, + input_location.input_path, worker_id ) })?; diff --git a/util/src/data_layer/abstraction_layer.rs b/util/src/data_layer/abstraction_layer.rs index 03ced84f..2d0af4d5 100644 --- a/util/src/data_layer/abstraction_layer.rs +++ b/util/src/data_layer/abstraction_layer.rs @@ -24,9 +24,15 @@ pub trait AbstractionLayer { fn create_dir_all(&self, path: &Path) -> Result<()>; - /// `get_file_closeness` returns how an integer indicating how easy it is for a given - /// worker to request a given file from the file system. If a file is on the local filesystem - /// of the worker then the closeness score should be high, if chunks of the file need to be - /// requested from other workers then the score should be low. - fn get_file_closeness(&self, path: &Path, worker_id: &str) -> Result; + /// `get_data_closeness` returns how an integer indicating how easy it is for a given + /// worker to request a given data chunk from the file system. + /// If a file is on the local filesystem of the worker then the closeness score should be high, + /// if chunks of the file need to be requested from other workers then the score should be low. + fn get_data_closeness( + &self, + file_path: &Path, + chunk_start: u64, + chunk_end: u64, + worker_id: &str, + ) -> Result; } diff --git a/util/src/data_layer/nfs_layer.rs b/util/src/data_layer/nfs_layer.rs index b309767c..b57c0a39 100644 --- a/util/src/data_layer/nfs_layer.rs +++ b/util/src/data_layer/nfs_layer.rs @@ -131,7 +131,13 @@ impl AbstractionLayer for NFSAbstractionLayer { fs::create_dir_all(&absolute_path.as_path()).chain_err(|| "Unable to create directories") } - fn get_file_closeness(&self, _path: &Path, _worker_id: &str) -> Result { + fn get_data_closeness( + &self, + _path: &Path, + _chunk_start: u64, + _chunk_end: u64, + _worker_id: &str, + ) -> Result { // Each file is equally close on NFS Ok(1) } diff --git a/util/src/data_layer/null_layer.rs b/util/src/data_layer/null_layer.rs index 6eb4d586..2a9b6ccf 100644 --- a/util/src/data_layer/null_layer.rs +++ b/util/src/data_layer/null_layer.rs @@ -91,7 +91,13 @@ impl AbstractionLayer for NullAbstractionLayer { fs::create_dir_all(path).chain_err(|| "Unable to create directories") } - fn get_file_closeness(&self, _path: &Path, _worker_id: &str) -> Result { + fn get_data_closeness( + &self, + _path: &Path, + _chunk_start: u64, + _chunk_end: u64, + _worker_id: &str, + ) -> Result { Ok(1) } } diff --git a/util/src/data_layer/s3_layer.rs b/util/src/data_layer/s3_layer.rs index f6ef81a4..fa7d71da 100644 --- a/util/src/data_layer/s3_layer.rs +++ b/util/src/data_layer/s3_layer.rs @@ -312,7 +312,13 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { } // For S3 all files can be taken to be equally close to each worker. - fn get_file_closeness(&self, _path: &Path, _worker_id: &str) -> Result { + fn get_data_closeness( + &self, + _path: &Path, + _chunk_start: u64, + _chunk_end: u64, + _worker_id: &str, + ) -> Result { Ok(1) } } diff --git a/util/src/distributed_filesystem/distributed_file_layer.rs b/util/src/distributed_filesystem/distributed_file_layer.rs index 271d0fb2..901fcd9c 100644 --- a/util/src/distributed_filesystem/distributed_file_layer.rs +++ b/util/src/distributed_filesystem/distributed_file_layer.rs @@ -258,7 +258,13 @@ impl AbstractionLayer for DFSAbstractionLayer { Ok(()) } - fn get_file_closeness(&self, path: &Path, worker_id: &str) -> Result { + fn get_data_closeness( + &self, + path: &Path, + chunk_start: u64, + chunk_end: u64, + worker_id: &str, + ) -> Result { let file_chunks = self.master_interface .get_file_chunks(&path.to_string_lossy()) .chain_err(|| "Could not get file locations")?; @@ -267,7 +273,11 @@ impl AbstractionLayer for DFSAbstractionLayer { for chunk in file_chunks { if chunk.workers.contains(&worker_id.to_string()) { - score += 1; + if (chunk.start_byte >= chunk_start && chunk.start_byte <= chunk_end) || + (chunk.end_byte >= chunk_start && chunk.end_byte <= chunk_end) + { + score += 1; + } } } From 69f4c48dfdfb0eb881dc17cdb0450e2c926d2102 Mon Sep 17 00:00:00 2001 From: Ryan Connell Date: Mon, 9 Apr 2018 08:54:46 +0100 Subject: [PATCH 18/58] Prevent cancelling of completed jobs --- cli/src/commands/cancel.rs | 12 ++++++++---- master/src/dashboard/server.rs | 9 +++++---- master/src/scheduling/scheduler.rs | 12 ++++++++---- master/src/scheduling/state.rs | 8 +++++--- master/src/server/client_service.rs | 13 ++++++++----- proto/mapreduce.proto | 1 + 6 files changed, 35 insertions(+), 20 deletions(-) diff --git a/cli/src/commands/cancel.rs b/cli/src/commands/cancel.rs index 033d7b9b..6626cdbc 100644 --- a/cli/src/commands/cancel.rs +++ b/cli/src/commands/cancel.rs @@ -22,9 +22,13 @@ pub fn cancel(client: &grpc_pb::MapReduceServiceClient, args: Option<&ArgMatches .chain_err(|| "Failed to cancel MapReduce")? .1; - println!( - "Succesfully cancelled MapReduce with ID: {}", - resp.mapreduce_id - ); + if resp.success { + println!( + "Succesfully cancelled MapReduce with ID: {}", + resp.mapreduce_id + ); + } else { + println!("Unable to cancel MapReduce with ID: {}", resp.mapreduce_id); + } Ok(()) } diff --git a/master/src/dashboard/server.rs b/master/src/dashboard/server.rs index 7333cd28..1ad9e758 100644 --- a/master/src/dashboard/server.rs +++ b/master/src/dashboard/server.rs @@ -111,13 +111,14 @@ impl ApiHandler { || "Could not get job_id in request", )?; - self.scheduler_arc.cancel_job(&job_id).chain_err(|| { + let success = self.scheduler_arc.cancel_job(&job_id).chain_err(|| { format!("Failed to cancel job with id {}", job_id) })?; - Ok(Response::with( - (iron::status::Ok, format!("{{ job_id: {} }}", job_id)), - )) + Ok(Response::with(( + iron::status::Ok, + format!("{{ job_id: {}, success: {} }}", job_id, success), + ))) } fn schedule_job(&self, req: &mut Request) -> Result { diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index 2fa2e10f..42e3398a 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -169,12 +169,16 @@ impl Scheduler { ) } - pub fn cancel_job(&self, job_id: &str) -> Result<()> { - { + pub fn cancel_job(&self, job_id: &str) -> Result { + let cancelled = { let mut state = self.state.lock().unwrap(); state.cancel_job(job_id).chain_err(|| { format!("Unable to cancel job with ID: {}", job_id) - })?; + })? + }; + if !cancelled { + info!("Unable to cancel job with ID {}", job_id); + return Ok(false); } let workers = self.worker_manager @@ -192,7 +196,7 @@ impl Scheduler { .chain_err(|| "Unable to cancel task on workers")?; info!("Succesfully cancelled job {}", job_id); - Ok(()) + Ok(cancelled) } pub fn get_job_queue_size(&self) -> u32 { diff --git a/master/src/scheduling/state.rs b/master/src/scheduling/state.rs index 839ff721..8b7bee20 100644 --- a/master/src/scheduling/state.rs +++ b/master/src/scheduling/state.rs @@ -117,16 +117,18 @@ impl State { Ok(()) } - pub fn cancel_job(&mut self, job_id: &str) -> Result<()> { + pub fn cancel_job(&mut self, job_id: &str) -> Result { let scheduled_job = match self.scheduled_jobs.get_mut(job_id) { Some(job) => job, None => return Err(format!("Job with ID {} was not found.", &job_id).into()), }; - if scheduled_job.job.status != pb::Status::FAILED { + if scheduled_job.job.status != pb::Status::FAILED && + scheduled_job.job.status != pb::Status::DONE + { scheduled_job.job.status = pb::Status::CANCELLED; } - Ok(()) + Ok(scheduled_job.job.status == pb::Status::CANCELLED) } pub fn get_job(&self, job_id: &str) -> Result<&Job> { diff --git a/master/src/server/client_service.rs b/master/src/server/client_service.rs index ef8538f5..da33c766 100644 --- a/master/src/server/client_service.rs +++ b/master/src/server/client_service.rs @@ -119,11 +119,14 @@ impl grpc_pb::MapReduceService for ClientService { info!("Attempting to cancel MapReduce: {}", job_id); let result = self.scheduler.cancel_job(job_id.as_ref()); - if let Err(err) = result { - output_error(&err.chain_err(|| "Error cancelling MapReduce")); - return SingleResponse::err(Error::Other(JOB_CANCEL_ERROR)); - } - + let cancelled = match result { + Ok(success) => success, + Err(err) => { + output_error(&err.chain_err(|| "Error cancelling MapReduce")); + return SingleResponse::err(Error::Other(JOB_CANCEL_ERROR)); + } + }; + response.success = cancelled; SingleResponse::completed(response) } diff --git a/proto/mapreduce.proto b/proto/mapreduce.proto index 55796d65..cbf98426 100644 --- a/proto/mapreduce.proto +++ b/proto/mapreduce.proto @@ -103,4 +103,5 @@ message MapReduceCancelRequest { message MapReduceCancelResponse { // ID of the canceled map reduce string mapreduce_id = 1; + bool success = 2; } From 3841fc3238de90629f8afdfe2558d36b3c5ff087 Mon Sep 17 00:00:00 2001 From: Ryan Connell Date: Mon, 9 Apr 2018 22:16:18 +0100 Subject: [PATCH 19/58] Mark all tasks as cancelled when a job is cancelled --- master/src/scheduling/state.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/master/src/scheduling/state.rs b/master/src/scheduling/state.rs index 8b7bee20..5c8bdb09 100644 --- a/master/src/scheduling/state.rs +++ b/master/src/scheduling/state.rs @@ -127,6 +127,11 @@ impl State { scheduled_job.job.status != pb::Status::DONE { scheduled_job.job.status = pb::Status::CANCELLED; + + // Cancel each of the tasks for this job. + for task in scheduled_job.tasks.values_mut() { + task.status = TaskStatus::Cancelled; + } } Ok(scheduled_job.job.status == pb::Status::CANCELLED) } From e53f2fe2f122c93aaa856d0c641690f8b3e121ac Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 9 Apr 2018 22:56:56 +0100 Subject: [PATCH 20/58] Take into account number of bytes on worker in closeness calculation --- util/src/data_layer/abstraction_layer.rs | 2 +- .../distributed_file_layer.rs | 50 ++++++++----------- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/util/src/data_layer/abstraction_layer.rs b/util/src/data_layer/abstraction_layer.rs index 2d0af4d5..0970af85 100644 --- a/util/src/data_layer/abstraction_layer.rs +++ b/util/src/data_layer/abstraction_layer.rs @@ -24,7 +24,7 @@ pub trait AbstractionLayer { fn create_dir_all(&self, path: &Path) -> Result<()>; - /// `get_data_closeness` returns how an integer indicating how easy it is for a given + /// `get_data_closeness` returns an integer indicating how easy it is for a given /// worker to request a given data chunk from the file system. /// If a file is on the local filesystem of the worker then the closeness score should be high, /// if chunks of the file need to be requested from other workers then the score should be low. diff --git a/util/src/distributed_filesystem/distributed_file_layer.rs b/util/src/distributed_filesystem/distributed_file_layer.rs index 901fcd9c..61d3f702 100644 --- a/util/src/distributed_filesystem/distributed_file_layer.rs +++ b/util/src/distributed_filesystem/distributed_file_layer.rs @@ -8,8 +8,8 @@ use std::sync::Arc; use rand::random; use data_layer::AbstractionLayer; -use distributed_filesystem::{LocalFileManager, FileSystemMasterInterface, - FileSystemWorkerInterface}; +use distributed_filesystem::{FileSystemMasterInterface, FileSystemWorkerInterface, + LocalFileManager}; use errors::*; const MAX_GET_DATA_RETRIES: usize = 3; @@ -48,7 +48,6 @@ impl DFSAbstractionLayer { ); } - let mut retries = MAX_GET_DATA_RETRIES; loop { let random_val = random::() % chunk.worker_address.len(); @@ -107,8 +106,8 @@ impl AbstractionLayer for DFSAbstractionLayer { let mut on_local_chunk = 0; let mut on_byte = start_byte; while on_byte < end_byte { - while on_local_chunk < local_file_chunks.len() && - local_file_chunks[on_local_chunk].start_byte < on_byte + while on_local_chunk < local_file_chunks.len() + && local_file_chunks[on_local_chunk].start_byte < on_byte { on_local_chunk += 1; } @@ -149,18 +148,15 @@ impl AbstractionLayer for DFSAbstractionLayer { fn get_local_file(&self, path: &Path) -> Result { debug!("Getting local file: {:?}", path); - if let Some(local_file_path) = - self.local_file_manager.get_local_file( - &path.to_string_lossy(), - ) + if let Some(local_file_path) = self.local_file_manager + .get_local_file(&path.to_string_lossy()) { return Ok(PathBuf::from(local_file_path)); } let mut start_byte = 0; - let file_length = self.get_file_length(path).chain_err( - || "Error getting file length", - )?; + let file_length = self.get_file_length(path) + .chain_err(|| "Error getting file length")?; let local_file_path = self.local_file_manager .get_new_local_file_path() @@ -173,28 +169,22 @@ impl AbstractionLayer for DFSAbstractionLayer { options.create(true); options.mode(0o777); - let mut file = options.open(local_file_path.to_owned()).chain_err({ - || format!("unable to create file {}", local_file_path) - })?; + let mut file = options + .open(local_file_path.to_owned()) + .chain_err({ || format!("unable to create file {}", local_file_path) })?; while start_byte < file_length { let end_byte = min(file_length, start_byte + MAX_LOCAL_FILE_CHUNK); let data = self.read_file_location(path, start_byte, file_length) .chain_err(|| "Error getting local file")?; - file.write_all(&data).chain_err(|| { - format!( - "unable to write content to {}", - local_file_path, - ) - })?; + file.write_all(&data) + .chain_err(|| format!("unable to write content to {}", local_file_path,))?; start_byte = end_byte; } - self.local_file_manager.complete_local_file( - &path.to_string_lossy(), - &local_file_path, - ); + self.local_file_manager + .complete_local_file(&path.to_string_lossy(), &local_file_path); Ok(PathBuf::from(local_file_path)) } @@ -269,14 +259,14 @@ impl AbstractionLayer for DFSAbstractionLayer { .get_file_chunks(&path.to_string_lossy()) .chain_err(|| "Could not get file locations")?; - let mut score = 0; + let mut score: u64 = 0; for chunk in file_chunks { if chunk.workers.contains(&worker_id.to_string()) { - if (chunk.start_byte >= chunk_start && chunk.start_byte <= chunk_end) || - (chunk.end_byte >= chunk_start && chunk.end_byte <= chunk_end) - { - score += 1; + let overlap: i64 = min(chunk_end, chunk.end_byte) as i64 + - max(chunk_start, chunk.start_byte) as i64; + if overlap > 0 { + score += overlap as u64; } } } From bf8191bbdab6033611e686ee9e3b4ebc31c07344 Mon Sep 17 00:00:00 2001 From: Ryan Connell Date: Wed, 11 Apr 2018 21:38:35 +0100 Subject: [PATCH 21/58] Add functionality for selecting map task size on a per job basis --- cli/src/commands/run.rs | 26 ++++++++++ cli/src/parser.rs | 8 +++ master/content/dashboard.js | 5 +- master/content/index.html | 3 ++ master/content/stylesheet.css | 4 +- master/src/common/job.rs | 15 ++++++ master/src/dashboard/server.rs | 69 +++++++++++++++++-------- master/src/scheduling/task_processor.rs | 33 +++++++----- proto/mapreduce.proto | 3 ++ 9 files changed, 130 insertions(+), 36 deletions(-) diff --git a/cli/src/commands/run.rs b/cli/src/commands/run.rs index 14f859d9..18e8a813 100644 --- a/cli/src/commands/run.rs +++ b/cli/src/commands/run.rs @@ -11,6 +11,7 @@ use errors::*; // Default priority applied to jobs. const DEFAULT_PRIORITY: &str = "3"; +const DEFAULT_MAP_SIZE: &str = "64"; fn verify_valid_path(path_str: &str) -> Result { let path = Path::new(path_str); @@ -52,6 +53,28 @@ fn get_priority(matches: &ArgMatches) -> Result { Ok(priority) } +fn get_map_size(matches: &ArgMatches) -> Result { + let map_size_str = matches.value_of("map_size").unwrap_or(DEFAULT_MAP_SIZE); + let map_size: u32 = match map_size_str.parse() { + Ok(val) => val, + Err(err) => { + return Err( + format!( + "Error occured while converting '{}' to a u32: {}", + map_size_str, + err + ).into(), + ); + } + }; + if map_size < 1 { + return Err( + "Map Size must be greater than or equal to 1 megabyte".into(), + ); + } + Ok(map_size) +} + pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Result<()> { let mut input = matches .value_of("input") @@ -74,6 +97,8 @@ pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Re .chain_err(|| "Binary cannot be empty")? .to_owned(); + let map_size: u32 = get_map_size(matches).chain_err(|| "Unable to get map size")?; + binary = verify_valid_path(&binary).chain_err( || "Invalid binary path.", )?; @@ -86,6 +111,7 @@ pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Re req.set_client_id(get_client_id()?); req.set_output_directory(output.to_owned()); req.set_priority(priority); + req.set_map_size(map_size); let res = client .perform_map_reduce(RequestOptions::new(), req) diff --git a/cli/src/parser.rs b/cli/src/parser.rs index 10d3eaab..6fad8fcb 100644 --- a/cli/src/parser.rs +++ b/cli/src/parser.rs @@ -49,6 +49,14 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { .takes_value(true) .required(false), ) + .arg( + Arg::with_name("map_size") + .long("map_size") + .short("m") + .help("Set the size of Map tasks in megabytes") + .takes_value(true) + .required(false), + ) ) .subcommand( SubCommand::with_name("cancel") diff --git a/master/content/dashboard.js b/master/content/dashboard.js index 211c1ca1..0c6d76f7 100644 --- a/master/content/dashboard.js +++ b/master/content/dashboard.js @@ -151,6 +151,7 @@ function processScheduleMapReduceForm(e) { var inputPath = encodeURIComponent($("#input").val()); var outputPath = encodeURIComponent($("#output").val()); var priority = encodeURIComponent($("#priority").val()); + var map_size = encodeURIComponent($("#map_size").val()); var submitButton = $("#submit-job"); submitButton.attr("disabled", true); @@ -179,7 +180,9 @@ function processScheduleMapReduceForm(e) { "binary_path=" + binaryPath + "&input_path=" + inputPath + "&output_path=" + outputPath + - "&priority=" + priority; + "&priority=" + priority + + "&map_size=" + map_size; + $.ajax({ url: requestUrl, diff --git a/master/content/index.html b/master/content/index.html index c4b7b8c4..82318547 100644 --- a/master/content/index.html +++ b/master/content/index.html @@ -61,6 +61,9 @@

Cluster Dashboard

+ + + diff --git a/master/content/stylesheet.css b/master/content/stylesheet.css index 184cb36d..567ad25e 100644 --- a/master/content/stylesheet.css +++ b/master/content/stylesheet.css @@ -240,7 +240,7 @@ div.hidden-form { border-radius: 5px; border-width: 1px; box-shadow: 3px 4px 4px 0 rgba(0, 0, 0, .3); - height: 480px; + height: 540px; left: 50%; margin-left: -175px; min-height: 300px; @@ -264,4 +264,4 @@ a.exit-form { margin-left: 90%; position: absolute; text-decoration: None; -} \ No newline at end of file +} diff --git a/master/src/common/job.rs b/master/src/common/job.rs index a0ecceb0..edf8e4a4 100644 --- a/master/src/common/job.rs +++ b/master/src/common/job.rs @@ -11,6 +11,8 @@ use util::data_layer::AbstractionLayer; use util::state::StateHandling; use cerberus_proto::mapreduce as pb; +const MEGA_BYTE: u64 = 1000 * 1000; + /// `JobOptions` stores arguments used to construct a `Job`. #[derive(Default)] pub struct JobOptions { @@ -26,6 +28,8 @@ pub struct JobOptions { pub validate_paths: bool, /// Priority that should be applied to all tasks for the job. pub priority: u32, + /// Size of a Map task in megabytes. + pub map_size: u32, } impl From for JobOptions { @@ -41,6 +45,7 @@ impl From for JobOptions { }, validate_paths: true, priority: other.priority, + map_size: other.map_size, } } } @@ -71,6 +76,9 @@ pub struct Job { /// Total CPU time used by the job. pub cpu_time: u64, + + /// Size of a Map task in bytes + pub map_input_size: u64, } #[derive(Serialize, Deserialize)] @@ -125,6 +133,8 @@ impl Job { time_completed: None, cpu_time: 0, + + map_input_size: (options.map_size as u64) * MEGA_BYTE, }) } @@ -239,6 +249,9 @@ impl StateHandling for Job { priority: serde_json::from_value(data["priority"].clone()).chain_err( || "Unable to convert priority", )?, + map_size: serde_json::from_value(data["map_size"].clone()).chain_err( + || "Unable to convert map_size", + )?, }; let mut job = Job::new_no_validate(options).chain_err( @@ -267,6 +280,7 @@ impl StateHandling for Job { "output_directory": self.output_directory, "priority": self.priority, + "map_size": self.map_input_size / MEGA_BYTE, "status": self.get_serializable_status(), "status_details": self.status_details, @@ -372,6 +386,7 @@ mod tests { output_directory: Some("/tmp/output/".to_owned()), validate_paths: false, priority: 1, + map_size: 64, }).unwrap(); assert_eq!("/tmp/input/output/", job1.output_directory); diff --git a/master/src/dashboard/server.rs b/master/src/dashboard/server.rs index 1ad9e758..4000db4f 100644 --- a/master/src/dashboard/server.rs +++ b/master/src/dashboard/server.rs @@ -18,6 +18,7 @@ use util::output_error; // Default priority applied to scheduled jobs. const DEFAULT_PRIORITY: u32 = 3; +const DEFAULT_MAP_SIZE: u32 = 64; #[derive(Clone)] struct ApiHandler { @@ -121,47 +122,73 @@ impl ApiHandler { ))) } - fn schedule_job(&self, req: &mut Request) -> Result { - let binary_path = self.get_parameter(req, "binary_path").chain_err( - || "Failed to get binary_path", - )?; - let input_path = self.get_parameter(req, "input_path").chain_err( - || "Failed to get input_path", - )?; + fn get_output_path(&self, req: &mut Request) -> Option { let output_path = self.get_parameter(req, "output_path").unwrap_or_else( |_| "".to_string(), ); + if output_path.is_empty() { + None + } else { + Some(output_path) + } + } + + fn get_priority(&self, req: &mut Request) -> Result { let priority = self.get_parameter(req, "priority").unwrap_or_else( |_| "".to_string(), ); + if priority.is_empty() { + Ok(DEFAULT_PRIORITY) + } else { + priority.parse::().chain_err( + || "Invalid priority when scheduling job", + ) + } + } - let output_path = { - if output_path.is_empty() { - None - } else { - Some(output_path) - } - }; - - let priority = { - if priority.is_empty() { - DEFAULT_PRIORITY + fn get_map_size(&self, req: &mut Request) -> Result { + let map_size = self.get_parameter(req, "map_size").unwrap_or_else( + |_| "".to_string(), + ); + let map_size = { + if map_size.is_empty() { + DEFAULT_MAP_SIZE } else { - priority.parse::().chain_err( - || "Invalid priority when scheduling job", + map_size.parse::().chain_err( + || "Invalid map size when scheduling job", )? } }; + if map_size < 1 { + return Err("Map size must be greater than or equal to 1".into()); + } + Ok(map_size) + } + + fn schedule_job(&self, req: &mut Request) -> Result { + let binary_path = self.get_parameter(req, "binary_path").chain_err( + || "Failed to get binary_path", + )?; + let input_path = self.get_parameter(req, "input_path").chain_err( + || "Failed to get input_path", + )?; + let priority = self.get_priority(req).chain_err( + || "Failed to get priority", + )?; + let map_size = self.get_map_size(req).chain_err( + || "Failed to get map size", + )?; let job_options = JobOptions { client_id: req.remote_addr.to_string(), binary_path, input_directory: input_path, - output_directory: output_path, + output_directory: self.get_output_path(req), validate_paths: true, priority, + map_size, }; let job = Job::new(job_options, &self.data_abstraction_layer_arc) diff --git a/master/src/scheduling/task_processor.rs b/master/src/scheduling/task_processor.rs index 623e472e..6e0a6aeb 100644 --- a/master/src/scheduling/task_processor.rs +++ b/master/src/scheduling/task_processor.rs @@ -8,8 +8,6 @@ use common::{Job, Task}; use util::data_layer::AbstractionLayer; use errors::*; -const MEGA_BYTE: u64 = 1000 * 1000; -const MAP_INPUT_SIZE: u64 = MEGA_BYTE * 64; const CLOSEST_ENDLINE_STEP: u64 = 1000; const NEWLINE: u8 = 0x0A; @@ -71,7 +69,11 @@ impl TaskProcessorImpl { /// `read_input_file` reads a given input file and splits it into chunks for map tasks. /// If a file can fit into one map task, it will not be split. - fn read_input_file(&self, input_file_path: &PathBuf) -> Result> { + fn read_input_file( + &self, + input_file_path: &PathBuf, + map_input_size: u64, + ) -> Result> { let input_path_str = input_file_path.to_str().ok_or("Invalid input file path.")?; let mut input_locations = Vec::new(); @@ -81,9 +83,9 @@ impl TaskProcessorImpl { .get_file_length(input_file_path) .chain_err(|| "Error reading input file")?; - while end_byte - start_byte > MAP_INPUT_SIZE { + while end_byte - start_byte > map_input_size { let new_start_byte = - self.get_closest_endline(input_file_path, start_byte, start_byte + MAP_INPUT_SIZE) + self.get_closest_endline(input_file_path, start_byte, start_byte + map_input_size) .chain_err(|| "Error reading input file")?; let mut input_location = pb::InputLocation::new(); input_location.set_input_path(input_path_str.to_owned()); @@ -106,11 +108,15 @@ impl TaskProcessorImpl { } /// `get_map_task_infos` reads a directory and creates a set of `MapTaskFileInformations` - fn get_map_task_infos(&self, input_directory: &Path) -> Result> { + fn get_map_task_infos( + &self, + input_directory: &Path, + map_input_size: u64, + ) -> Result> { let mut map_task_infos = Vec::new(); let mut map_task_info = MapTaskInformation { - bytes_remaining: MAP_INPUT_SIZE, + bytes_remaining: map_input_size, input_locations: Vec::new(), }; @@ -128,7 +134,7 @@ impl TaskProcessorImpl { .is_file(path.as_path()) .chain_err(|| "Failed to check if path is a file")? { - let input_locations = self.read_input_file(&path).chain_err( + let input_locations = self.read_input_file(&path, map_input_size).chain_err( || "Error reading input file.", )?; @@ -138,7 +144,7 @@ impl TaskProcessorImpl { map_task_infos.push(map_task_info); map_task_info = MapTaskInformation { - bytes_remaining: MAP_INPUT_SIZE, + bytes_remaining: map_input_size, input_locations: Vec::new(), }; } @@ -149,7 +155,7 @@ impl TaskProcessorImpl { } } - if map_task_info.bytes_remaining != MAP_INPUT_SIZE { + if map_task_info.bytes_remaining != map_input_size { map_task_infos.push(map_task_info); } @@ -159,8 +165,9 @@ impl TaskProcessorImpl { impl TaskProcessor for TaskProcessorImpl { fn create_map_tasks(&self, job: &Job) -> Result> { - let map_task_infos = self.get_map_task_infos(Path::new(&job.input_directory)) - .chain_err(|| "Error creating map tasks")?; + let map_task_infos = + self.get_map_task_infos(Path::new(&job.input_directory), job.map_input_size) + .chain_err(|| "Error creating map tasks")?; // TODO(conor): Consider adding together any map tasks that can be combined here. @@ -247,6 +254,7 @@ mod tests { client_id: "test-client".to_owned(), binary_path: "/tmp/bin".to_owned(), input_directory: test_path.to_str().unwrap().to_owned(), + map_size: 64, ..Default::default() }, &data_abstraction_layer, @@ -306,6 +314,7 @@ mod tests { client_id: "test-client".to_owned(), binary_path: "/tmp/bin".to_owned(), input_directory: "/tmp/inputdir".to_owned(), + map_size: 64, ..Default::default() }, &data_abstraction_layer, diff --git a/proto/mapreduce.proto b/proto/mapreduce.proto index cbf98426..67893b80 100644 --- a/proto/mapreduce.proto +++ b/proto/mapreduce.proto @@ -38,6 +38,9 @@ message MapReduceRequest { // Priority of the MapReduce uint32 priority = 5; + + // Size in megabytes of Map tasks + uint32 map_size = 6; } // Response from the master about the map reduce. From 5b2bc681f0281baf96f4b183ec852b4ed15272af Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Thu, 12 Apr 2018 13:30:43 +0100 Subject: [PATCH 22/58] Don't make job scheduling wait on input splitting --- cli/src/commands/status.rs | 1 + master/src/common/job.rs | 15 ++++--- master/src/scheduling/scheduler.rs | 67 +++++++++++++++++++++-------- master/src/scheduling/state.rs | 34 ++++++++++++++- master/src/server/client_service.rs | 2 +- proto/mapreduce.proto | 11 ++--- 6 files changed, 97 insertions(+), 33 deletions(-) diff --git a/cli/src/commands/status.rs b/cli/src/commands/status.rs index 0e1a7358..a8eebda4 100644 --- a/cli/src/commands/status.rs +++ b/cli/src/commands/status.rs @@ -14,6 +14,7 @@ fn print_table(rep: &pb::MapReduceReport) { let status: String = match rep.get_status() { pb::Status::UNKNOWN => "UNKNOWN".to_owned(), + pb::Status::SPLITTING_INPUT => "SPLITTING_INPUT".to_owned(), pb::Status::DONE => { let time_taken = rep.get_done_timestamp() - rep.get_started_timestamp(); format!("DONE ({}s)", time_taken) diff --git a/master/src/common/job.rs b/master/src/common/job.rs index edf8e4a4..8678ffac 100644 --- a/master/src/common/job.rs +++ b/master/src/common/job.rs @@ -85,11 +85,12 @@ pub struct Job { #[allow(non_camel_case_types)] /// `SerializableJobStatus` is the Serializable counterpart to `mapreduce_proto::Status`. pub enum SerializableJobStatus { - DONE, - IN_PROGRESS, + UNKNOWN, + SPLITTING_INPUT, IN_QUEUE, + IN_PROGRESS, + DONE, FAILED, - UNKNOWN, CANCELLED, } @@ -119,7 +120,7 @@ impl Job { priority: options.priority, - status: pb::Status::IN_QUEUE, + status: pb::Status::SPLITTING_INPUT, status_details: None, map_tasks_completed: 0, @@ -214,6 +215,7 @@ impl Job { fn status_from_state(&self, state: &SerializableJobStatus) -> pb::Status { match *state { SerializableJobStatus::DONE => pb::Status::DONE, + SerializableJobStatus::SPLITTING_INPUT => pb::Status::SPLITTING_INPUT, SerializableJobStatus::IN_PROGRESS => pb::Status::IN_PROGRESS, SerializableJobStatus::IN_QUEUE => pb::Status::IN_QUEUE, SerializableJobStatus::FAILED => pb::Status::FAILED, @@ -225,6 +227,7 @@ impl Job { pub fn get_serializable_status(&self) -> SerializableJobStatus { match self.status { pb::Status::DONE => SerializableJobStatus::DONE, + pb::Status::SPLITTING_INPUT => SerializableJobStatus::SPLITTING_INPUT, pb::Status::IN_PROGRESS => SerializableJobStatus::IN_PROGRESS, pb::Status::IN_QUEUE => SerializableJobStatus::IN_QUEUE, pb::Status::FAILED => SerializableJobStatus::FAILED, @@ -366,8 +369,8 @@ mod tests { #[test] fn test_defaults() { let job = Job::new_no_validate(get_test_job_options()).unwrap(); - // Assert that the default status for a map reduce job is Queued. - assert_eq!(pb::Status::IN_QUEUE, job.status); + // Assert that the default status for a map reduce job is splitting input. + assert_eq!(pb::Status::SPLITTING_INPUT, job.status); // Assert that completed tasks starts at 0. assert_eq!(0, job.map_tasks_completed); assert_eq!(0, job.reduce_tasks_completed); diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index 42e3398a..09fafd1d 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -141,32 +141,56 @@ impl Scheduler { thread::spawn(move || { worker_manager.run_task(task); }); } - /// Schedule a [`Job`](common::Job) to be executed. - /// This function creates the map tasks before returning. - pub fn schedule_job(&self, mut job: Job) -> Result<()> { - let map_tasks_vec = self.task_processor.create_map_tasks(&job).chain_err( - || "Error creating map tasks for job.", - )?; + /// Splits the input for a job and schedules the map tasks in the background. + pub fn split_input(&self, job: Job) { + let state = Arc::clone(&self.state); + let worker_manager = Arc::clone(&self.worker_manager); + let task_processor = Arc::clone(&self.task_processor); - job.map_tasks_total = map_tasks_vec.len() as u32; + thread::spawn(move || { + info!("Splitting input for job with ID {}.", job.id); - let mut map_tasks: HashMap = HashMap::new(); - for task in map_tasks_vec { - self.schedule_task(task.clone()); - map_tasks.insert(task.id.to_owned(), task); - } + let map_tasks_vec = match task_processor.create_map_tasks(&job) { + Ok(tasks) => tasks, + Err(err) => { + output_error(&err.chain_err(|| "Error creating map tasks for job.")); + return; + } + }; - info!("Starting job with ID {}.", job.id); + let mut map_tasks: HashMap = HashMap::new(); + for task in map_tasks_vec { + map_tasks.insert(task.id.to_owned(), task.clone()); + let worker_manager = Arc::clone(&worker_manager); + thread::spawn(move || { worker_manager.run_task(task); }); + } + + info!("Starting job with ID {}.", job.id); + + let mut state = state.lock().unwrap(); + let result = state.input_splitting_complete(&job.id, map_tasks); + if let Err(err) = result { + output_error(&err.chain_err(|| "Error creating map tasks for job.")); + } + }); + } + /// Schedule a [`Job`](common::Job) to be executed. + pub fn schedule_job(&self, job: Job) -> Result<()> { let scheduled_job = ScheduledJob { - job, - tasks: map_tasks, + job: job.clone(), + tasks: Default::default(), }; - let mut state = self.state.lock().unwrap(); - state.add_job(scheduled_job).chain_err( - || "Error adding scheduled job to state store", - ) + { + let mut state = self.state.lock().unwrap(); + state.add_job(scheduled_job).chain_err( + || "Error adding scheduled job to state store", + )?; + } + + self.split_input(job); + Ok(()) } pub fn cancel_job(&self, job_id: &str) -> Result { @@ -313,6 +337,11 @@ impl SimpleStateHandling for Scheduler { } } + let splitting_jobs = state.get_splitting_jobs(); + for job in splitting_jobs { + self.split_input(job); + } + Ok(()) } } diff --git a/master/src/scheduling/state.rs b/master/src/scheduling/state.rs index 5c8bdb09..b9bcb0da 100644 --- a/master/src/scheduling/state.rs +++ b/master/src/scheduling/state.rs @@ -96,6 +96,35 @@ impl State { Ok(()) } + pub fn input_splitting_complete( + &mut self, + job_id: &str, + map_tasks: HashMap, + ) -> Result<()> { + let scheduled_job = match self.scheduled_jobs.get_mut(job_id) { + Some(scheduled_job) => scheduled_job, + None => return Err(format!("Job with ID {} is not found.", &job_id).into()), + }; + + if scheduled_job.job.status != pb::Status::CANCELLED { + scheduled_job.job.status = pb::Status::IN_QUEUE; + scheduled_job.job.map_tasks_total = map_tasks.len() as u32; + scheduled_job.tasks = map_tasks; + } + Ok(()) + } + + // Returns a vector of jobs in the SPLITTING_INPUT phase. + pub fn get_splitting_jobs(&self) -> Vec { + let mut jobs = Vec::new(); + for scheduled_job in self.scheduled_jobs.values() { + if scheduled_job.job.status == pb::Status::SPLITTING_INPUT { + jobs.push(scheduled_job.job.clone()); + } + } + jobs + } + pub fn update_job_started(&mut self, job_id: &str, time_started: DateTime) -> Result<()> { let scheduled_job = match self.scheduled_jobs.get_mut(job_id) { Some(scheduled_job) => scheduled_job, @@ -298,8 +327,9 @@ impl State { pub fn get_job_queue_size(&self) -> u32 { let mut job_count = 0; for scheduled_job in self.scheduled_jobs.values() { - if scheduled_job.job.status == pb::Status::IN_PROGRESS || - scheduled_job.job.status == pb::Status::IN_QUEUE + let status = scheduled_job.job.status; + if status == pb::Status::IN_PROGRESS || status == pb::Status::IN_QUEUE || + status == pb::Status::SPLITTING_INPUT { job_count += 1; } diff --git a/master/src/server/client_service.rs b/master/src/server/client_service.rs index da33c766..3eced291 100644 --- a/master/src/server/client_service.rs +++ b/master/src/server/client_service.rs @@ -257,7 +257,7 @@ mod tests { let (_, mut item, _) = response.wait().unwrap(); let status = item.reports.pop().unwrap().status; - assert_eq!(MapReduceStatus::IN_QUEUE, status) + assert_eq!(MapReduceStatus::SPLITTING_INPUT, status) } #[test] diff --git a/proto/mapreduce.proto b/proto/mapreduce.proto index 67893b80..a45d7bdf 100644 --- a/proto/mapreduce.proto +++ b/proto/mapreduce.proto @@ -66,11 +66,12 @@ message MapReduceStatusResponse { enum Status { UNKNOWN = 0; - DONE = 1; - IN_PROGRESS = 2; - IN_QUEUE = 3; - FAILED = 4; - CANCELLED = 5; + SPLITTING_INPUT = 1; + IN_QUEUE = 2; + IN_PROGRESS = 3; + DONE = 4; + FAILED = 5; + CANCELLED = 6; }; message MapReduceReport { From f2c5e3cfe16ad51e54dcc25b194a855fedfc537b Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Thu, 12 Apr 2018 13:39:07 +0100 Subject: [PATCH 23/58] Fix clippy lints --- master/src/common/job.rs | 2 +- .../src/communication/intermediate_data_fetching.rs | 12 ++++++------ worker/src/operations/reduce.rs | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/master/src/common/job.rs b/master/src/common/job.rs index edf8e4a4..8ae1a0e0 100644 --- a/master/src/common/job.rs +++ b/master/src/common/job.rs @@ -134,7 +134,7 @@ impl Job { cpu_time: 0, - map_input_size: (options.map_size as u64) * MEGA_BYTE, + map_input_size: u64::from(options.map_size) * MEGA_BYTE, }) } diff --git a/worker/src/communication/intermediate_data_fetching.rs b/worker/src/communication/intermediate_data_fetching.rs index 2cda2685..68856ad9 100644 --- a/worker/src/communication/intermediate_data_fetching.rs +++ b/worker/src/communication/intermediate_data_fetching.rs @@ -10,17 +10,17 @@ const INPUT_FETCHING_CPU_POOL_SIZE: usize = 20; pub fn fetch_reduce_inputs( input_files: Vec, - output_uuid: String, - resources: OperationResources, - task_id: String, + output_uuid: &str, + resources: &OperationResources, + task_id: &str, ) -> Result> { let cpu_pool = CpuPool::new(INPUT_FETCHING_CPU_POOL_SIZE); let mut input_futures = Vec::new(); for reduce_input_file in input_files { - let output_uuid = output_uuid.clone(); - let resources = resources.clone(); - let task_id = task_id.clone(); + let output_uuid = output_uuid.to_string(); + let resources = resources.to_owned(); + let task_id = task_id.to_string(); let input_future = cpu_pool.spawn_fn(move || { let reduce_input_result = diff --git a/worker/src/operations/reduce.rs b/worker/src/operations/reduce.rs index 2e115eac..8090de01 100644 --- a/worker/src/operations/reduce.rs +++ b/worker/src/operations/reduce.rs @@ -135,10 +135,10 @@ fn create_reduce_input( let mut reduce_map: HashMap> = HashMap::new(); let reduce_inputs = communication::fetch_reduce_inputs( - reduce_request.get_input_file_paths().clone().to_vec(), - output_uuid.to_string(), - resources.clone(), - reduce_request.task_id.to_string(), + reduce_request.get_input_file_paths().to_vec(), + output_uuid, + resources, + &reduce_request.task_id, ).chain_err(|| "Error fetching reduce inputs")?; for reduce_input in reduce_inputs { From 9e612dbe4c2bdb7fec498eb3408bb37a9073c616 Mon Sep 17 00:00:00 2001 From: Ryan Connell Date: Thu, 12 Apr 2018 22:33:02 +0100 Subject: [PATCH 24/58] Optimize S3 File Reading --- util/src/data_layer/s3_layer.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/util/src/data_layer/s3_layer.rs b/util/src/data_layer/s3_layer.rs index fa7d71da..21b4508c 100644 --- a/util/src/data_layer/s3_layer.rs +++ b/util/src/data_layer/s3_layer.rs @@ -130,6 +130,11 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { || "Unable to get abstracted path", )?; + + // This range includes the start/end byte. Since we don't want to include the end byte + // we subtract 1 here. + let range: String = format!("bytes={}-{}", start_byte, end_byte - 1); + let request = rusoto_s3::GetObjectRequest { bucket: self.bucket.clone(), if_match: None, @@ -138,7 +143,7 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { if_unmodified_since: None, key: abstracted_path, part_number: None, - range: None, + range: Some(range), request_payer: None, response_cache_control: None, response_content_disposition: None, @@ -161,18 +166,11 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { None => return Err("Object has no body".into()), }; - // TODO(rhino): Make this more efficient let result: Vec = streaming_body.concat2().wait().chain_err( || "Unable to get body of file", )?; - let mut bytes: Vec = vec![]; - let start = start_byte as usize; - let end = end_byte as usize; - - bytes.extend_from_slice(&result[start..end]); - - Ok(bytes) + Ok(result) } fn write_file(&self, path: &Path, data: &[u8]) -> Result<()> { From 84319f23001f9bf51579404b0edf4e0f15b85e13 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Thu, 12 Apr 2018 23:17:50 +0100 Subject: [PATCH 25/58] Use CPU pool to process map inputs --- worker/src/operations/map.rs | 230 +++++++++++++++++---------------- worker/src/operations/state.rs | 6 - 2 files changed, 120 insertions(+), 116 deletions(-) diff --git a/worker/src/operations/map.rs b/worker/src/operations/map.rs index 884b2c5e..333dcc2b 100644 --- a/worker/src/operations/map.rs +++ b/worker/src/operations/map.rs @@ -7,6 +7,9 @@ use std::sync::{Arc, Mutex}; use std::thread; use bson; +use futures::Future; +use futures::future; +use futures_cpupool::{CpuPool, CpuFuture}; use serde_json; use uuid::Uuid; @@ -150,23 +153,14 @@ fn combine_map_results( initial_cpu_time: u64, output_dir: &str, task_id: &str, + map_results_vec: Vec, ) -> Result<()> { - // Map of partition to maps of key to value. - let mut partition_map: PartitionMap = HashMap::new(); - let mut map_results_vec = Vec::new(); - { - let mut operation_state = resources.operation_state.lock().unwrap(); - - // Task has been cancelled - if operation_state.current_task_id != task_id { - return Ok(()); - } - - for map_result in operation_state.intermediate_map_results.drain(0..) { - map_results_vec.push(map_result); - } + if operation_handler::check_task_cancelled(&resources.operation_state, task_id) { + return Ok(()); } + // Map of partition to maps of key to value. + let mut partition_map: PartitionMap = HashMap::new(); for map_result in map_results_vec { parse_map_results(&map_result, &mut partition_map) .chain_err(|| "Error parsing map result")?; @@ -232,46 +226,95 @@ fn process_map_operation_error( log_map_operation_err(err, &resources.operation_state, task_id); } -fn process_map_result( - result: Result, +fn run_map_input( + input_location: &pb::InputLocation, + map_options: &pb::PerformMapRequest, resources: &OperationResources, - initial_cpu_time: u64, - output_dir: &str, - task_id: &str, -) { - // If we have cancelled the current task then we should avoid processing the map results. - if operation_handler::check_task_cancelled(&resources.operation_state, task_id) { - return; +) -> Result { + if operation_handler::check_task_cancelled(&resources.operation_state, &map_options.task_id) { + return Ok(String::new()); } - match result { - Ok(map_result) => { - let finished = { - let mut operation_state = resources.operation_state.lock().unwrap(); + info!( + "Running map task for {} ({} - > {})", + input_location.input_path, + input_location.start_byte, + input_location.end_byte + ); - if operation_state.current_task_id != task_id { - // The map operation has failed, no need to continue. - return; - } + let map_input_value = io::read_location(&resources.data_abstraction_layer, input_location) + .chain_err(|| "unable to open input file")?; + + let absolute_path = resources + .data_abstraction_layer + .get_local_file(Path::new(map_options.get_mapper_file_path())) + .chain_err(|| "Unable to get absolute path")?; + + let child = Command::new(absolute_path) + .arg("map") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .chain_err(|| "Failed to start map operation process.")?; + + let map_input = MapInput { + key: input_location.get_input_path().to_owned(), + value: map_input_value, + }; - operation_state.intermediate_map_results.push(map_result); - operation_state.waiting_map_operations -= 1; - operation_state.waiting_map_operations == 0 - }; + let serialized_map_input = bson::to_bson(&map_input).chain_err( + || "Could not serialize map input to bson.", + )?; - if finished { - info!("Map operations finished"); - let result = combine_map_results(resources, initial_cpu_time, output_dir, task_id); + let map_input_document; + if let bson::Bson::Document(document) = serialized_map_input { + map_input_document = document; + } else { + return Err("Could not convert map input to bson::Document.".into()); + } - if let Err(err) = result { - process_map_operation_error(err, resources, initial_cpu_time, task_id); - } + map_operation_thread_impl(&map_input_document, child) +} + +fn handle_map_results( + map_options: &pb::PerformMapRequest, + initial_cpu_time: u64, + resources: &OperationResources, + output_dir_uuid: &str, + map_result_futures: Vec>, +) { + let mut output_path = PathBuf::new(); + output_path.push(WORKER_OUTPUT_DIRECTORY); + output_path.push(output_dir_uuid); + output_path.push("map"); + let output_path_str: String = (*output_path.to_string_lossy()).to_owned(); + + let results_future = future::join_all(map_result_futures); + let resources = resources.to_owned(); + let map_options = map_options.to_owned(); + + thread::spawn(move || { + let map_results = results_future.wait(); + if let Ok(map_output) = map_results { + let combine_result = combine_map_results( + &resources, + initial_cpu_time, + &output_path_str, + &map_options.task_id, + map_output, + ); + + if let Err(err) = combine_result { + process_map_operation_error( + err, + &resources, + initial_cpu_time, + &map_options.task_id, + ); } } - Err(err) => { - process_map_operation_error(err, resources, initial_cpu_time, task_id); - } - } + }); } // Internal implementation for performing a map task. @@ -293,79 +336,46 @@ fn internal_perform_map( let initial_cpu_time; { - let mut operation_state = resources.operation_state.lock().unwrap(); - operation_state.waiting_map_operations = input_locations.len(); - operation_state.intermediate_map_results.clear(); - + let operation_state = resources.operation_state.lock().unwrap(); initial_cpu_time = operation_state.initial_cpu_time; } + let cpu_pool = CpuPool::new_num_cpus(); + let mut map_result_futures = Vec::new(); for input_location in input_locations { - // Make sure the job hasn't been cancelled before continuing. - if operation_handler::check_task_cancelled( - &resources.operation_state, - &map_options.task_id, - ) - { - return Ok(()); - } - - info!( - "Running map task for {} ({} - > {})", - input_location.input_path, - input_location.start_byte, - input_location.end_byte - ); - - let map_input_value = io::read_location(&resources.data_abstraction_layer, input_location) - .chain_err(|| "unable to open input file")?; - - let absolute_path = resources - .data_abstraction_layer - .get_local_file(Path::new(map_options.get_mapper_file_path())) - .chain_err(|| "Unable to get absolute path")?; - let child = Command::new(absolute_path) - .arg("map") - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .chain_err(|| "Failed to start map operation process.")?; - - let map_input = MapInput { - key: input_location.get_input_path().to_owned(), - value: map_input_value, - }; - - let serialized_map_input = bson::to_bson(&map_input).chain_err( - || "Could not serialize map input to bson.", - )?; - - let map_input_document; - if let bson::Bson::Document(document) = serialized_map_input { - map_input_document = document; - } else { - return Err("Could not convert map input to bson::Document.".into()); - } - - let output_path_str: String = (*output_path.to_string_lossy()).to_owned(); - let task_id = map_options.task_id.clone(); - - let resources = resources.clone(); - - thread::spawn(move || { - let result = map_operation_thread_impl(&map_input_document, child); + let resources = resources.to_owned(); + let map_options = map_options.to_owned(); + let input_location = input_location.to_owned(); + + let map_result_future = cpu_pool.spawn_fn(move || { + let map_result = run_map_input(&input_location, &map_options, &resources); + + match map_result { + Ok(map_output) => future::ok(map_output), + Err(err) => { + process_map_operation_error( + err, + &resources, + initial_cpu_time, + &map_options.task_id, + ); + future::err::("Running map input failed".into()) + } + } - process_map_result( - result, - &resources, - initial_cpu_time, - &output_path_str, - &task_id, - ); }); + + map_result_futures.push(map_result_future); } + handle_map_results( + map_options, + initial_cpu_time, + resources, + output_dir_uuid, + map_result_futures, + ); + Ok(()) } diff --git a/worker/src/operations/state.rs b/worker/src/operations/state.rs index fc63fd06..bc1e85d3 100644 --- a/worker/src/operations/state.rs +++ b/worker/src/operations/state.rs @@ -10,9 +10,6 @@ pub struct OperationState { // Initial CPU time of the current operation. This is used to calculate the total cpu time used // for an operation. pub initial_cpu_time: u64, - - pub waiting_map_operations: usize, - pub intermediate_map_results: Vec, } impl OperationState { @@ -22,9 +19,6 @@ impl OperationState { operation_status: pb::OperationStatus::UNKNOWN, initial_cpu_time: 0, - - waiting_map_operations: 0, - intermediate_map_results: Vec::new(), } } From e44eaf8d708425299ebef7b8b45d623b9fe889ba Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Fri, 13 Apr 2018 00:21:34 +0100 Subject: [PATCH 26/58] Fix task being reassigned incorrectly --- master/src/common/worker.rs | 2 ++ master/src/worker_management/state.rs | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/master/src/common/worker.rs b/master/src/common/worker.rs index 0bda3a7b..d494409d 100644 --- a/master/src/common/worker.rs +++ b/master/src/common/worker.rs @@ -35,6 +35,7 @@ pub struct Worker { pub operation_status: pb::OperationStatus, pub status_last_updated: DateTime, + pub task_last_updated: DateTime, pub current_task_id: String, pub worker_id: String, @@ -59,6 +60,7 @@ impl Worker { operation_status: pb::OperationStatus::UNKNOWN, status_last_updated: Utc::now(), + task_last_updated: Utc::now(), current_task_id: String::new(), worker_id, diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 0238d049..8c46e50b 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -23,6 +23,10 @@ const REQUEUED_TASK_PRIORITY: u32 = 15; // Max tasks to consider from the top of the task queue when trying to find the best task to assign. const MAX_TASKS_TO_CONSIDER: u32 = 5; +// If a worker reports being available this many seconds after being assigned a task, assume that +// the masters view of the worker is out of sync and reassign the task. +const TIME_REASSIGN_REPORTING_AVAILABLE_S: i64 = 10; + pub struct State { // A map of worker id to worker. workers: HashMap, @@ -147,7 +151,17 @@ impl State { worker.operation_status = operation_status; worker.status_last_updated = Utc::now(); - if worker_status == pb::WorkerStatus::AVAILABLE && !worker.current_task_id.is_empty() { + // If a worker has not been recently assigned a task and is reporting available, + // we assume that the masters view of the workers current task is out of date and + // reassign the worker. + // If the worker has recently been assigned a task, it may not be reporting the most up to + // date status. + let time_since_task_assigned = Utc::now().timestamp() - + worker.task_last_updated.timestamp(); + if time_since_task_assigned > TIME_REASSIGN_REPORTING_AVAILABLE_S && + worker_status == pb::WorkerStatus::AVAILABLE && + !worker.current_task_id.is_empty() + { if let Some(assigned_task) = self.tasks.get_mut(&worker.current_task_id) { self.priority_task_queue.push(PriorityTask::new( worker.current_task_id.clone(), @@ -395,6 +409,7 @@ impl State { let worker = self.workers.get_mut(worker_id).chain_err(|| { format!("Worker with ID {} not found.", worker_id) })?; + worker.task_last_updated = Utc::now(); worker.current_task_id = task_id.to_owned(); Ok(assigned_task) From 648a5f3449ba3c7f8076af8ff55ffb142d3a322d Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Sat, 14 Apr 2018 14:12:48 +0100 Subject: [PATCH 27/58] Format code with new rustfmt version Formatted with rustfmt 0.4.1 --- cli/src/commands/cancel.rs | 2 +- cli/src/commands/download.rs | 72 +++---- cli/src/commands/mod.rs | 2 +- cli/src/commands/run.rs | 48 ++--- cli/src/commands/status.rs | 12 +- cli/src/commands/upload.rs | 51 ++--- cli/src/common/client_id.rs | 22 +- cli/src/main.rs | 2 +- cli/src/parser.rs | 2 +- cli/tests/end-to-end.rs | 1 - libcerberus/examples/distributed-grep.rs | 14 +- libcerberus/examples/end-to-end.rs | 17 +- libcerberus/examples/rating-aggregator.rs | 54 +++-- libcerberus/examples/rating-by-genre.rs | 25 +-- libcerberus/examples/rating-by-year.rs | 16 +- libcerberus/examples/word-counter.rs | 22 +- libcerberus/src/combiner.rs | 2 +- libcerberus/src/io.rs | 48 ++--- libcerberus/src/lib.rs | 4 +- libcerberus/src/reducer.rs | 7 +- libcerberus/src/registry.rs | 32 +-- libcerberus/src/runner.rs | 64 +++--- libcerberus/src/serialise.rs | 20 +- master/src/common/job.rs | 69 +++--- master/src/common/mod.rs | 2 +- master/src/common/task.rs | 113 +++++----- master/src/common/worker.rs | 25 +-- master/src/dashboard/server.rs | 110 +++++----- master/src/initialization/dashboard_server.rs | 8 +- master/src/initialization/data_layer.rs | 23 +- master/src/initialization/grpc_server.rs | 4 +- master/src/initialization/master_resources.rs | 10 +- master/src/initialization/state_handler.rs | 14 +- master/src/main.rs | 4 +- master/src/main_loop.rs | 15 +- master/src/parser.rs | 12 +- master/src/scheduling/mod.rs | 2 +- master/src/scheduling/scheduler.rs | 110 +++++----- master/src/scheduling/state.rs | 74 +++---- master/src/scheduling/task_processor.rs | 48 ++--- master/src/server/client_service.rs | 17 +- master/src/server/filesystem_service.rs | 12 +- master/src/server/mod.rs | 16 +- master/src/server/worker_service.rs | 4 +- master/src/state/handler.rs | 49 ++--- .../worker_communication/worker_interface.rs | 7 +- master/src/worker_management/mod.rs | 4 +- master/src/worker_management/state.rs | 203 ++++++++---------- .../src/worker_management/worker_manager.rs | 118 +++++----- util/src/data_layer/nfs_layer.rs | 61 +++--- util/src/data_layer/null_layer.rs | 28 +-- util/src/data_layer/s3_layer.rs | 108 +++++----- .../filesystem_manager.rs | 35 ++- .../filesystem_master_interface.rs | 14 +- .../filesystem_worker_interface.rs | 16 +- .../local_file_manager.rs | 69 +++--- util/src/distributed_filesystem/mod.rs | 8 +- util/src/lib.rs | 4 +- util/src/state/mod.rs | 2 +- .../intermediate_data_fetching.rs | 11 +- worker/src/communication/worker_interface.rs | 32 ++- worker/src/initialization/data_layer.rs | 14 +- worker/src/initialization/grpc_server.rs | 2 +- worker/src/initialization/register_worker.rs | 4 +- worker/src/initialization/state_handler.rs | 12 +- worker/src/initialization/worker_resources.rs | 12 +- worker/src/main.rs | 5 +- worker/src/main_loop.rs | 9 +- worker/src/operations/combine.rs | 54 ++--- worker/src/operations/io.rs | 19 +- worker/src/operations/map.rs | 62 +++--- worker/src/operations/mod.rs | 2 +- worker/src/operations/operation_handler.rs | 14 +- worker/src/operations/reduce.rs | 57 +++-- worker/src/operations/state.rs | 1 - worker/src/parser.rs | 8 +- worker/src/server/filesystem_service.rs | 12 +- .../src/server/intermediate_data_service.rs | 2 +- worker/src/server/master_service.rs | 2 +- worker/src/server/mod.rs | 42 ++-- worker/src/state/handler.rs | 31 ++- 81 files changed, 1006 insertions(+), 1358 deletions(-) diff --git a/cli/src/commands/cancel.rs b/cli/src/commands/cancel.rs index 6626cdbc..2f42acf0 100644 --- a/cli/src/commands/cancel.rs +++ b/cli/src/commands/cancel.rs @@ -1,10 +1,10 @@ use clap::ArgMatches; use grpc::RequestOptions; -use common::get_client_id; use cerberus_proto::mapreduce as pb; use cerberus_proto::mapreduce_grpc as grpc_pb; use cerberus_proto::mapreduce_grpc::MapReduceService; +use common::get_client_id; use errors::*; pub fn cancel(client: &grpc_pb::MapReduceServiceClient, args: Option<&ArgMatches>) -> Result<()> { diff --git a/cli/src/commands/download.rs b/cli/src/commands/download.rs index 29b2d8ff..00c1b517 100644 --- a/cli/src/commands/download.rs +++ b/cli/src/commands/download.rs @@ -3,36 +3,35 @@ use std::fs; use std::fs::File; use std::io::Write; use std::net::SocketAddr; -use std::sync::Arc; use std::path::{Path, PathBuf}; +use std::sync::Arc; use clap::ArgMatches; use errors::*; use util::data_layer::AbstractionLayer; -use util::distributed_filesystem::{NetworkFileSystemMasterInterface, DFSAbstractionLayer, - LocalFileManager}; +use util::distributed_filesystem::{DFSAbstractionLayer, LocalFileManager, + NetworkFileSystemMasterInterface}; const DFS_FILE_DIRECTORY: &str = "/tmp/cerberus/dfs/"; const MEGA_BYTE: u64 = 1000 * 1000; const MAX_DOWNLOAD_SIZE: u64 = MEGA_BYTE * 32; - fn get_files_to_download( data_layer: &DFSAbstractionLayer, remote_path: &str, ) -> Result> { let mut files = Vec::new(); - if data_layer.is_file(Path::new(remote_path)).chain_err( - || "Error checking is file", - )? + if data_layer + .is_file(Path::new(remote_path)) + .chain_err(|| "Error checking is file")? { files.push(remote_path.to_string()); } else { - let dir_entries = data_layer.read_dir(Path::new(remote_path)).chain_err( - || "Error reading directory", - )?; + let dir_entries = data_layer + .read_dir(Path::new(remote_path)) + .chain_err(|| "Error reading directory")?; for entry in dir_entries { files.push(entry.to_string_lossy().to_string()); @@ -51,20 +50,17 @@ fn download_file( let mut local_directory = Path::new(local_path).to_path_buf(); local_directory.pop(); - fs::create_dir_all(local_directory).chain_err( - || "Error creating new local directory", - )?; + fs::create_dir_all(local_directory).chain_err(|| "Error creating new local directory")?; - let mut file = File::create(local_path).chain_err(|| { - format!("unable to create file {}", local_path) - })?; + let mut file = + File::create(local_path).chain_err(|| format!("unable to create file {}", local_path))?; let remote_path = Path::new(remote_path); let mut start_byte = 0; - let file_length = data_layer.get_file_length(remote_path).chain_err( - || "Error getting file length", - )?; + let file_length = data_layer + .get_file_length(remote_path) + .chain_err(|| "Error getting file length")?; while start_byte < file_length { let end_byte = min(file_length, start_byte + MAX_DOWNLOAD_SIZE); @@ -72,12 +68,8 @@ fn download_file( .read_file_location(remote_path, start_byte, end_byte) .chain_err(|| "Error reading file")?; - file.write_all(&file_data).chain_err(|| { - format!( - "unable to write content to {}", - local_path, - ) - })?; + file.write_all(&file_data) + .chain_err(|| format!("unable to write content to {}", local_path,))?; start_byte = end_byte; } @@ -88,9 +80,9 @@ fn get_download_file_path(local_path_str: &str, remote_path: &str) -> Result Result Result<()> { println!("Downloading File(s) from Cluster..."); - let remote_path = args.value_of("remote_path").chain_err( - || "Remote path can not be empty", - )?; + let remote_path = args.value_of("remote_path") + .chain_err(|| "Remote path can not be empty")?; - let local_path = args.value_of("local_path").chain_err( - || "Local path can not be empty", - )?; + let local_path = args.value_of("local_path") + .chain_err(|| "Local path can not be empty")?; - let master_interface = - NetworkFileSystemMasterInterface::new(*master_addr) - .chain_err(|| "Error creating distributed filesystem master interface")?; + let master_interface = NetworkFileSystemMasterInterface::new(*master_addr) + .chain_err(|| "Error creating distributed filesystem master interface")?; let mut path_buf = PathBuf::new(); path_buf.push(DFS_FILE_DIRECTORY); let local_file_manager = Arc::new(LocalFileManager::new(path_buf)); let data_layer = DFSAbstractionLayer::new(local_file_manager, Box::new(master_interface)); - let files = get_files_to_download(&data_layer, remote_path).chain_err( - || "Error getting files to download", - )?; + let files = get_files_to_download(&data_layer, remote_path) + .chain_err(|| "Error getting files to download")?; if files.is_empty() { return Err("No files found to download".into()); } else if files.len() > 1 && Path::new(local_path).is_file() { - return Err( - "Local Path must be directory to download multiple files".into(), - ); + return Err("Local Path must be directory to download multiple files".into()); } for remote_file_path in files { diff --git a/cli/src/commands/mod.rs b/cli/src/commands/mod.rs index dcebe6e1..942cfd3f 100644 --- a/cli/src/commands/mod.rs +++ b/cli/src/commands/mod.rs @@ -8,6 +8,6 @@ pub mod upload; pub use self::cancel::cancel; pub use self::cluster_status::cluster_status; pub use self::download::download; -pub use self::status::status; pub use self::run::run; +pub use self::status::status; pub use self::upload::upload; diff --git a/cli/src/commands/run.rs b/cli/src/commands/run.rs index 18e8a813..ca2ece07 100644 --- a/cli/src/commands/run.rs +++ b/cli/src/commands/run.rs @@ -32,23 +32,18 @@ fn get_priority(matches: &ArgMatches) -> Result { let priority: u32 = match priority_str.parse() { Ok(val) => val, Err(err) => { - return Err( - format!( - "Error occured while converting '{}' to a u32: {}", - priority_str, - err - ).into(), - ); + return Err(format!( + "Error occured while converting '{}' to a u32: {}", + priority_str, err + ).into()); } }; if priority < 1 || priority > 10 { - return Err( - format!( - "Priority can only be between 1 and 10. {} is not in this range", - priority - ).into(), - ); + return Err(format!( + "Priority can only be between 1 and 10. {} is not in this range", + priority + ).into()); } Ok(priority) } @@ -58,19 +53,14 @@ fn get_map_size(matches: &ArgMatches) -> Result { let map_size: u32 = match map_size_str.parse() { Ok(val) => val, Err(err) => { - return Err( - format!( - "Error occured while converting '{}' to a u32: {}", - map_size_str, - err - ).into(), - ); + return Err(format!( + "Error occured while converting '{}' to a u32: {}", + map_size_str, err + ).into()); } }; if map_size < 1 { - return Err( - "Map Size must be greater than or equal to 1 megabyte".into(), - ); + return Err("Map Size must be greater than or equal to 1 megabyte".into()); } Ok(map_size) } @@ -81,15 +71,11 @@ pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Re .chain_err(|| "Input directory cannot be empty")? .to_owned(); - input = verify_valid_path(&input).chain_err( - || "Invalid input path.", - )?; + input = verify_valid_path(&input).chain_err(|| "Invalid input path.")?; let output = matches.value_of("output").unwrap_or(""); if !output.is_empty() { - verify_valid_path(output).chain_err( - || "Invalid output path", - )?; + verify_valid_path(output).chain_err(|| "Invalid output path")?; } let mut binary = matches @@ -99,9 +85,7 @@ pub fn run(client: &grpc_pb::MapReduceServiceClient, matches: &ArgMatches) -> Re let map_size: u32 = get_map_size(matches).chain_err(|| "Unable to get map size")?; - binary = verify_valid_path(&binary).chain_err( - || "Invalid binary path.", - )?; + binary = verify_valid_path(&binary).chain_err(|| "Invalid binary path.")?; let priority = get_priority(matches)?; diff --git a/cli/src/commands/status.rs b/cli/src/commands/status.rs index a8eebda4..eca9d3bf 100644 --- a/cli/src/commands/status.rs +++ b/cli/src/commands/status.rs @@ -2,10 +2,10 @@ use chrono::Local; use clap::ArgMatches; use grpc::RequestOptions; -use common::get_client_id; use cerberus_proto::mapreduce as pb; use cerberus_proto::mapreduce_grpc as grpc_pb; use cerberus_proto::mapreduce_grpc::MapReduceService; +use common::get_client_id; use errors::*; fn print_table(rep: &pb::MapReduceReport) { @@ -19,12 +19,10 @@ fn print_table(rep: &pb::MapReduceReport) { let time_taken = rep.get_done_timestamp() - rep.get_started_timestamp(); format!("DONE ({}s)", time_taken) } - pb::Status::IN_PROGRESS => { - format!( - "IN_PROGRESS ({})", - get_time_offset(rep.get_started_timestamp()) - ) - } + pb::Status::IN_PROGRESS => format!( + "IN_PROGRESS ({})", + get_time_offset(rep.get_started_timestamp()) + ), pb::Status::IN_QUEUE => format!("IN_QUEUE ({})", rep.get_queue_length()), pb::Status::FAILED => format!("FAILED\n{}", rep.get_failure_details()).to_owned(), pb::Status::CANCELLED => "CANCELLED".to_owned(), diff --git a/cli/src/commands/upload.rs b/cli/src/commands/upload.rs index e0272b8c..adcfc49b 100644 --- a/cli/src/commands/upload.rs +++ b/cli/src/commands/upload.rs @@ -9,7 +9,7 @@ use std::path::Path; use clap::ArgMatches; use errors::*; -use util::distributed_filesystem::{NetworkFileSystemMasterInterface, FileSystemMasterInterface}; +use util::distributed_filesystem::{FileSystemMasterInterface, NetworkFileSystemMasterInterface}; const MEGA_BYTE: u64 = 1000 * 1000; const MAX_UPLOAD_SIZE: u64 = MEGA_BYTE * 32; @@ -18,9 +18,7 @@ fn get_local_files(path: &Path) -> Result> { let mut files = Vec::new(); if path.is_dir() { - let entries = fs::read_dir(path).chain_err( - || "Unable to read local file directroy", - )?; + let entries = fs::read_dir(path).chain_err(|| "Unable to read local file directroy")?; for entry in entries { let entry: DirEntry = entry.chain_err(|| "Error reading input directory")?; @@ -42,14 +40,10 @@ fn upload_local_file( remote_path: &str, ) -> Result<()> { println!("Uploading File {} to Cluster", local_path); - let file = File::open(local_path).chain_err(|| { - format!("unable to open file {}", local_path) - })?; + let file = File::open(local_path).chain_err(|| format!("unable to open file {}", local_path))?; let mut start_byte = 0; - let metadata = fs::metadata(local_path).chain_err( - || "Error getting metadata", - )?; + let metadata = fs::metadata(local_path).chain_err(|| "Error getting metadata")?; let file_length = metadata.len(); let mut first = true; // Allow uploading empty files @@ -58,9 +52,9 @@ fn upload_local_file( first = false; let mut data = vec![0; min(MAX_UPLOAD_SIZE, file_length - start_byte) as usize]; - let bytes_read = buf_reader.read(&mut data).chain_err(|| { - format!("unable to read content of {}", local_path) - })?; + let bytes_read = buf_reader + .read(&mut data) + .chain_err(|| format!("unable to read content of {}", local_path))?; master_interface .upload_file_chunk(remote_path, start_byte, data) @@ -69,7 +63,6 @@ fn upload_local_file( start_byte += bytes_read as u64; } - Ok(()) } @@ -84,9 +77,9 @@ fn get_upload_file_path(local_path: &str, remote_path: Option<&str>) -> Result) -> Result Result<()> { println!("Uploading File(s) to Cluster..."); - let local_path = args.value_of("local_path").chain_err( - || "Local path can not be empty", - )?; + let local_path = args.value_of("local_path") + .chain_err(|| "Local path can not be empty")?; let remote_path = args.value_of("remote_path"); - let local_files = get_local_files(Path::new(local_path)).chain_err( - || "Error getting files to uplaod to cluster", - )?; + let local_files = get_local_files(Path::new(local_path)) + .chain_err(|| "Error getting files to uplaod to cluster")?; if local_files.is_empty() { - return Err( - "No local file found to upload. Is the directory empty?".into(), - ); + return Err("No local file found to upload. Is the directory empty?".into()); } else if let Some(remote_path_str) = remote_path { if local_files.len() > 1 { let remote_path = Path::new(remote_path_str); @@ -120,14 +109,12 @@ pub fn upload(master_addr: &SocketAddr, args: &ArgMatches) -> Result<()> { } } - let master_interface = - NetworkFileSystemMasterInterface::new(*master_addr) - .chain_err(|| "Error creating distributed filesystem master interface")?; + let master_interface = NetworkFileSystemMasterInterface::new(*master_addr) + .chain_err(|| "Error creating distributed filesystem master interface")?; for local_path in local_files { - let remote_path_str = get_upload_file_path(&local_path, remote_path).chain_err( - || "Error getting file path to upload", - )?; + let remote_path_str = get_upload_file_path(&local_path, remote_path) + .chain_err(|| "Error getting file path to upload")?; upload_local_file(&master_interface, &local_path, &remote_path_str) .chain_err(|| "Error uploading local file")?; diff --git a/cli/src/common/client_id.rs b/cli/src/common/client_id.rs index 0b7893e6..7f3bfda9 100644 --- a/cli/src/common/client_id.rs +++ b/cli/src/common/client_id.rs @@ -14,18 +14,13 @@ const CLIENT_ID_FILE: &str = "cerberus"; fn create_new_client_id(dir: &str, file_path: &str) -> Result { // Create new client id as we do not have one saved. - fs::create_dir_all(dir).chain_err( - || "Error creating new client id.", - )?; + fs::create_dir_all(dir).chain_err(|| "Error creating new client id.")?; let client_id = Uuid::new_v4().to_string(); - let mut file = fs::File::create(file_path).chain_err( - || "Error creating new client id.", - )?; + let mut file = fs::File::create(file_path).chain_err(|| "Error creating new client id.")?; - file.write_all(client_id.as_bytes()).chain_err( - || "Error creating new client id.", - )?; + file.write_all(client_id.as_bytes()) + .chain_err(|| "Error creating new client id.")?; Ok(client_id) } @@ -43,14 +38,11 @@ pub fn get_client_id() -> Result { let file_path = path_buf.to_str().chain_err(|| "Error getting client id.")?; if fs::metadata(file_path).is_ok() { - let mut file = fs::File::open(file_path).chain_err( - || "Error getting client id.", - )?; + let mut file = fs::File::open(file_path).chain_err(|| "Error getting client id.")?; let mut client_id = String::new(); - file.read_to_string(&mut client_id).chain_err( - || "Error getting client id.", - )?; + file.read_to_string(&mut client_id) + .chain_err(|| "Error getting client id.")?; return Ok(client_id); } diff --git a/cli/src/main.rs b/cli/src/main.rs index 6ce36474..2d1f765b 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -6,8 +6,8 @@ extern crate error_chain; extern crate grpc; #[macro_use] extern crate prettytable; -extern crate uuid; extern crate util; +extern crate uuid; extern crate cerberus_proto; diff --git a/cli/src/parser.rs b/cli/src/parser.rs index 6fad8fcb..907b6aca 100644 --- a/cli/src/parser.rs +++ b/cli/src/parser.rs @@ -1,4 +1,4 @@ -use clap::{App, SubCommand, Arg, ArgMatches}; +use clap::{App, Arg, ArgMatches, SubCommand}; pub fn parse_command_line<'a>() -> ArgMatches<'a> { App::new("cli") diff --git a/cli/tests/end-to-end.rs b/cli/tests/end-to-end.rs index e032a2bc..768998ba 100644 --- a/cli/tests/end-to-end.rs +++ b/cli/tests/end-to-end.rs @@ -1,6 +1,5 @@ /// This is a set of tests which is ran against the CLI. It spawns a test server to which the CLI /// connects to. - use std::env; use std::path::PathBuf; use std::process::Command; diff --git a/libcerberus/examples/distributed-grep.rs b/libcerberus/examples/distributed-grep.rs index 594d1236..2ba4e9ef 100644 --- a/libcerberus/examples/distributed-grep.rs +++ b/libcerberus/examples/distributed-grep.rs @@ -31,16 +31,14 @@ impl Map for GrepMapper { where E: EmitIntermediate, { - let regex = Regex::new(REGEX).chain_err( - || "Error creating regex object.", - )?; + let regex = Regex::new(REGEX).chain_err(|| "Error creating regex object.")?; for line in input.value.lines() { if regex.is_match(line) { let word = get_longest_word(line); - emitter.emit(word.len(), line.to_owned()).chain_err( - || "Error emitting map key-value pair.", - )?; + emitter + .emit(word.len(), line.to_owned()) + .chain_err(|| "Error emitting map key-value pair.")?; } } Ok(()) @@ -61,9 +59,7 @@ impl Reduce for GrepReducer { } fn run() -> Result<()> { - env_logger::init().chain_err( - || "Failed to initialise logging.", - )?; + env_logger::init().chain_err(|| "Failed to initialise logging.")?; let grep_mapper = GrepMapper; let grep_reducer = GrepReducer; diff --git a/libcerberus/examples/end-to-end.rs b/libcerberus/examples/end-to-end.rs index 710ba08d..5b92c772 100644 --- a/libcerberus/examples/end-to-end.rs +++ b/libcerberus/examples/end-to-end.rs @@ -25,10 +25,7 @@ impl Reduce for TestReducer { where E: EmitFinal, { - emitter.emit(input.values.iter().fold( - String::new(), - |acc, x| acc + x, - ))?; + emitter.emit(input.values.iter().fold(String::new(), |acc, x| acc + x))?; Ok(()) } } @@ -37,12 +34,16 @@ struct TestPartitioner; impl Partition for TestPartitioner { fn partition(&self, input: PartitionInputKV) -> Result { let key = input.key; - let first_char = key.chars().nth(0).chain_err( - || "Cannot partition key of empty string.", - )?; + let first_char = key.chars() + .nth(0) + .chain_err(|| "Cannot partition key of empty string.")?; let partition = { if first_char.is_lowercase() { - if first_char > 'm' { 1 } else { 0 } + if first_char > 'm' { + 1 + } else { + 0 + } } else if first_char > 'M' { 1 } else { diff --git a/libcerberus/examples/rating-aggregator.rs b/libcerberus/examples/rating-aggregator.rs index bf7324ad..a0577ffe 100644 --- a/libcerberus/examples/rating-aggregator.rs +++ b/libcerberus/examples/rating-aggregator.rs @@ -45,24 +45,24 @@ impl Map for RatingAggregatorMapper { let movie_title = format!("T:{}", info[1]); let movie_genre = format!("G:{}", info[2]); - emitter.emit(movie_id, movie_title).chain_err( - || "Error emitting map key-value pair.", - )?; - emitter.emit(movie_id, movie_genre).chain_err( - || "Error emitting map key-value pair.", - )?; + emitter + .emit(movie_id, movie_title) + .chain_err(|| "Error emitting map key-value pair.")?; + emitter + .emit(movie_id, movie_genre) + .chain_err(|| "Error emitting map key-value pair.")?; } else { // Rating info let movie_id: u32 = info[1].parse().chain_err(|| "Error parsing movie id")?; let rating: f64 = info[2].parse().chain_err(|| "Error parsing movie rating")?; - emitter.emit(movie_id, rating.to_string()).chain_err( - || "Error emitting map key-value pair.", - )?; + emitter + .emit(movie_id, rating.to_string()) + .chain_err(|| "Error emitting map key-value pair.")?; - emitter.emit(movie_id, "C:1".to_string()).chain_err( - || "Error emitting map key-value pair.", - )?; + emitter + .emit(movie_id, "C:1".to_string()) + .chain_err(|| "Error emitting map key-value pair.")?; } } Ok(()) @@ -119,22 +119,21 @@ impl Combine for RatingAggregatorCombiner { { let combine_result = do_rating_combine(input)?; if !combine_result.title.is_empty() { - emitter.emit(combine_result.title).chain_err( - || "Error emitting value.", - )?; - + emitter + .emit(combine_result.title) + .chain_err(|| "Error emitting value.")?; } if !combine_result.genres.is_empty() { - emitter.emit(combine_result.genres).chain_err( - || "Error emitting value.", - )?; + emitter + .emit(combine_result.genres) + .chain_err(|| "Error emitting value.")?; } if combine_result.rating_count > 0 { - emitter.emit(combine_result.rating.to_string()).chain_err( - || "Error emitting value.", - )?; + emitter + .emit(combine_result.rating.to_string()) + .chain_err(|| "Error emitting value.")?; emitter .emit(format!("C:{}", combine_result.rating_count.to_string())) @@ -166,20 +165,17 @@ impl Reduce for RatingAggregatorReducer { combine_result.rating_count ); - emitter.emit(output_str).chain_err( - || "Error emitting value.", - )?; + emitter + .emit(output_str) + .chain_err(|| "Error emitting value.")?; } Ok(()) } } - fn run() -> Result<()> { - env_logger::init().chain_err( - || "Failed to initialise logging.", - )?; + env_logger::init().chain_err(|| "Failed to initialise logging.")?; let ra_mapper = RatingAggregatorMapper; let ra_reducer = RatingAggregatorReducer; diff --git a/libcerberus/examples/rating-by-genre.rs b/libcerberus/examples/rating-by-genre.rs index ac2489fd..e662b23a 100644 --- a/libcerberus/examples/rating-by-genre.rs +++ b/libcerberus/examples/rating-by-genre.rs @@ -57,9 +57,9 @@ impl Map for RatingByGenreMapper { let movie_genres = movie_genre_str.split('|'); let rating: f64 = info[2].parse().chain_err(|| "Error parsing movie rating")?; - let rating_count: u64 = info[3].parse().chain_err( - || "Error parsing movie rating count", - )?; + let rating_count: u64 = info[3] + .parse() + .chain_err(|| "Error parsing movie rating count")?; if rating_count > MIN_RATING_COUNT { let rating_pair = format!("\"{}\",{}", movie_title, rating); @@ -111,12 +111,11 @@ impl Combine for RatingByGenreCombiner { let rating_pair = format!( "\"{}\",{}", - combine_result.best_title, - combine_result.best_rating + combine_result.best_title, combine_result.best_rating ); - emitter.emit(rating_pair).chain_err( - || "Error emitting value", - )?; + emitter + .emit(rating_pair) + .chain_err(|| "Error emitting value")?; Ok(()) } @@ -130,9 +129,9 @@ impl Reduce for RatingByGenreReducer { { let combine_result = do_genre_combine(input)?; - emitter.emit(combine_result.best_title).chain_err( - || "Error emitting value", - )?; + emitter + .emit(combine_result.best_title) + .chain_err(|| "Error emitting value")?; emitter .emit(combine_result.best_rating.to_string()) .chain_err(|| "Error emitting value")?; @@ -142,9 +141,7 @@ impl Reduce for RatingByGenreReducer { } fn run() -> Result<()> { - env_logger::init().chain_err( - || "Failed to initialise logging.", - )?; + env_logger::init().chain_err(|| "Failed to initialise logging.")?; let rbg_mapper = RatingByGenreMapper; let rbg_reducer = RatingByGenreReducer; diff --git a/libcerberus/examples/rating-by-year.rs b/libcerberus/examples/rating-by-year.rs index 3e6f6896..36b6a10e 100644 --- a/libcerberus/examples/rating-by-year.rs +++ b/libcerberus/examples/rating-by-year.rs @@ -53,9 +53,9 @@ impl Map for RatingByYearMapper { let movie_title = info[0][1..(info[0].len() - 1)].trim().to_owned(); let rating: f64 = info[2].parse().chain_err(|| "Error parsing movie rating")?; - emitter.emit(movie_title, rating).chain_err( - || "Error emitting map key-value pair.", - )?; + emitter + .emit(movie_title, rating) + .chain_err(|| "Error emitting map key-value pair.")?; } Ok(()) } @@ -66,9 +66,9 @@ impl Partition for RatingByYearPartitioner { fn partition(&self, input: PartitionInputKV) -> Result { let key = input.key; let year_str = key[(key.len() - 5)..(key.len() - 1)].to_owned(); - let partition: u64 = year_str.parse().chain_err(|| { - format!("Error getting year from movie title {}, {}", key, year_str) - })?; + let partition: u64 = year_str + .parse() + .chain_err(|| format!("Error getting year from movie title {}, {}", key, year_str))?; Ok(partition) } @@ -89,9 +89,7 @@ impl Reduce for RatingByYearReducer { } fn run() -> Result<()> { - env_logger::init().chain_err( - || "Failed to initialise logging.", - )?; + env_logger::init().chain_err(|| "Failed to initialise logging.")?; let rby_mapper = RatingByYearMapper; let rby_reducer = RatingByYearReducer; diff --git a/libcerberus/examples/word-counter.rs b/libcerberus/examples/word-counter.rs index c2e22231..e48f441a 100644 --- a/libcerberus/examples/word-counter.rs +++ b/libcerberus/examples/word-counter.rs @@ -17,9 +17,9 @@ impl Map for WordCountMapper { { for token in input.value.split(char::is_whitespace) { if !token.is_empty() { - emitter.emit(token.to_owned(), 1).chain_err( - || "Error emitting map key-value pair.", - )?; + emitter + .emit(token.to_owned(), 1) + .chain_err(|| "Error emitting map key-value pair.")?; } } Ok(()) @@ -36,9 +36,9 @@ impl Reduce for WordCountReducer { for val in input.values { total += val; } - emitter.emit(total).chain_err(|| { - format!("Error emitting value {:?}.", total) - })?; + emitter + .emit(total) + .chain_err(|| format!("Error emitting value {:?}.", total))?; Ok(()) } } @@ -53,18 +53,16 @@ impl Combine for WordCountCombiner { for val in input.values { total += val; } - emitter.emit(total).chain_err(|| { - format!("Error emitting value {:?}.", total) - })?; + emitter + .emit(total) + .chain_err(|| format!("Error emitting value {:?}.", total))?; Ok(()) } } fn run() -> Result<()> { - env_logger::init().chain_err( - || "Failed to initialise logging.", - )?; + env_logger::init().chain_err(|| "Failed to initialise logging.")?; let wc_mapper = WordCountMapper; let wc_reducer = WordCountReducer; diff --git a/libcerberus/src/combiner.rs b/libcerberus/src/combiner.rs index 836ad001..c5a0c900 100644 --- a/libcerberus/src/combiner.rs +++ b/libcerberus/src/combiner.rs @@ -1,5 +1,5 @@ -use serde::Serialize; use serde::de::DeserializeOwned; +use serde::Serialize; use emitter::EmitFinal; use errors::*; diff --git a/libcerberus/src/io.rs b/libcerberus/src/io.rs index cfb978a1..7639b737 100644 --- a/libcerberus/src/io.rs +++ b/libcerberus/src/io.rs @@ -1,9 +1,9 @@ use bson; use errors::*; -use mapper::MapInputKV; use intermediate::IntermediateInputKV; -use serde::Serialize; +use mapper::MapInputKV; use serde::de::DeserializeOwned; +use serde::Serialize; use serde_json; use serialise::{FinalOutputObject, IntermediateOutputObject}; use std::io::{Read, Write}; @@ -13,9 +13,8 @@ use std::io::{Read, Write}; /// It attempts to parse the string from the input source as BSON and returns an `errors::Error` if /// the attempt fails. pub fn read_map_input(source: &mut R) -> Result { - let bson_document = bson::decode_document(source).chain_err( - || "Error parsing input BSON from source.", - )?; + let bson_document = + bson::decode_document(source).chain_err(|| "Error parsing input BSON from source.")?; let map_input = bson::from_bson(bson::Bson::Document(bson_document)) .chain_err(|| "Error parsing input BSON as MapInputKV.")?; @@ -34,22 +33,20 @@ where V: Default + Serialize + DeserializeOwned, { let mut input_string = String::new(); - let bytes_read = source.read_to_string(&mut input_string).chain_err( - || "Error reading from source.", - )?; + let bytes_read = source + .read_to_string(&mut input_string) + .chain_err(|| "Error reading from source.")?; if bytes_read == 0 { warn!("bytes_read is 0"); } - let value: serde_json::Value = serde_json::from_str(input_string.as_str()).chain_err( - || "Error parsing input JSON to Value.", - )?; + let value: serde_json::Value = serde_json::from_str(input_string.as_str()) + .chain_err(|| "Error parsing input JSON to Value.")?; let mut result = Vec::new(); if let serde_json::Value::Array(pairs) = value { for kv_pair in pairs { - let kv_pair = serde_json::from_value(kv_pair).chain_err( - || "Error parsing value to IntermediateInputKV", - )?; + let kv_pair = serde_json::from_value(kv_pair) + .chain_err(|| "Error parsing value to IntermediateInputKV")?; result.push(kv_pair); } } else { @@ -65,9 +62,7 @@ where W: Write, V: Default + Serialize, { - serde_json::to_writer(sink, &output).chain_err( - || "Error writing to sink.", - )?; + serde_json::to_writer(sink, &output).chain_err(|| "Error writing to sink.")?; Ok(()) } @@ -81,9 +76,7 @@ where K: Default + Serialize, V: Default + Serialize, { - serde_json::to_writer(sink, &output).chain_err( - || "Error writing to sink.", - )?; + serde_json::to_writer(sink, &output).chain_err(|| "Error writing to sink.")?; Ok(()) } @@ -93,19 +86,16 @@ where W: Write, V: Default + Serialize, { - - serde_json::to_writer(sink, &output).chain_err( - || "Error writing to sink.", - )?; + serde_json::to_writer(sink, &output).chain_err(|| "Error writing to sink.")?; Ok(()) } #[cfg(test)] mod tests { + use super::*; use serialise::IntermediateOutputPair; use std::collections::HashMap; use std::io::Cursor; - use super::*; #[test] fn read_valid_map_input_kv() { @@ -160,8 +150,8 @@ mod tests { let test_string = ""; let mut cursor = Cursor::new(test_string); - let _: IntermediateInputKV = read_intermediate_input(&mut cursor).unwrap() - [0]; + let _: IntermediateInputKV = + read_intermediate_input(&mut cursor).unwrap()[0]; } #[test] @@ -198,7 +188,9 @@ mod tests { #[test] fn write_final_output_object() { - let test_object = vec![FinalOutputObject { values: vec!["barbaz", "bazbar"] }]; + let test_object = vec![FinalOutputObject { + values: vec!["barbaz", "bazbar"], + }]; let expected_json_string = r#"[{"values":["barbaz","bazbar"]}]"#; let output_vector: Vec = Vec::new(); let mut cursor = Cursor::new(output_vector); diff --git a/libcerberus/src/lib.rs b/libcerberus/src/lib.rs index 6145c18c..fcc56e8b 100644 --- a/libcerberus/src/lib.rs +++ b/libcerberus/src/lib.rs @@ -28,8 +28,8 @@ mod errors { pub mod combiner; pub mod emitter; -pub mod io; pub mod intermediate; +pub mod io; pub mod mapper; pub mod partition; pub mod reducer; @@ -38,8 +38,8 @@ pub mod runner; pub mod serialise; pub use combiner::Combine; +pub use emitter::{EmitFinal, EmitIntermediate}; pub use errors::*; -pub use emitter::{EmitIntermediate, EmitFinal}; pub use intermediate::IntermediateInputKV; pub use mapper::{Map, MapInputKV}; pub use partition::{HashPartitioner, Partition, PartitionInputKV}; diff --git a/libcerberus/src/reducer.rs b/libcerberus/src/reducer.rs index eec9c439..4761fe8c 100644 --- a/libcerberus/src/reducer.rs +++ b/libcerberus/src/reducer.rs @@ -1,5 +1,5 @@ -use serde::Serialize; use serde::de::DeserializeOwned; +use serde::Serialize; use emitter::EmitFinal; use errors::*; @@ -43,10 +43,7 @@ mod tests { where E: EmitFinal, { - emitter.emit(input.values.iter().fold( - String::new(), - |acc, x| acc + x, - ))?; + emitter.emit(input.values.iter().fold(String::new(), |acc, x| acc + x))?; Ok(()) } } diff --git a/libcerberus/src/registry.rs b/libcerberus/src/registry.rs index 4cd81a05..3cbf66df 100644 --- a/libcerberus/src/registry.rs +++ b/libcerberus/src/registry.rs @@ -1,5 +1,5 @@ -use serde::Serialize; use serde::de::DeserializeOwned; +use serde::Serialize; use combiner::Combine; use emitter::EmitFinal; @@ -42,12 +42,9 @@ where impl<'a, M, R, P, C> Default for UserImplRegistryBuilder<'a, M, R, P, C> where M: Map + 'a, - R: Reduce - + 'a, - P: Partition - + 'a, - C: Combine - + 'a, + R: Reduce + 'a, + P: Partition + 'a, + C: Combine + 'a, { fn default() -> UserImplRegistryBuilder<'a, M, R, P, C> { UserImplRegistryBuilder { @@ -94,15 +91,12 @@ where } pub fn build(&self) -> Result> { - let mapper = self.mapper.chain_err( - || "Error building UserImplRegistry: No Mapper provided", - )?; - let reducer = self.reducer.chain_err( - || "Error building UserImplRegistry: No Reducer provided", - )?; - let partitioner = self.partitioner.chain_err( - || "Error building UserImplRegistry: No Partitioner provided", - )?; + let mapper = self.mapper + .chain_err(|| "Error building UserImplRegistry: No Mapper provided")?; + let reducer = self.reducer + .chain_err(|| "Error building UserImplRegistry: No Reducer provided")?; + let partitioner = self.partitioner + .chain_err(|| "Error building UserImplRegistry: No Partitioner provided")?; Ok(UserImplRegistry { mapper, @@ -134,11 +128,7 @@ impl<'a, M, R, P> UserImplRegistryBuilder<'a, M, R, P, NullCombiner> where M: Map + 'a, R: Reduce + 'a, - P: Partition< - M::Key, - M::Value, - > - + 'a, + P: Partition + 'a, { pub fn new_no_combiner() -> UserImplRegistryBuilder<'a, M, R, P, NullCombiner> { Default::default() diff --git a/libcerberus/src/runner.rs b/libcerberus/src/runner.rs index 576db743..d393a7b9 100644 --- a/libcerberus/src/runner.rs +++ b/libcerberus/src/runner.rs @@ -3,11 +3,12 @@ use std::io::{stdin, stdout}; use chrono::prelude::*; use clap::{App, ArgMatches, SubCommand}; -use serde::Serialize; use serde::de::DeserializeOwned; +use serde::Serialize; use serde_json; use uuid::Uuid; +use super::VERSION; use combiner::Combine; use emitter::IntermediateVecEmitter; use errors::*; @@ -17,9 +18,8 @@ use mapper::Map; use partition::{Partition, PartitionInputKV}; use reducer::Reduce; use registry::UserImplRegistry; -use serialise::{FinalOutputObject, FinalOutputObjectEmitter, IntermediateOutputObject, VecEmitter, - IntermediateOutputPair}; -use super::VERSION; +use serialise::{FinalOutputObject, FinalOutputObjectEmitter, IntermediateOutputObject, + IntermediateOutputPair, VecEmitter}; /// `parse_command_line` uses `clap` to parse the command-line arguments passed to the payload. /// @@ -59,15 +59,13 @@ where Ok(()) } Some("reduce") => { - run_reduce(registry.reducer).chain_err( - || "Error running reduce", - )?; + run_reduce(registry.reducer).chain_err(|| "Error running reduce")?; Ok(()) } Some("combine") => { - let combiner = registry.combiner.chain_err( - || "Attempt to run combine command when combiner is not implemented", - )?; + let combiner = registry + .combiner + .chain_err(|| "Attempt to run combine command when combiner is not implemented")?; run_combine(combiner).chain_err(|| "Error running combine")?; Ok(()) } @@ -95,9 +93,7 @@ where { let mut source = stdin(); let mut sink = stdout(); - let input_kv = read_map_input(&mut source).chain_err( - || "Error getting input to map.", - )?; + let input_kv = read_map_input(&mut source).chain_err(|| "Error getting input to map.")?; let mut pairs_vec: Vec<(M::Key, M::Value)> = Vec::new(); @@ -106,9 +102,8 @@ where .chain_err(|| "Error running map operation.")?; if let Some(combiner) = combiner_option { - let new_pairs_vec = run_internal_combine(combiner, &mut pairs_vec).chain_err( - || "Error running combine on map results", - )?; + let new_pairs_vec = run_internal_combine(combiner, &mut pairs_vec) + .chain_err(|| "Error running combine on map results")?; pairs_vec = new_pairs_vec; } @@ -119,9 +114,10 @@ where let partition = partitioner .partition(PartitionInputKV::new(&pair.0, &pair.1)) .chain_err(|| "Error partitioning map output")?; - let output_array = output_object.partitions.entry(partition).or_insert_with( - Default::default, - ); + let output_array = output_object + .partitions + .entry(partition) + .or_insert_with(Default::default); output_array.push(IntermediateOutputPair { key: pair.0, value: pair.1, @@ -140,9 +136,8 @@ where { let mut source = stdin(); let mut sink = stdout(); - let input_kvs = read_intermediate_input(&mut source).chain_err( - || "Error getting input to reduce.", - )?; + let input_kvs = + read_intermediate_input(&mut source).chain_err(|| "Error getting input to reduce.")?; let mut output_objects = Vec::new(); for input_kv in input_kvs { @@ -153,9 +148,8 @@ where output_objects.push(output_object); } - write_reduce_output(&mut sink, &output_objects).chain_err( - || "Error writing reduce output to stdout.", - )?; + write_reduce_output(&mut sink, &output_objects) + .chain_err(|| "Error writing reduce output to stdout.")?; Ok(()) } @@ -167,9 +161,8 @@ where { let mut source = stdin(); let mut sink = stdout(); - let input_kvs = read_intermediate_input(&mut source).chain_err( - || "Error getting input to combine.", - )?; + let input_kvs = + read_intermediate_input(&mut source).chain_err(|| "Error getting input to combine.")?; let mut output_objects = Vec::new(); @@ -208,14 +201,12 @@ where let mut results = Vec::new(); for (key_str, mut values) in kv_map.drain() { - let key_json: serde_json::Value = serde_json::from_str(&key_str).chain_err( - || "Error parsing combine key.", - )?; + let key_json: serde_json::Value = + serde_json::from_str(&key_str).chain_err(|| "Error parsing combine key.")?; // Retrieve the original key from the serialized version. - let key = serde_json::from_value(key_json.clone()).chain_err( - || "Error converting combine key string to key type", - )?; + let key = serde_json::from_value(key_json.clone()) + .chain_err(|| "Error converting combine key string to key type")?; if values.len() > 1 { let input_kv = IntermediateInputKV { key, values }; @@ -227,9 +218,8 @@ where .chain_err(|| "Error running combine operation.")?; for value in result_values { - let key = serde_json::from_value(key_json.clone()).chain_err( - || "Error converting combine key string to key type", - )?; + let key = serde_json::from_value(key_json.clone()) + .chain_err(|| "Error converting combine key string to key type")?; results.push((key, value)); } diff --git a/libcerberus/src/serialise.rs b/libcerberus/src/serialise.rs index d0f01987..9df5cb45 100644 --- a/libcerberus/src/serialise.rs +++ b/libcerberus/src/serialise.rs @@ -95,9 +95,9 @@ impl<'a, V: Default + Serialize> EmitFinal for FinalOutputObjectEmitter<'a, V #[cfg(test)] mod tests { + use super::*; use serde_json; use std::collections::HashSet; - use super::*; // Test that the JSON serialisation of IntermediateOutputObject matches the libcerberus JSON // API. @@ -119,12 +119,10 @@ mod tests { ); partitions.insert( 1, - vec![ - IntermediateOutputPair { - key: "foo_intermediate2", - value: "bar", - }, - ], + vec![IntermediateOutputPair { + key: "foo_intermediate2", + value: "bar", + }], ); let output = IntermediateOutputObject { partitions }; @@ -144,7 +142,9 @@ mod tests { // Test that the JSON serialisation of FinalOutputObject matches the libcerberus JSON API. #[test] fn final_output_object_json_format() { - let output = FinalOutputObject { values: vec!["barbaz", "bazbar"] }; + let output = FinalOutputObject { + values: vec!["barbaz", "bazbar"], + }; let expected_json_string = r#"{"values":["barbaz","bazbar"]}"#; let json_string = serde_json::to_string(&output).unwrap(); @@ -155,7 +155,9 @@ mod tests { #[test] fn final_output_emitter_works() { let mut output = FinalOutputObject::default(); - let expected_output = FinalOutputObject { values: vec!["foo", "bar"] }; + let expected_output = FinalOutputObject { + values: vec!["foo", "bar"], + }; { let mut emitter = FinalOutputObjectEmitter::new(&mut output); diff --git a/master/src/common/job.rs b/master/src/common/job.rs index f76e1d93..0c72435d 100644 --- a/master/src/common/job.rs +++ b/master/src/common/job.rs @@ -1,15 +1,15 @@ use errors::*; -use std::sync::Arc; -use std::path::{PathBuf, Path}; +use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; +use std::sync::Arc; use chrono::prelude::*; use serde_json; use uuid::Uuid; +use cerberus_proto::mapreduce as pb; use util::data_layer::AbstractionLayer; use util::state::StateHandling; -use cerberus_proto::mapreduce as pb; const MEGA_BYTE: u64 = 1000 * 1000; @@ -144,14 +144,11 @@ impl Job { data_abstraction_layer: &Arc, ) -> Result { let validate_paths = options.validate_paths; - let job = Job::new_no_validate(options).chain_err( - || "Unable to create job", - )?; + let job = Job::new_no_validate(options).chain_err(|| "Unable to create job")?; if validate_paths { - job.validate_input(data_abstraction_layer).chain_err( - || "Error validating input", - )?; + job.validate_input(data_abstraction_layer) + .chain_err(|| "Error validating input")?; } Ok(job) } @@ -162,19 +159,17 @@ impl Job { ) -> Result<()> { // Validate the existence of the input directory and the binary file. let input_path = Path::new(&self.input_directory); - let is_dir = data_abstraction_layer.is_dir(input_path).chain_err( - || "Error checking if path is a directory", - )?; + let is_dir = data_abstraction_layer + .is_dir(input_path) + .chain_err(|| "Error checking if path is a directory")?; if !is_dir { - return Err( - format!("Input directory does not exist: {:?}", input_path).into(), - ); + return Err(format!("Input directory does not exist: {:?}", input_path).into()); } let binary_path = Path::new(&self.binary_path); - let is_file = data_abstraction_layer.is_file(binary_path).chain_err( - || "Error checking if path is a file", - )?; + let is_file = data_abstraction_layer + .is_file(binary_path) + .chain_err(|| "Error checking if path is a file")?; if !is_file { return Err(format!("Binary does not exist: {:?}", binary_path).into()); } @@ -198,12 +193,11 @@ impl Job { .stderr(Stdio::piped()) .spawn() .chain_err(|| "Unable to run sanity-check on binary")?; - let output = child.wait_with_output().chain_err( - || "Error waiting for output from binary", - )?; - let output_str = String::from_utf8(output.stdout).chain_err( - || "Unable to read output from binary", - )?; + let output = child + .wait_with_output() + .chain_err(|| "Error waiting for output from binary")?; + let output_str = + String::from_utf8(output.stdout).chain_err(|| "Unable to read output from binary")?; if output_str.contains("sanity located") { Ok(()) @@ -249,17 +243,14 @@ impl StateHandling for Job { output_directory: serde_json::from_value(data["output_directory"].clone()) .chain_err(|| "Unable to convert output dir")?, validate_paths: false, - priority: serde_json::from_value(data["priority"].clone()).chain_err( - || "Unable to convert priority", - )?, - map_size: serde_json::from_value(data["map_size"].clone()).chain_err( - || "Unable to convert map_size", - )?, + priority: serde_json::from_value(data["priority"].clone()) + .chain_err(|| "Unable to convert priority")?, + map_size: serde_json::from_value(data["map_size"].clone()) + .chain_err(|| "Unable to convert map_size")?, }; - let mut job = Job::new_no_validate(options).chain_err( - || "Unable to create map reduce job", - )?; + let mut job = + Job::new_no_validate(options).chain_err(|| "Unable to create map reduce job")?; job.load_state(data).chain_err(|| "Unable to load state")?; @@ -301,14 +292,10 @@ impl StateHandling for Job { } fn load_state(&mut self, data: serde_json::Value) -> Result<()> { - self.id = serde_json::from_value(data["id"].clone()).chain_err( - || "Unable to convert id", - )?; - - let status: SerializableJobStatus = - serde_json::from_value(data["status"].clone()).chain_err( - || "Unable to convert mapreduce status", - )?; + self.id = serde_json::from_value(data["id"].clone()).chain_err(|| "Unable to convert id")?; + + let status: SerializableJobStatus = serde_json::from_value(data["status"].clone()) + .chain_err(|| "Unable to convert mapreduce status")?; self.status = self.status_from_state(&status); self.status_details = serde_json::from_value(data["status_details"].clone()) .chain_err(|| "Unable to convert status_details.")?; diff --git a/master/src/common/mod.rs b/master/src/common/mod.rs index c98ee55c..69478b8c 100644 --- a/master/src/common/mod.rs +++ b/master/src/common/mod.rs @@ -11,8 +11,8 @@ pub mod worker; pub use self::job::Job; pub use self::job::JobOptions; +pub use self::task::PriorityTask; pub use self::task::Task; pub use self::task::TaskStatus; pub use self::task::TaskType; -pub use self::task::PriorityTask; pub use self::worker::Worker; diff --git a/master/src/common/task.rs b/master/src/common/task.rs index 02d8b33b..1b967d96 100644 --- a/master/src/common/task.rs +++ b/master/src/common/task.rs @@ -1,8 +1,8 @@ -use std::collections::HashMap; use std::cmp::Ordering; +use std::collections::HashMap; -use protobuf::repeated::RepeatedField; use chrono::prelude::*; +use protobuf::repeated::RepeatedField; use serde_json; use uuid::Uuid; @@ -124,9 +124,8 @@ impl Task { let request_data = data["request"].clone(); // Create a basic Map task. - let id: String = serde_json::from_value(data["job_id"].clone()).chain_err( - || "Unable to convert job_id", - )?; + let id: String = serde_json::from_value(data["job_id"].clone()) + .chain_err(|| "Unable to convert job_id")?; let binary_path: String = serde_json::from_value(request_data["binary_path"].clone()) .chain_err(|| "Unable to convert binary_path")?; let input_locations: Vec = serde_json::from_value( @@ -151,9 +150,8 @@ impl Task { let mut task = Task::new_map_task(id, binary_path, input_locations_pb, job_priority); // Update the state. - task.load_state(data).chain_err( - || "Unable to load Task from state", - )?; + task.load_state(data) + .chain_err(|| "Unable to load Task from state")?; Ok(task) } @@ -211,9 +209,8 @@ impl Task { let input_files: Vec = serde_json::from_value(request_data["input_files"].clone()) .chain_err(|| "Unable to convert input_files")?; - let id: String = serde_json::from_value(data["job_id"].clone()).chain_err( - || "Unable to convert job_id", - )?; + let id: String = serde_json::from_value(data["job_id"].clone()) + .chain_err(|| "Unable to convert job_id")?; let binary_path: String = serde_json::from_value(request_data["binary_path"].clone()) .chain_err(|| "Unable to convert binary_path")?; let output_dir: String = serde_json::from_value(request_data["output_directory"].clone()) @@ -230,9 +227,8 @@ impl Task { job_priority, ); - task.load_state(data).chain_err( - || "Unable to load Task from state", - )?; + task.load_state(data) + .chain_err(|| "Unable to load Task from state")?; Ok(task) } @@ -251,43 +247,35 @@ impl StateHandling for Task { fn dump_state(&self) -> Result { let request = match self.task_type { - TaskType::Map => { - match self.map_request { - Some(ref req) => { - let input_locations: Vec = req.get_input() - .get_input_locations() - .into_iter() - .map(|loc| { - InputLocation { - input_path: loc.input_path.clone(), - start_byte: loc.start_byte, - end_byte: loc.end_byte, - } - }) - .collect(); - - json!({ + TaskType::Map => match self.map_request { + Some(ref req) => { + let input_locations: Vec = req.get_input() + .get_input_locations() + .into_iter() + .map(|loc| InputLocation { + input_path: loc.input_path.clone(), + start_byte: loc.start_byte, + end_byte: loc.end_byte, + }) + .collect(); + + json!({ "input_locations": input_locations, "binary_path":req.mapper_file_path, }) - } - None => return Err("Unable to serialize map_request".into()), } - } + None => return Err("Unable to serialize map_request".into()), + }, - TaskType::Reduce => { - match self.reduce_request { - Some(ref req) => { - json!({ + TaskType::Reduce => match self.reduce_request { + Some(ref req) => json!({ "input_partition": req.partition, "input_files": req.input_file_paths.clone().into_vec(), "binary_path": req.reducer_file_path, "output_directory": req.output_directory, - }) - } - None => return Err("Unable to serialize reduce_request".into()), - } - } + }), + None => return Err("Unable to serialize reduce_request".into()), + }, }; let time_started = match self.time_started { @@ -316,22 +304,20 @@ impl StateHandling for Task { } fn load_state(&mut self, data: serde_json::Value) -> Result<()> { - self.id = serde_json::from_value(data["id"].clone()).chain_err( - || "Unable to convert id", - )?; + self.id = serde_json::from_value(data["id"].clone()).chain_err(|| "Unable to convert id")?; match self.task_type { TaskType::Map => { - let mut map_request = self.map_request.clone().chain_err( - || "Map Request should exist", - )?; + let mut map_request = self.map_request + .clone() + .chain_err(|| "Map Request should exist")?; map_request.task_id = self.id.clone(); self.map_request = Some(map_request); } TaskType::Reduce => { - let mut reduce_request = self.reduce_request.clone().chain_err( - || "Reduce Request should exist", - )?; + let mut reduce_request = self.reduce_request + .clone() + .chain_err(|| "Reduce Request should exist")?; reduce_request.task_id = self.id.clone(); self.reduce_request = Some(reduce_request); } @@ -341,9 +327,8 @@ impl StateHandling for Task { .chain_err(|| "Unable to convert map_output_files")?; self.assigned_worker_id = serde_json::from_value(data["assigned_worker_id"].clone()) .chain_err(|| "Unable to convert assigned_worker_id")?; - self.status = serde_json::from_value(data["status"].clone()).chain_err( - || "Unable to convert status", - )?; + self.status = serde_json::from_value(data["status"].clone()) + .chain_err(|| "Unable to convert status")?; self.failure_count = serde_json::from_value(data["failure_count"].clone()) .chain_err(|| "Unable to convert failure_count")?; self.failure_details = serde_json::from_value(data["failure_details"].clone()) @@ -445,8 +430,10 @@ mod tests { let map_request = map_task.map_request.unwrap(); assert_eq!("/tmp/bin", map_request.get_mapper_file_path()); - assert_eq!("/tmp/input/file1", map_request.get_input().get_input_locations()[0] - .input_path); + assert_eq!( + "/tmp/input/file1", + map_request.get_input().get_input_locations()[0].input_path + ); assert!(map_task.reduce_request.is_none()); } @@ -479,16 +466,14 @@ mod tests { let mut map_task = Task::new_map_task("map-1", "/tmp/bin", vec![input_location], 1); - map_task.map_output_files.insert( - 0, - "output_file_1".to_owned(), - ); + map_task + .map_output_files + .insert(0, "output_file_1".to_owned()); assert_eq!("output_file_1", &map_task.map_output_files[&0]); - map_task.map_output_files.insert( - 1, - "output_file_2".to_owned(), - ); + map_task + .map_output_files + .insert(1, "output_file_2".to_owned()); assert_eq!("output_file_1", &map_task.map_output_files[&0]); assert_eq!("output_file_2", &map_task.map_output_files[&1]); } diff --git a/master/src/common/worker.rs b/master/src/common/worker.rs index d494409d..723e732d 100644 --- a/master/src/common/worker.rs +++ b/master/src/common/worker.rs @@ -2,8 +2,8 @@ use std::net::SocketAddr; use std::str::FromStr; use chrono::prelude::*; -use uuid::Uuid; use serde_json; +use uuid::Uuid; use cerberus_proto::worker as pb; use errors::*; @@ -52,9 +52,8 @@ impl Worker { } Ok(Worker { - address: SocketAddr::from_str(&address).chain_err( - || "Invalid address when creating worker", - )?, + address: SocketAddr::from_str(&address) + .chain_err(|| "Invalid address when creating worker")?, status: pb::WorkerStatus::AVAILABLE, operation_status: pb::OperationStatus::UNKNOWN, @@ -106,9 +105,8 @@ impl Worker { impl StateHandling for Worker { fn new_from_json(data: serde_json::Value) -> Result { // Convert address from a serde_json::Value to a String. - let address: String = serde_json::from_value(data["address"].clone()).chain_err( - || "Unable to create String from serde_json::Value", - )?; + let address: String = serde_json::from_value(data["address"].clone()) + .chain_err(|| "Unable to create String from serde_json::Value")?; // Create the worker with the above address. let worker_result = Worker::new(address, String::new()); @@ -118,9 +116,9 @@ impl StateHandling for Worker { }; // Update worker state to match the given state. - worker.load_state(data).chain_err( - || "Unable to recreate worker from previous state", - )?; + worker + .load_state(data) + .chain_err(|| "Unable to recreate worker from previous state")?; Ok(worker) } @@ -154,15 +152,14 @@ impl StateHandling for Worker { self.worker_id = serde_json::from_value(data["worker_id"].clone()) .chain_err(|| "Unable to convert worker_id")?; - self.task_assignments_failed = serde_json::from_value( - data["task_assignments_failed"].clone(), - ).chain_err(|| "Unable to convert task_assignments_failed")?; + self.task_assignments_failed = + serde_json::from_value(data["task_assignments_failed"].clone()) + .chain_err(|| "Unable to convert task_assignments_failed")?; Ok(()) } } - #[cfg(test)] mod tests { use super::*; diff --git a/master/src/dashboard/server.rs b/master/src/dashboard/server.rs index 4000db4f..fc64d3fc 100644 --- a/master/src/dashboard/server.rs +++ b/master/src/dashboard/server.rs @@ -12,9 +12,9 @@ use urlencoded::UrlEncodedQuery; use common::{Job, JobOptions}; use errors::*; use scheduling::Scheduler; -use worker_management::WorkerManager; use util::data_layer::AbstractionLayer; use util::output_error; +use worker_management::WorkerManager; // Default priority applied to scheduled jobs. const DEFAULT_PRIORITY: u32 = 3; @@ -48,9 +48,8 @@ impl ApiHandler { fn get_parameter(&self, req: &mut Request, param_name: &str) -> Result { match req.get_ref::() { Ok(hashmap) => { - let param = self.get_query_param(param_name, hashmap).chain_err(|| { - format!("Failed to get param with name {} ", param_name) - })?; + let param = self.get_query_param(param_name, hashmap) + .chain_err(|| format!("Failed to get param with name {} ", param_name))?; Ok(param) } @@ -61,16 +60,12 @@ impl ApiHandler { fn get_router_url_component(&self, req: &Request, component_name: &str) -> Result { let value: String = { match req.extensions.get::() { - Some(router_extension) => { - match router_extension.find(component_name) { - Some(value) => value.to_string(), - None => { - return Err( - format!("No router value with name {}", component_name).into(), - ); - } + Some(router_extension) => match router_extension.find(component_name) { + Some(value) => value.to_string(), + None => { + return Err(format!("No router value with name {}", component_name).into()); } - } + }, None => { return Err("Failed to get Router for request".into()); } @@ -82,39 +77,38 @@ impl ApiHandler { /// Returns information about the `Tasks` which are currently in progress. fn tasks(&self, _req: &mut Request) -> Result { - let tasks_info = self.worker_manager_arc.get_tasks_info().chain_err( - || "Failed to get tasks info", - )?; + let tasks_info = self.worker_manager_arc + .get_tasks_info() + .chain_err(|| "Failed to get tasks info")?; Ok(Response::with((iron::status::Ok, tasks_info.to_string()))) } /// Returns information about the `Workers` currently registered with the cluster. fn workers(&self, _req: &mut Request) -> Result { - let workers_info = self.worker_manager_arc.get_workers_info().chain_err( - || "Failed to get workers info", - )?; + let workers_info = self.worker_manager_arc + .get_workers_info() + .chain_err(|| "Failed to get workers info")?; Ok(Response::with((iron::status::Ok, workers_info.to_string()))) } /// Returns information about the `Jobs` currently running on the cluster. fn jobs(&self, _req: &mut Request) -> Result { - let jobs_info = self.scheduler_arc.get_jobs_info().chain_err( - || "Failed to get jobs info", - )?; + let jobs_info = self.scheduler_arc + .get_jobs_info() + .chain_err(|| "Failed to get jobs info")?; Ok(Response::with((iron::status::Ok, jobs_info.to_string()))) } fn cancel_job(&self, req: &mut Request) -> Result { - let job_id = self.get_parameter(req, "job_id").chain_err( - || "Could not get job_id in request", - )?; + let job_id = self.get_parameter(req, "job_id") + .chain_err(|| "Could not get job_id in request")?; - let success = self.scheduler_arc.cancel_job(&job_id).chain_err(|| { - format!("Failed to cancel job with id {}", job_id) - })?; + let success = self.scheduler_arc + .cancel_job(&job_id) + .chain_err(|| format!("Failed to cancel job with id {}", job_id))?; Ok(Response::with(( iron::status::Ok, @@ -123,9 +117,8 @@ impl ApiHandler { } fn get_output_path(&self, req: &mut Request) -> Option { - let output_path = self.get_parameter(req, "output_path").unwrap_or_else( - |_| "".to_string(), - ); + let output_path = self.get_parameter(req, "output_path") + .unwrap_or_else(|_| "".to_string()); if output_path.is_empty() { None } else { @@ -134,29 +127,27 @@ impl ApiHandler { } fn get_priority(&self, req: &mut Request) -> Result { - let priority = self.get_parameter(req, "priority").unwrap_or_else( - |_| "".to_string(), - ); + let priority = self.get_parameter(req, "priority") + .unwrap_or_else(|_| "".to_string()); if priority.is_empty() { Ok(DEFAULT_PRIORITY) } else { - priority.parse::().chain_err( - || "Invalid priority when scheduling job", - ) + priority + .parse::() + .chain_err(|| "Invalid priority when scheduling job") } } fn get_map_size(&self, req: &mut Request) -> Result { - let map_size = self.get_parameter(req, "map_size").unwrap_or_else( - |_| "".to_string(), - ); + let map_size = self.get_parameter(req, "map_size") + .unwrap_or_else(|_| "".to_string()); let map_size = { if map_size.is_empty() { DEFAULT_MAP_SIZE } else { - map_size.parse::().chain_err( - || "Invalid map size when scheduling job", - )? + map_size + .parse::() + .chain_err(|| "Invalid map size when scheduling job")? } }; if map_size < 1 { @@ -166,18 +157,14 @@ impl ApiHandler { } fn schedule_job(&self, req: &mut Request) -> Result { - let binary_path = self.get_parameter(req, "binary_path").chain_err( - || "Failed to get binary_path", - )?; - let input_path = self.get_parameter(req, "input_path").chain_err( - || "Failed to get input_path", - )?; - let priority = self.get_priority(req).chain_err( - || "Failed to get priority", - )?; - let map_size = self.get_map_size(req).chain_err( - || "Failed to get map size", - )?; + let binary_path = self.get_parameter(req, "binary_path") + .chain_err(|| "Failed to get binary_path")?; + let input_path = self.get_parameter(req, "input_path") + .chain_err(|| "Failed to get input_path")?; + let priority = self.get_priority(req) + .chain_err(|| "Failed to get priority")?; + let map_size = self.get_map_size(req) + .chain_err(|| "Failed to get map size")?; let job_options = JobOptions { client_id: req.remote_addr.to_string(), @@ -194,9 +181,9 @@ impl ApiHandler { let job = Job::new(job_options, &self.data_abstraction_layer_arc) .chain_err(|| "Error creating new job")?; - self.scheduler_arc.schedule_job(job).chain_err( - || "Error scheduling job", - )?; + self.scheduler_arc + .schedule_job(job) + .chain_err(|| "Error scheduling job")?; Ok(Response::with((iron::status::Ok, "{{ success: true }}"))) } @@ -236,7 +223,6 @@ impl iron::Handler for ApiHandler { Err(IronError::new(chained_err, iron::status::BadRequest)) } } - } } @@ -269,9 +255,9 @@ impl DashboardServer { .mount("/", Static::new(Path::new("content/"))); Ok(DashboardServer { - iron_server: Iron::new(mount).http(serving_addr).chain_err( - || "Failed to start cluster dashboard server.", - )?, + iron_server: Iron::new(mount) + .http(serving_addr) + .chain_err(|| "Failed to start cluster dashboard server.")?, }) } } diff --git a/master/src/initialization/dashboard_server.rs b/master/src/initialization/dashboard_server.rs index 2c02a5d6..9a1163f8 100644 --- a/master/src/initialization/dashboard_server.rs +++ b/master/src/initialization/dashboard_server.rs @@ -5,8 +5,8 @@ use clap::ArgMatches; use dashboard::DashboardServer; use errors::*; use scheduling::Scheduler; -use worker_management::WorkerManager; use util::data_layer::AbstractionLayer; +use worker_management::WorkerManager; const DEFAULT_DASHBOARD_ADDRESS: &str = "127.0.0.1:3000"; @@ -16,9 +16,9 @@ pub fn initialize_dashboard_server( scheduler: &Arc, data_layer: &Arc, ) -> Result { - let dashboard_address = matches.value_of("dashboard-address").unwrap_or( - DEFAULT_DASHBOARD_ADDRESS, - ); + let dashboard_address = matches + .value_of("dashboard-address") + .unwrap_or(DEFAULT_DASHBOARD_ADDRESS); let dashboard = DashboardServer::new( dashboard_address, diff --git a/master/src/initialization/data_layer.rs b/master/src/initialization/data_layer.rs index 6fff8b44..2f3655e9 100644 --- a/master/src/initialization/data_layer.rs +++ b/master/src/initialization/data_layer.rs @@ -1,15 +1,15 @@ use std::path::{Path, PathBuf}; -use std::sync::Arc; use std::sync::mpsc::Receiver; +use std::sync::Arc; use clap::ArgMatches; use errors::*; -use util::data_layer::{AbstractionLayer, AmazonS3AbstractionLayer, NullAbstractionLayer, - NFSAbstractionLayer}; -use util::distributed_filesystem::{LocalFileManager, DFSAbstractionLayer, - LocalFileSystemMasterInterface, FileSystemManager, - run_worker_info_upate_loop, WorkerInfoUpdate}; +use util::data_layer::{AbstractionLayer, AmazonS3AbstractionLayer, NFSAbstractionLayer, + NullAbstractionLayer}; +use util::distributed_filesystem::{run_worker_info_upate_loop, DFSAbstractionLayer, + FileSystemManager, LocalFileManager, + LocalFileSystemMasterInterface, WorkerInfoUpdate}; const DEFAULT_DFS_DIRECTORY: &str = "/tmp/cerberus/dfs/"; const DEFAULT_S3_DIRECTORY: &str = "/tmp/cerberus/s3/"; @@ -25,9 +25,9 @@ fn initialize_dfs( let local_file_manager_arc = Arc::new(LocalFileManager::new(storage_dir)); let file_manager_arc = Arc::new(FileSystemManager::new()); - let master_interface = Box::new(LocalFileSystemMasterInterface::new( - Arc::clone(&file_manager_arc), - )); + let master_interface = Box::new(LocalFileSystemMasterInterface::new(Arc::clone( + &file_manager_arc, + ))); let dfs_abstraction_layer = Arc::new(DFSAbstractionLayer::new( Arc::clone(&local_file_manager_arc), @@ -66,9 +66,8 @@ pub fn get_data_abstraction_layer( data_abstraction_layer = abstraction_layer; filesystem_manager = Some(file_manager_arc); } else if let Some(bucket) = matches.value_of("s3") { - data_abstraction_layer = initialize_s3(&storage_location, bucket).chain_err( - || "Error initializing S3 abstraction layer", - )?; + data_abstraction_layer = initialize_s3(&storage_location, bucket) + .chain_err(|| "Error initializing S3 abstraction layer")?; } else { data_abstraction_layer = Arc::new(NullAbstractionLayer::new()); } diff --git a/master/src/initialization/grpc_server.rs b/master/src/initialization/grpc_server.rs index 83057338..db77b54c 100644 --- a/master/src/initialization/grpc_server.rs +++ b/master/src/initialization/grpc_server.rs @@ -1,11 +1,11 @@ -use std::sync::Arc; use std::str::FromStr; +use std::sync::Arc; use clap::ArgMatches; use errors::*; use scheduling::Scheduler; -use server::{Server, ClientService, FileSystemService, WorkerService}; +use server::{ClientService, FileSystemService, Server, WorkerService}; use util::data_layer::AbstractionLayer; use util::distributed_filesystem::FileSystemManager; use worker_management::WorkerManager; diff --git a/master/src/initialization/master_resources.rs b/master/src/initialization/master_resources.rs index c89045af..1a47df6c 100644 --- a/master/src/initialization/master_resources.rs +++ b/master/src/initialization/master_resources.rs @@ -1,5 +1,5 @@ -use std::sync::Arc; use std::sync::mpsc::channel; +use std::sync::Arc; use clap::ArgMatches; @@ -7,7 +7,7 @@ use dashboard::DashboardServer; use errors::*; use initialization; use initialization::{initialize_dashboard_server, initialize_grpc_server, initialize_state_handler}; -use scheduling::{TaskProcessorImpl, Scheduler}; +use scheduling::{Scheduler, TaskProcessorImpl}; use server::Server; use state::StateHandler; use worker_communication::WorkerInterfaceImpl; @@ -35,9 +35,9 @@ impl MasterResources { worker_info_sender, )); - let task_processor = Arc::new(TaskProcessorImpl::new( - Arc::clone(&data_abstraction_layer_arc), - )); + let task_processor = Arc::new(TaskProcessorImpl::new(Arc::clone( + &data_abstraction_layer_arc, + ))); let scheduler = Arc::new(Scheduler::new(Arc::clone(&worker_manager), task_processor)); Ok(MasterResources { diff --git a/master/src/initialization/state_handler.rs b/master/src/initialization/state_handler.rs index 0e35c355..c5da6f9f 100644 --- a/master/src/initialization/state_handler.rs +++ b/master/src/initialization/state_handler.rs @@ -1,5 +1,5 @@ -use std::sync::Arc; use std::path::Path; +use std::sync::Arc; use clap::ArgMatches; @@ -21,9 +21,9 @@ pub fn initialize_state_handler( let fresh = matches.is_present("fresh"); - let dump_dir = matches.value_of("state-location").unwrap_or( - DEFAULT_DUMP_DIR, - ); + let dump_dir = matches + .value_of("state-location") + .unwrap_or(DEFAULT_DUMP_DIR); let state_handler = StateHandler::new( Arc::clone(scheduler), @@ -36,9 +36,9 @@ pub fn initialize_state_handler( // If our state dump file exists and we aren't running a fresh copy of master we // should load from state. if !fresh && Path::new(&format!("{}/master.dump", dump_dir)).exists() { - state_handler.load_state().chain_err( - || "Unable to load state from file", - )?; + state_handler + .load_state() + .chain_err(|| "Unable to load state from file")?; } Ok(state_handler) diff --git a/master/src/main.rs b/master/src/main.rs index f696646b..1d5259f4 100644 --- a/master/src/main.rs +++ b/master/src/main.rs @@ -60,9 +60,7 @@ fn run() -> Result<()> { init_logger().chain_err(|| "Failed to initialise logging.")?; let matches = parser::parse_command_line(); - let resources = MasterResources::new(&matches).chain_err( - || "Error initilizing master", - )?; + let resources = MasterResources::new(&matches).chain_err(|| "Error initilizing master")?; // Startup worker management loops run_task_assigment_loop(Arc::clone(&resources.worker_manager)); diff --git a/master/src/main_loop.rs b/master/src/main_loop.rs index b370c982..a9550f9b 100644 --- a/master/src/main_loop.rs +++ b/master/src/main_loop.rs @@ -12,18 +12,21 @@ pub fn run_main_loop(mut resources: MasterResources) -> Result<()> { thread::sleep(time::Duration::from_millis(MAIN_LOOP_SLEEP_MS)); if !resources.grpc_server.is_alive() { - resources.dashboard_server.iron_server.close().chain_err( - || "Failed to close dashboard server", - )?; + resources + .dashboard_server + .iron_server + .close() + .chain_err(|| "Failed to close dashboard server")?; return Err("GRPC server unexpectedly died".into()); } if resources.state_handler.get_should_dump_state() { iterations_since_state_dump += 1; if iterations_since_state_dump * MAIN_LOOP_SLEEP_MS >= DUMP_LOOP_MS { - resources.state_handler.dump_state().chain_err( - || "Unable to dump state", - )?; + resources + .state_handler + .dump_state() + .chain_err(|| "Unable to dump state")?; iterations_since_state_dump = 0 } } diff --git a/master/src/parser.rs b/master/src/parser.rs index b034f976..0a1b4829 100644 --- a/master/src/parser.rs +++ b/master/src/parser.rs @@ -4,16 +4,12 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { App::new("master") .version(crate_version!()) .author("Cerberus Authors ") - .about( - "Responsible for scheduling MapReduce jobs and managing workers", - ) + .about("Responsible for scheduling MapReduce jobs and managing workers") .arg( Arg::with_name("port") .long("port") .short("p") - .help( - "Port which the master will use to communicate with the client and workers", - ) + .help("Port which the master will use to communicate with the client and workers") .takes_value(true) .required(false), ) @@ -58,9 +54,7 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { .arg( Arg::with_name("dfs") .long("dfs") - .help( - "Makes the master run using the distributed file system for data access.", - ) + .help("Makes the master run using the distributed file system for data access.") .takes_value(false) .required(false), ) diff --git a/master/src/scheduling/mod.rs b/master/src/scheduling/mod.rs index 2350fc12..135f0471 100644 --- a/master/src/scheduling/mod.rs +++ b/master/src/scheduling/mod.rs @@ -1,5 +1,5 @@ -pub use self::scheduler::Scheduler; pub use self::scheduler::run_task_update_loop; +pub use self::scheduler::Scheduler; pub use self::task_processor::TaskProcessor; pub use self::task_processor::TaskProcessorImpl; diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index 09fafd1d..a7876f21 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -1,16 +1,16 @@ -use std::thread; use std::collections::HashMap; -use std::sync::{Mutex, Arc}; +use std::sync::{Arc, Mutex}; +use std::thread; use serde_json; use cerberus_proto::mapreduce as pb; -use common::{Task, TaskStatus, Job}; +use common::{Job, Task, TaskStatus}; use errors::*; use scheduling::state::{ScheduledJob, State}; use scheduling::task_processor::TaskProcessor; -use util::state::{StateHandling, SimpleStateHandling}; use util::output_error; +use util::state::{SimpleStateHandling, StateHandling}; use worker_management::WorkerManager; /// The `Scheduler` is responsible for the managing of `Job`s and `Task`s. @@ -44,25 +44,23 @@ impl Scheduler { let mut state = self.state.lock().unwrap(); let reduce_tasks = { - let job = state.get_job(job_id).chain_err( - || "Error scheduling reduce tasks", - )?; + let job = state + .get_job(job_id) + .chain_err(|| "Error scheduling reduce tasks")?; - let map_tasks = state.get_map_tasks(job_id).chain_err(|| { - format!("Could not get map tasks for job {}", job_id) - })?; + let map_tasks = state + .get_map_tasks(job_id) + .chain_err(|| format!("Could not get map tasks for job {}", job_id))?; self.task_processor .create_reduce_tasks(job, map_tasks) - .chain_err(|| { - format!("Could not create reduce tasks for job {}", job_id) - })? + .chain_err(|| format!("Could not create reduce tasks for job {}", job_id))? }; if reduce_tasks.is_empty() { - state.set_job_completed(job_id).chain_err(|| { - format!("Could not set job with id {} completed", job_id) - })?; + state + .set_job_completed(job_id) + .chain_err(|| format!("Could not set job with id {} completed", job_id))?; } else { state .add_tasks_for_job(job_id, reduce_tasks.clone()) @@ -79,34 +77,30 @@ impl Scheduler { fn process_completed_task(&self, task: &Task) -> Result<()> { info!( "Processing completed task {} with status {:?}", - task.id, - task.status + task.id, task.status ); let reduce_tasks_required = { let mut state = self.state.lock().unwrap(); - state.add_completed_task(task.clone()).chain_err( - || "Error processing completed task result", - )?; - - state.reduce_tasks_required(&task.job_id).chain_err( - || "Error processing completed task result", - )? + state + .add_completed_task(task.clone()) + .chain_err(|| "Error processing completed task result")?; + state + .reduce_tasks_required(&task.job_id) + .chain_err(|| "Error processing completed task result")? }; if task.status != TaskStatus::Complete { - self.cancel_job(&task.job_id).chain_err(|| { - format!("Unable to cancel job with ID {}", &task.job_id) - })?; + self.cancel_job(&task.job_id) + .chain_err(|| format!("Unable to cancel job with ID {}", &task.job_id))?; return Ok(()); } if reduce_tasks_required { - self.schedule_reduce_tasks(&task.job_id).chain_err(|| { - format!("Could not schedule reduce tasks for job {}", task.job_id) - })?; + self.schedule_reduce_tasks(&task.job_id) + .chain_err(|| format!("Could not schedule reduce tasks for job {}", task.job_id))?; } Ok(()) } @@ -116,15 +110,14 @@ impl Scheduler { state .update_job_started( &task.job_id, - task.time_started.chain_err( - || "Time started expected to exist.", - )?, + task.time_started + .chain_err(|| "Time started expected to exist.")?, ) .chain_err(|| "Error updating job start time.")?; - state.update_task(task.to_owned()).chain_err( - || "Error updating task info.", - ) + state + .update_task(task.to_owned()) + .chain_err(|| "Error updating task info.") } pub fn process_updated_task(&self, task: &Task) -> Result<()> { @@ -138,7 +131,9 @@ impl Scheduler { /// Schedule a [`Task`](common::Task) to be executed. fn schedule_task(&self, task: Task) { let worker_manager = Arc::clone(&self.worker_manager); - thread::spawn(move || { worker_manager.run_task(task); }); + thread::spawn(move || { + worker_manager.run_task(task); + }); } /// Splits the input for a job and schedules the map tasks in the background. @@ -162,7 +157,9 @@ impl Scheduler { for task in map_tasks_vec { map_tasks.insert(task.id.to_owned(), task.clone()); let worker_manager = Arc::clone(&worker_manager); - thread::spawn(move || { worker_manager.run_task(task); }); + thread::spawn(move || { + worker_manager.run_task(task); + }); } info!("Starting job with ID {}.", job.id); @@ -184,9 +181,9 @@ impl Scheduler { { let mut state = self.state.lock().unwrap(); - state.add_job(scheduled_job).chain_err( - || "Error adding scheduled job to state store", - )?; + state + .add_job(scheduled_job) + .chain_err(|| "Error adding scheduled job to state store")?; } self.split_input(job); @@ -196,9 +193,9 @@ impl Scheduler { pub fn cancel_job(&self, job_id: &str) -> Result { let cancelled = { let mut state = self.state.lock().unwrap(); - state.cancel_job(job_id).chain_err(|| { - format!("Unable to cancel job with ID: {}", job_id) - })? + state + .cancel_job(job_id) + .chain_err(|| format!("Unable to cancel job with ID: {}", job_id))? }; if !cancelled { info!("Unable to cancel job with ID {}", job_id); @@ -207,9 +204,7 @@ impl Scheduler { let workers = self.worker_manager .get_workers_running_job(job_id) - .chain_err(|| { - format!("Unable to get list of workers running job {}", job_id) - })?; + .chain_err(|| format!("Unable to get list of workers running job {}", job_id))?; self.worker_manager .remove_queued_tasks_for_job(job_id) @@ -238,9 +233,9 @@ impl Scheduler { report.mapreduce_id = job.id.clone(); report.status = job.status; if job.status == pb::Status::FAILED { - report.failure_details = job.status_details.clone().unwrap_or_else( - || "Unknown.".to_owned(), - ); + report.failure_details = job.status_details + .clone() + .unwrap_or_else(|| "Unknown.".to_owned()); } report.scheduled_timestamp = job.time_requested.timestamp(); report.output_directory = job.output_directory.clone(); @@ -256,9 +251,9 @@ impl Scheduler { pub fn get_mapreduce_status(&self, mapreduce_id: &str) -> Result { let state = self.state.lock().unwrap(); - let job = state.get_job(mapreduce_id).chain_err( - || "Error getting map reduce status.", - )?; + let job = state + .get_job(mapreduce_id) + .chain_err(|| "Error getting map reduce status.")?; Ok(self.get_status_for_job(job)) } @@ -326,9 +321,9 @@ impl SimpleStateHandling for Scheduler { fn load_state(&self, data: serde_json::Value) -> Result<()> { let mut state = self.state.lock().unwrap(); - state.load_state(data).chain_err( - || "Error loading scheduler state.", - )?; + state + .load_state(data) + .chain_err(|| "Error loading scheduler state.")?; let in_progress_tasks = state.get_in_progress_tasks(); for task in in_progress_tasks { @@ -346,7 +341,6 @@ impl SimpleStateHandling for Scheduler { } } - pub fn run_task_update_loop(scheduler: Arc, worker_manager: &Arc) { let receiver = worker_manager.get_update_receiver(); thread::spawn(move || loop { diff --git a/master/src/scheduling/state.rs b/master/src/scheduling/state.rs index b9bcb0da..5922c4c5 100644 --- a/master/src/scheduling/state.rs +++ b/master/src/scheduling/state.rs @@ -4,7 +4,7 @@ use chrono::prelude::*; use serde_json; use cerberus_proto::mapreduce as pb; -use common::{Task, TaskType, TaskStatus, Job}; +use common::{Job, Task, TaskStatus, TaskType}; use errors::*; use util::state::StateHandling; @@ -17,17 +17,15 @@ pub struct ScheduledJob { impl ScheduledJob { fn process_json(data: &serde_json::Value) -> Result<(Job, HashMap)> { - let job = Job::new_from_json(data["job"].clone()).chain_err( - || "Unable to create map reduce job from json.", - )?; + let job = Job::new_from_json(data["job"].clone()) + .chain_err(|| "Unable to create map reduce job from json.")?; let mut tasks = HashMap::new(); if let serde_json::Value::Array(ref tasks_array) = data["tasks"] { for task in tasks_array { - let task = Task::new_from_json(task.clone()).chain_err( - || "Unable to create map reduce task from json.", - )?; + let task = Task::new_from_json(task.clone()) + .chain_err(|| "Unable to create map reduce task from json.")?; tasks.insert(task.id.to_owned(), task); } } @@ -46,9 +44,8 @@ impl StateHandling for ScheduledJob { fn dump_state(&self) -> Result { let mut tasks_json: Vec = Vec::new(); for task in self.tasks.values() { - tasks_json.push(task.dump_state().chain_err( - || "Error dumping scheduled job state.", - )?); + tasks_json.push(task.dump_state() + .chain_err(|| "Error dumping scheduled job state.")?); } Ok(json!({ @@ -81,18 +78,14 @@ impl State { pub fn add_job(&mut self, scheduled_job: ScheduledJob) -> Result<()> { if self.scheduled_jobs.contains_key(&scheduled_job.job.id) { - return Err( - format!( - "Job with ID {} is already scheduled.", - &scheduled_job.job.id - ).into(), - ); + return Err(format!( + "Job with ID {} is already scheduled.", + &scheduled_job.job.id + ).into()); } - self.scheduled_jobs.insert( - scheduled_job.job.id.to_owned(), - scheduled_job, - ); + self.scheduled_jobs + .insert(scheduled_job.job.id.to_owned(), scheduled_job); Ok(()) } @@ -152,8 +145,8 @@ impl State { None => return Err(format!("Job with ID {} was not found.", &job_id).into()), }; - if scheduled_job.job.status != pb::Status::FAILED && - scheduled_job.job.status != pb::Status::DONE + if scheduled_job.job.status != pb::Status::FAILED + && scheduled_job.job.status != pb::Status::DONE { scheduled_job.job.status = pb::Status::CANCELLED; @@ -230,9 +223,7 @@ impl State { for task in tasks { if scheduled_job.tasks.contains_key(&task.id) { - return Err( - format!("Task with ID {} is already scheduled.", &task.id).into(), - ); + return Err(format!("Task with ID {} is already scheduled.", &task.id).into()); } match task.task_type { @@ -252,9 +243,7 @@ impl State { }; if !scheduled_job.tasks.contains_key(&task.id) { - return Err( - format!("Task with ID {} is does not exist.", &task.id).into(), - ); + return Err(format!("Task with ID {} is does not exist.", &task.id).into()); } scheduled_job.tasks.insert(task.id.to_owned(), task); @@ -279,9 +268,8 @@ impl State { pub fn add_completed_task(&mut self, task: Task) -> Result<()> { self.update_job_started( &task.job_id, - task.time_started.chain_err( - || "Time started is expected to exist.", - )?, + task.time_started + .chain_err(|| "Time started is expected to exist.")?, ).chain_err(|| "Error adding completed task.")?; let scheduled_job = match self.scheduled_jobs.get_mut(&task.job_id) { @@ -304,8 +292,8 @@ impl State { } TaskType::Reduce => { scheduled_job.job.reduce_tasks_completed += 1; - if scheduled_job.job.reduce_tasks_completed == - scheduled_job.job.reduce_tasks_total + if scheduled_job.job.reduce_tasks_completed + == scheduled_job.job.reduce_tasks_total { scheduled_job.job.status = pb::Status::DONE; scheduled_job.job.time_completed = Some(Utc::now()); @@ -328,8 +316,8 @@ impl State { let mut job_count = 0; for scheduled_job in self.scheduled_jobs.values() { let status = scheduled_job.job.status; - if status == pb::Status::IN_PROGRESS || status == pb::Status::IN_QUEUE || - status == pb::Status::SPLITTING_INPUT + if status == pb::Status::IN_PROGRESS || status == pb::Status::IN_QUEUE + || status == pb::Status::SPLITTING_INPUT { job_count += 1; } @@ -358,9 +346,8 @@ impl StateHandling for State { fn dump_state(&self) -> Result { let mut jobs_json: Vec = Vec::new(); for job in self.scheduled_jobs.values() { - jobs_json.push(job.dump_state().chain_err( - || "Unable to dump ScheduledJob state", - )?); + jobs_json.push(job.dump_state() + .chain_err(|| "Unable to dump ScheduledJob state")?); } Ok(json!({ @@ -371,13 +358,10 @@ impl StateHandling for State { fn load_state(&mut self, data: serde_json::Value) -> Result<()> { if let serde_json::Value::Array(ref jobs_array) = data["scheduled_jobs"] { for job in jobs_array { - let scheduled_job = ScheduledJob::new_from_json(job.clone()).chain_err( - || "Unable to create ScheduledJob from json.", - )?; - self.scheduled_jobs.insert( - scheduled_job.job.id.to_owned(), - scheduled_job, - ); + let scheduled_job = ScheduledJob::new_from_json(job.clone()) + .chain_err(|| "Unable to create ScheduledJob from json.")?; + self.scheduled_jobs + .insert(scheduled_job.job.id.to_owned(), scheduled_job); } } else { return Err("Error processing scheduled_jobs array.".into()); diff --git a/master/src/scheduling/task_processor.rs b/master/src/scheduling/task_processor.rs index 6e0a6aeb..392acc42 100644 --- a/master/src/scheduling/task_processor.rs +++ b/master/src/scheduling/task_processor.rs @@ -1,12 +1,12 @@ -use std::collections::HashMap; use std::cmp::max; +use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::sync::Arc; use cerberus_proto::worker as pb; use common::{Job, Task}; -use util::data_layer::AbstractionLayer; use errors::*; +use util::data_layer::AbstractionLayer; const CLOSEST_ENDLINE_STEP: u64 = 1000; const NEWLINE: u8 = 0x0A; @@ -34,7 +34,9 @@ pub struct TaskProcessorImpl { impl TaskProcessorImpl { pub fn new(data_abstraction_layer: Arc) -> Self { - TaskProcessorImpl { data_abstraction_layer } + TaskProcessorImpl { + data_abstraction_layer, + } } /// `get_closest_endline` files the endline closest to the end of a given range for input file @@ -134,9 +136,8 @@ impl TaskProcessorImpl { .is_file(path.as_path()) .chain_err(|| "Failed to check if path is a file")? { - let input_locations = self.read_input_file(&path, map_input_size).chain_err( - || "Error reading input file.", - )?; + let input_locations = self.read_input_file(&path, map_input_size) + .chain_err(|| "Error reading input file.")?; for input_location in input_locations { let bytes_to_read = input_location.end_byte - input_location.start_byte; @@ -219,12 +220,12 @@ impl TaskProcessor for TaskProcessorImpl { #[cfg(test)] mod tests { use super::*; - use std::path::Path; - use std::io::{Read, Write}; + use common::{JobOptions, TaskType}; use std::collections::HashSet; use std::fs; use std::fs::File; - use common::{JobOptions, TaskType}; + use std::io::{Read, Write}; + use std::path::Path; use util::data_layer::NullAbstractionLayer; #[test] @@ -289,12 +290,8 @@ mod tests { // Either input file order is fine. let mut good_inputs = HashSet::new(); - good_inputs.insert( - "this is the first test file\nthis is the second test file".to_owned(), - ); - good_inputs.insert( - "this is the second test file\nthis is the first test file".to_owned(), - ); + good_inputs.insert("this is the first test file\nthis is the second test file".to_owned()); + good_inputs.insert("this is the second test file\nthis is the first test file".to_owned()); println!("{}", map_input.clone()); @@ -327,20 +324,17 @@ mod tests { let mut map_task1 = Task::new_map_task("map-1", "/tmp/bin", vec![input_location.clone()], 1); - map_task1.map_output_files.insert( - 0, - "/tmp/output/1".to_owned(), - ); - map_task1.map_output_files.insert( - 1, - "/tmp/output/2".to_owned(), - ); + map_task1 + .map_output_files + .insert(0, "/tmp/output/1".to_owned()); + map_task1 + .map_output_files + .insert(1, "/tmp/output/2".to_owned()); let mut map_task2 = Task::new_map_task("map-2", "/tmp/bin", vec![input_location], 1); - map_task2.map_output_files.insert( - 0, - "/tmp/output/3".to_owned(), - ); + map_task2 + .map_output_files + .insert(0, "/tmp/output/3".to_owned()); let map_tasks: Vec<&Task> = vec![&map_task1, &map_task2]; let mut reduce_tasks: Vec = diff --git a/master/src/server/client_service.rs b/master/src/server/client_service.rs index 3eced291..973403ff 100644 --- a/master/src/server/client_service.rs +++ b/master/src/server/client_service.rs @@ -1,11 +1,11 @@ use std::sync::Arc; -use grpc::{SingleResponse, Error, RequestOptions}; +use grpc::{Error, RequestOptions, SingleResponse}; use common::{Job, JobOptions}; use scheduling::Scheduler; -use util::output_error; use util::data_layer::AbstractionLayer; +use util::output_error; use cerberus_proto::mapreduce as pb; use cerberus_proto::mapreduce_grpc as grpc_pb; @@ -143,21 +143,20 @@ impl grpc_pb::MapReduceService for ClientService { } } - #[cfg(test)] mod tests { use super::*; - use std::sync::mpsc::channel; - use std::thread; + use cerberus_proto::mapreduce::Status as MapReduceStatus; + use cerberus_proto::mapreduce_grpc::MapReduceService; + use cerberus_proto::worker as wpb; use common::{Job, Task, Worker}; use errors::*; use scheduling::TaskProcessor; - use cerberus_proto::worker as wpb; - use cerberus_proto::mapreduce::Status as MapReduceStatus; - use cerberus_proto::mapreduce_grpc::MapReduceService; + use std::sync::mpsc::channel; + use std::thread; use util::data_layer::NullAbstractionLayer; - use worker_management::WorkerManager; use worker_communication::WorkerInterface; + use worker_management::WorkerManager; struct NullTaskProcessor; diff --git a/master/src/server/filesystem_service.rs b/master/src/server/filesystem_service.rs index 80d107fe..e286f4f4 100644 --- a/master/src/server/filesystem_service.rs +++ b/master/src/server/filesystem_service.rs @@ -1,13 +1,14 @@ use std::sync::Arc; -use grpc::{SingleResponse, Error, RequestOptions}; +use grpc::{Error, RequestOptions, SingleResponse}; use cerberus_proto::filesystem as pb; use cerberus_proto::filesystem_grpc as grpc_pb; use util::distributed_filesystem::FileSystemManager; use util::output_error; -const NOT_DISTRIBUTED_FILESYSTEM: &str = "Master is not running in distributed filesytem configuration"; +const NOT_DISTRIBUTED_FILESYSTEM: &str = + "Master is not running in distributed filesytem configuration"; const UPLOAD_FILE_ERROR: &str = "Error processing upload file request"; const FILE_LOCATION_ERROR: &str = "Error processing file location request"; @@ -39,11 +40,8 @@ impl grpc_pb::FileSystemMasterService for FileSystemService { } }; - if let Err(err) = filesystem_manager.upload_file_chunk( - &req.file_path, - req.start_byte, - &req.data, - ) + if let Err(err) = + filesystem_manager.upload_file_chunk(&req.file_path, req.start_byte, &req.data) { output_error(&err.chain_err(|| "Error processing upload file request.")); return SingleResponse::err(Error::Other(UPLOAD_FILE_ERROR)); diff --git a/master/src/server/mod.rs b/master/src/server/mod.rs index 3af910b0..5a917852 100644 --- a/master/src/server/mod.rs +++ b/master/src/server/mod.rs @@ -26,9 +26,9 @@ impl Server { ) -> Result { let mut server_builder = grpc::ServerBuilder::new_plain(); server_builder.http.set_port(port); - server_builder.http.set_cpu_pool_threads( - GRPC_THREAD_POOL_SIZE, - ); + server_builder + .http + .set_cpu_pool_threads(GRPC_THREAD_POOL_SIZE); // Register the MapReduceService server_builder.add_service(mapreduce_grpc::MapReduceServiceServer::new_service_def( @@ -42,15 +42,13 @@ impl Server { // Register the FileSystemService server_builder.add_service( - filesystem_grpc::FileSystemMasterServiceServer::new_service_def( - file_system_service, - ), + filesystem_grpc::FileSystemMasterServiceServer::new_service_def(file_system_service), ); Ok(Server { - server: server_builder.build().chain_err( - || "Error building grpc server", - )?, + server: server_builder + .build() + .chain_err(|| "Error building grpc server")?, }) } diff --git a/master/src/server/worker_service.rs b/master/src/server/worker_service.rs index e87929d7..8427d737 100644 --- a/master/src/server/worker_service.rs +++ b/master/src/server/worker_service.rs @@ -1,12 +1,12 @@ use std::sync::Arc; -use grpc::{RequestOptions, SingleResponse, Error}; +use grpc::{Error, RequestOptions, SingleResponse}; use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; use common::Worker; -use worker_management::WorkerManager; use util; +use worker_management::WorkerManager; pub struct WorkerService { worker_manager: Arc, diff --git a/master/src/state/handler.rs b/master/src/state/handler.rs index c9bdb47c..368e7e8e 100644 --- a/master/src/state/handler.rs +++ b/master/src/state/handler.rs @@ -1,7 +1,7 @@ -use std::sync::Arc; -use std::fs::File; use std::fs; +use std::fs::File; use std::io::{Read, Write}; +use std::sync::Arc; use serde_json; use serde_json::Value as json; @@ -29,9 +29,7 @@ impl StateHandler { dir: &str, ) -> Result { if should_dump_state { - fs::create_dir_all(dir).chain_err(|| { - format!("Unable to create dir: {}", dir) - })?; + fs::create_dir_all(dir).chain_err(|| format!("Unable to create dir: {}", dir))?; } Ok(StateHandler { @@ -46,22 +44,20 @@ impl StateHandler { pub fn dump_state(&self) -> Result<()> { // Get Scheduler state as JSON. - let scheduler_json = self.scheduler.dump_state().chain_err( - || "Unable to dump Scheduler state", - )?; + let scheduler_json = self.scheduler + .dump_state() + .chain_err(|| "Unable to dump Scheduler state")?; // Get WorkerManager state as JSON. - let worker_manager_json = self.worker_manager.dump_state().chain_err( - || "Unable to dump WorkerManager state", - )?; + let worker_manager_json = self.worker_manager + .dump_state() + .chain_err(|| "Unable to dump WorkerManager state")?; // Get the filesystem manager state as JSON. let filesystem_manager_json = match self.filesystem_manager { - Some(ref filesystem_manager) => { - filesystem_manager.dump_state().chain_err( - || "Unable to dump FileSystemManager state", - )? - } + Some(ref filesystem_manager) => filesystem_manager + .dump_state() + .chain_err(|| "Unable to dump FileSystemManager state")?, None => json!(null), }; @@ -74,9 +70,8 @@ impl StateHandler { // Write the state to file. let mut file = File::create(format!("{}/temp.dump", self.dump_dir)) .chain_err(|| "Unable to create file")?; - file.write_all(json.to_string().as_bytes()).chain_err( - || "Unable to write data", - )?; + file.write_all(json.to_string().as_bytes()) + .chain_err(|| "Unable to write data")?; fs::rename( format!("{}/temp.dump", self.dump_dir), @@ -91,13 +86,11 @@ impl StateHandler { let mut file = File::open(format!("{}/master.dump", self.dump_dir)) .chain_err(|| "Unable to open file")?; let mut data = String::new(); - file.read_to_string(&mut data).chain_err( - || "Unable to read from state file", - )?; + file.read_to_string(&mut data) + .chain_err(|| "Unable to read from state file")?; - let json: serde_json::Value = serde_json::from_str(&data).chain_err( - || "Unable to parse string as JSON", - )?; + let json: serde_json::Value = + serde_json::from_str(&data).chain_err(|| "Unable to parse string as JSON")?; // Worker manager state needs to be reset first so that the scheduler knows what tasks it // doesn't need to reschedule. @@ -117,9 +110,9 @@ impl StateHandler { return Err("Unable to retrieve Scheduler state from JSON".into()); } - self.scheduler.load_state(scheduler_json).chain_err( - || "Error reloading scheduler state", - )?; + self.scheduler + .load_state(scheduler_json) + .chain_err(|| "Error reloading scheduler state")?; // Reset file system manager state from json. let filesystem_manager_json = json["filesystem_manager"].clone(); diff --git a/master/src/worker_communication/worker_interface.rs b/master/src/worker_communication/worker_interface.rs index 38b0d5d8..7544141b 100644 --- a/master/src/worker_communication/worker_interface.rs +++ b/master/src/worker_communication/worker_interface.rs @@ -3,8 +3,8 @@ use std::sync::RwLock; use grpc::RequestOptions; -use errors::*; use common::Worker; +use errors::*; use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; @@ -26,7 +26,6 @@ pub struct WorkerInterfaceImpl { clients: RwLock>, } - /// `WorkerInterfaceImpl` is used to schedule `MapReduce` operations on the workers. impl WorkerInterfaceImpl { pub fn new() -> Self { @@ -39,9 +38,7 @@ impl WorkerInterface for WorkerInterfaceImpl { let mut clients = self.clients.write().unwrap(); if clients.get(&worker.worker_id).is_some() { - return Err( - format!("client already exists for worker {}", &worker.worker_id).into(), - ); + return Err(format!("client already exists for worker {}", &worker.worker_id).into()); } info!( diff --git a/master/src/worker_management/mod.rs b/master/src/worker_management/mod.rs index 0f69d247..bd6c3315 100644 --- a/master/src/worker_management/mod.rs +++ b/master/src/worker_management/mod.rs @@ -1,6 +1,6 @@ -pub use self::worker_manager::WorkerManager; pub use self::worker_manager::run_health_check_loop; pub use self::worker_manager::run_task_assigment_loop; +pub use self::worker_manager::WorkerManager; -mod worker_manager; mod state; +mod worker_manager; diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 8c46e50b..450e442a 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -1,5 +1,5 @@ -use std::collections::HashMap; use std::collections::BinaryHeap; +use std::collections::HashMap; use std::path::Path; use std::sync::Arc; @@ -114,9 +114,7 @@ impl State { // Sort workers by most recent health checks, to avoid repeatedly trying to assign work to a // worker which is not responding. - workers.sort_by(|a, b| { - a.status_last_updated.cmp(&b.status_last_updated).reverse() - }); + workers.sort_by(|a, b| a.status_last_updated.cmp(&b.status_last_updated).reverse()); workers .into_iter() @@ -126,12 +124,10 @@ impl State { pub fn add_worker(&mut self, worker: Worker) -> Result<()> { if self.workers.contains_key(&worker.worker_id) { - return Err( - format!( - "Worker with ID {} is already registered.", - &worker.worker_id - ).into(), - ); + return Err(format!( + "Worker with ID {} is already registered.", + &worker.worker_id + ).into()); } self.workers.insert(worker.worker_id.clone(), worker); Ok(()) @@ -143,9 +139,9 @@ impl State { worker_status: pb::WorkerStatus, operation_status: pb::OperationStatus, ) -> Result<()> { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.status = worker_status; worker.operation_status = operation_status; @@ -156,11 +152,11 @@ impl State { // reassign the worker. // If the worker has recently been assigned a task, it may not be reporting the most up to // date status. - let time_since_task_assigned = Utc::now().timestamp() - - worker.task_last_updated.timestamp(); - if time_since_task_assigned > TIME_REASSIGN_REPORTING_AVAILABLE_S && - worker_status == pb::WorkerStatus::AVAILABLE && - !worker.current_task_id.is_empty() + let time_since_task_assigned = + Utc::now().timestamp() - worker.task_last_updated.timestamp(); + if time_since_task_assigned > TIME_REASSIGN_REPORTING_AVAILABLE_S + && worker_status == pb::WorkerStatus::AVAILABLE + && !worker.current_task_id.is_empty() { if let Some(assigned_task) = self.tasks.get_mut(&worker.current_task_id) { self.priority_task_queue.push(PriorityTask::new( @@ -218,28 +214,21 @@ impl State { return Err("Task id does not match expected task id.".into()); } - let worker = self.workers.get_mut(&map_result.worker_id).chain_err(|| { - format!("Worker with ID {} not found.", map_result.worker_id) - })?; + let worker = self.workers + .get_mut(&map_result.worker_id) + .chain_err(|| format!("Worker with ID {} not found.", map_result.worker_id))?; for (partition, output_file) in map_result.get_map_results() { - scheduled_task.map_output_files.insert( - *partition, - format!( - "{}{}", - worker.address, - output_file - ), - ); + scheduled_task + .map_output_files + .insert(*partition, format!("{}{}", worker.address, output_file)); } scheduled_task.status = TaskStatus::Complete; scheduled_task.time_completed = Some(Utc::now()); scheduled_task.cpu_time = map_result.get_cpu_time(); - self.completed_tasks.insert( - scheduled_task.id.clone(), - scheduled_task.clone(), - ); + self.completed_tasks + .insert(scheduled_task.id.clone(), scheduled_task.clone()); return Ok(scheduled_task); } @@ -257,17 +246,16 @@ impl State { task_id: &str, failure_details: &str, ) -> Result<(Task)> { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.current_task_id = String::new(); let mut assigned_task = self.tasks.remove(task_id).chain_err(|| { format!( "Task with ID {} not found, Worker ID: {}.", - task_id, - worker_id + task_id, worker_id ) })?; @@ -312,9 +300,9 @@ impl State { worker_id: &str, result: pb::ResultStatus, ) -> Result<()> { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.status = pb::WorkerStatus::AVAILABLE; if result == pb::ResultStatus::SUCCESS { @@ -329,9 +317,9 @@ impl State { } pub fn set_worker_operation_cancelled(&mut self, worker_id: &str) -> Result<()> { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.status = pb::WorkerStatus::AVAILABLE; worker.operation_status = pb::OperationStatus::CANCELLED; @@ -343,12 +331,11 @@ impl State { pub fn remove_worker(&mut self, worker_id: &str) -> Result<()> { if let Some(worker) = self.workers.remove(worker_id) { - // If this worker is a assigned a task, requeue the task. if !worker.current_task_id.is_empty() { - let assigned_task = self.tasks.get(&worker.current_task_id).chain_err( - || "Unable to get worker task", - )?; + let assigned_task = self.tasks + .get(&worker.current_task_id) + .chain_err(|| "Unable to get worker task")?; self.priority_task_queue.push(PriorityTask::new( worker.current_task_id, REQUEUED_TASK_PRIORITY * assigned_task.job_priority, @@ -371,13 +358,13 @@ impl State { // Unassign a task assigned to a worker and put the task back in the queue. pub fn unassign_worker(&mut self, worker_id: &str) -> Result<()> { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; - let assigned_task = self.tasks.get(&worker.current_task_id).chain_err( - || "Unable to get worker task", - )?; + let assigned_task = self.tasks + .get(&worker.current_task_id) + .chain_err(|| "Unable to get worker task")?; if !worker.current_task_id.is_empty() { self.priority_task_queue.push(PriorityTask::new( worker.current_task_id.clone(), @@ -406,9 +393,9 @@ impl State { } }; - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.task_last_updated = Utc::now(); worker.current_task_id = task_id.to_owned(); @@ -428,8 +415,7 @@ impl State { .chain_err(|| { format!( "Could not get closeness for file {} and worker {}", - input_location.input_path, - worker_id + input_location.input_path, worker_id ) })?; } @@ -444,15 +430,12 @@ impl State { }; let data_score = match task.map_request { - Some(ref map_request) => { - self.get_data_score(map_request, worker_id).chain_err(|| { - format!( - "Could not get data closeness score for task_id {} and worker_id {}", - task.id, - worker_id - ) - })? - } + Some(ref map_request) => self.get_data_score(map_request, worker_id).chain_err(|| { + format!( + "Could not get data closeness score for task_id {} and worker_id {}", + task.id, worker_id + ) + })?, None => 0, }; @@ -481,9 +464,8 @@ impl State { } } - let data_score = self.get_data_score_if_map(&task.id, worker_id).chain_err( - || "Unable to get data score", - )?; + let data_score = self.get_data_score_if_map(&task.id, worker_id) + .chain_err(|| "Unable to get data score")?; if data_score > best_data_score || best_task.is_none() { best_data_score = data_score; @@ -508,16 +490,14 @@ impl State { // Tries to assign a worker the next task in the queue. // Returns the task if one exists. pub fn try_assign_worker_task(&mut self, worker_id: &str) -> Result<(Option)> { - let task_option = self.get_best_task_for_worker(worker_id).chain_err( - || "Could not get task for worker", - )?; + let task_option = self.get_best_task_for_worker(worker_id) + .chain_err(|| "Could not get task for worker")?; let scheduled_task_id: String = match task_option { Some(priority_task) => { info!( "Popped off task {} with priority {}", - priority_task.id, - priority_task.priority + priority_task.id, priority_task.priority ); priority_task.id } @@ -532,9 +512,9 @@ impl State { // Clears the workers current_task_id and returns the previous value. pub fn cancel_task_for_worker(&mut self, worker_id: &str) -> Result { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; let previous_task_id = worker.current_task_id.clone(); worker.current_task_id = String::new(); @@ -554,9 +534,9 @@ impl State { pub fn increment_failed_task_assignments(&mut self, worker_id: &str) -> Result<()> { let should_remove_worker = { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.task_assignments_failed += 1; worker.task_assignments_failed == MAX_TASK_ASSIGNMENT_FAILURE @@ -575,9 +555,9 @@ impl State { } pub fn reset_failed_task_assignments(&mut self, worker_id: &str) -> Result<()> { - let worker = self.workers.get_mut(worker_id).chain_err(|| { - format!("Worker with ID {} not found.", worker_id) - })?; + let worker = self.workers + .get_mut(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.task_assignments_failed = 0; Ok(()) @@ -590,40 +570,34 @@ impl State { .clone(); let queued_tasks = self.get_in_progress_tasks_for_job(&map_task.job_id) - .chain_err(|| { - format!("Unable to get queued tasks for job {}", map_task.job_id) - })?; + .chain_err(|| format!("Unable to get queued tasks for job {}", map_task.job_id))?; let mut remove_tasks: Vec = Vec::new(); for queued_task_id in queued_tasks.clone() { - let queued_task = self.tasks.get(&queued_task_id).chain_err(|| { - format!("Unable to get task {} from task queue", queued_task_id) - })?; + let queued_task = self.tasks + .get(&queued_task_id) + .chain_err(|| format!("Unable to get task {} from task queue", queued_task_id))?; if queued_task.task_type != TaskType::Reduce { continue; } - let from_map_task = - self.reduce_from_map_task(&map_task, queued_task) - .chain_err(|| "Unable to determine if reduce stems from map task")?; + let from_map_task = self.reduce_from_map_task(&map_task, queued_task) + .chain_err(|| "Unable to determine if reduce stems from map task")?; if from_map_task { remove_tasks.push(queued_task.id.clone()); } } - self.remove_tasks_from_queue(&remove_tasks).chain_err( - || "Unable to remove tasks from queue", - )?; + self.remove_tasks_from_queue(&remove_tasks) + .chain_err(|| "Unable to remove tasks from queue")?; // Reschedule the map task let mut new_map_task = map_task.clone(); new_map_task.reset_map_task(); - self.tasks.insert( - new_map_task.id.clone(), - new_map_task.clone(), - ); + self.tasks + .insert(new_map_task.id.clone(), new_map_task.clone()); self.priority_task_queue.push(PriorityTask::new( task_id.to_owned(), new_map_task.job_priority * FAILED_TASK_PRIORITY, @@ -642,7 +616,6 @@ impl State { reschedule_task = task.id.clone(); break; } - } } @@ -670,12 +643,10 @@ impl State { pub fn reduce_from_map_task(&self, map_task: &Task, reduce_task: &Task) -> Result { match reduce_task.reduce_request { Some(ref req) => Ok(map_task.map_output_files.contains_key(&req.partition)), - None => Err( - format!( - "Unabale to get reduce request for task with ID {}", - map_task.id - ).into(), - ), + None => Err(format!( + "Unabale to get reduce request for task with ID {}", + map_task.id + ).into()), } } @@ -705,9 +676,9 @@ impl StateHandling for State { fn dump_state(&self) -> Result { let mut workers_json: Vec = Vec::new(); for worker in self.workers.values() { - workers_json.push(worker.dump_state().chain_err( - || "Error dumping worker state.", - )?); + workers_json.push(worker + .dump_state() + .chain_err(|| "Error dumping worker state.")?); } let mut tasks_json: Vec = Vec::new(); @@ -725,9 +696,8 @@ impl StateHandling for State { fn load_state(&mut self, data: serde_json::Value) -> Result<()> { if let serde_json::Value::Array(ref workers_array) = data["workers"] { for worker in workers_array { - let worker = Worker::new_from_json(worker.clone()).chain_err( - || "Unable to create ScheduledJob from json.", - )?; + let worker = Worker::new_from_json(worker.clone()) + .chain_err(|| "Unable to create ScheduledJob from json.")?; self.workers.insert(worker.worker_id.to_owned(), worker); } } else { @@ -739,9 +709,8 @@ impl StateHandling for State { if let serde_json::Value::Array(ref tasks_array) = data["tasks"] { for task in tasks_array { - let task = Task::new_from_json(task.clone()).chain_err( - || "Unable to create Task from json.", - )?; + let task = Task::new_from_json(task.clone()) + .chain_err(|| "Unable to create Task from json.")?; debug!("Loaded task from state:\n {:?}", task); self.tasks.insert(task.id.to_owned(), task); } diff --git a/master/src/worker_management/worker_manager.rs b/master/src/worker_management/worker_manager.rs index e65f0e53..e9a90564 100644 --- a/master/src/worker_management/worker_manager.rs +++ b/master/src/worker_management/worker_manager.rs @@ -1,6 +1,6 @@ -use std::{thread, time}; -use std::sync::{Arc, Mutex}; use std::sync::mpsc::{channel, Receiver, Sender}; +use std::sync::{Arc, Mutex}; +use std::{thread, time}; use chrono::prelude::*; use futures::future; @@ -14,7 +14,7 @@ use errors::*; use util::data_layer::AbstractionLayer; use util::distributed_filesystem::{WorkerInfoUpdate, WorkerInfoUpdateType}; use util::output_error; -use util::state::{StateHandling, SimpleStateHandling}; +use util::state::{SimpleStateHandling, StateHandling}; use worker_communication::WorkerInterface; use worker_management::state::State; @@ -56,9 +56,9 @@ impl WorkerManager { fn send_worker_info_update(&self, info_update: WorkerInfoUpdate) -> Result<()> { let worker_info_sender = self.worker_info_sender.lock().unwrap(); - worker_info_sender.send(info_update).chain_err( - || "Error sending worker info update.", - )?; + worker_info_sender + .send(info_update) + .chain_err(|| "Error sending worker info update.")?; Ok(()) } @@ -76,16 +76,16 @@ impl WorkerManager { pub fn register_worker(&self, worker: Worker) -> Result<()> { let mut state = self.state.lock().unwrap(); - self.worker_interface.add_client(&worker).chain_err( - || "Error registering worker.", - )?; + self.worker_interface + .add_client(&worker) + .chain_err(|| "Error registering worker.")?; let worker_id = worker.worker_id.clone(); let worker_address = Some(worker.address); - state.add_worker(worker).chain_err( - || "Error registering worker.", - )?; + state + .add_worker(worker) + .chain_err(|| "Error registering worker.")?; self.send_worker_info_update(WorkerInfoUpdate::new( WorkerInfoUpdateType::Available, @@ -108,21 +108,20 @@ impl WorkerManager { pub fn process_reduce_task_result(&self, reduce_result: &pb::ReduceResult) -> Result<()> { let mut state = self.state.lock().unwrap(); - let task = state.process_reduce_task_result(reduce_result).chain_err( - || "Error processing reduce result.", - )?; + let task = state + .process_reduce_task_result(reduce_result) + .chain_err(|| "Error processing reduce result.")?; info!( "Got result for reduce task {} from {}", - task.id, - reduce_result.worker_id + task.id, reduce_result.worker_id ); if task.status == TaskStatus::Complete || task.status == TaskStatus::Failed { let task_update_sender = self.task_update_sender.lock().unwrap(); - task_update_sender.send(task).chain_err( - || "Error processing reduce result.", - )?; + task_update_sender + .send(task) + .chain_err(|| "Error processing reduce result.")?; } state @@ -136,21 +135,20 @@ impl WorkerManager { pub fn process_map_task_result(&self, map_result: &pb::MapResult) -> Result<()> { let mut state = self.state.lock().unwrap(); - let task = state.process_map_task_result(map_result).chain_err( - || "Error processing map result.", - )?; + let task = state + .process_map_task_result(map_result) + .chain_err(|| "Error processing map result.")?; info!( "Got result for map task {} from {}", - task.id, - map_result.worker_id + task.id, map_result.worker_id ); if task.status == TaskStatus::Complete || task.status == TaskStatus::Failed { let task_update_sender = self.task_update_sender.lock().unwrap(); - task_update_sender.send(task).chain_err( - || "Error processing map result.", - )?; + task_update_sender + .send(task) + .chain_err(|| "Error processing map result.")?; } state @@ -179,9 +177,9 @@ impl WorkerManager { for worker_id in workers { // Clear the task from the worker so that we can ignore it's result. - let task_id = state.cancel_task_for_worker(&worker_id).chain_err(|| { - format!("Error cancelling task on worker: {}", worker_id) - })?; + let task_id = state + .cancel_task_for_worker(&worker_id) + .chain_err(|| format!("Error cancelling task on worker: {}", worker_id))?; // Create a request to cancel the task the worker is currently running. let mut cancel_request = pb::CancelTaskRequest::new(); @@ -235,12 +233,12 @@ impl WorkerManager { let state = self.state.lock().unwrap(); let workers = state.get_workers(); for worker in workers { - let time_since_worker_updated = Utc::now().timestamp() - - worker.status_last_updated.timestamp(); + let time_since_worker_updated = + Utc::now().timestamp() - worker.status_last_updated.timestamp(); if time_since_worker_updated >= TIME_BEFORE_WORKER_TERMINATION_S { workers_to_remove.push(worker.worker_id.to_owned()); - } else if time_since_worker_updated >= TIME_BEFORE_WORKER_TASK_REASSIGNMENT_S && - worker.current_task_id != "" + } else if time_since_worker_updated >= TIME_BEFORE_WORKER_TASK_REASSIGNMENT_S + && worker.current_task_id != "" { workers_to_reassign.push(worker.worker_id.to_owned()); } @@ -344,9 +342,9 @@ impl WorkerManager { pub fn handle_task_assignment_failure(&self, worker_id: &str) -> Result<()> { let mut state = self.state.lock().unwrap(); - state.unassign_worker(worker_id).chain_err(|| { - format!("Failed to unassign task from worker {}", worker_id) - })?; + state + .unassign_worker(worker_id) + .chain_err(|| format!("Failed to unassign task from worker {}", worker_id))?; state .increment_failed_task_assignments(worker_id) .chain_err(|| "Error when incrementing task assignment failures") @@ -354,9 +352,9 @@ impl WorkerManager { pub fn handle_task_assignment_success(&self, worker_id: &str) -> Result<()> { let mut state = self.state.lock().unwrap(); - state.reset_failed_task_assignments(worker_id).chain_err( - || "Error when recording task assignment success", - ) + state + .reset_failed_task_assignments(worker_id) + .chain_err(|| "Error when recording task assignment success") } pub fn run_task(&self, task: Task) { @@ -429,13 +427,13 @@ impl WorkerManager { info!("Removing worker {} from list of active workers.", worker_id); // Remove worker interface. - self.worker_interface.remove_client(worker_id).chain_err( - || "Error removing worker client", - )?; + self.worker_interface + .remove_client(worker_id) + .chain_err(|| "Error removing worker client")?; - state.remove_worker(worker_id).chain_err( - || "Error removing worker from state", - )?; + state + .remove_worker(worker_id) + .chain_err(|| "Error removing worker from state")?; self.send_worker_info_update(WorkerInfoUpdate::new( WorkerInfoUpdateType::Unavailable, @@ -449,9 +447,7 @@ impl WorkerManager { pub fn handle_worker_report(&self, request: &pb::ReportWorkerRequest) -> Result<()> { info!( "Worker on '{}' failed to provide map output data to '{}' for task with ID {}", - request.report_address, - request.worker_id, - request.task_id, + request.report_address, request.worker_id, request.task_id, ); let mut state = self.state.lock().unwrap(); @@ -467,9 +463,9 @@ impl SimpleStateHandling for WorkerManager { fn load_state(&self, data: serde_json::Value) -> Result<()> { let mut state = self.state.lock().unwrap(); - state.load_state(data).chain_err( - || "Error creating worker manager from state.", - )?; + state + .load_state(data) + .chain_err(|| "Error creating worker manager from state.")?; let mut workers_to_remove = Vec::new(); { @@ -484,17 +480,17 @@ impl SimpleStateHandling for WorkerManager { )).chain_err(|| "Error sending worker info update")?; if let Err(e) = add_client_result { - output_error(&e.chain_err( - || format!("Unable to reconnect to worker {}", worker.worker_id), - )); + output_error(&e.chain_err(|| { + format!("Unable to reconnect to worker {}", worker.worker_id) + })); workers_to_remove.push(worker.worker_id.to_owned()); } } } for worker_id in workers_to_remove { - state.remove_worker(&worker_id).chain_err( - || "Error creating worker manager from state.", - )?; + state + .remove_worker(&worker_id) + .chain_err(|| "Error creating worker manager from state.")?; self.send_worker_info_update(WorkerInfoUpdate::new( WorkerInfoUpdateType::Unavailable, @@ -524,9 +520,7 @@ fn handle_task_assignment_result( ) { let worker_id = assignment_result.worker_id.clone(); if let Err(err) = assignment_result.result { - output_error(&err.chain_err( - || format!("Error assigning task to worker {}", worker_id), - )); + output_error(&err.chain_err(|| format!("Error assigning task to worker {}", worker_id))); let result = worker_manager.handle_task_assignment_failure(&worker_id); if let Err(err) = result { output_error(&err.chain_err(|| "Error handling task assignment failure.")); diff --git a/util/src/data_layer/nfs_layer.rs b/util/src/data_layer/nfs_layer.rs index b57c0a39..571de881 100644 --- a/util/src/data_layer/nfs_layer.rs +++ b/util/src/data_layer/nfs_layer.rs @@ -1,6 +1,6 @@ -use std::fs::{File, DirEntry}; use std::fs; -use std::io::{Read, Write, Seek, SeekFrom}; +use std::fs::{DirEntry, File}; +use std::io::{Read, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; use errors::*; @@ -13,14 +13,15 @@ pub struct NFSAbstractionLayer { impl NFSAbstractionLayer { pub fn new(nfs_path: &Path) -> Self { - NFSAbstractionLayer { nfs_path: PathBuf::from(nfs_path) } + NFSAbstractionLayer { + nfs_path: PathBuf::from(nfs_path), + } } fn abstracted_path(&self, path: &Path) -> Result { if path.starts_with(self.nfs_path.clone()) { - let abstracted_path = path.strip_prefix(self.nfs_path.as_path()).chain_err( - || "Unable to strip prefix from path", - )?; + let abstracted_path = path.strip_prefix(self.nfs_path.as_path()) + .chain_err(|| "Unable to strip prefix from path")?; return Ok(PathBuf::from(abstracted_path)); } Ok(PathBuf::from(path)) @@ -29,13 +30,11 @@ impl NFSAbstractionLayer { fn absolute_path(&self, path: &Path) -> Result { debug!( "Attempting to get absolute path: {:?}, {:?}", - self.nfs_path, - path + self.nfs_path, path ); - let relative_path = path.strip_prefix("/").chain_err( - || "Error occured stripping prefix", - )?; + let relative_path = path.strip_prefix("/") + .chain_err(|| "Error occured stripping prefix")?; Ok(self.nfs_path.join(relative_path)) } @@ -51,9 +50,7 @@ impl AbstractionLayer for NFSAbstractionLayer { debug!("Getting file length: {:?}", path); let file_path = self.absolute_path(path).chain_err(|| "Unable to get path")?; - let metadata = fs::metadata(file_path).chain_err( - || "Error getting metadata", - )?; + let metadata = fs::metadata(file_path).chain_err(|| "Error getting metadata")?; Ok(metadata.len()) } @@ -62,14 +59,12 @@ impl AbstractionLayer for NFSAbstractionLayer { debug!("Reading file: {:?}", path); let mut file = self.open_file(path)?; - file.seek(SeekFrom::Start(start_byte)).chain_err(|| { - format!("Error reading file {:?}", path) - })?; + file.seek(SeekFrom::Start(start_byte)) + .chain_err(|| format!("Error reading file {:?}", path))?; let mut bytes = vec![0; (end_byte - start_byte) as usize]; - file.read_exact(&mut bytes).chain_err(|| { - format!("Error reading file {:?}", path) - })?; + file.read_exact(&mut bytes) + .chain_err(|| format!("Error reading file {:?}", path))?; Ok(bytes) } @@ -77,16 +72,13 @@ impl AbstractionLayer for NFSAbstractionLayer { fn write_file(&self, path: &Path, data: &[u8]) -> Result<()> { let file_path = self.absolute_path(path).chain_err(|| "Unable to get path")?; debug!("Writing file: {}", file_path.to_string_lossy()); - let mut file = File::create(file_path.clone()).chain_err(|| { - format!("unable to create file {:?}", file_path) - })?; + let mut file = File::create(file_path.clone()) + .chain_err(|| format!("unable to create file {:?}", file_path))?; - file.write_all(data).chain_err(|| { - format!("unable to write content to {:?}", file_path) - }) + file.write_all(data) + .chain_err(|| format!("unable to write content to {:?}", file_path)) } - fn get_local_file(&self, path: &Path) -> Result { self.absolute_path(path) } @@ -94,15 +86,13 @@ impl AbstractionLayer for NFSAbstractionLayer { fn read_dir(&self, path: &Path) -> Result> { let absolute_path = self.absolute_path(path).chain_err(|| "Unable to get path")?; debug!("Reading from {:?}", absolute_path); - let entries = fs::read_dir(absolute_path.as_path()).chain_err( - || "Unable to read input directroy", - )?; + let entries = + fs::read_dir(absolute_path.as_path()).chain_err(|| "Unable to read input directroy")?; let mut abstracted_entries: Vec = vec![]; for entry in entries { let entry: DirEntry = entry.chain_err(|| "Error reading input directory")?; - let abstracted_path = self.abstracted_path(&entry.path()).chain_err( - || "Unable to get abstracted path", - )?; + let abstracted_path = self.abstracted_path(&entry.path()) + .chain_err(|| "Unable to get abstracted path")?; abstracted_entries.push(Path::new("/").join(abstracted_path)) } Ok(abstracted_entries) @@ -124,9 +114,8 @@ impl AbstractionLayer for NFSAbstractionLayer { } fn create_dir_all(&self, path: &Path) -> Result<()> { - let absolute_path = self.absolute_path(path).chain_err( - || "Unable to get absolute_path", - )?; + let absolute_path = self.absolute_path(path) + .chain_err(|| "Unable to get absolute_path")?; debug!("Creating directory: {:?}", absolute_path); fs::create_dir_all(&absolute_path.as_path()).chain_err(|| "Unable to create directories") } diff --git a/util/src/data_layer/null_layer.rs b/util/src/data_layer/null_layer.rs index 2a9b6ccf..7f7414c3 100644 --- a/util/src/data_layer/null_layer.rs +++ b/util/src/data_layer/null_layer.rs @@ -1,6 +1,6 @@ -use std::fs::{File, DirEntry}; use std::fs; -use std::io::{Read, Write, Seek, SeekFrom}; +use std::fs::{DirEntry, File}; +use std::io::{Read, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; use errors::*; @@ -35,27 +35,23 @@ impl AbstractionLayer for NullAbstractionLayer { debug!("Reading file: {:?}", path); let mut file = self.open_file(path)?; - file.seek(SeekFrom::Start(start_byte)).chain_err(|| { - format!("Error reading file {:?}", path) - })?; + file.seek(SeekFrom::Start(start_byte)) + .chain_err(|| format!("Error reading file {:?}", path))?; let mut bytes = vec![0; (end_byte - start_byte) as usize]; - file.read_exact(&mut bytes).chain_err(|| { - format!("Error reading file {:?}", path) - })?; + file.read_exact(&mut bytes) + .chain_err(|| format!("Error reading file {:?}", path))?; Ok(bytes) } fn write_file(&self, path: &Path, data: &[u8]) -> Result<()> { debug!("Writing file: {}", path.to_string_lossy()); - let mut file = File::create(&path).chain_err(|| { - format!("unable to create file {:?}", path) - })?; + let mut file = + File::create(&path).chain_err(|| format!("unable to create file {:?}", path))?; - file.write_all(data).chain_err(|| { - format!("unable to write content to {:?}", path) - }) + file.write_all(data) + .chain_err(|| format!("unable to write content to {:?}", path)) } fn get_local_file(&self, path: &Path) -> Result { @@ -63,9 +59,7 @@ impl AbstractionLayer for NullAbstractionLayer { } fn read_dir(&self, path: &Path) -> Result> { - let entries = fs::read_dir(path).chain_err( - || "Unable to read input directroy", - )?; + let entries = fs::read_dir(path).chain_err(|| "Unable to read input directroy")?; let mut pathbufs: Vec = vec![]; for entry in entries { let entry: DirEntry = entry.chain_err(|| "Error reading input directory")?; diff --git a/util/src/data_layer/s3_layer.rs b/util/src/data_layer/s3_layer.rs index 21b4508c..7855c0bf 100644 --- a/util/src/data_layer/s3_layer.rs +++ b/util/src/data_layer/s3_layer.rs @@ -1,14 +1,14 @@ +use futures::{Future, Stream}; use std::path::{Path, PathBuf}; use std::sync::Arc; -use futures::{Future, Stream}; use rusoto_core::Region; -use rusoto_s3::{S3Client, S3}; use rusoto_s3; +use rusoto_s3::{S3, S3Client}; -use errors::*; -use distributed_filesystem::LocalFileManager; use data_layer::abstraction_layer::AbstractionLayer; +use distributed_filesystem::LocalFileManager; +use errors::*; const S3_REGION: Region = Region::EuWest1; @@ -28,9 +28,8 @@ impl AmazonS3AbstractionLayer { local_file_manager, }; - let exists = s3.bucket_exists().chain_err( - || "Unable to check if bucket exists", - )?; + let exists = s3.bucket_exists() + .chain_err(|| "Unable to check if bucket exists")?; if !exists { return Err(format!("Bucket '{}' does not exist.", s3.bucket).into()); } @@ -41,25 +40,21 @@ impl AmazonS3AbstractionLayer { fn abstracted_path(&self, path: &Path) -> Result { let stripped_path = { if path.starts_with("/") { - path.strip_prefix("/").chain_err( - || "Unable to strip prefix from path", - )? + path.strip_prefix("/") + .chain_err(|| "Unable to strip prefix from path")? } else { path } }; match stripped_path.to_str() { Some(string) => Ok(string.to_owned()), - None => Err( - format!("Unable to convert path '{:?}' to a String", path).into(), - ), + None => Err(format!("Unable to convert path '{:?}' to a String", path).into()), } } pub fn file_metadata(&self, path: &Path) -> Result { - let abstracted_path = self.abstracted_path(path).chain_err( - || "Unable to convert PathBuf to a String", - )?; + let abstracted_path = self.abstracted_path(path) + .chain_err(|| "Unable to convert PathBuf to a String")?; let request = rusoto_s3::HeadObjectRequest { bucket: self.bucket.clone(), @@ -88,9 +83,10 @@ impl AmazonS3AbstractionLayer { } pub fn bucket_exists(&self) -> Result { - let result = self.client.list_buckets().sync().chain_err( - || "Unable to retrieve bucket list", - )?; + let result = self.client + .list_buckets() + .sync() + .chain_err(|| "Unable to retrieve bucket list")?; match result.buckets { Some(buckets) => { @@ -112,24 +108,19 @@ impl AmazonS3AbstractionLayer { impl AbstractionLayer for AmazonS3AbstractionLayer { fn get_file_length(&self, path: &Path) -> Result { - let metadata = self.file_metadata(path).chain_err( - || "Unable to get metadata for file", - )?; + let metadata = self.file_metadata(path) + .chain_err(|| "Unable to get metadata for file")?; if let Some(size) = metadata.content_length { return Ok(size as u64); } - Err( - format!("Unable to get content length of file: {:?}", path).into(), - ) + Err(format!("Unable to get content length of file: {:?}", path).into()) } fn read_file_location(&self, path: &Path, start_byte: u64, end_byte: u64) -> Result> { - let abstracted_path = self.abstracted_path(path).chain_err( - || "Unable to get abstracted path", - )?; - + let abstracted_path = self.abstracted_path(path) + .chain_err(|| "Unable to get abstracted path")?; // This range includes the start/end byte. Since we don't want to include the end byte // we subtract 1 here. @@ -157,26 +148,27 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { version_id: None, }; - let response = self.client.get_object(&request).sync().chain_err( - || "Unable to get object", - )?; + let response = self.client + .get_object(&request) + .sync() + .chain_err(|| "Unable to get object")?; let streaming_body = match response.body { Some(body) => body, None => return Err("Object has no body".into()), }; - let result: Vec = streaming_body.concat2().wait().chain_err( - || "Unable to get body of file", - )?; + let result: Vec = streaming_body + .concat2() + .wait() + .chain_err(|| "Unable to get body of file")?; Ok(result) } fn write_file(&self, path: &Path, data: &[u8]) -> Result<()> { - let abstracted_path = self.abstracted_path(path).chain_err( - || "Unable to get abstracted path", - )?; + let abstracted_path = self.abstracted_path(path) + .chain_err(|| "Unable to get abstracted path")?; let request = rusoto_s3::PutObjectRequest { bucket: self.bucket.clone(), @@ -208,30 +200,27 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { website_redirect_location: None, }; - self.client.put_object(&request).wait().chain_err( - || "Unable to put object into bucket", - )?; + self.client + .put_object(&request) + .wait() + .chain_err(|| "Unable to put object into bucket")?; Ok(()) } fn get_local_file(&self, path: &Path) -> Result { // Return the path to the local file if we have it. - if let Some(local_file_path) = - self.local_file_manager.get_local_file( - &path.to_string_lossy(), - ) + if let Some(local_file_path) = self.local_file_manager + .get_local_file(&path.to_string_lossy()) { return Ok(PathBuf::from(local_file_path)); } // Otherwise download the file and return it's path. info!("Downloading remote file: {:?}", path); - let file_length = self.get_file_length(path).chain_err( - || "Error getting file length", - )?; - let data = self.read_file_location(path, 0, file_length).chain_err( - || "Error reading remote file", - )?; + let file_length = self.get_file_length(path) + .chain_err(|| "Error getting file length")?; + let data = self.read_file_location(path, 0, file_length) + .chain_err(|| "Error reading remote file")?; let local_file_path = self.local_file_manager .write_local_file(&path.to_string_lossy(), &data) @@ -242,9 +231,8 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { } fn read_dir(&self, path: &Path) -> Result> { - let mut abstracted_path = self.abstracted_path(path).chain_err( - || "Unable to convert Path to a String", - )?; + let mut abstracted_path = self.abstracted_path(path) + .chain_err(|| "Unable to convert Path to a String")?; abstracted_path = format!("{}/", abstracted_path); let request = rusoto_s3::ListObjectsV2Request { bucket: self.bucket.clone(), @@ -259,9 +247,10 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { }; let mut files: Vec = vec![]; - let response = self.client.list_objects_v2(&request).sync().chain_err( - || "Unable to get list of objects from bucket", - )?; + let response = self.client + .list_objects_v2(&request) + .sync() + .chain_err(|| "Unable to get list of objects from bucket")?; if let Some(contents) = response.contents { for object in contents { if let Some(file_name) = object.key { @@ -294,9 +283,8 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { // S3 automatically creates folders if they don't exist, so we can just return true if the path // isn't a file. fn is_dir(&self, path: &Path) -> Result<(bool)> { - let is_file = self.is_file(path).chain_err( - || "Unable to check if path is a file", - )?; + let is_file = self.is_file(path) + .chain_err(|| "Unable to check if path is a file")?; Ok(!is_file) } diff --git a/util/src/distributed_filesystem/filesystem_manager.rs b/util/src/distributed_filesystem/filesystem_manager.rs index 19c0dae1..97a2d5b0 100644 --- a/util/src/distributed_filesystem/filesystem_manager.rs +++ b/util/src/distributed_filesystem/filesystem_manager.rs @@ -1,17 +1,17 @@ -use std::collections::HashMap; use std::cmp::max; +use std::collections::HashMap; use std::net::SocketAddr; use std::path::PathBuf; -use std::sync::{Arc, RwLock}; use std::sync::mpsc::Receiver; +use std::sync::{Arc, RwLock}; use std::thread; use protobuf::repeated::RepeatedField; use rand::random; use cerberus_proto::filesystem as pb; -use errors::*; use distributed_filesystem::{FileChunk, FileSystemWorkerInterface}; +use errors::*; use logging::output_error; use serde_json; use state::SimpleStateHandling; @@ -98,12 +98,12 @@ impl FileSystemManager { fn update_file_info(&self, file_path: &str, file_chunk: FileChunk) { let mut file_info_map = self.file_info_map.write().unwrap(); - let file_info = file_info_map.entry(file_path.to_owned()).or_insert( - FileInfo { + let file_info = file_info_map + .entry(file_path.to_owned()) + .or_insert(FileInfo { length: 0, chunks: Vec::new(), - }, - ); + }); file_info.length = max(file_info.length, file_chunk.end_byte); file_info.chunks.push(file_chunk); @@ -161,8 +161,8 @@ impl FileSystemManager { let distribution_level = self.get_distribution_level(active_workers.len()); let mut distribution_count = 0; let mut distribution_failures = 0; - while distribution_count < distribution_level && - distribution_failures < MAX_DISTRIBUTION_FAILURES + while distribution_count < distribution_level + && distribution_failures < MAX_DISTRIBUTION_FAILURES { let worker = self.get_random_worker(&active_workers); let worker_addr = active_workers @@ -186,9 +186,9 @@ impl FileSystemManager { used_workers.push(worker); } Err(err) => { - output_error(&err.chain_err( - || format!("Error storing file on worker {}", worker), - )); + output_error(&err.chain_err(|| { + format!("Error storing file on worker {}", worker) + })); distribution_failures += 1; if distribution_count + active_workers.len() > distribution_level { active_workers.remove(&worker); @@ -251,8 +251,8 @@ impl FileSystemManager { } else { for chunk in &file_info.chunks { // Check if chunk is in the range. - if (start_byte >= chunk.start_byte && start_byte < chunk.end_byte) || - (chunk.start_byte >= start_byte && chunk.start_byte < end_byte) + if (start_byte >= chunk.start_byte && start_byte < chunk.end_byte) + || (chunk.start_byte >= start_byte && chunk.start_byte < end_byte) { let pb_chunk = self.convert_file_chunk(chunk, &active_workers); response.chunks.push(pb_chunk); @@ -300,7 +300,6 @@ impl FileSystemManager { response.set_children(RepeatedField::from_vec(dir_info.children.clone())); return response; } - } let mut response = pb::FileInfoResponse::new(); @@ -311,9 +310,9 @@ impl FileSystemManager { pub fn process_worker_info_update(&self, worker_info_update: &WorkerInfoUpdate) -> Result<()> { let mut worker_map = self.active_workers.write().unwrap(); if worker_info_update.update_type == WorkerInfoUpdateType::Available { - let address = worker_info_update.address.chain_err( - || "No address when adding available worker", - )?; + let address = worker_info_update + .address + .chain_err(|| "No address when adding available worker")?; worker_map.insert(worker_info_update.worker_id.clone(), address); } else { diff --git a/util/src/distributed_filesystem/filesystem_master_interface.rs b/util/src/distributed_filesystem/filesystem_master_interface.rs index 84064f51..8b13d2dc 100644 --- a/util/src/distributed_filesystem/filesystem_master_interface.rs +++ b/util/src/distributed_filesystem/filesystem_master_interface.rs @@ -140,11 +140,8 @@ impl LocalFileSystemMasterInterface { impl FileSystemMasterInterface for LocalFileSystemMasterInterface { fn upload_file_chunk(&self, file_path: &str, start_byte: u64, data: Vec) -> Result<()> { - self.filesystem_manager.upload_file_chunk( - file_path, - start_byte, - &data, - ) + self.filesystem_manager + .upload_file_chunk(file_path, start_byte, &data) } // Gets the locations for the entire file if end_byte is 0 @@ -154,11 +151,8 @@ impl FileSystemMasterInterface for LocalFileSystemMasterInterface { start_byte: u64, end_byte: u64, ) -> Result { - self.filesystem_manager.get_file_location( - file_path, - start_byte, - end_byte, - ) + self.filesystem_manager + .get_file_location(file_path, start_byte, end_byte) } fn get_file_chunks(&self, file_path: &str) -> Result> { diff --git a/util/src/distributed_filesystem/filesystem_worker_interface.rs b/util/src/distributed_filesystem/filesystem_worker_interface.rs index 5c97e7c6..3e159708 100644 --- a/util/src/distributed_filesystem/filesystem_worker_interface.rs +++ b/util/src/distributed_filesystem/filesystem_worker_interface.rs @@ -1,4 +1,3 @@ - use std::collections::HashMap; use std::net::SocketAddr; use std::str::FromStr; @@ -33,9 +32,8 @@ impl FileSystemWorkerInterface { } } - let worker_socket_addr = SocketAddr::from_str(worker_addr).chain_err( - || "Invalid worker address", - )?; + let worker_socket_addr = + SocketAddr::from_str(worker_addr).chain_err(|| "Invalid worker address")?; let client = grpc_pb::FileSystemWorkerServiceClient::new_plain( &worker_socket_addr.ip().to_string(), @@ -56,9 +54,8 @@ impl FileSystemWorkerInterface { start_byte: u64, data: Vec, ) -> Result<()> { - self.create_client_if_required(worker_addr).chain_err( - || "Error creating client", - )?; + self.create_client_if_required(worker_addr) + .chain_err(|| "Error creating client")?; let mut request = pb::StoreFileRequest::new(); request.set_file_path(file_path.to_owned()); @@ -86,9 +83,8 @@ impl FileSystemWorkerInterface { start_byte: u64, end_byte: u64, ) -> Result> { - self.create_client_if_required(worker_addr).chain_err( - || "Error creating client", - )?; + self.create_client_if_required(worker_addr) + .chain_err(|| "Error creating client")?; let mut request = pb::ReadFileRequest::new(); request.set_file_path(file_path.to_owned()); diff --git a/util/src/distributed_filesystem/local_file_manager.rs b/util/src/distributed_filesystem/local_file_manager.rs index b253bd0c..b72037fb 100644 --- a/util/src/distributed_filesystem/local_file_manager.rs +++ b/util/src/distributed_filesystem/local_file_manager.rs @@ -1,10 +1,10 @@ -use std::collections::HashMap; use std::cmp::{max, min}; +use std::collections::HashMap; use std::fs::{DirBuilder, File, OpenOptions}; -use std::io::{Read, Write, Seek, SeekFrom}; -use std::sync::RwLock; -use std::path::PathBuf; +use std::io::{Read, Seek, SeekFrom, Write}; use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt}; +use std::path::PathBuf; +use std::sync::RwLock; use uuid::Uuid; @@ -49,9 +49,9 @@ impl LocalFileManager { let mut dir_builder = DirBuilder::new(); let dir_builder = dir_builder.recursive(true).mode(0o777); - dir_builder.create(&storage_path).chain_err( - || "Failed to create storage directory", - )?; + dir_builder + .create(&storage_path) + .chain_err(|| "Failed to create storage directory")?; let file_name = Uuid::new_v4().to_string(); storage_path.push(file_name); @@ -64,15 +64,13 @@ impl LocalFileManager { storage_path.to_string_lossy() ); - let mut file = File::create(storage_path.clone()).chain_err( - || "Unable to create file", - )?; + let mut file = File::create(storage_path.clone()).chain_err(|| "Unable to create file")?; file.write_all(data).chain_err(|| "Unable to write data")?; let mut local_file_map = self.local_file_map.write().unwrap(); - let chunks = local_file_map.entry(file_path.to_owned()).or_insert_with( - Vec::new, - ); + let chunks = local_file_map + .entry(file_path.to_owned()) + .or_insert_with(Vec::new); let file_chunk = FileChunk { local_file_path: storage_path, @@ -96,9 +94,9 @@ impl LocalFileManager { let mut dir_builder = DirBuilder::new(); let dir_builder = dir_builder.recursive(true).mode(0o777); - dir_builder.create(&storage_path).chain_err( - || "Failed to create storage directory", - )?; + dir_builder + .create(&storage_path) + .chain_err(|| "Failed to create storage directory")?; let file_name = Uuid::new_v4().to_string(); storage_path.push(file_name); @@ -112,9 +110,8 @@ impl LocalFileManager { } pub fn write_local_file(&self, file_path: &str, data: &[u8]) -> Result { - let storage_path = self.get_new_local_file_path().chain_err( - || "Error writing local file", - )?; + let storage_path = self.get_new_local_file_path() + .chain_err(|| "Error writing local file")?; let mut options = OpenOptions::new(); options.read(true); @@ -123,9 +120,9 @@ impl LocalFileManager { options.create(true); options.mode(0o777); - let mut file = options.open(storage_path.clone()).chain_err( - || "Unable to create file", - )?; + let mut file = options + .open(storage_path.clone()) + .chain_err(|| "Unable to create file")?; file.write_all(data).chain_err(|| "Unable to write data")?; self.complete_local_file(file_path, &storage_path); @@ -143,11 +140,7 @@ impl LocalFileManager { let local_file_map = self.local_file_map.read().unwrap(); let stored_chunks = match local_file_map.get(file_path) { Some(stored_chunks) => stored_chunks, - None => { - return Err( - format!("No stored file chunks found for {}", file_path).into(), - ) - } + None => return Err(format!("No stored file chunks found for {}", file_path).into()), }; let mut file_chunk = None; @@ -162,24 +155,18 @@ impl LocalFileManager { let bytes_to_read = end_byte - start_byte; let mut bytes = vec![0u8; bytes_to_read as usize]; - let mut file = File::open(chunk.local_file_path.clone()).chain_err(|| { - format!("Error opening file chunk {:?}", chunk.local_file_path) - })?; + let mut file = File::open(chunk.local_file_path.clone()) + .chain_err(|| format!("Error opening file chunk {:?}", chunk.local_file_path))?; file.seek(SeekFrom::Start(start_byte - chunk.start_byte)) - .chain_err(|| { - format!("Error reading file chunk {:?}", chunk.local_file_path) - })?; + .chain_err(|| format!("Error reading file chunk {:?}", chunk.local_file_path))?; - file.read_exact(&mut bytes).chain_err(|| { - format!("Error reading file chunk {:?}", chunk.local_file_path) - })?; + file.read_exact(&mut bytes) + .chain_err(|| format!("Error reading file chunk {:?}", chunk.local_file_path))?; return Ok(bytes); } - Err( - format!("Stored file chunk not found for {}", file_path).into(), - ) + Err(format!("Stored file chunk not found for {}", file_path).into()) } /// `read_local_file` reads the portion of the requested file that is stored localy and @@ -265,9 +252,7 @@ impl SimpleStateHandling for LocalFileManager { let complete_file_map = self.complete_file_map.read().unwrap(); let mut complete_file_vec: Vec = Vec::new(); for (remote_path, local_path) in complete_file_map.iter() { - complete_file_vec.push( - json!({"remote_path": remote_path, "local_path": local_path}), - ); + complete_file_vec.push(json!({"remote_path": remote_path, "local_path": local_path})); } Ok(json!({ diff --git a/util/src/distributed_filesystem/mod.rs b/util/src/distributed_filesystem/mod.rs index c8266ca7..3e9573d3 100644 --- a/util/src/distributed_filesystem/mod.rs +++ b/util/src/distributed_filesystem/mod.rs @@ -5,13 +5,13 @@ mod filesystem_worker_interface; mod local_file_manager; pub use self::distributed_file_layer::DFSAbstractionLayer; +pub use self::filesystem_manager::run_worker_info_upate_loop; +pub use self::filesystem_manager::FileSystemManager; pub use self::filesystem_manager::WorkerInfoUpdate; pub use self::filesystem_manager::WorkerInfoUpdateType; -pub use self::filesystem_manager::FileSystemManager; -pub use self::filesystem_manager::run_worker_info_upate_loop; pub use self::filesystem_master_interface::FileChunk; -pub use self::filesystem_worker_interface::FileSystemWorkerInterface; pub use self::filesystem_master_interface::FileSystemMasterInterface; -pub use self::filesystem_master_interface::NetworkFileSystemMasterInterface; pub use self::filesystem_master_interface::LocalFileSystemMasterInterface; +pub use self::filesystem_master_interface::NetworkFileSystemMasterInterface; +pub use self::filesystem_worker_interface::FileSystemWorkerInterface; pub use self::local_file_manager::LocalFileManager; diff --git a/util/src/lib.rs b/util/src/lib.rs index 4936d4c7..8481d52d 100644 --- a/util/src/lib.rs +++ b/util/src/lib.rs @@ -13,10 +13,10 @@ extern crate serde_derive; extern crate serde; #[macro_use] extern crate serde_json; -extern crate uuid; +extern crate futures; extern crate rusoto_core; extern crate rusoto_s3; -extern crate futures; +extern crate uuid; pub mod errors { error_chain!{} diff --git a/util/src/state/mod.rs b/util/src/state/mod.rs index 60e930dd..81cbaa9a 100644 --- a/util/src/state/mod.rs +++ b/util/src/state/mod.rs @@ -1,4 +1,4 @@ mod traits; -pub use self::traits::StateHandling; pub use self::traits::SimpleStateHandling; +pub use self::traits::StateHandling; diff --git a/worker/src/communication/intermediate_data_fetching.rs b/worker/src/communication/intermediate_data_fetching.rs index 68856ad9..cf60acf5 100644 --- a/worker/src/communication/intermediate_data_fetching.rs +++ b/worker/src/communication/intermediate_data_fetching.rs @@ -1,9 +1,9 @@ -use futures::Future; use futures::future; +use futures::Future; use futures_cpupool::CpuPool; -use errors::*; use super::worker_interface::WorkerInterface; +use errors::*; use operations::OperationResources; const INPUT_FETCHING_CPU_POOL_SIZE: usize = 20; @@ -31,14 +31,13 @@ pub fn fetch_reduce_inputs( Ok(input) => future::ok(input), Err(err) => future::err(err), } - }); input_futures.push(input_future); } let results_future = future::join_all(input_futures); - results_future.wait().chain_err( - || "Error running fetch reduce input futures", - ) + results_future + .wait() + .chain_err(|| "Error running fetch reduce input futures") } diff --git a/worker/src/communication/worker_interface.rs b/worker/src/communication/worker_interface.rs index 279f2573..062f3a68 100644 --- a/worker/src/communication/worker_interface.rs +++ b/worker/src/communication/worker_interface.rs @@ -1,6 +1,6 @@ use std::net::SocketAddr; -use std::str::FromStr; use std::path::Path; +use std::str::FromStr; use grpc::RequestOptions; use std::error::Error; @@ -8,10 +8,10 @@ use std::error::Error; use errors::*; use operations::io; -use operations::OperationResources; use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; -use cerberus_proto::worker_grpc::IntermediateDataService; // For pub functions only +use cerberus_proto::worker_grpc::IntermediateDataService; +use operations::OperationResources; // For pub functions only const INTERMEDIATE_DATA_RETRIES: u8 = 3; @@ -28,9 +28,8 @@ impl WorkerInterface { ) -> Result { let path_str = path.as_ref().to_string_lossy(); let split_path: Vec<&str> = path_str.splitn(2, '/').collect(); - let worker_addr = SocketAddr::from_str(split_path[0]).chain_err( - || "Unable to parse worker address", - )?; + let worker_addr = + SocketAddr::from_str(split_path[0]).chain_err(|| "Unable to parse worker address")?; let file = format!("/{}", split_path[1]); info!("getting {} from {}", &file, worker_addr); @@ -45,9 +44,8 @@ impl WorkerInterface { let res = WorkerInterface::request_data(worker_addr, req, resources, task_id) .chain_err(|| format!("Failed to get {} from {}", file, worker_addr))?; - String::from_utf8(res.get_data().to_vec()).chain_err( - || "Unable to convert returned data to string", - ) + String::from_utf8(res.get_data().to_vec()) + .chain_err(|| "Unable to convert returned data to string") } pub fn request_data( @@ -61,16 +59,13 @@ impl WorkerInterface { &worker_addr.ip().to_string(), worker_addr.port(), Default::default(), - ).chain_err(|| { - format!("Error building client for worker {}", worker_addr) - })?; + ).chain_err(|| format!("Error building client for worker {}", worker_addr))?; for i in 0..INTERMEDIATE_DATA_RETRIES { let response = client .get_intermediate_data(RequestOptions::new(), req.clone()) .wait(); - if let Ok(res) = response { return Ok(res.1); } @@ -86,7 +81,6 @@ impl WorkerInterface { ); } }; - } // At this point we have failed to contact the worker multiple times and should report this @@ -101,11 +95,9 @@ impl WorkerInterface { ) .chain_err(|| "Unable to report worker")?; - Err( - format!( - "Unable to get intermediate data after {} attempts", - INTERMEDIATE_DATA_RETRIES - ).into(), - ) + Err(format!( + "Unable to get intermediate data after {} attempts", + INTERMEDIATE_DATA_RETRIES + ).into()) } } diff --git a/worker/src/initialization/data_layer.rs b/worker/src/initialization/data_layer.rs index 1eb23c7a..d75078d7 100644 --- a/worker/src/initialization/data_layer.rs +++ b/worker/src/initialization/data_layer.rs @@ -1,13 +1,13 @@ use std::net::SocketAddr; -use std::sync::Arc; use std::path::{Path, PathBuf}; +use std::sync::Arc; use clap::ArgMatches; use errors::*; -use util::data_layer::{AbstractionLayer, AmazonS3AbstractionLayer, NullAbstractionLayer, - NFSAbstractionLayer}; -use util::distributed_filesystem::{LocalFileManager, DFSAbstractionLayer, +use util::data_layer::{AbstractionLayer, AmazonS3AbstractionLayer, NFSAbstractionLayer, + NullAbstractionLayer}; +use util::distributed_filesystem::{DFSAbstractionLayer, LocalFileManager, NetworkFileSystemMasterInterface}; const DEFAULT_DFS_DIRECTORY: &str = "/tmp/cerberus/dfs/"; @@ -35,10 +35,8 @@ pub fn get_data_abstraction_layer( let local_file_manager_arc = Arc::new(LocalFileManager::new(storage_dir)); - let master_interface = Box::new( - NetworkFileSystemMasterInterface::new(master_addr) - .chain_err(|| "Error creating filesystem master interface.")?, - ); + let master_interface = Box::new(NetworkFileSystemMasterInterface::new(master_addr) + .chain_err(|| "Error creating filesystem master interface.")?); data_abstraction_layer = Arc::new(DFSAbstractionLayer::new( Arc::clone(&local_file_manager_arc), diff --git a/worker/src/initialization/grpc_server.rs b/worker/src/initialization/grpc_server.rs index 0f50283b..7cfa23ef 100644 --- a/worker/src/initialization/grpc_server.rs +++ b/worker/src/initialization/grpc_server.rs @@ -5,7 +5,7 @@ use clap::ArgMatches; use errors::*; use operations::OperationHandler; -use server::{Server, ScheduleOperationService, IntermediateDataService, FileSystemService}; +use server::{FileSystemService, IntermediateDataService, ScheduleOperationService, Server}; use util::distributed_filesystem::LocalFileManager; // Setting the port to 0 means a random available port will be selected diff --git a/worker/src/initialization/register_worker.rs b/worker/src/initialization/register_worker.rs index 46ae6fb3..424f8d90 100644 --- a/worker/src/initialization/register_worker.rs +++ b/worker/src/initialization/register_worker.rs @@ -1,8 +1,8 @@ -use std::{thread, time}; use std::net::SocketAddr; +use std::{thread, time}; -use errors::*; use communication::MasterInterface; +use errors::*; const WORKER_REGISTRATION_RETRIES: u16 = 5; const WORKER_REGISTRATION_RETRY_WAIT_DURATION_MS: u64 = 1000; diff --git a/worker/src/initialization/state_handler.rs b/worker/src/initialization/state_handler.rs index d289ce9e..8a79c5fb 100644 --- a/worker/src/initialization/state_handler.rs +++ b/worker/src/initialization/state_handler.rs @@ -16,17 +16,17 @@ pub fn initialize_state_handler( let fresh = matches.is_present("fresh"); let should_dump_state = !matches.is_present("nodump"); - let dump_dir = matches.value_of("state-location").unwrap_or( - DEFAULT_DUMP_DIR, - ); + let dump_dir = matches + .value_of("state-location") + .unwrap_or(DEFAULT_DUMP_DIR); let mut state_handler = StateHandler::new(local_file_manager, should_dump_state, dump_dir) .chain_err(|| "Unable to create StateHandler")?; if !fresh && Path::new(&format!("{}/worker.dump", dump_dir)).exists() { - state_handler.load_state().chain_err( - || "Error loading state", - )?; + state_handler + .load_state() + .chain_err(|| "Error loading state")?; } Ok(state_handler) diff --git a/worker/src/initialization/worker_resources.rs b/worker/src/initialization/worker_resources.rs index d99a935d..6c3ee913 100644 --- a/worker/src/initialization/worker_resources.rs +++ b/worker/src/initialization/worker_resources.rs @@ -1,13 +1,12 @@ - use std::net::SocketAddr; use std::str::FromStr; use std::sync::Arc; use clap::ArgMatches; +use communication::MasterInterface; use errors::*; use initialization::{get_data_abstraction_layer, initialize_grpc_server, initialize_state_handler}; -use communication::MasterInterface; use operations::OperationHandler; use server::Server; use state::StateHandler; @@ -27,14 +26,11 @@ impl WorkerResources { matches.value_of("master").unwrap_or(DEFAULT_MASTER_ADDR), ).chain_err(|| "Error parsing master address")?; - let master_interface = Arc::new(MasterInterface::new(master_addr).chain_err( - || "Error creating master interface.", - )?); + let master_interface = Arc::new(MasterInterface::new(master_addr).chain_err(|| "Error creating master interface.")?); let (data_abstraction_layer, local_file_manager) = - get_data_abstraction_layer(master_addr, matches).chain_err( - || "Error creating data abstraction layer.", - )?; + get_data_abstraction_layer(master_addr, matches) + .chain_err(|| "Error creating data abstraction layer.")?; let operation_handler = Arc::new(OperationHandler::new( Arc::clone(&master_interface), diff --git a/worker/src/main.rs b/worker/src/main.rs index a0c81bb2..140ba5c2 100644 --- a/worker/src/main.rs +++ b/worker/src/main.rs @@ -61,9 +61,8 @@ fn run() -> Result<()> { let matches = parser::parse_command_line(); - let mut resources = WorkerResources::new(&matches).chain_err( - || "Error initializing worker resources", - )?; + let mut resources = + WorkerResources::new(&matches).chain_err(|| "Error initializing worker resources")?; let local_ip_addr = matches.value_of("ip").unwrap_or(DEFAULT_WORKER_IP); let local_addr = SocketAddr::from_str(&format!( diff --git a/worker/src/main_loop.rs b/worker/src/main_loop.rs index 1af451f6..fb6cd205 100644 --- a/worker/src/main_loop.rs +++ b/worker/src/main_loop.rs @@ -1,5 +1,5 @@ -use std::{thread, time}; use std::net::SocketAddr; +use std::{thread, time}; use errors::*; use initialization::{register_worker, WorkerResources}; @@ -45,9 +45,10 @@ pub fn run_main_loop(mut resources: WorkerResources, local_addr: SocketAddr) -> if resources.state_handler.get_should_dump_state() { iterations_since_state_dump += 1; if iterations_since_state_dump * MAIN_LOOP_SLEEP_MS >= DUMP_LOOP_MS { - resources.state_handler.dump_state().chain_err( - || "Unable to dump state", - )?; + resources + .state_handler + .dump_state() + .chain_err(|| "Unable to dump state")?; iterations_since_state_dump = 0 } } diff --git a/worker/src/operations/combine.rs b/worker/src/operations/combine.rs index 214d8038..e9c5666a 100644 --- a/worker/src/operations/combine.rs +++ b/worker/src/operations/combine.rs @@ -4,8 +4,8 @@ use std::process::{Command, Stdio}; use serde_json; -use errors::*; use super::operation_handler::{OperationResources, PartitionMap}; +use errors::*; #[derive(Serialize)] struct CombineInput { @@ -33,9 +33,8 @@ fn do_combine_operation( resources: &OperationResources, combine_input: &[CombineInput], ) -> Result { - let combine_input_str = serde_json::to_string(&combine_input).chain_err( - || "Error seralizing combine operation input.", - )?; + let combine_input_str = serde_json::to_string(&combine_input) + .chain_err(|| "Error seralizing combine operation input.")?; let absolute_binary_path = resources .data_abstraction_layer @@ -50,34 +49,29 @@ fn do_combine_operation( .chain_err(|| "Failed to start combine operation process.")?; if let Some(stdin) = child.stdin.as_mut() { - stdin.write_all(combine_input_str.as_bytes()).chain_err( - || "Error writing to payload stdin.", - )?; + stdin + .write_all(combine_input_str.as_bytes()) + .chain_err(|| "Error writing to payload stdin.")?; } else { return Err("Error accessing stdin of payload binary.".into()); } - let output = child.wait_with_output().chain_err( - || "Error waiting for payload result.", - )?; + let output = child + .wait_with_output() + .chain_err(|| "Error waiting for payload result.")?; - let output_str = String::from_utf8(output.stdout).chain_err( - || "Error accessing payload output.", - )?; + let output_str = + String::from_utf8(output.stdout).chain_err(|| "Error accessing payload output.")?; - let stderr_str = String::from_utf8(output.stderr).chain_err( - || "Error accessing payload output.", - )?; + let stderr_str = + String::from_utf8(output.stderr).chain_err(|| "Error accessing payload output.")?; if !stderr_str.is_empty() { - return Err( - format!("MapReduce binary failed with stderr:\n {}", stderr_str).into(), - ); + return Err(format!("MapReduce binary failed with stderr:\n {}", stderr_str).into()); } - let combine_results: serde_json::Value = serde_json::from_str(&output_str).chain_err( - || "Error parsing combine results.", - )?; + let combine_results: serde_json::Value = + serde_json::from_str(&output_str).chain_err(|| "Error parsing combine results.")?; Ok(combine_results) } @@ -105,16 +99,15 @@ fn run_combine(resources: &OperationResources, partition_map: &mut PartitionMap) continue; } - let results = do_combine_operation(resources, &combine_inputs).chain_err( - || "Failed to run combine operation.", - )?; + let results = do_combine_operation(resources, &combine_inputs) + .chain_err(|| "Failed to run combine operation.")?; // Use results of combine operations. if let serde_json::Value::Array(results) = results { for (i, result) in results.iter().enumerate() { - let values = kv_map.get_mut(&combine_keys[i]).chain_err( - || "Error running combine", - )?; + let values = kv_map + .get_mut(&combine_keys[i]) + .chain_err(|| "Error running combine")?; values.clear(); @@ -139,9 +132,8 @@ pub fn optional_run_combine( resources: &OperationResources, partition_map: &mut PartitionMap, ) -> Result<()> { - let has_combine = check_has_combine(resources).chain_err( - || "Error running has-combine command.", - )?; + let has_combine = + check_has_combine(resources).chain_err(|| "Error running has-combine command.")?; if has_combine { return run_combine(resources, partition_map); diff --git a/worker/src/operations/io.rs b/worker/src/operations/io.rs index 2a4946cf..174e7aca 100644 --- a/worker/src/operations/io.rs +++ b/worker/src/operations/io.rs @@ -24,18 +24,16 @@ pub fn read_location( .read_file_location(path, input_location.start_byte, input_location.end_byte) .chain_err(|| "Error reading file location.")?; - let value = str::from_utf8(&buffer).chain_err(|| { - format!("Invalid string in file {}", input_location.get_input_path()) - })?; + let value = str::from_utf8(&buffer) + .chain_err(|| format!("Invalid string in file {}", input_location.get_input_path()))?; Ok(value.to_owned()) } pub fn read_local>(path: P) -> Result { debug!("Attempting to read local file: {:?}", path.as_ref()); - let file = File::open(&path).chain_err(|| { - format!("unable to open file {}", path.as_ref().to_string_lossy()) - })?; + let file = File::open(&path) + .chain_err(|| format!("unable to open file {}", path.as_ref().to_string_lossy()))?; let mut buf_reader = BufReader::new(file); let mut value = String::new(); @@ -57,9 +55,7 @@ pub fn write>( ) -> Result<()> { data_abstraction_layer_arc .write_file(path.as_ref(), data) - .chain_err(|| { - format!("Unable to write file {}", path.as_ref().to_string_lossy()) - })?; + .chain_err(|| format!("Unable to write file {}", path.as_ref().to_string_lossy()))?; Ok(()) } @@ -67,9 +63,8 @@ pub fn write>( #[cfg_attr(test, mockable)] pub fn write_local>(path: P, data: &[u8]) -> Result<()> { debug!("Attempting to write to local file: {:?}", path.as_ref()); - let mut file = File::create(&path).chain_err(|| { - format!("unable to create file {}", path.as_ref().to_string_lossy()) - })?; + let mut file = File::create(&path) + .chain_err(|| format!("unable to create file {}", path.as_ref().to_string_lossy()))?; file.write_all(data).chain_err(|| { format!( "unable to write content to {}", diff --git a/worker/src/operations/map.rs b/worker/src/operations/map.rs index 333dcc2b..27e8984f 100644 --- a/worker/src/operations/map.rs +++ b/worker/src/operations/map.rs @@ -7,20 +7,20 @@ use std::sync::{Arc, Mutex}; use std::thread; use bson; -use futures::Future; use futures::future; -use futures_cpupool::{CpuPool, CpuFuture}; +use futures::Future; +use futures_cpupool::{CpuFuture, CpuPool}; use serde_json; use uuid::Uuid; -use errors::*; -use cerberus_proto::worker as pb; -use communication::MasterInterface; use super::combine; use super::io; use super::operation_handler; use super::operation_handler::{OperationResources, PartitionMap}; use super::state::OperationState; +use cerberus_proto::worker as pb; +use communication::MasterInterface; +use errors::*; use util::output_error; const WORKER_OUTPUT_DIRECTORY: &str = "/tmp/cerberus/"; @@ -52,9 +52,8 @@ fn send_map_result( } fn parse_map_results(map_result_string: &str, partition_map: &mut PartitionMap) -> Result<()> { - let parse_value: serde_json::Value = serde_json::from_str(map_result_string).chain_err( - || "Error parsing map response.", - )?; + let parse_value: serde_json::Value = + serde_json::from_str(map_result_string).chain_err(|| "Error parsing map response.")?; let partition_map_object = match parse_value["partitions"].as_object() { None => return Err("Error parsing partition map.".into()), @@ -62,9 +61,10 @@ fn parse_map_results(map_result_string: &str, partition_map: &mut PartitionMap) }; for (partition_str, pairs) in partition_map_object.iter() { - let partition: u64 = partition_str.to_owned().parse().chain_err( - || "Error parsing map response.", - )?; + let partition: u64 = partition_str + .to_owned() + .parse() + .chain_err(|| "Error parsing map response.")?; let partition_hashmap = partition_map.entry(partition).or_insert_with(HashMap::new); if let serde_json::Value::Array(ref pairs) = *pairs { for pair in pairs { @@ -84,29 +84,25 @@ fn map_operation_thread_impl(map_input_value: &bson::Document, mut child: Child) .chain_err(|| "Could not encode map_input as BSON.")?; if let Some(stdin) = child.stdin.as_mut() { - stdin.write_all(&input_buf[..]).chain_err( - || "Error writing to payload stdin.", - )?; + stdin + .write_all(&input_buf[..]) + .chain_err(|| "Error writing to payload stdin.")?; } else { return Err("Error accessing stdin of payload binary.".into()); } - let output = child.wait_with_output().chain_err( - || "Error waiting for payload result.", - )?; + let output = child + .wait_with_output() + .chain_err(|| "Error waiting for payload result.")?; - let output_str = String::from_utf8(output.stdout).chain_err( - || "Error accessing payload output.", - )?; + let output_str = + String::from_utf8(output.stdout).chain_err(|| "Error accessing payload output.")?; - let stderr_str = String::from_utf8(output.stderr).chain_err( - || "Error accessing payload output.", - )?; + let stderr_str = + String::from_utf8(output.stderr).chain_err(|| "Error accessing payload output.")?; if !stderr_str.is_empty() { - return Err( - format!("MapReduce binary failed with stderr:\n {}", stderr_str).into(), - ); + return Err(format!("MapReduce binary failed with stderr:\n {}", stderr_str).into()); } Ok(output_str) @@ -237,9 +233,7 @@ fn run_map_input( info!( "Running map task for {} ({} - > {})", - input_location.input_path, - input_location.start_byte, - input_location.end_byte + input_location.input_path, input_location.start_byte, input_location.end_byte ); let map_input_value = io::read_location(&resources.data_abstraction_layer, input_location) @@ -263,9 +257,8 @@ fn run_map_input( value: map_input_value, }; - let serialized_map_input = bson::to_bson(&map_input).chain_err( - || "Could not serialize map input to bson.", - )?; + let serialized_map_input = + bson::to_bson(&map_input).chain_err(|| "Could not serialize map input to bson.")?; let map_input_document; if let bson::Bson::Document(document) = serialized_map_input { @@ -328,9 +321,7 @@ fn internal_perform_map( output_path.push(output_dir_uuid); output_path.push("map"); - fs::create_dir_all(&output_path).chain_err( - || "Failed to create output directory", - )?; + fs::create_dir_all(&output_path).chain_err(|| "Failed to create output directory")?; let input_locations = map_options.get_input().get_input_locations(); let initial_cpu_time; @@ -362,7 +353,6 @@ fn internal_perform_map( future::err::("Running map input failed".into()) } } - }); map_result_futures.push(map_result_future); diff --git a/worker/src/operations/mod.rs b/worker/src/operations/mod.rs index 6f44033b..072c6c74 100644 --- a/worker/src/operations/mod.rs +++ b/worker/src/operations/mod.rs @@ -1,9 +1,9 @@ mod combine; pub mod io; mod map; +pub mod operation_handler; mod reduce; mod state; -pub mod operation_handler; pub use self::operation_handler::OperationHandler; pub use self::operation_handler::OperationResources; diff --git a/worker/src/operations/operation_handler.rs b/worker/src/operations/operation_handler.rs index 55e72cbb..c296591b 100644 --- a/worker/src/operations/operation_handler.rs +++ b/worker/src/operations/operation_handler.rs @@ -8,13 +8,13 @@ use procinfo::pid::stat_self; use serde_json; use uuid::Uuid; -use cerberus_proto::worker as pb; -use errors::*; -use communication::MasterInterface; -use util::data_layer::AbstractionLayer; use super::map; use super::reduce; use super::state::OperationState; +use cerberus_proto::worker as pb; +use communication::MasterInterface; +use errors::*; +use util::data_layer::AbstractionLayer; pub type PartitionMap = HashMap>>; @@ -164,9 +164,7 @@ impl OperationHandler { let worker_status = self.get_worker_status(); let operation_status = self.get_worker_operation_status(); - self.master_interface.update_worker_status( - worker_status, - operation_status, - ) + self.master_interface + .update_worker_status(worker_status, operation_status) } } diff --git a/worker/src/operations/reduce.rs b/worker/src/operations/reduce.rs index 8090de01..3599d89f 100644 --- a/worker/src/operations/reduce.rs +++ b/worker/src/operations/reduce.rs @@ -7,15 +7,15 @@ use std::thread; use serde_json; -use errors::*; -use communication; -use communication::MasterInterface; use super::io; use super::operation_handler; use super::operation_handler::OperationResources; use super::state::OperationState; -use util::output_error; +use communication; +use communication::MasterInterface; +use errors::*; use util::data_layer::AbstractionLayer; +use util::output_error; use cerberus_proto::worker as pb; @@ -62,27 +62,22 @@ fn run_reducer( return Err("Error accessing stdin of payload binary.".into()); } - let output = child.wait_with_output().chain_err( - || "Error waiting for payload result.", - )?; + let output = child + .wait_with_output() + .chain_err(|| "Error waiting for payload result.")?; - let output_str = String::from_utf8(output.stdout).chain_err( - || "Error accessing payload output.", - )?; + let output_str = + String::from_utf8(output.stdout).chain_err(|| "Error accessing payload output.")?; - let stderr_str = String::from_utf8(output.stderr).chain_err( - || "Error accessing payload output.", - )?; + let stderr_str = + String::from_utf8(output.stderr).chain_err(|| "Error accessing payload output.")?; if !stderr_str.is_empty() { - return Err( - format!("MapReduce binary failed with stderr:\n {}", stderr_str).into(), - ); + return Err(format!("MapReduce binary failed with stderr:\n {}", stderr_str).into()); } - let reduce_output: serde_json::Value = serde_json::from_str(&output_str).chain_err( - || "Error parsing reduce results.", - )?; + let reduce_output: serde_json::Value = + serde_json::from_str(&output_str).chain_err(|| "Error parsing reduce results.")?; let mut reduce_results = Vec::new(); if let serde_json::Value::Array(ref reduce_outputs) = reduce_output { @@ -142,13 +137,12 @@ fn create_reduce_input( ).chain_err(|| "Error fetching reduce inputs")?; for reduce_input in reduce_inputs { - let parsed_value: serde_json::Value = serde_json::from_str(&reduce_input).chain_err( - || "Error parsing reduce input", - )?; + let parsed_value: serde_json::Value = + serde_json::from_str(&reduce_input).chain_err(|| "Error parsing reduce input")?; - let parsed_object = parsed_value.as_object().chain_err( - || "Error parsing reduce input", - )?; + let parsed_object = parsed_value + .as_object() + .chain_err(|| "Error parsing reduce input")?; for (key, values) in parsed_object.iter() { let key = key.to_string(); @@ -165,9 +159,8 @@ fn create_reduce_input( let mut reduce_operations: Vec = Vec::new(); for (intermediate_key, reduce_array) in reduce_map { - let key_value: serde_json::Value = serde_json::from_str(&intermediate_key).chain_err( - || "Error parsing intermediate_key", - )?; + let key_value: serde_json::Value = + serde_json::from_str(&intermediate_key).chain_err(|| "Error parsing intermediate_key")?; let reduce_operation = ReduceInput { key: key_value, @@ -279,9 +272,8 @@ fn write_reduce_output( .create_dir_all(Path::new(&reduce_options.output_directory)) .chain_err(|| "Failed to create output directory")?; - let reduce_results_pretty: String = serde_json::to_string_pretty(&reduce_results).chain_err( - || "Error prettifying reduce results", - )?; + let reduce_results_pretty: String = serde_json::to_string_pretty(&reduce_results) + .chain_err(|| "Error prettifying reduce results")?; let mut file_path = PathBuf::new(); file_path.push(reduce_options.output_directory.clone()); @@ -311,8 +303,7 @@ fn run_reduce( if operation_handler::check_task_cancelled( &resources.operation_state, &reduce_options.task_id, - ) - { + ) { return; } diff --git a/worker/src/operations/state.rs b/worker/src/operations/state.rs index bc1e85d3..2b289f2f 100644 --- a/worker/src/operations/state.rs +++ b/worker/src/operations/state.rs @@ -1,4 +1,3 @@ - use cerberus_proto::worker as pb; /// `OperationState` is a data only struct for holding the current state for the `OperationHandler` diff --git a/worker/src/parser.rs b/worker/src/parser.rs index c28e648e..2c5c1a5f 100644 --- a/worker/src/parser.rs +++ b/worker/src/parser.rs @@ -25,9 +25,7 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { Arg::with_name("ip") .long("ip") .short("i") - .help( - "Set the IP address that can be used to communicate with this worker", - ) + .help("Set the IP address that can be used to communicate with this worker") .takes_value(true) .required(false), ) @@ -42,9 +40,7 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { .arg( Arg::with_name("dfs") .long("dfs") - .help( - "Makes the worker run using the distributed file system for data access.", - ) + .help("Makes the worker run using the distributed file system for data access.") .takes_value(false) .required(false), ) diff --git a/worker/src/server/filesystem_service.rs b/worker/src/server/filesystem_service.rs index 54008105..19016345 100644 --- a/worker/src/server/filesystem_service.rs +++ b/worker/src/server/filesystem_service.rs @@ -1,13 +1,14 @@ use std::sync::Arc; -use grpc::{SingleResponse, Error, RequestOptions}; +use grpc::{Error, RequestOptions, SingleResponse}; use cerberus_proto::filesystem as pb; use cerberus_proto::filesystem_grpc as grpc_pb; use util::distributed_filesystem::LocalFileManager; use util::output_error; -const NOT_DISTRIBUTED_FILESYSTEM: &str = "Worker is not running in distributed filesytem configuration"; +const NOT_DISTRIBUTED_FILESYSTEM: &str = + "Worker is not running in distributed filesytem configuration"; const STORE_FILE_ERROR: &str = "Error processing store file request"; const READ_FILE_ERROR: &str = "Error processing read file request"; @@ -39,11 +40,8 @@ impl grpc_pb::FileSystemWorkerService for FileSystemService { } }; - if let Err(err) = local_file_manager.store_file_chunk( - &req.file_path, - req.start_byte, - &req.data, - ) + if let Err(err) = + local_file_manager.store_file_chunk(&req.file_path, req.start_byte, &req.data) { output_error(&err.chain_err(|| "Error processing store file request.")); return SingleResponse::err(Error::Other(STORE_FILE_ERROR)); diff --git a/worker/src/server/intermediate_data_service.rs b/worker/src/server/intermediate_data_service.rs index fc988262..0775bb80 100644 --- a/worker/src/server/intermediate_data_service.rs +++ b/worker/src/server/intermediate_data_service.rs @@ -2,9 +2,9 @@ use grpc::{Error, RequestOptions, SingleResponse}; use operations::io; -use util; use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; +use util; const DATA_NOT_AVAILABLE: &str = "Data not available"; diff --git a/worker/src/server/master_service.rs b/worker/src/server/master_service.rs index df65a813..9e1c05ca 100644 --- a/worker/src/server/master_service.rs +++ b/worker/src/server/master_service.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use futures::Future; -use grpc::{RequestOptions, SingleResponse, Error}; +use grpc::{Error, RequestOptions, SingleResponse}; use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; diff --git a/worker/src/server/mod.rs b/worker/src/server/mod.rs index 1a4c0005..6f482a2a 100644 --- a/worker/src/server/mod.rs +++ b/worker/src/server/mod.rs @@ -1,21 +1,21 @@ +/// `filesystem_service` is responsible for handling requests related to the distributed file +/// system. +mod filesystem_service; /// `intermediate_data_service` is responsible for handing traffic coming from other workers /// requesting intermediate data created by the map task. mod intermediate_data_service; /// `master_service` is responsible for handing data incoming from the master. mod master_service; -/// `filesystem_service` is responsible for handling requests related to the distributed file -/// system. -mod filesystem_service; +pub use self::filesystem_service::FileSystemService; pub use self::intermediate_data_service::IntermediateDataService; pub use self::master_service::ScheduleOperationService; -pub use self::filesystem_service::FileSystemService; -use std::net::SocketAddr; -use grpc; -use cerberus_proto::worker_grpc; use cerberus_proto::filesystem_grpc; +use cerberus_proto::worker_grpc; use errors::*; +use grpc; +use std::net::SocketAddr; const GRPC_THREAD_POOL_SIZE: usize = 10; @@ -32,34 +32,28 @@ impl Server { ) -> Result { let mut server_builder = grpc::ServerBuilder::new_plain(); server_builder.http.set_port(port); - server_builder.http.set_cpu_pool_threads( - GRPC_THREAD_POOL_SIZE, - ); + server_builder + .http + .set_cpu_pool_threads(GRPC_THREAD_POOL_SIZE); // Register the ScheduleOperationService server_builder.add_service( - worker_grpc::ScheduleOperationServiceServer::new_service_def( - scheduler_service, - ), + worker_grpc::ScheduleOperationServiceServer::new_service_def(scheduler_service), ); // Register IntermediateDataService - server_builder.add_service( - worker_grpc::IntermediateDataServiceServer::new_service_def( - interm_data_service, - ), - ); + server_builder.add_service(worker_grpc::IntermediateDataServiceServer::new_service_def( + interm_data_service, + )); // Register FileSystemService server_builder.add_service( - filesystem_grpc::FileSystemWorkerServiceServer::new_service_def( - filesystem_service, - ), + filesystem_grpc::FileSystemWorkerServiceServer::new_service_def(filesystem_service), ); Ok(Server { - server: server_builder.build().chain_err( - || "Error building gRPC server", - )?, + server: server_builder + .build() + .chain_err(|| "Error building gRPC server")?, }) } diff --git a/worker/src/state/handler.rs b/worker/src/state/handler.rs index b410ed1c..8127f40f 100644 --- a/worker/src/state/handler.rs +++ b/worker/src/state/handler.rs @@ -1,7 +1,7 @@ -use std::sync::Arc; -use std::fs::File; use std::fs; +use std::fs::File; use std::io::{Read, Write}; +use std::sync::Arc; use serde_json; use serde_json::Value as json; @@ -25,9 +25,7 @@ impl StateHandler { dir: &str, ) -> Result { if should_dump_state { - fs::create_dir_all(dir).chain_err(|| { - format!("Unable to create dir: {}", dir) - })?; + fs::create_dir_all(dir).chain_err(|| format!("Unable to create dir: {}", dir))?; } Ok(StateHandler { @@ -54,11 +52,9 @@ impl StateHandler { pub fn dump_state(&self) -> Result<()> { // Get the filesystem manager state as JSON. let local_file_manager_json = match self.local_file_manager { - Some(ref local_file_manager) => { - local_file_manager.dump_state().chain_err( - || "Unable to dump LocalFileManager state", - )? - } + Some(ref local_file_manager) => local_file_manager + .dump_state() + .chain_err(|| "Unable to dump LocalFileManager state")?, None => json!(null), }; @@ -70,9 +66,8 @@ impl StateHandler { // Write the state to file. let mut file = File::create(format!("{}/workertemp.dump", self.dump_dir)) .chain_err(|| "Unable to create file")?; - file.write_all(json.to_string().as_bytes()).chain_err( - || "Unable to write data", - )?; + file.write_all(json.to_string().as_bytes()) + .chain_err(|| "Unable to write data")?; fs::rename( format!("{}/workertemp.dump", self.dump_dir), @@ -87,13 +82,11 @@ impl StateHandler { let mut file = File::open(format!("{}/worker.dump", self.dump_dir)) .chain_err(|| "Unable to open file")?; let mut data = String::new(); - file.read_to_string(&mut data).chain_err( - || "Unable to read from state file", - )?; + file.read_to_string(&mut data) + .chain_err(|| "Unable to read from state file")?; - let json: serde_json::Value = serde_json::from_str(&data).chain_err( - || "Unable to parse string as JSON", - )?; + let json: serde_json::Value = + serde_json::from_str(&data).chain_err(|| "Unable to parse string as JSON")?; // Reset the local file manager state from json. let local_file_manager_json = json["local_file_manager"].clone(); From 26b6da1e0bfd59f0f59f2a89c95621fbf5da4a8f Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Fri, 13 Apr 2018 22:02:59 +0100 Subject: [PATCH 28/58] Overschedule slow tasks that might be stalled --- master/src/common/job.rs | 12 ++ master/src/common/task.rs | 28 +++++ master/src/main.rs | 10 +- master/src/scheduling/mod.rs | 10 +- master/src/scheduling/scheduler.rs | 105 +++++++++++++++++- master/src/scheduling/state.rs | 26 ++++- master/src/worker_management/state.rs | 18 ++- .../src/worker_management/worker_manager.rs | 5 + 8 files changed, 192 insertions(+), 22 deletions(-) diff --git a/master/src/common/job.rs b/master/src/common/job.rs index 0c72435d..e648d888 100644 --- a/master/src/common/job.rs +++ b/master/src/common/job.rs @@ -66,9 +66,11 @@ pub struct Job { pub map_tasks_completed: u32, pub map_tasks_total: u32, + pub map_tasks_seconds_taken: u32, pub reduce_tasks_completed: u32, pub reduce_tasks_total: u32, + pub reduce_tasks_seconds_taken: u32, pub time_requested: DateTime, pub time_started: Option>, @@ -125,9 +127,11 @@ impl Job { map_tasks_completed: 0, map_tasks_total: 0, + map_tasks_seconds_taken: 0, reduce_tasks_completed: 0, reduce_tasks_total: 0, + reduce_tasks_seconds_taken: 0, time_requested: Utc::now(), time_started: None, @@ -281,9 +285,11 @@ impl StateHandling for Job { "map_tasks_completed": self.map_tasks_completed, "map_tasks_total": self.map_tasks_total, + "map_tasks_seconds_taken": self.map_tasks_seconds_taken, "reduce_tasks_completed": self.reduce_tasks_completed, "reduce_tasks_total": self.reduce_tasks_total, + "reduce_tasks_seconds_taken": self.reduce_tasks_seconds_taken, "time_requested": self.time_requested.timestamp(), "time_started": time_started, @@ -304,12 +310,18 @@ impl StateHandling for Job { .chain_err(|| "Unable to convert map_tasks_complete")?; self.map_tasks_total = serde_json::from_value(data["map_tasks_total"].clone()) .chain_err(|| "Unable to convert map_tasks_total")?; + self.map_tasks_seconds_taken = + serde_json::from_value(data["map_tasks_seconds_taken"].clone()) + .chain_err(|| "Unable to convert map_tasks_seconds_taken")?; self.reduce_tasks_completed = serde_json::from_value( data["reduce_tasks_completed"].clone(), ).chain_err(|| "Unable to convert reduce_tasks_complete")?; self.reduce_tasks_total = serde_json::from_value(data["reduce_tasks_total"].clone()) .chain_err(|| "Unable to convert reduce_tasks_total")?; + self.reduce_tasks_seconds_taken = + serde_json::from_value(data["reduce_tasks_seconds_taken"].clone()) + .chain_err(|| "Unable to convert reduce_tasks_seconds_taken")?; let time_requested: i64 = serde_json::from_value(data["time_requested"].clone()) .chain_err(|| "Unable to convert time_requested")?; diff --git a/master/src/common/task.rs b/master/src/common/task.rs index 1b967d96..2d7d2e15 100644 --- a/master/src/common/task.rs +++ b/master/src/common/task.rs @@ -64,6 +64,9 @@ pub struct Task { pub time_started: Option>, pub time_completed: Option>, + + // Used by the scheduler to manage which tasks have been requeued + pub requeued: bool, } impl Task { @@ -106,6 +109,8 @@ impl Task { time_started: None, time_completed: None, + + requeued: false, } } @@ -118,6 +123,7 @@ impl Task { self.time_started = None; self.time_completed = None; self.has_completed_before = true; + self.requeued = false; } fn new_map_task_from_json(data: serde_json::Value) -> Result { @@ -198,6 +204,8 @@ impl Task { time_started: None, time_completed: None, + + requeued: false, } } @@ -232,6 +240,22 @@ impl Task { Ok(task) } + + pub fn get_seconds_running(&self) -> Result { + let time_started = self.time_started + .chain_err(|| "Time started is expected to exist.")?; + + let time_now = Utc::now(); + + let seconds_running: u32 = (time_now.timestamp() - time_started.timestamp()) as u32; + + // Round up to at least one second + if seconds_running == 0 { + Ok(1) + } else { + Ok(seconds_running) + } + } } impl StateHandling for Task { @@ -300,6 +324,7 @@ impl StateHandling for Task { "time_started": time_started, "time_completed": time_completed, "job_priority": self.job_priority, + "requeued": self.requeued, })) } @@ -354,6 +379,9 @@ impl StateHandling for Task { )), }; + self.requeued = serde_json::from_value(data["requeued"].clone()) + .chain_err(|| "Unable to convert requeued")?; + Ok(()) } } diff --git a/master/src/main.rs b/master/src/main.rs index 1d5259f4..9d17cc4b 100644 --- a/master/src/main.rs +++ b/master/src/main.rs @@ -51,9 +51,7 @@ use std::sync::Arc; use errors::*; use initialization::MasterResources; -use scheduling::run_task_update_loop; use util::init_logger; -use worker_management::{run_health_check_loop, run_task_assigment_loop}; fn run() -> Result<()> { println!("Cerberus Master!"); @@ -63,13 +61,13 @@ fn run() -> Result<()> { let resources = MasterResources::new(&matches).chain_err(|| "Error initilizing master")?; // Startup worker management loops - run_task_assigment_loop(Arc::clone(&resources.worker_manager)); - run_health_check_loop(Arc::clone(&resources.worker_manager)); + worker_management::run_task_assigment_loop(Arc::clone(&resources.worker_manager)); + worker_management::run_health_check_loop(Arc::clone(&resources.worker_manager)); // Startup scheduler loop - run_task_update_loop( + scheduling::run_scheduler_loop( Arc::clone(&resources.scheduler), - &Arc::clone(&resources.worker_manager), + Arc::clone(&resources.worker_manager), ); main_loop::run_main_loop(resources) diff --git a/master/src/scheduling/mod.rs b/master/src/scheduling/mod.rs index 135f0471..88807497 100644 --- a/master/src/scheduling/mod.rs +++ b/master/src/scheduling/mod.rs @@ -1,8 +1,8 @@ -pub use self::scheduler::run_task_update_loop; -pub use self::scheduler::Scheduler; -pub use self::task_processor::TaskProcessor; -pub use self::task_processor::TaskProcessorImpl; - mod scheduler; mod state; mod task_processor; + +pub use self::scheduler::run_scheduler_loop; +pub use self::scheduler::Scheduler; +pub use self::task_processor::TaskProcessor; +pub use self::task_processor::TaskProcessorImpl; diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index a7876f21..0cb90ab9 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -1,11 +1,11 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; -use std::thread; +use std::{thread, time}; use serde_json; use cerberus_proto::mapreduce as pb; -use common::{Job, Task, TaskStatus}; +use common::{Job, Task, TaskStatus, TaskType}; use errors::*; use scheduling::state::{ScheduledJob, State}; use scheduling::task_processor::TaskProcessor; @@ -13,6 +13,10 @@ use util::output_error; use util::state::{SimpleStateHandling, StateHandling}; use worker_management::WorkerManager; +const SCHEDULER_LOOP_SLEEP_MS: u64 = 2000; +const SLOW_TASK_COMPLETION_MINIMUM_PERCENTAGE: u32 = 60; +const SLOW_TASK_COMPLETION_MULTIPLIER: u32 = 3; + /// The `Scheduler` is responsible for the managing of `Job`s and `Task`s. /// /// It delegates work to several child modules, and to the @@ -311,6 +315,79 @@ impl Scheduler { Ok(json!(results_vec)) } + + fn get_slow_tasks( + &self, + scheduled_job: &mut ScheduledJob, + task_type: TaskType, + ) -> Result> { + let job = &mut scheduled_job.job; + + let average_completion_time = { + match task_type { + TaskType::Map => job.map_tasks_seconds_taken / job.map_tasks_completed, + TaskType::Reduce => job.reduce_tasks_seconds_taken / job.reduce_tasks_completed, + } + }; + + let mut slow_tasks = Vec::new(); + + for task in scheduled_job.tasks.values_mut() { + if task.task_type == task_type { + if task.status == TaskStatus::InProgress && !task.requeued { + let seconds_running = task.get_seconds_running() + .chain_err(|| "Error getting seconds since task started running")?; + + if seconds_running > (average_completion_time * SLOW_TASK_COMPLETION_MULTIPLIER) + { + slow_tasks.push(task.id.clone()); + task.requeued = true; + } + } + } + } + + Ok(slow_tasks) + } + + // Returns a vector of task_ids for tasks that have been running for more than + // SLOW_TASK_COMPLETION_MULTIPLIER times the average time for tasks of that job. + pub fn get_slow_running_tasks(&self) -> Result> { + let mut state = self.state.lock().unwrap(); + let mut scheduled_jobs = state.get_in_progress_jobs_mut(); + + let mut slow_tasks = Vec::new(); + + for mut job in scheduled_jobs.iter_mut() { + if job.job.map_tasks_completed != job.job.map_tasks_total { + let completed_percentage = + (job.job.map_tasks_completed * 100) / job.job.map_tasks_total; + + if completed_percentage > SLOW_TASK_COMPLETION_MINIMUM_PERCENTAGE { + let slow_map_tasks = self.get_slow_tasks(&mut job, TaskType::Map).chain_err( + || format!("Error getting slow map tasks for job {}", job.job.id), + )?; + slow_tasks.extend(slow_map_tasks); + } + } else if job.job.reduce_tasks_total != 0 + && job.job.reduce_tasks_completed != job.job.reduce_tasks_total + { + let completed_percentage = + (job.job.reduce_tasks_completed * 100) / job.job.reduce_tasks_total; + + if completed_percentage > SLOW_TASK_COMPLETION_MINIMUM_PERCENTAGE { + let slow_reduce_tasks = self.get_slow_tasks(&mut job, TaskType::Reduce) + .chain_err(|| { + format!("Error getting slow map tasks for job {}", job.job.id) + })?; + + slow_tasks.extend(slow_reduce_tasks); + } + } + } + + Ok(slow_tasks) + } } impl SimpleStateHandling for Scheduler { @@ -341,7 +418,7 @@ impl SimpleStateHandling for Scheduler { } } -pub fn run_task_update_loop(scheduler: Arc, worker_manager: &Arc) { +fn task_update_loop(scheduler: Arc, worker_manager: &Arc) { let receiver = worker_manager.get_update_receiver(); thread::spawn(move || loop { let receiver = receiver.lock().unwrap(); @@ -356,3 +433,25 @@ pub fn run_task_update_loop(scheduler: Arc, worker_manager: &Arc, worker_manager: Arc) { + // First start the task_update_loop + task_update_loop(Arc::clone(&scheduler), &worker_manager); + + thread::spawn(move || loop { + thread::sleep(time::Duration::from_millis(SCHEDULER_LOOP_SLEEP_MS)); + + let slow_tasks_result = scheduler.get_slow_running_tasks(); + match slow_tasks_result { + Ok(slow_tasks) => { + for task_id in slow_tasks.iter() { + let requeue_result = worker_manager.requeue_slow_task(task_id); + if let Err(err) = requeue_result { + output_error(&err.chain_err(|| "Error requeuing slow task")); + } + } + } + Err(err) => output_error(&err.chain_err(|| "Error getting slow running tasks")), + } + }); +} diff --git a/master/src/scheduling/state.rs b/master/src/scheduling/state.rs index 5922c4c5..ce74ee7b 100644 --- a/master/src/scheduling/state.rs +++ b/master/src/scheduling/state.rs @@ -174,6 +174,17 @@ impl State { jobs } + pub fn get_in_progress_jobs_mut(&mut self) -> Vec<&mut ScheduledJob> { + let mut jobs = Vec::new(); + for scheduled_job in self.scheduled_jobs.values_mut() { + if scheduled_job.job.status == pb::Status::IN_PROGRESS { + jobs.push(scheduled_job); + } + } + + jobs + } + pub fn get_jobs(&self, client_id: &str) -> Vec<&Job> { let mut jobs = Vec::new(); for scheduled_job in self.scheduled_jobs.values() { @@ -266,11 +277,14 @@ impl State { // Adds the information for a completed task and updates the job. pub fn add_completed_task(&mut self, task: Task) -> Result<()> { - self.update_job_started( - &task.job_id, - task.time_started - .chain_err(|| "Time started is expected to exist.")?, - ).chain_err(|| "Error adding completed task.")?; + let time_started = task.time_started + .chain_err(|| "Time started is expected to exist.")?; + + let task_running_seconds = task.get_seconds_running() + .chain_err(|| "Error getting seconds since task started running")?; + + self.update_job_started(&task.job_id, time_started) + .chain_err(|| "Error adding completed task.")?; let scheduled_job = match self.scheduled_jobs.get_mut(&task.job_id) { Some(scheduled_job) => scheduled_job, @@ -288,6 +302,7 @@ impl State { TaskType::Map => { if !task.has_completed_before { scheduled_job.job.map_tasks_completed += 1; + scheduled_job.job.map_tasks_seconds_taken += task_running_seconds; } } TaskType::Reduce => { @@ -297,6 +312,7 @@ impl State { { scheduled_job.job.status = pb::Status::DONE; scheduled_job.job.time_completed = Some(Utc::now()); + scheduled_job.job.reduce_tasks_seconds_taken += task_running_seconds; } } } diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 450e442a..c58ad861 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -356,6 +356,19 @@ impl State { )); } + pub fn requeue_slow_task(&mut self, task_id: &str) -> Result<()> { + let task = self.tasks + .get(task_id) + .chain_err(|| format!("No task found with id {}", task_id))?; + + self.priority_task_queue.push(PriorityTask::new( + task_id.clone().to_string(), + REQUEUED_TASK_PRIORITY * task.job_priority, + )); + + Ok(()) + } + // Unassign a task assigned to a worker and put the task back in the queue. pub fn unassign_worker(&mut self, worker_id: &str) -> Result<()> { let worker = self.workers @@ -382,9 +395,8 @@ impl State { let assigned_task = { if let Some(scheduled_task) = self.tasks.get_mut(task_id) { scheduled_task.status = TaskStatus::InProgress; - if scheduled_task.time_started == None { - scheduled_task.time_started = Some(Utc::now()); - } + scheduled_task.time_started = Some(Utc::now()); + scheduled_task.requeued = false; scheduled_task.assigned_worker_id = worker_id.to_owned(); scheduled_task.clone() diff --git a/master/src/worker_management/worker_manager.rs b/master/src/worker_management/worker_manager.rs index e9a90564..8ef4706c 100644 --- a/master/src/worker_management/worker_manager.rs +++ b/master/src/worker_management/worker_manager.rs @@ -377,6 +377,11 @@ impl WorkerManager { state.get_workers_running_job(job_id) } + pub fn requeue_slow_task(&self, task_id: &str) -> Result<()> { + let mut state = self.state.lock().unwrap(); + state.requeue_slow_task(task_id) + } + pub fn get_workers_info(&self) -> Result { let state = self.state.lock().unwrap(); From bab11795a3c8dd4a9f343a987a4f2f58c2dbe7be Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Sat, 14 Apr 2018 21:14:39 +0100 Subject: [PATCH 29/58] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..e08ba73d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Cerberus Authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From d01ccfacd74b1409fb69e41a38f53d12910e5872 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Sat, 14 Apr 2018 00:15:52 +0100 Subject: [PATCH 30/58] Demo using google ngrams --- libcerberus/examples/distributed-grep.rs | 3 +- libcerberus/examples/end-to-end.rs | 3 +- libcerberus/examples/ngrams.rs | 148 ++++++++++++++++++++++ libcerberus/examples/rating-aggregator.rs | 3 +- libcerberus/examples/rating-by-genre.rs | 3 +- libcerberus/examples/rating-by-year.rs | 3 +- libcerberus/examples/sprint-seven-demo.md | 20 +++ libcerberus/examples/word-counter.rs | 3 +- libcerberus/src/reducer.rs | 6 +- libcerberus/src/runner.rs | 2 +- 10 files changed, 185 insertions(+), 9 deletions(-) create mode 100644 libcerberus/examples/ngrams.rs create mode 100644 libcerberus/examples/sprint-seven-demo.md diff --git a/libcerberus/examples/distributed-grep.rs b/libcerberus/examples/distributed-grep.rs index 2ba4e9ef..5c225f6e 100644 --- a/libcerberus/examples/distributed-grep.rs +++ b/libcerberus/examples/distributed-grep.rs @@ -47,9 +47,10 @@ impl Map for GrepMapper { struct GrepReducer; impl Reduce for GrepReducer { + type Output = String; fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { for val in input.values { emitter.emit(val).chain_err(|| "Error emitting value.")?; diff --git a/libcerberus/examples/end-to-end.rs b/libcerberus/examples/end-to-end.rs index 5b92c772..c0c6115b 100644 --- a/libcerberus/examples/end-to-end.rs +++ b/libcerberus/examples/end-to-end.rs @@ -21,9 +21,10 @@ impl Map for TestMapper { struct TestReducer; impl Reduce for TestReducer { + type Output = String; fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { emitter.emit(input.values.iter().fold(String::new(), |acc, x| acc + x))?; Ok(()) diff --git a/libcerberus/examples/ngrams.rs b/libcerberus/examples/ngrams.rs new file mode 100644 index 00000000..51ff54a6 --- /dev/null +++ b/libcerberus/examples/ngrams.rs @@ -0,0 +1,148 @@ +extern crate cerberus; +extern crate env_logger; +#[macro_use] +extern crate error_chain; + +use std::collections::HashSet; + +use cerberus::*; + +const MAP_OUTPUT_PARTITIONS: u64 = 200; + +struct NGramsMapper; +impl Map for NGramsMapper { + type Key = String; + type Value = String; + fn map(&self, input: MapInputKV, mut emitter: E) -> Result<()> + where + E: EmitIntermediate, + { + for line in input.value.lines() { + let info: Vec<&str> = line.split_whitespace().collect(); + + let word1: String = info[0].to_string(); + let word2: String = info[1].to_string(); + + let year: u32 = info[2].parse().chain_err(|| "Error parsing year")?; + + /* + let match_count: u32 = info[3].parse().chain_err(|| "Error parsing year")?; + let volumne_count: u32 = info[4].parse().chain_err(|| "Error parsing year")?; + */ + + // Only care about the modern era + if year >= 2000 { + emitter + .emit(word2, word1) + .chain_err(|| "Error emitting map key-value pair.")?; + } + } + Ok(()) + } +} + +fn get_index(c: char) -> u64 { + // Characters (0 -> 9, a -> z) can be converted to a digit. + // 36 different possible values. + match c.to_digit(36) { + Some(d) => u64::from(d), + // Not a character that can be converted to a digit. + None => 37, + } +} + +fn get_place_in_range(x: u64, max_x: u64, start_range: u64, end_range: u64) -> u64 { + if x >= max_x { + return end_range; + } + let range = end_range - start_range; + let new_value = ((x * range) / max_x) + start_range; + return new_value; +} + +struct NGramsPartitioner; +impl Partition for NGramsPartitioner { + fn partition(&self, input: PartitionInputKV) -> Result { + let mut chars = input.key.chars(); + + let first_index = match chars.next() { + Some(c) => get_index(c), + None => 0, + }; + + let second_index = match chars.next() { + Some(c) => get_index(c), + None => 0, + }; + + let start_first_char = get_place_in_range(first_index, 37, 0, MAP_OUTPUT_PARTITIONS); + let end_first_char = get_place_in_range(first_index + 1, 37, 0, MAP_OUTPUT_PARTITIONS); + let partition = get_place_in_range(second_index, 37, start_first_char, end_first_char); + + Ok(partition) + } +} + +struct NGramsCombiner; +impl Combine for NGramsCombiner { + fn combine(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> + where + E: EmitFinal, + { + let mut words: HashSet = HashSet::new(); + for word in input.values { + words.insert(word); + } + + for word in words { + emitter.emit(word).chain_err(|| "Error emitting value.")?; + } + + Ok(()) + } +} + +struct NGramsReducer; +impl Reduce for NGramsReducer { + type Output = u32; + fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> + where + E: EmitFinal, + { + let mut words: HashSet = HashSet::new(); + for word in input.values { + words.insert(word); + } + + emitter + .emit(words.len() as u32) + .chain_err(|| "Error emitting value.")?; + + Ok(()) + } +} + +fn run() -> Result<()> { + env_logger::init().chain_err(|| "Failed to initialise logging.")?; + + let ng_mapper = NGramsMapper; + let ng_reducer = NGramsReducer; + let ng_partitioner = NGramsPartitioner; + let ng_combiner = NGramsCombiner; + + let matches = cerberus::parse_command_line(); + + let registry = UserImplRegistryBuilder::new() + .mapper(&ng_mapper) + .reducer(&ng_reducer) + .partitioner(&ng_partitioner) + .combiner(&ng_combiner) + .build() + .chain_err(|| "Error building UserImplRegistry.")?; + + cerberus::run(&matches, ®istry) +} + +// Macro to generate a quick error_chain main function. +// https://github.com/rust-lang-nursery/error-chain/blob/master/examples/quickstart.rs +quick_main!(run); diff --git a/libcerberus/examples/rating-aggregator.rs b/libcerberus/examples/rating-aggregator.rs index a0577ffe..1860cd6d 100644 --- a/libcerberus/examples/rating-aggregator.rs +++ b/libcerberus/examples/rating-aggregator.rs @@ -146,9 +146,10 @@ impl Combine for RatingAggregatorCombiner { struct RatingAggregatorReducer; impl Reduce for RatingAggregatorReducer { + type Output = String; fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { let mut combine_result = do_rating_combine(input)?; diff --git a/libcerberus/examples/rating-by-genre.rs b/libcerberus/examples/rating-by-genre.rs index e662b23a..8721d7b5 100644 --- a/libcerberus/examples/rating-by-genre.rs +++ b/libcerberus/examples/rating-by-genre.rs @@ -123,9 +123,10 @@ impl Combine for RatingByGenreCombiner { struct RatingByGenreReducer; impl Reduce for RatingByGenreReducer { + type Output = String; fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { let combine_result = do_genre_combine(input)?; diff --git a/libcerberus/examples/rating-by-year.rs b/libcerberus/examples/rating-by-year.rs index 36b6a10e..6a9576d2 100644 --- a/libcerberus/examples/rating-by-year.rs +++ b/libcerberus/examples/rating-by-year.rs @@ -76,9 +76,10 @@ impl Partition for RatingByYearPartitioner { struct RatingByYearReducer; impl Reduce for RatingByYearReducer { + type Output = f64; fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { for val in input.values { emitter.emit(val).chain_err(|| "Error emitting value.")?; diff --git a/libcerberus/examples/sprint-seven-demo.md b/libcerberus/examples/sprint-seven-demo.md new file mode 100644 index 00000000..70cdd999 --- /dev/null +++ b/libcerberus/examples/sprint-seven-demo.md @@ -0,0 +1,20 @@ +# Sprint 7 Demo Payload + +The sprint 7 demo involves processing a the google ngrams data for 2-grams. +The map reduce produces a count of how many unique words precede each +word word in the dataset. It only takes into account data from years +past 2000. + +### Map + +The map phase processes lines in a tab seperated csv format. +For each pair of ngrams, the first word is outputed as the value for the second word. + +### Parition + +We use a custom partitioning function to alphabetize the output. + +### Reduce + +The reduce phase counts the number of unique values for each key and outputs +this count. diff --git a/libcerberus/examples/word-counter.rs b/libcerberus/examples/word-counter.rs index e48f441a..6abf4cef 100644 --- a/libcerberus/examples/word-counter.rs +++ b/libcerberus/examples/word-counter.rs @@ -28,9 +28,10 @@ impl Map for WordCountMapper { struct WordCountReducer; impl Reduce for WordCountReducer { + type Output = u64; fn reduce(&self, input: IntermediateInputKV, mut emitter: E) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { let mut total: u64 = 0; for val in input.values { diff --git a/libcerberus/src/reducer.rs b/libcerberus/src/reducer.rs index 4761fe8c..5e198c6b 100644 --- a/libcerberus/src/reducer.rs +++ b/libcerberus/src/reducer.rs @@ -23,9 +23,10 @@ where K: Default + Serialize + DeserializeOwned, V: Default + Serialize + DeserializeOwned, { + type Output: Default + Serialize + DeserializeOwned; fn reduce(&self, input: IntermediateInputKV, emitter: E) -> Result<()> where - E: EmitFinal; + E: EmitFinal; } #[cfg(test)] @@ -35,13 +36,14 @@ mod tests { struct TestReducer; impl Reduce for TestReducer { + type Output = String; fn reduce( &self, input: IntermediateInputKV, mut emitter: E, ) -> Result<()> where - E: EmitFinal, + E: EmitFinal, { emitter.emit(input.values.iter().fold(String::new(), |acc, x| acc + x))?; Ok(()) diff --git a/libcerberus/src/runner.rs b/libcerberus/src/runner.rs index d393a7b9..6dbc7313 100644 --- a/libcerberus/src/runner.rs +++ b/libcerberus/src/runner.rs @@ -141,7 +141,7 @@ where let mut output_objects = Vec::new(); for input_kv in input_kvs { - let mut output_object = FinalOutputObject::::default(); + let mut output_object = FinalOutputObject::::default(); reducer .reduce(input_kv, FinalOutputObjectEmitter::new(&mut output_object)) .chain_err(|| "Error running reduce operation.")?; From c1b53ddbaf86b45aa858445104bf24c2dfc5099e Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Sun, 15 Apr 2018 18:34:13 +0100 Subject: [PATCH 31/58] Remove assumption that tasks will always exist in worker manager --- libcerberus/examples/ngrams.rs | 5 +- master/src/common/task.rs | 2 +- master/src/scheduling/scheduler.rs | 23 ++-- master/src/worker_management/state.rs | 117 ++++++++---------- .../src/worker_management/worker_manager.rs | 51 +++++--- util/src/data_layer/abstraction_layer.rs | 2 +- util/src/data_layer/nfs_layer.rs | 4 +- util/src/data_layer/null_layer.rs | 4 +- util/src/data_layer/s3_layer.rs | 4 +- .../distributed_file_layer.rs | 17 ++- 10 files changed, 116 insertions(+), 113 deletions(-) diff --git a/libcerberus/examples/ngrams.rs b/libcerberus/examples/ngrams.rs index 51ff54a6..b075ec50 100644 --- a/libcerberus/examples/ngrams.rs +++ b/libcerberus/examples/ngrams.rs @@ -42,7 +42,7 @@ impl Map for NGramsMapper { } fn get_index(c: char) -> u64 { - // Characters (0 -> 9, a -> z) can be converted to a digit. + // Characters (0 -> 9, a -> z) can be converted to a digit. // 36 different possible values. match c.to_digit(36) { Some(d) => u64::from(d), @@ -56,8 +56,7 @@ fn get_place_in_range(x: u64, max_x: u64, start_range: u64, end_range: u64) -> u return end_range; } let range = end_range - start_range; - let new_value = ((x * range) / max_x) + start_range; - return new_value; + ((x * range) / max_x) + start_range } struct NGramsPartitioner; diff --git a/master/src/common/task.rs b/master/src/common/task.rs index 2d7d2e15..d0b56030 100644 --- a/master/src/common/task.rs +++ b/master/src/common/task.rs @@ -20,7 +20,7 @@ pub enum TaskStatus { Cancelled, } -#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] pub enum TaskType { Map, Reduce, diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index 0cb90ab9..d9459103 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -333,16 +333,15 @@ impl Scheduler { let mut slow_tasks = Vec::new(); for task in scheduled_job.tasks.values_mut() { - if task.task_type == task_type { - if task.status == TaskStatus::InProgress && !task.requeued { - let seconds_running = task.get_seconds_running() - .chain_err(|| "Error getting seconds since task started running")?; - - if seconds_running > (average_completion_time * SLOW_TASK_COMPLETION_MULTIPLIER) - { - slow_tasks.push(task.id.clone()); - task.requeued = true; - } + if task.task_type == task_type && task.status == TaskStatus::InProgress + && !task.requeued + { + let seconds_running = task.get_seconds_running() + .chain_err(|| "Error getting seconds since task started running")?; + + if seconds_running > (average_completion_time * SLOW_TASK_COMPLETION_MULTIPLIER) { + slow_tasks.push(task.id.clone()); + task.requeued = true; } } } @@ -358,7 +357,7 @@ impl Scheduler { let mut slow_tasks = Vec::new(); - for mut job in scheduled_jobs.iter_mut() { + for mut job in &mut scheduled_jobs { if job.job.map_tasks_completed != job.job.map_tasks_total { let completed_percentage = (job.job.map_tasks_completed * 100) / job.job.map_tasks_total; @@ -444,7 +443,7 @@ pub fn run_scheduler_loop(scheduler: Arc, worker_manager: Arc { - for task_id in slow_tasks.iter() { + for task_id in &slow_tasks { let requeue_result = worker_manager.requeue_slow_task(task_id); if let Err(err) = requeue_result { output_error(&err.chain_err(|| "Error requeuing slow task")); diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index c58ad861..5b051751 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -171,17 +171,15 @@ impl State { Ok(()) } - pub fn process_reduce_task_result(&mut self, reduce_result: &pb::ReduceResult) -> Result { + pub fn process_reduce_task_result( + &mut self, + reduce_result: &pb::ReduceResult, + ) -> Result> { if reduce_result.status == pb::ResultStatus::SUCCESS { - let mut scheduled_task = self.tasks.remove(reduce_result.get_task_id()).chain_err( - || { - format!( - "Task with ID {} not found, Worker ID: {}.", - reduce_result.get_task_id(), - reduce_result.get_worker_id() - ) - }, - )?; + let mut scheduled_task = match self.tasks.remove(reduce_result.get_task_id()) { + Some(task) => task, + None => return Ok(None), + }; if scheduled_task.id != reduce_result.task_id { return Err("Task id does not match expected task id.".into()); @@ -190,7 +188,7 @@ impl State { scheduled_task.status = TaskStatus::Complete; scheduled_task.time_completed = Some(Utc::now()); scheduled_task.cpu_time = reduce_result.get_cpu_time(); - return Ok(scheduled_task); + return Ok(Some(scheduled_task)); } self.task_failed( @@ -200,15 +198,12 @@ impl State { ).chain_err(|| "Error marking task as failed") } - pub fn process_map_task_result(&mut self, map_result: &pb::MapResult) -> Result { + pub fn process_map_task_result(&mut self, map_result: &pb::MapResult) -> Result> { if map_result.status == pb::ResultStatus::SUCCESS { - let mut scheduled_task = self.tasks.remove(map_result.get_task_id()).chain_err(|| { - format!( - "Task with ID {} not found, Worker ID: {}.", - map_result.get_task_id(), - map_result.get_worker_id() - ) - })?; + let mut scheduled_task = match self.tasks.remove(map_result.get_task_id()) { + Some(task) => task, + None => return Ok(None), + }; if scheduled_task.id != map_result.task_id { return Err("Task id does not match expected task id.".into()); @@ -230,7 +225,7 @@ impl State { self.completed_tasks .insert(scheduled_task.id.clone(), scheduled_task.clone()); - return Ok(scheduled_task); + return Ok(Some(scheduled_task)); } self.task_failed( @@ -245,19 +240,17 @@ impl State { worker_id: &str, task_id: &str, failure_details: &str, - ) -> Result<(Task)> { + ) -> Result> { let worker = self.workers .get_mut(worker_id) .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; worker.current_task_id = String::new(); - let mut assigned_task = self.tasks.remove(task_id).chain_err(|| { - format!( - "Task with ID {} not found, Worker ID: {}.", - task_id, worker_id - ) - })?; + let mut assigned_task = match self.tasks.remove(task_id) { + Some(task) => task, + None => return Ok(None), + }; if failure_details != "" { assigned_task.failure_details = Some(failure_details.to_owned()); @@ -276,7 +269,7 @@ impl State { )); } - Ok(assigned_task) + Ok(Some(assigned_task)) } pub fn get_workers_running_job(&self, job_id: &str) -> Result> { @@ -362,7 +355,7 @@ impl State { .chain_err(|| format!("No task found with id {}", task_id))?; self.priority_task_queue.push(PriorityTask::new( - task_id.clone().to_string(), + task_id.to_string(), REQUEUED_TASK_PRIORITY * task.job_priority, )); @@ -391,7 +384,7 @@ impl State { } // Assigns a given task_id to a worker - fn assign_worker_task(&mut self, worker_id: &str, task_id: &str) -> Result<(Task)> { + fn assign_worker_task(&mut self, worker_id: &str, task_id: &str) -> Result { let assigned_task = { if let Some(scheduled_task) = self.tasks.get_mut(task_id) { scheduled_task.status = TaskStatus::InProgress; @@ -414,52 +407,47 @@ impl State { Ok(assigned_task) } - fn get_data_score(&self, map_request: &pb::PerformMapRequest, worker_id: &str) -> Result { + fn get_data_score(&self, map_request: &pb::PerformMapRequest, worker_id: &str) -> u64 { let mut score: u64 = 0; for input_location in map_request.get_input().get_input_locations() { - score += self.data_layer - .get_data_closeness( - Path::new(&input_location.input_path), - input_location.start_byte, - input_location.end_byte, - worker_id, - ) - .chain_err(|| { - format!( - "Could not get closeness for file {} and worker {}", - input_location.input_path, worker_id - ) - })?; + score += self.data_layer.get_data_closeness( + Path::new(&input_location.input_path), + input_location.start_byte, + input_location.end_byte, + worker_id, + ); } - Ok(score) + score } - fn get_data_score_if_map(&self, task_id: &str, worker_id: &str) -> Result { + fn get_data_score_if_map(&self, task_id: &str, worker_id: &str) -> u64 { let task = match self.tasks.get(task_id) { Some(task) => task, - None => return Err(format!("Task with ID {} not found.", task_id).into()), + // This should never happen, as we already check if the tasks exists before here. + None => { + warn!( + "Unable to get data score. Task not found with id {}", + task_id + ); + return 0; + } }; - let data_score = match task.map_request { - Some(ref map_request) => self.get_data_score(map_request, worker_id).chain_err(|| { - format!( - "Could not get data closeness score for task_id {} and worker_id {}", - task.id, worker_id - ) - })?, + match task.map_request { + Some(ref map_request) => self.get_data_score(map_request, worker_id), None => 0, - }; - - Ok(data_score) + } } - fn get_best_task_for_worker(&mut self, worker_id: &str) -> Result> { + fn get_best_task_for_worker(&mut self, worker_id: &str) -> Option { let mut tasks_to_consider = Vec::new(); for _ in 0..MAX_TASKS_TO_CONSIDER { if let Some(task) = self.priority_task_queue.pop() { - tasks_to_consider.push(task); + if self.tasks.contains_key(&task.id) { + tasks_to_consider.push(task); + } } else { break; } @@ -476,9 +464,7 @@ impl State { } } - let data_score = self.get_data_score_if_map(&task.id, worker_id) - .chain_err(|| "Unable to get data score")?; - + let data_score = self.get_data_score_if_map(&task.id, worker_id); if data_score > best_data_score || best_task.is_none() { best_data_score = data_score; best_task = Some(task.clone()); @@ -496,14 +482,13 @@ impl State { } } - Ok(best_task) + best_task } // Tries to assign a worker the next task in the queue. // Returns the task if one exists. - pub fn try_assign_worker_task(&mut self, worker_id: &str) -> Result<(Option)> { - let task_option = self.get_best_task_for_worker(worker_id) - .chain_err(|| "Could not get task for worker")?; + pub fn try_assign_worker_task(&mut self, worker_id: &str) -> Result> { + let task_option = self.get_best_task_for_worker(worker_id); let scheduled_task_id: String = match task_option { Some(priority_task) => { diff --git a/master/src/worker_management/worker_manager.rs b/master/src/worker_management/worker_manager.rs index 8ef4706c..aa0d9208 100644 --- a/master/src/worker_management/worker_manager.rs +++ b/master/src/worker_management/worker_manager.rs @@ -108,21 +108,14 @@ impl WorkerManager { pub fn process_reduce_task_result(&self, reduce_result: &pb::ReduceResult) -> Result<()> { let mut state = self.state.lock().unwrap(); - let task = state - .process_reduce_task_result(reduce_result) - .chain_err(|| "Error processing reduce result.")?; - info!( "Got result for reduce task {} from {}", - task.id, reduce_result.worker_id + reduce_result.task_id, reduce_result.worker_id ); - if task.status == TaskStatus::Complete || task.status == TaskStatus::Failed { - let task_update_sender = self.task_update_sender.lock().unwrap(); - task_update_sender - .send(task) - .chain_err(|| "Error processing reduce result.")?; - } + let task_option = state + .process_reduce_task_result(reduce_result) + .chain_err(|| "Error processing reduce result.")?; state .set_worker_operation_completed( @@ -130,20 +123,42 @@ impl WorkerManager { reduce_result.get_status(), ) .chain_err(|| "Error processing reduce result.")?; + + let task = match task_option { + Some(task) => task, + None => return Ok(()), + }; + + if task.status == TaskStatus::Complete || task.status == TaskStatus::Failed { + let task_update_sender = self.task_update_sender.lock().unwrap(); + task_update_sender + .send(task) + .chain_err(|| "Error processing reduce result.")?; + } + Ok(()) } pub fn process_map_task_result(&self, map_result: &pb::MapResult) -> Result<()> { let mut state = self.state.lock().unwrap(); - let task = state - .process_map_task_result(map_result) - .chain_err(|| "Error processing map result.")?; - info!( "Got result for map task {} from {}", - task.id, map_result.worker_id + map_result.task_id, map_result.worker_id ); + let task_option = state + .process_map_task_result(map_result) + .chain_err(|| "Error processing map result.")?; + + state + .set_worker_operation_completed(map_result.get_worker_id(), map_result.get_status()) + .chain_err(|| "Error processing map result.")?; + + let task = match task_option { + Some(task) => task, + None => return Ok(()), + }; + if task.status == TaskStatus::Complete || task.status == TaskStatus::Failed { let task_update_sender = self.task_update_sender.lock().unwrap(); task_update_sender @@ -151,10 +166,6 @@ impl WorkerManager { .chain_err(|| "Error processing map result.")?; } - state - .set_worker_operation_completed(map_result.get_worker_id(), map_result.get_status()) - .chain_err(|| "Error processing map result.")?; - Ok(()) } diff --git a/util/src/data_layer/abstraction_layer.rs b/util/src/data_layer/abstraction_layer.rs index 0970af85..12e924a3 100644 --- a/util/src/data_layer/abstraction_layer.rs +++ b/util/src/data_layer/abstraction_layer.rs @@ -34,5 +34,5 @@ pub trait AbstractionLayer { chunk_start: u64, chunk_end: u64, worker_id: &str, - ) -> Result; + ) -> u64; } diff --git a/util/src/data_layer/nfs_layer.rs b/util/src/data_layer/nfs_layer.rs index 571de881..cc6ccd4a 100644 --- a/util/src/data_layer/nfs_layer.rs +++ b/util/src/data_layer/nfs_layer.rs @@ -126,8 +126,8 @@ impl AbstractionLayer for NFSAbstractionLayer { _chunk_start: u64, _chunk_end: u64, _worker_id: &str, - ) -> Result { + ) -> u64 { // Each file is equally close on NFS - Ok(1) + 1 } } diff --git a/util/src/data_layer/null_layer.rs b/util/src/data_layer/null_layer.rs index 7f7414c3..ba9cd0ee 100644 --- a/util/src/data_layer/null_layer.rs +++ b/util/src/data_layer/null_layer.rs @@ -91,7 +91,7 @@ impl AbstractionLayer for NullAbstractionLayer { _chunk_start: u64, _chunk_end: u64, _worker_id: &str, - ) -> Result { - Ok(1) + ) -> u64 { + 1 } } diff --git a/util/src/data_layer/s3_layer.rs b/util/src/data_layer/s3_layer.rs index 7855c0bf..246a5f33 100644 --- a/util/src/data_layer/s3_layer.rs +++ b/util/src/data_layer/s3_layer.rs @@ -304,7 +304,7 @@ impl AbstractionLayer for AmazonS3AbstractionLayer { _chunk_start: u64, _chunk_end: u64, _worker_id: &str, - ) -> Result { - Ok(1) + ) -> u64 { + 1 } } diff --git a/util/src/distributed_filesystem/distributed_file_layer.rs b/util/src/distributed_filesystem/distributed_file_layer.rs index 61d3f702..b2a1828e 100644 --- a/util/src/distributed_filesystem/distributed_file_layer.rs +++ b/util/src/distributed_filesystem/distributed_file_layer.rs @@ -11,6 +11,7 @@ use data_layer::AbstractionLayer; use distributed_filesystem::{FileSystemMasterInterface, FileSystemWorkerInterface, LocalFileManager}; use errors::*; +use logging::output_error; const MAX_GET_DATA_RETRIES: usize = 3; const MEGA_BYTE: u64 = 1000 * 1000; @@ -254,10 +255,18 @@ impl AbstractionLayer for DFSAbstractionLayer { chunk_start: u64, chunk_end: u64, worker_id: &str, - ) -> Result { - let file_chunks = self.master_interface + ) -> u64 { + let file_chunks = match self.master_interface .get_file_chunks(&path.to_string_lossy()) - .chain_err(|| "Could not get file locations")?; + { + Ok(chunks) => chunks, + Err(err) => { + output_error(&err.chain_err(|| { + format!("Error getting data closeness score for {:?}", path) + })); + return 0; + } + }; let mut score: u64 = 0; @@ -271,6 +280,6 @@ impl AbstractionLayer for DFSAbstractionLayer { } } - Ok(score) + score } } From d70acc35f237e1a956655ec9dadcbf8ce9ca640c Mon Sep 17 00:00:00 2001 From: Ryan Connell Date: Sun, 15 Apr 2018 21:24:03 +0100 Subject: [PATCH 32/58] Fix dependencies preventing us from building on MacOS --- worker/Cargo.toml | 4 +++- worker/src/main.rs | 1 + worker/src/operations/operation_handler.rs | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/worker/Cargo.toml b/worker/Cargo.toml index 335239bc..4aa9330e 100644 --- a/worker/Cargo.toml +++ b/worker/Cargo.toml @@ -13,7 +13,6 @@ futures-cpupool = "0.1" grpc = "0.2.1" log = "0.3.8" libc = "0.2.33" -procinfo = "0.4.2" tls-api = "0.1.10" serde = "1.0" serde_derive = "1.0" @@ -22,5 +21,8 @@ util = { path = "../util" } uuid = { version = "0.5", features = ["v4"] } cerberus-proto = { path = "../proto" } +[target.'cfg(target_os = "linux")'.dependencies] +procinfo = "0.4.2" + [dev-dependencies] mocktopus = "=0.1.1" diff --git a/worker/src/main.rs b/worker/src/main.rs index 140ba5c2..a28755e9 100644 --- a/worker/src/main.rs +++ b/worker/src/main.rs @@ -12,6 +12,7 @@ extern crate grpc; extern crate libc; #[macro_use] extern crate log; +#[cfg(target_os = "linux")] extern crate procinfo; extern crate protobuf; extern crate serde; diff --git a/worker/src/operations/operation_handler.rs b/worker/src/operations/operation_handler.rs index c296591b..59eb569d 100644 --- a/worker/src/operations/operation_handler.rs +++ b/worker/src/operations/operation_handler.rs @@ -3,7 +3,9 @@ use std::sync::{Arc, Mutex}; use futures::future; use futures::prelude::*; +#[cfg(target_os = "linux")] use libc::_SC_CLK_TCK; +#[cfg(target_os = "linux")] use procinfo::pid::stat_self; use serde_json; use uuid::Uuid; @@ -72,6 +74,7 @@ pub fn failure_details_from_error(err: &Error) -> String { failure_details } +#[cfg(target_os = "linux")] pub fn get_cpu_time() -> u64 { // We can panic in this case. This is beyond our control and would mostly be caused by a very // critical error. @@ -79,6 +82,11 @@ pub fn get_cpu_time() -> u64 { (stat.utime + stat.stime + stat.cstime + stat.cutime) as u64 / (_SC_CLK_TCK as u64) } +#[cfg(not(target_os = "linux"))] +pub fn get_cpu_time() -> u64 { + 0 +} + impl OperationHandler { pub fn new( master_interface: Arc, From 7290ef083201cfa207665c12424c50d9a737e9da Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Sun, 15 Apr 2018 23:34:15 +0100 Subject: [PATCH 33/58] Make task cancelling an atomic operation --- master/src/scheduling/scheduler.rs | 12 +--- master/src/worker_management/state.rs | 61 ++++++++----------- .../src/worker_management/worker_manager.rs | 34 +++++------ 3 files changed, 41 insertions(+), 66 deletions(-) diff --git a/master/src/scheduling/scheduler.rs b/master/src/scheduling/scheduler.rs index d9459103..a2dc1e3d 100644 --- a/master/src/scheduling/scheduler.rs +++ b/master/src/scheduling/scheduler.rs @@ -206,17 +206,9 @@ impl Scheduler { return Ok(false); } - let workers = self.worker_manager - .get_workers_running_job(job_id) - .chain_err(|| format!("Unable to get list of workers running job {}", job_id))?; - - self.worker_manager - .remove_queued_tasks_for_job(job_id) - .chain_err(|| "Unable to remove queued task from state")?; - self.worker_manager - .cancel_workers_tasks(workers) - .chain_err(|| "Unable to cancel task on workers")?; + .cancel_job(job_id) + .chain_err(|| "Error cancelling job")?; info!("Succesfully cancelled job {}", job_id); Ok(cancelled) diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 5b051751..132887b5 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -53,11 +53,9 @@ impl State { } } - pub fn remove_queued_tasks_for_job(&mut self, job_id: &str) -> Result<()> { + pub fn remove_queued_tasks_for_job(&mut self, job_id: &str) { let mut new_priority_queue: BinaryHeap = BinaryHeap::new(); - self.tasks.retain(|_, v| v.job_id != job_id); - for priority_task in self.priority_task_queue.drain() { if let Some(task) = self.tasks.get_mut(&priority_task.id.clone()) { if task.job_id != job_id { @@ -67,11 +65,9 @@ impl State { } self.priority_task_queue = new_priority_queue; - - Ok(()) } - fn get_in_progress_tasks_for_job(&self, job_id: &str) -> Result> { + fn get_in_progress_tasks_for_job(&self, job_id: &str) -> Vec { let mut tasks: Vec = Vec::new(); for task in self.tasks.values() { @@ -80,7 +76,7 @@ impl State { } } - Ok(tasks) + tasks } pub fn get_worker_count(&self) -> u32 { @@ -272,21 +268,6 @@ impl State { Ok(Some(assigned_task)) } - pub fn get_workers_running_job(&self, job_id: &str) -> Result> { - let mut worker_ids = Vec::new(); - - let workers = self.get_workers(); - for worker in workers { - if let Some(task) = self.tasks.get(&worker.current_task_id) { - if task.job_id == job_id { - worker_ids.push(worker.worker_id.clone()); - } - } - } - - Ok(worker_ids) - } - // Mark that a given worker has returned a result for it's task. pub fn set_worker_operation_completed( &mut self, @@ -507,18 +488,27 @@ impl State { } } - // Clears the workers current_task_id and returns the previous value. - pub fn cancel_task_for_worker(&mut self, worker_id: &str) -> Result { - let worker = self.workers - .get_mut(worker_id) - .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; + pub fn cancel_job(&mut self, job_id: &str) -> Vec<(String, String)> { + let mut workers_running_job = Vec::new(); - let previous_task_id = worker.current_task_id.clone(); - worker.current_task_id = String::new(); + // Remove assigned workers + for worker in self.workers.values_mut() { + if let Some(task) = self.tasks.get(&worker.current_task_id) { + if task.job_id == job_id { + workers_running_job + .push((worker.worker_id.clone(), worker.current_task_id.clone())); + worker.current_task_id = String::new(); + } + } + } - self.tasks.remove(&previous_task_id); + // Remove queued tasks + self.remove_queued_tasks_for_job(job_id); - Ok(previous_task_id) + // Remove tasks + self.tasks.retain(|_, v| v.job_id != job_id); + + workers_running_job } pub fn has_task(&self, task_id: &str) -> bool { @@ -566,8 +556,7 @@ impl State { .chain_err(|| format!("Unable to get map task with ID {}", task_id))? .clone(); - let queued_tasks = self.get_in_progress_tasks_for_job(&map_task.job_id) - .chain_err(|| format!("Unable to get queued tasks for job {}", map_task.job_id))?; + let queued_tasks = self.get_in_progress_tasks_for_job(&map_task.job_id); let mut remove_tasks: Vec = Vec::new(); for queued_task_id in queued_tasks.clone() { @@ -587,8 +576,7 @@ impl State { } } - self.remove_tasks_from_queue(&remove_tasks) - .chain_err(|| "Unable to remove tasks from queue")?; + self.remove_tasks_from_queue(&remove_tasks); // Reschedule the map task let mut new_map_task = map_task.clone(); @@ -647,7 +635,7 @@ impl State { } } - pub fn remove_tasks_from_queue(&mut self, task_ids: &[String]) -> Result<()> { + pub fn remove_tasks_from_queue(&mut self, task_ids: &[String]) { let mut new_priority_queue: BinaryHeap = BinaryHeap::new(); for task_id in task_ids { @@ -661,7 +649,6 @@ impl State { } self.priority_task_queue = new_priority_queue; - Ok(()) } } diff --git a/master/src/worker_management/worker_manager.rs b/master/src/worker_management/worker_manager.rs index aa0d9208..51f210e7 100644 --- a/master/src/worker_management/worker_manager.rs +++ b/master/src/worker_management/worker_manager.rs @@ -183,15 +183,9 @@ impl WorkerManager { } } - pub fn cancel_workers_tasks(&self, workers: Vec) -> Result<()> { - let mut state = self.state.lock().unwrap(); - - for worker_id in workers { - // Clear the task from the worker so that we can ignore it's result. - let task_id = state - .cancel_task_for_worker(&worker_id) - .chain_err(|| format!("Error cancelling task on worker: {}", worker_id))?; - + // Takes a list of worker id, task id pairs and tells each worker to stop running its tasks. + pub fn cancel_workers_tasks(&self, workers: Vec<(String, String)>) -> Result<()> { + for (worker_id, task_id) in workers { // Create a request to cancel the task the worker is currently running. let mut cancel_request = pb::CancelTaskRequest::new(); cancel_request.task_id = task_id; @@ -204,6 +198,18 @@ impl WorkerManager { Ok(()) } + pub fn cancel_job(&self, job_id: &str) -> Result<()> { + let workers_running_job = { + let mut state = self.state.lock().unwrap(); + state.cancel_job(job_id) + }; + + self.cancel_workers_tasks(workers_running_job) + .chain_err(|| "Error cancelling tasks on workers")?; + + Ok(()) + } + fn remove_workers(&self, workers: Vec) { let mut state = self.state.lock().unwrap(); for worker_id in workers { @@ -378,16 +384,6 @@ impl WorkerManager { state.has_task(task_id) } - pub fn remove_queued_tasks_for_job(&self, job_id: &str) -> Result<()> { - let mut state = self.state.lock().unwrap(); - state.remove_queued_tasks_for_job(job_id) - } - - pub fn get_workers_running_job(&self, job_id: &str) -> Result> { - let state = self.state.lock().unwrap(); - state.get_workers_running_job(job_id) - } - pub fn requeue_slow_task(&self, task_id: &str) -> Result<()> { let mut state = self.state.lock().unwrap(); state.requeue_slow_task(task_id) From 7690af03c5e5cda370d5e10d98c64ca52b0f42b0 Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Mon, 16 Apr 2018 13:03:39 +0100 Subject: [PATCH 34/58] Disable CI caching --- .travis.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 245720df..97db614d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,6 @@ language: rust rust: - nightly -cache: - - cargo: true - - timeout: 1000 - addons: apt: packages: From 98fe6ab73392baaf2105e9d920164ebc88259c29 Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 16 Apr 2018 00:31:24 +0100 Subject: [PATCH 35/58] Add logs viewing feature to cluster dashboard --- Cargo.lock | 13 +++- master/content/dashboard.js | 71 +++++++++++++++++++ master/content/index.html | 19 ++++- master/content/stylesheet.css | 46 ++++++++++++ master/src/dashboard/fetch_logs.rs | 36 ++++++++++ master/src/dashboard/mod.rs | 2 + master/src/dashboard/server.rs | 42 +++++++++++ master/src/initialization/dashboard_server.rs | 2 + master/src/initialization/master_resources.rs | 3 +- master/src/main.rs | 15 +++- master/src/parser.rs | 16 +++++ master/src/worker_management/state.rs | 9 +++ .../src/worker_management/worker_manager.rs | 6 ++ proto/worker.proto | 11 +++ util/Cargo.toml | 4 +- util/src/lib.rs | 2 +- util/src/logging.rs | 70 +++++++++++------- worker/src/communication/worker_interface.rs | 7 +- worker/src/initialization/grpc_server.rs | 6 +- worker/src/initialization/worker_resources.rs | 3 +- worker/src/main.rs | 14 ++-- worker/src/parser.rs | 16 +++++ .../src/server/intermediate_data_service.rs | 3 +- worker/src/server/log_service.rs | 40 +++++++++++ worker/src/server/mod.rs | 8 +++ 25 files changed, 418 insertions(+), 46 deletions(-) create mode 100644 master/src/dashboard/fetch_logs.rs create mode 100644 worker/src/server/log_service.rs diff --git a/Cargo.lock b/Cargo.lock index b6dbd368..1bff88c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -373,6 +373,14 @@ name = "fake-simd" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "fern" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "foreign-types" version = "0.3.2" @@ -1752,11 +1760,11 @@ version = "0.6.0" dependencies = [ "cerberus-proto 0.6.0", "chrono 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "env_logger 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fern 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)", "grpc 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", "protobuf 1.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "rusoto_core 0.32.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1914,6 +1922,7 @@ dependencies = [ "checksum error 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "a6e606f14042bb87cc02ef6a14db6c90ab92ed6f62d87e69377bc759fd7987cc" "checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3" "checksum fake-simd 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" +"checksum fern 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "de237898aa785d93b869e965132f62a525b90cce5c0bf2a395f03e62e085bc5c" "checksum foreign-types 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" "checksum foreign-types-shared 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" "checksum fs_extra 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674" diff --git a/master/content/dashboard.js b/master/content/dashboard.js index 0c6d76f7..c05caa9f 100644 --- a/master/content/dashboard.js +++ b/master/content/dashboard.js @@ -39,6 +39,64 @@ function addButton(text, clickedFuncCreator, container) { .appendTo(container); } +function showWorkerLogs(workerId, logsText) { + var workerLogsText = $("#worker-logs-text"); + workerLogsText.text(logsText); + + var workerScroll = $("#worker-scroll-box"); + + var logsBox = $("#worker-logs-box"); + logsBox.text("Worker ID: " + workerId); + + workerScroll.appendTo(logsBox); + + var logsView = $("#worker-logs"); + logsView.css({ + "visibility": "visible", + }); +} + +function closeWorkerLogs() { + var logsView = $("#worker-logs"); + logsView.css({ + "visibility": "hidden", + }) +} + +function showLogsFunction(workerId) { + return function(button) { + return function() { + button.attr("disabled", true); + button.text("Requesting logs"); + button.css({ + "background-color": "#B3E5FC", + }); + + $.ajax({ + url: "/api/workerlogs/query?worker_id=" + encodeURIComponent(workerId), + dataType: "text", + success: function(logsText) { + showWorkerLogs(workerId, logsText); + button.attr("disabled", false); + button.text("View Logs"); + button.css({ + "background-color": "#008CBA", + }); + }, + error: function(xhr, status, error) { + console.log("Error getting worker logs:"); + console.log(error); + button.attr("disabled", false); + button.text("View Logs"); + button.css({ + "background-color": "#008CBA", + }); + } + }); + } + } +} + function updateWorkersList() { var workersBox = $("#workers"); @@ -57,6 +115,7 @@ function updateWorkersList() { addProperty("Operation Status", workerInfo.operation_status, container); addProperty("Current Task ID", workerInfo.current_task_id, container); addProperty("Task Assignments Failed", workerInfo.task_assignments_failed, container); + addButton("View Logs", showLogsFunction(workerInfo.worker_id), container.parent()); }); } }); @@ -136,10 +195,22 @@ function updateTasksList() { }); } +function updateMasterLog() { + var logs = $("#master-logs"); + $.ajax({ + url: "/api/logs", + dataType: "text", + success: function(logsText) { + logs.text(logsText); + } + }); +} + function updateFunction() { updateWorkersList(); updateJobsList(); updateTasksList(); + updateMasterLog(); } function processScheduleMapReduceForm(e) { diff --git a/master/content/index.html b/master/content/index.html index 82318547..a38a83ef 100644 --- a/master/content/index.html +++ b/master/content/index.html @@ -26,6 +26,11 @@

Cluster Dashboard

+
Master Logs +
+

+
+
@@ -33,7 +38,7 @@

Cluster Dashboard

- × + ×
Schedule MapReduce
@@ -69,6 +74,18 @@

Cluster Dashboard

+
+ × +
Worker Logs +
+
Worker ID +
+

+
+
+
+
+ diff --git a/master/content/stylesheet.css b/master/content/stylesheet.css index 567ad25e..ba637114 100644 --- a/master/content/stylesheet.css +++ b/master/content/stylesheet.css @@ -265,3 +265,49 @@ a.exit-form { position: absolute; text-decoration: None; } + +.logs { + background-color: #ffffff; + border-color: #bbdefb; + border-style: double; + border-width: medium; + color: #212121; + float: left; + font-size: 20px; + font-weight: bold; + height: 400px; + overflow: hidden; + padding: 10px; + text-align: center; + width: 100%; +} + +.logs-scroll { + background-color: #ffffff; + border: none; + height: 370px; + overflow-y: scroll; +} + +.logs-text { + font-size: 16px; + font-weight: normal; + text-align: left; + white-space: pre-wrap; +} + +.hidden-logs { + background-color: #bbdefb; + border-color: black; + border-radius: 5px; + border-width: 1px; + box-shadow: 3px 4px 4px 0 rgba(0, 0, 0, .3); + height: 480px; + left: 50%; + margin-left: -175px; + min-height: 300px; + position: fixed; + top: 15%; + visibility: hidden; + width: 700px; +} diff --git a/master/src/dashboard/fetch_logs.rs b/master/src/dashboard/fetch_logs.rs new file mode 100644 index 00000000..8be10ebc --- /dev/null +++ b/master/src/dashboard/fetch_logs.rs @@ -0,0 +1,36 @@ +use std::sync::Arc; + +use grpc::RequestOptions; + +use cerberus_proto::worker as pb; +use cerberus_proto::worker_grpc as grpc_pb; +use cerberus_proto::worker_grpc::WorkerLogService; +use errors::*; +use worker_management::WorkerManager; + +pub fn fetch_worker_log(worker_id: &str, worker_manager: &Arc) -> Result { + let worker_addr = worker_manager + .get_worker_address(worker_id) + .chain_err(|| "Failed to get worker address")?; + + let client = grpc_pb::WorkerLogServiceClient::new_plain( + &worker_addr.ip().to_string(), + worker_addr.port(), + Default::default(), + ).chain_err(|| format!("Error building client for worker {}", worker_addr))?; + + info!("Fetching log file from worker {}", worker_id); + let response = client + .get_worker_logs(RequestOptions::new(), pb::EmptyMessage::new()) + .wait(); + + match response { + Ok(res) => Ok(res.1.get_log_contents().to_string()), + Err(err) => Err(format!( + "Error retrieving log from worker id {}, address: {:?}, error: {}", + worker_id, + worker_addr, + err.to_string() + ).into()), + } +} diff --git a/master/src/dashboard/mod.rs b/master/src/dashboard/mod.rs index 98aa56c8..66fbba3c 100644 --- a/master/src/dashboard/mod.rs +++ b/master/src/dashboard/mod.rs @@ -1,4 +1,6 @@ // The dashboard module contains code used for serving the cluster dashboard web page. +pub mod fetch_logs; pub mod server; +pub use self::fetch_logs::fetch_worker_log; pub use self::server::DashboardServer; diff --git a/master/src/dashboard/server.rs b/master/src/dashboard/server.rs index fc64d3fc..7ae9e315 100644 --- a/master/src/dashboard/server.rs +++ b/master/src/dashboard/server.rs @@ -1,4 +1,6 @@ use std::collections::HashMap; +use std::fs::File; +use std::io::{BufReader, Read}; use std::path::Path; use std::sync::Arc; @@ -10,6 +12,7 @@ use staticfile::Static; use urlencoded::UrlEncodedQuery; use common::{Job, JobOptions}; +use dashboard::fetch_logs::fetch_worker_log; use errors::*; use scheduling::Scheduler; use util::data_layer::AbstractionLayer; @@ -20,11 +23,29 @@ use worker_management::WorkerManager; const DEFAULT_PRIORITY: u32 = 3; const DEFAULT_MAP_SIZE: u32 = 64; +fn read_local_file>(path: P) -> Result { + debug!("Attempting to read local file: {:?}", path.as_ref()); + let file = File::open(&path) + .chain_err(|| format!("unable to open file {}", path.as_ref().to_string_lossy()))?; + + let mut buf_reader = BufReader::new(file); + let mut value = String::new(); + buf_reader.read_to_string(&mut value).chain_err(|| { + format!( + "unable to read content of {}", + path.as_ref().to_string_lossy() + ) + })?; + + Ok(value) +} + #[derive(Clone)] struct ApiHandler { scheduler_arc: Arc, worker_manager_arc: Arc, data_abstraction_layer_arc: Arc, + log_file_path: String, } impl ApiHandler { @@ -188,6 +209,23 @@ impl ApiHandler { Ok(Response::with((iron::status::Ok, "{{ success: true }}"))) } + fn get_master_logs(&self, _req: &mut Request) -> Result { + match read_local_file(&self.log_file_path) { + Ok(log_file_contents) => Ok(Response::with((iron::status::Ok, log_file_contents))), + Err(err) => Err(err.chain_err(|| "Unable to read master log file")), + } + } + + fn get_worker_logs(&self, req: &mut Request) -> Result { + let worker_id = self.get_parameter(req, "worker_id") + .chain_err(|| "Failed to get worker_id")?; + + let log_contents = fetch_worker_log(&worker_id, &self.worker_manager_arc) + .chain_err(|| format!("Failed to get log for worker with id {}", worker_id))?; + + Ok(Response::with((iron::status::Ok, log_contents))) + } + fn handle_endpoint(&self, endpoint: &str, req: &mut Request) -> IronResult { let result = { match endpoint { @@ -196,6 +234,8 @@ impl ApiHandler { "jobs" => self.jobs(req), "canceljob" => self.cancel_job(req), "schedule" => self.schedule_job(req), + "logs" => self.get_master_logs(req), + "workerlogs" => self.get_worker_logs(req), _ => Err("Invalid endpoint".into()), } }; @@ -236,11 +276,13 @@ impl DashboardServer { scheduler_arc: Arc, worker_manager_arc: Arc, data_abstraction_layer_arc: Arc, + log_file_path: String, ) -> Result { let handler = ApiHandler { scheduler_arc, worker_manager_arc, data_abstraction_layer_arc, + log_file_path, }; let mut router = Router::new(); diff --git a/master/src/initialization/dashboard_server.rs b/master/src/initialization/dashboard_server.rs index 9a1163f8..56ac3820 100644 --- a/master/src/initialization/dashboard_server.rs +++ b/master/src/initialization/dashboard_server.rs @@ -15,6 +15,7 @@ pub fn initialize_dashboard_server( worker_manager: &Arc, scheduler: &Arc, data_layer: &Arc, + log_file_path: &str, ) -> Result { let dashboard_address = matches .value_of("dashboard-address") @@ -25,6 +26,7 @@ pub fn initialize_dashboard_server( Arc::clone(scheduler), Arc::clone(worker_manager), Arc::clone(data_layer), + log_file_path.to_string(), ).chain_err(|| "Failed to create cluster dashboard server.")?; Ok(dashboard) diff --git a/master/src/initialization/master_resources.rs b/master/src/initialization/master_resources.rs index 1a47df6c..b9852aa1 100644 --- a/master/src/initialization/master_resources.rs +++ b/master/src/initialization/master_resources.rs @@ -22,7 +22,7 @@ pub struct MasterResources { } impl MasterResources { - pub fn new(matches: &ArgMatches) -> Result { + pub fn new(matches: &ArgMatches, log_file_path: &str) -> Result { let (worker_info_sender, worker_info_receiver) = channel(); let (data_abstraction_layer_arc, filesystem_manager) = @@ -46,6 +46,7 @@ impl MasterResources { &worker_manager, &scheduler, &data_abstraction_layer_arc, + log_file_path, ).chain_err(|| "Error initilizing cluster dashboard")?, grpc_server: initialize_grpc_server( diff --git a/master/src/main.rs b/master/src/main.rs index 9d17cc4b..e7612855 100644 --- a/master/src/main.rs +++ b/master/src/main.rs @@ -53,12 +53,21 @@ use errors::*; use initialization::MasterResources; use util::init_logger; +const DEFAULT_LOG_FILE_PATH: &str = "/tmp/cerberus/logs/master.log"; + fn run() -> Result<()> { println!("Cerberus Master!"); - init_logger().chain_err(|| "Failed to initialise logging.")?; - let matches = parser::parse_command_line(); - let resources = MasterResources::new(&matches).chain_err(|| "Error initilizing master")?; + + let log_file_path = matches + .value_of("log-file-path") + .unwrap_or(DEFAULT_LOG_FILE_PATH); + + init_logger(log_file_path, matches.is_present("verbose-logging")) + .chain_err(|| "Failed to initialise logging.")?; + + let resources = + MasterResources::new(&matches, log_file_path).chain_err(|| "Error initilizing master")?; // Startup worker management loops worker_management::run_task_assigment_loop(Arc::clone(&resources.worker_manager)); diff --git a/master/src/parser.rs b/master/src/parser.rs index 0a1b4829..babb844f 100644 --- a/master/src/parser.rs +++ b/master/src/parser.rs @@ -29,6 +29,22 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { .takes_value(false) .required(false), ) + .arg( + Arg::with_name("verbose-logging") + .long("verbose-logging") + .short("v") + .help("Removes all log filters") + .takes_value(false) + .required(false), + ) + .arg( + Arg::with_name("log-file-path") + .long("log-file-path") + .short("l") + .help("Location to write log file") + .takes_value(true) + .required(false), + ) .arg( Arg::with_name("state-location") .long("state-location") diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index c58ad861..7e16411e 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -1,5 +1,6 @@ use std::collections::BinaryHeap; use std::collections::HashMap; +use std::net::SocketAddr; use std::path::Path; use std::sync::Arc; @@ -83,6 +84,14 @@ impl State { Ok(tasks) } + pub fn get_worker_address(&self, worker_id: &str) -> Result { + let worker = self.workers + .get(worker_id) + .chain_err(|| format!("Worker with ID {} not found.", worker_id))?; + + Ok(worker.address.clone()) + } + pub fn get_worker_count(&self) -> u32 { self.workers.len() as u32 } diff --git a/master/src/worker_management/worker_manager.rs b/master/src/worker_management/worker_manager.rs index 8ef4706c..5ad7c675 100644 --- a/master/src/worker_management/worker_manager.rs +++ b/master/src/worker_management/worker_manager.rs @@ -1,3 +1,4 @@ +use std::net::SocketAddr; use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::{Arc, Mutex}; use std::{thread, time}; @@ -458,6 +459,11 @@ impl WorkerManager { let mut state = self.state.lock().unwrap(); state.handle_worker_report(request) } + + pub fn get_worker_address(&self, worker_id: &str) -> Result { + let state = self.state.lock().unwrap(); + state.get_worker_address(worker_id) + } } impl SimpleStateHandling for WorkerManager { diff --git a/proto/worker.proto b/proto/worker.proto index 98e1f25d..1d6939bc 100644 --- a/proto/worker.proto +++ b/proto/worker.proto @@ -175,3 +175,14 @@ message IntermediateDataRequest { message IntermediateData { bytes data = 1; } + +//////////////////////////////////////////////////////////////////////////////// + +// WorkerLogService allows the master to get logs from workers. +service WorkerLogService { + rpc GetWorkerLogs (EmptyMessage) returns (LogsResult); +} + +message LogsResult { + string log_contents = 1; +} diff --git a/util/Cargo.toml b/util/Cargo.toml index 49c3b8a1..bd30b50e 100644 --- a/util/Cargo.toml +++ b/util/Cargo.toml @@ -6,10 +6,10 @@ authors = ["Cerberus Authors "] [dependencies] cerberus-proto = { path = "../proto" } chrono = "0.4" -env_logger = "0.4.3" error-chain = "0.11.0" +fern = "0.5.5" grpc = "0.2.1" -log = "0.3.8" +log = { version = "0.4", features = ["std"] } protobuf = "1.4.1" rand = "0.4" serde = "1.0" diff --git a/util/src/lib.rs b/util/src/lib.rs index 8481d52d..07d4d70d 100644 --- a/util/src/lib.rs +++ b/util/src/lib.rs @@ -1,8 +1,8 @@ extern crate cerberus_proto; extern crate chrono; -extern crate env_logger; #[macro_use] extern crate error_chain; +extern crate fern; extern crate grpc; #[macro_use] extern crate log; diff --git a/util/src/logging.rs b/util/src/logging.rs index 87cc96a6..78a0f637 100644 --- a/util/src/logging.rs +++ b/util/src/logging.rs @@ -1,33 +1,55 @@ -use std::env; +use std::fs; +use std::io; +use std::path::Path; use chrono::Utc; -use env_logger::LogBuilder; use error_chain::ChainedError; -use log::LogRecord; +use fern; +use log; use errors::*; -// This makes the default logging level info for everything but the grpc modules which will use -// error. This is because grpc modules output an unnecessary level of logging for info. -const DEFAULT_LOG_CONFIG: &str = "info,httpbis=error,grpc=error"; - -pub fn init_logger() -> Result<()> { - let format = |record: &LogRecord| { - format!( - "{}:{}:{}: {}", - record.level(), - Utc::now().format("%T%.3f"), - record.location().module_path(), - record.args() - ) - }; - let log_config = env::var("RUST_LOG").unwrap_or_else(|_| DEFAULT_LOG_CONFIG.to_owned()); - - LogBuilder::new() - .format(format) - .parse(log_config.as_str()) - .init() - .chain_err(|| "Failed to build env_logger")?; +pub fn init_logger(log_file_path: &str, verbose_logging: bool) -> Result<()> { + let mut log_directory = Path::new(log_file_path).to_path_buf(); + log_directory.pop(); + fs::create_dir_all(log_directory).chain_err(|| "Error creating logs output directory")?; + + let log_file = fern::log_file(log_file_path) + .chain_err(|| format!("Error creating log file with path: {}", log_file_path))?; + + let mut fern_logger = fern::Dispatch::new().format(|out, _message, record| { + if let Some(path) = record.module_path() { + out.finish(format_args!( + "{}:{}:{}: {}", + record.level(), + Utc::now().format("%T%.3f"), + path, + record.args(), + )) + } else { + out.finish(format_args!( + "{}:{}: {}", + record.level(), + Utc::now().format("%T%.3f"), + record.args(), + )) + } + }); + + if verbose_logging { + fern_logger = fern_logger.level(log::LevelFilter::Off); + } else { + fern_logger = fern_logger + .level(log::LevelFilter::Info) + .level_for("httpbis", log::LevelFilter::Error) + .level_for("grpc", log::LevelFilter::Error); + } + + fern_logger + .chain(io::stdout()) + .chain(log_file) + .apply() + .chain_err(|| "Failed to initialize logging")?; Ok(()) } diff --git a/worker/src/communication/worker_interface.rs b/worker/src/communication/worker_interface.rs index 062f3a68..ba5f5d08 100644 --- a/worker/src/communication/worker_interface.rs +++ b/worker/src/communication/worker_interface.rs @@ -1,16 +1,15 @@ +use std::error::Error; use std::net::SocketAddr; use std::path::Path; use std::str::FromStr; use grpc::RequestOptions; -use std::error::Error; - -use errors::*; -use operations::io; use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; use cerberus_proto::worker_grpc::IntermediateDataService; +use errors::*; +use operations::io; use operations::OperationResources; // For pub functions only const INTERMEDIATE_DATA_RETRIES: u8 = 3; diff --git a/worker/src/initialization/grpc_server.rs b/worker/src/initialization/grpc_server.rs index 7cfa23ef..f906b123 100644 --- a/worker/src/initialization/grpc_server.rs +++ b/worker/src/initialization/grpc_server.rs @@ -5,7 +5,8 @@ use clap::ArgMatches; use errors::*; use operations::OperationHandler; -use server::{FileSystemService, IntermediateDataService, ScheduleOperationService, Server}; +use server::{FileSystemService, IntermediateDataService, ScheduleOperationService, Server, + WorkerLogService}; use util::distributed_filesystem::LocalFileManager; // Setting the port to 0 means a random available port will be selected @@ -14,6 +15,7 @@ const DEFAULT_PORT: &str = "0"; pub fn initialize_grpc_server( matches: &ArgMatches, operation_handler: &Arc, + log_file_path: &str, local_file_manager: Option>, ) -> Result { let port = u16::from_str(matches.value_of("port").unwrap_or(DEFAULT_PORT)) @@ -21,12 +23,14 @@ pub fn initialize_grpc_server( let scheduler_service = ScheduleOperationService::new(Arc::clone(operation_handler)); let interm_data_service = IntermediateDataService; + let log_service = WorkerLogService::new(log_file_path.to_string()); let filesystem_service = FileSystemService::new(local_file_manager); let server = Server::new( port, scheduler_service, interm_data_service, + log_service, filesystem_service, ).chain_err(|| "Error building grpc server")?; diff --git a/worker/src/initialization/worker_resources.rs b/worker/src/initialization/worker_resources.rs index 6c3ee913..2b19d431 100644 --- a/worker/src/initialization/worker_resources.rs +++ b/worker/src/initialization/worker_resources.rs @@ -21,7 +21,7 @@ pub struct WorkerResources { } impl WorkerResources { - pub fn new(matches: &ArgMatches) -> Result { + pub fn new(matches: &ArgMatches, log_file_path: &str) -> Result { let master_addr = SocketAddr::from_str( matches.value_of("master").unwrap_or(DEFAULT_MASTER_ADDR), ).chain_err(|| "Error parsing master address")?; @@ -41,6 +41,7 @@ impl WorkerResources { grpc_server: initialize_grpc_server( matches, &operation_handler, + log_file_path, local_file_manager.clone(), ).chain_err(|| "Error initializing grpc server")?, diff --git a/worker/src/main.rs b/worker/src/main.rs index 140ba5c2..75a3d5cd 100644 --- a/worker/src/main.rs +++ b/worker/src/main.rs @@ -54,15 +54,21 @@ use initialization::{register_worker, WorkerResources}; use util::init_logger; const DEFAULT_WORKER_IP: &str = "[::]"; +const DEFAULT_LOG_FILE_PATH: &str = "/tmp/cerberus/logs/worker.log"; fn run() -> Result<()> { println!("Cerberus Worker!"); - init_logger().chain_err(|| "Failed to initialise logging.")?; - let matches = parser::parse_command_line(); - let mut resources = - WorkerResources::new(&matches).chain_err(|| "Error initializing worker resources")?; + let log_file_path = matches + .value_of("log-file-path") + .unwrap_or(DEFAULT_LOG_FILE_PATH); + + init_logger(log_file_path, matches.is_present("verbose-logging")) + .chain_err(|| "Failed to initialise logging.")?; + + let mut resources = WorkerResources::new(&matches, log_file_path) + .chain_err(|| "Error initializing worker resources")?; let local_ip_addr = matches.value_of("ip").unwrap_or(DEFAULT_WORKER_IP); let local_addr = SocketAddr::from_str(&format!( diff --git a/worker/src/parser.rs b/worker/src/parser.rs index 2c5c1a5f..ae97dc6c 100644 --- a/worker/src/parser.rs +++ b/worker/src/parser.rs @@ -73,6 +73,22 @@ pub fn parse_command_line<'a>() -> ArgMatches<'a> { .takes_value(false) .required(false), ) + .arg( + Arg::with_name("verbose-logging") + .long("verbose-logging") + .short("v") + .help("Removes all log filters") + .takes_value(false) + .required(false), + ) + .arg( + Arg::with_name("log-file-path") + .long("log-file-path") + .short("l") + .help("Location to write log file") + .takes_value(true) + .required(false), + ) .arg( Arg::with_name("state-location") .long("state-location") diff --git a/worker/src/server/intermediate_data_service.rs b/worker/src/server/intermediate_data_service.rs index 0775bb80..09b15764 100644 --- a/worker/src/server/intermediate_data_service.rs +++ b/worker/src/server/intermediate_data_service.rs @@ -1,9 +1,8 @@ use grpc::{Error, RequestOptions, SingleResponse}; -use operations::io; - use cerberus_proto::worker as pb; use cerberus_proto::worker_grpc as grpc_pb; +use operations::io; use util; const DATA_NOT_AVAILABLE: &str = "Data not available"; diff --git a/worker/src/server/log_service.rs b/worker/src/server/log_service.rs new file mode 100644 index 00000000..46bf995b --- /dev/null +++ b/worker/src/server/log_service.rs @@ -0,0 +1,40 @@ +use grpc::{Error, RequestOptions, SingleResponse}; + +use cerberus_proto::worker as pb; +use cerberus_proto::worker_grpc as grpc_pb; +use operations::io; +use util; + +const FILE_NOT_AVAILABLE: &str = "Log file not available"; + +pub struct WorkerLogService { + log_file_path: String, +} + +impl WorkerLogService { + pub fn new(log_file_path: String) -> WorkerLogService { + WorkerLogService { log_file_path } + } +} + +impl grpc_pb::WorkerLogService for WorkerLogService { + fn get_worker_logs( + &self, + _o: RequestOptions, + _req: pb::EmptyMessage, + ) -> SingleResponse { + info!("Serving log file {}", self.log_file_path); + match io::read_local(&self.log_file_path) { + Ok(log_file_contents) => { + let mut res = pb::LogsResult::new(); + res.set_log_contents(log_file_contents); + + SingleResponse::completed(res) + } + Err(err) => { + util::output_error(&err.chain_err(|| "Unable to read log file: {:?}")); + SingleResponse::err(Error::Other(FILE_NOT_AVAILABLE)) + } + } + } +} diff --git a/worker/src/server/mod.rs b/worker/src/server/mod.rs index 6f482a2a..15c65b1b 100644 --- a/worker/src/server/mod.rs +++ b/worker/src/server/mod.rs @@ -4,11 +4,14 @@ mod filesystem_service; /// `intermediate_data_service` is responsible for handing traffic coming from other workers /// requesting intermediate data created by the map task. mod intermediate_data_service; +/// `log_service` is responsible for serving the log file of this worker to the master. +mod log_service; /// `master_service` is responsible for handing data incoming from the master. mod master_service; pub use self::filesystem_service::FileSystemService; pub use self::intermediate_data_service::IntermediateDataService; +pub use self::log_service::WorkerLogService; pub use self::master_service::ScheduleOperationService; use cerberus_proto::filesystem_grpc; @@ -28,6 +31,7 @@ impl Server { port: u16, scheduler_service: ScheduleOperationService, interm_data_service: IntermediateDataService, + logs_service: WorkerLogService, filesystem_service: FileSystemService, ) -> Result { let mut server_builder = grpc::ServerBuilder::new_plain(); @@ -44,6 +48,10 @@ impl Server { server_builder.add_service(worker_grpc::IntermediateDataServiceServer::new_service_def( interm_data_service, )); + // Register WorkerLogService + server_builder.add_service(worker_grpc::WorkerLogServiceServer::new_service_def( + logs_service, + )); // Register FileSystemService server_builder.add_service( From 1cd7130979de7bb569b51beb311e8f2bd6f29f37 Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 16 Apr 2018 13:48:22 +0100 Subject: [PATCH 36/58] Center worker logs view --- master/content/stylesheet.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/master/content/stylesheet.css b/master/content/stylesheet.css index ba637114..81b5fb7e 100644 --- a/master/content/stylesheet.css +++ b/master/content/stylesheet.css @@ -304,7 +304,7 @@ a.exit-form { box-shadow: 3px 4px 4px 0 rgba(0, 0, 0, .3); height: 480px; left: 50%; - margin-left: -175px; + margin-left: -350px; min-height: 300px; position: fixed; top: 15%; From f91d3b15088280be7d346e93c20db5103ce1706b Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 16 Apr 2018 13:55:27 +0100 Subject: [PATCH 37/58] Improve schedule form --- master/content/dashboard.js | 19 +++++++++++-------- master/content/index.html | 4 ++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/master/content/dashboard.js b/master/content/dashboard.js index c05caa9f..428de529 100644 --- a/master/content/dashboard.js +++ b/master/content/dashboard.js @@ -213,6 +213,14 @@ function updateFunction() { updateMasterLog(); } +var scheduleFormToggled = false; + +function toggleScheduleForm() { + scheduleFormToggled = !scheduleFormToggled; + var scheduleForm = document.getElementById("schedule-form"); + scheduleForm.style.visibility = scheduleFormToggled ? "visible" : "hidden"; +} + function processScheduleMapReduceForm(e) { if (e.preventDefault) { e.preventDefault(); @@ -260,6 +268,9 @@ function processScheduleMapReduceForm(e) { dataType: "json", complete: function() { submitButton.val("Succesfully scheduled"); + if (scheduleFormToggled) { + toggleScheduleForm(); + } restoreAnimation(); updateFunction(); } @@ -268,14 +279,6 @@ function processScheduleMapReduceForm(e) { return false; } -var scheduleFormToggled = false; - -function toggleScheduleForm() { - scheduleFormToggled = !scheduleFormToggled; - var scheduleForm = document.getElementById("schedule-form"); - scheduleForm.style.visibility = scheduleFormToggled ? "visible" : "hidden"; -} - $(document).ready(function() { var scheduleMapReduceForm = document.getElementById("schedule-job"); if (scheduleMapReduceForm.attachEvent) { diff --git a/master/content/index.html b/master/content/index.html index a38a83ef..fbf82aa8 100644 --- a/master/content/index.html +++ b/master/content/index.html @@ -38,7 +38,7 @@

Cluster Dashboard

- × + ×
Schedule MapReduce
@@ -75,7 +75,7 @@

Cluster Dashboard

- × + ×
Worker Logs
Worker ID From a765843e32e4c9dc88040759e89f058655c3292f Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 16 Apr 2018 14:31:35 +0100 Subject: [PATCH 38/58] Fix card layout --- master/content/stylesheet.css | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/master/content/stylesheet.css b/master/content/stylesheet.css index 81b5fb7e..7737e3f5 100644 --- a/master/content/stylesheet.css +++ b/master/content/stylesheet.css @@ -62,9 +62,10 @@ tr:hover { border-radius: 5px; box-shadow: 0 4px 8px 0 rgba(0, 0, 0, .2); color: #212121; - float: left; + display: inline-block; margin-bottom: 20px; margin-right: 20px; + text-align: center; transition: .3s; width: 45%; } @@ -80,6 +81,7 @@ tr:hover { .container-grid { display: auto; padding: 8px 16px; + text-align: left; } .lower { From 85f85569b5fd41d1b0bb10e609bca3a43e93ad9c Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Mon, 16 Apr 2018 15:04:33 +0100 Subject: [PATCH 39/58] Make overscheduled task priority lower --- master/src/worker_management/state.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/master/src/worker_management/state.rs b/master/src/worker_management/state.rs index 132887b5..0a2124b1 100644 --- a/master/src/worker_management/state.rs +++ b/master/src/worker_management/state.rs @@ -19,6 +19,7 @@ const MAX_TASK_ASSIGNMENT_FAILURE: u16 = 5; const DEFAULT_TASK_PRIORITY: u32 = 10; const FAILED_TASK_PRIORITY: u32 = 20; const REQUEUED_TASK_PRIORITY: u32 = 15; +const OVERSCHEDULED_TASK_PRIORITY: u32 = 9; // Max tasks to consider from the top of the task queue when trying to find the best task to assign. const MAX_TASKS_TO_CONSIDER: u32 = 5; @@ -337,7 +338,7 @@ impl State { self.priority_task_queue.push(PriorityTask::new( task_id.to_string(), - REQUEUED_TASK_PRIORITY * task.job_priority, + OVERSCHEDULED_TASK_PRIORITY * task.job_priority, )); Ok(()) From 2e8b3b00ef883c9ecf1fa3cce04a14d2874d9e4a Mon Sep 17 00:00:00 2001 From: Conor Griffin Date: Mon, 16 Apr 2018 22:11:54 +0100 Subject: [PATCH 40/58] Update typo in readme launch template details --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 13c689b1..718f4041 100644 --- a/README.md +++ b/README.md @@ -79,8 +79,8 @@ pip install boto3 | Template Name | Tag | |----|----| - | Master | Key: "type", Value: "worker" | - | Worker | Key: "type", Value: "master" | + | Master | Key: "type", Value: "master" | + | Worker | Key: "type", Value: "worker" | 6. **Deploy Instances** From d5cdd793b459451ae22ae8dabc416b6cb577e5ab Mon Sep 17 00:00:00 2001 From: Darragh Griffin Date: Mon, 16 Apr 2018 22:29:29 +0100 Subject: [PATCH 41/58] Change dashboard so cards aren't deleted and re-added --- master/content/dashboard.js | 297 +++++++++++++++++++++++++++++++----- 1 file changed, 258 insertions(+), 39 deletions(-) diff --git a/master/content/dashboard.js b/master/content/dashboard.js index 428de529..4b25be59 100644 --- a/master/content/dashboard.js +++ b/master/content/dashboard.js @@ -21,16 +21,18 @@ function createCard(parent, halfSize) { return table; } -function addProperty(name, value, container) { + +function addProperty(name, value, id, container) { var row = $("").appendTo(container); $("").text(name).appendTo(row); - $("").text(value).appendTo(row); + $("").text(value).attr("id", id).appendTo(row); } -function addButton(text, clickedFuncCreator, container) { +function addButton(text, clickedFuncCreator, id, container) { var button = $("
Master Logs +
+

@@ -75,6 +78,9 @@

Cluster Dashboard

+
+
×
Worker Logs
@@ -86,10 +92,13 @@

Cluster Dashboard

+