diff --git a/GPU.lua b/GPU.lua new file mode 100644 index 000000000..0efddd1d0 --- /dev/null +++ b/GPU.lua @@ -0,0 +1,267 @@ +------------------------------------------------------------------------ +--[[ GPU ]]-- +-- Decorates a module such that its parameters are +-- hosted on a specified GPU device. +-- The operations are also executed on that device. +-- Arguments input and gradOutput are converted to the specified device +-- before being fed to the decorated module. +-- Returned output is on the specified outdevice (defaults to device). +-- Returned gradInput is allocated on the same device as the input. +-- The unit test is located in cunn. +------------------------------------------------------------------------ +local GPU, parent = torch.class("nn.GPU", "nn.Container") + +function GPU:__init(module, device, outdevice) + parent.__init(self) + assert(torch.type(device) == 'number') + self.device = device + self.outdevice = outdevice or device + + assert(torch.isTypeOf(module, 'nn.Module')) + self.modules[1] = module + + if module:type() == 'torch.CudaTensor' then + self:cuda() + end +end + +-- set the device of the decorated module +function GPU:setDevice(device) + self.device = device or self.device + + local function recursiveModuleDevice(obj) + if type(obj) == 'table' and not torch.isTypeOf(obj, 'nn.GPU') then + for k,v in pairs(obj) do + obj[k] = recursiveModuleDevice(v) + end + elseif torch.type(obj):match('torch.Cuda.*Tensor') then + if obj:getDevice() ~= self.device then + obj = obj:clone() -- this will reallocate it to self.device + local newdevice = obj:getDevice() + -- when nElement() == 0 newdevice is 0 + assert(newdevice == self.device or newdevice == 0) + end + end + assert(obj ~= nil) + return obj + end + + assert(self.modules[1]) + self.modules[1] = cutorch.withDevice(self.device, function() return recursiveModuleDevice(self.modules[1]) end) + return self +end + +-- returns a dst that has device device for each element in src +function GPU:recursiveSetDevice(dst, src, device) + if torch.type(src) == 'table' then + dst = torch.type(dst) == 'table' and dst or {} + for k,v in ipairs(src) do + dst[k] = self:recursiveSetDevice(dst[k], v, device) + end + for k=#src+1,#dst do + dst[k] = nil + end + elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then + if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then + dst = src.new() + end + dst:resizeAs(src):copy(src) + else + dst = src + end + return dst +end + +-- makes sure dst is a identical to src except but on the same device as proto +function GPU:recursiveSetDeviceAs(dst, src, proto) + if torch.type(src) == 'table' then + dst = torch.type(dst) == 'table' and dst or {} + for k,v in ipairs(src) do + dst[k] = self:recursiveSetDeviceAs(dst[k], v, proto[k]) + end + for k=#src+1,#dst do + dst[k] = nil + end + elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= proto:getDevice() and src:getDevice() ~= 0 then + local device = proto:getDevice() + if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then + dst = src.new() + end + cutorch.withDevice(device, function() dst:resizeAs(src):copy(src) end) + else + dst = src + end + return dst +end + +function GPU:updateOutput(input) + if self._type == 'torch.CudaTensor' then + local output = cutorch.withDevice(self.device, function() + self._input = self:recursiveSetDevice(self._input, input, self.device) + return self.modules[1]:updateOutput(self._input) + end) + + if self.device ~= self.outdevice then + self.output = cutorch.withDevice(self.outdevice, function() + return self:recursiveSetDevice(self.output, output, self.outdevice) + end) + else + self.output = output + end + else + self.output = self.modules[1]:updateOutput(input) + end + + return self.output +end + +function GPU:updateGradInput(input, gradOutput) + if self._type == 'torch.CudaTensor' then + local gradInput = cutorch.withDevice(self.device, function() + self._gradOutput = self:recursiveSetDevice(self._gradOutput, gradOutput, self.device) + return self.modules[1]:updateGradInput(self._input, self._gradOutput) + end) + + self.gradInput = self:recursiveSetDeviceAs(self.gradInput, gradInput, input) + else + self.gradInput = self.modules[1]:updateGradInput(input, gradOutput) + end + + return self.gradInput +end + +function GPU:accGradParameters(input, gradOutput, scale) + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() + self.modules[1]:accGradParameters(self._input, self._gradOutput, scale) + end) + else + self.modules[1]:accGradParameters(input, gradOutput, scale) + end +end + +function GPU:apply(callback) + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.apply(self, callback) end) + else + parent.apply(self, callback) + end +end + +function GPU:type(type, typecache) + if type and type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.type(self, type, typecache) end) + self:setDevice() + else + self.output = nil + self.gradInput = nil + self._input = nil + self._gradOutput = nil + parent.type(self, type, typecache) + end + return self +end + +function GPU:clearState() + self.output = nil + self.gradInput = nil + self._input = nil + self._gradOutput = nil + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.clearState(self) end) + else + parent.clearState(self) + end +end + +function GPU:zeroGradParameters() + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.zeroGradParameters(self) end) + else + parent.zeroGradParameters(self) + end +end + +function GPU:updateParameters(lr) + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.updateParameters(self, lr) end) + else + parent.updateParameters(self, lr) + end +end + +function GPU:training() + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.training(self) end) + else + parent.training(self) + end +end + +function GPU:evaluate() + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.evaluate(self) end) + else + parent.evaluate(self) + end +end + +function GPU:share(mlp, ...) + local args = {...} + if self._type == 'torch.CudaTensor' then + cutorch.withDevice(self.device, function() parent.share(self, mlp, unpack(args)) end) + else + parent.share(self, mlp, unpack(args)) + end + return self +end + +function GPU:clone(...) + local args = {...} + if self._type == 'torch.CudaTensor' then + return cutorch.withDevice(self.device, function() parent.clone(self, unpack(args)) end) + else + return parent.clone(self, unpack(args)) + end +end + +function GPU:write(file) + -- Write all values in the object as a table. + local object = {} + for k, v in pairs(self) do + object[k] = v + end + local header = {self._type, self.device} + file:writeObject(header) + file:writeObject(object) +end + +function GPU:read(file) + local header = file:readObject() + local object + if header[1] == 'torch.CudaTensor' then + object = cutorch.withDevice(header[2], function() return file:readObject() end) + else + object = file:readObject() + end + + for k, v in pairs(object) do + self[k] = v + end +end + +function GPU:__tostring__() + if self.modules[1].__tostring__ then + return torch.type(self) .. '(' .. self.device ..') @ ' .. self.modules[1]:__tostring__() + else + return torch.type(self) .. '(' .. self.device ..') @ ' .. torch.type(self.modules[1]) + end +end + +function GPU:accUpdateGradParameters(input, gradOutput, lr) + error"Not Implemented" +end + +function GPU:sharedAccUpdateGradParameters(input, gradOutput, lr) + error"Not Implemented" +end diff --git a/doc/simple.md b/doc/simple.md index e29813c03..2cbef531d 100644 --- a/doc/simple.md +++ b/doc/simple.md @@ -51,6 +51,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi * [Padding](#nn.Padding) : adds padding to a dimension ; * [L1Penalty](#nn.L1Penalty) : adds an L1 penalty to an input (for sparsity) ; * [GradientReversal](#nn.GradientReversal) : reverses the gradient (to maximize an objective function) ; + * [GPU](#nn.GPU) : decorates a module so that it can be executed on a specific GPU device. ## Linear ## @@ -1357,3 +1358,50 @@ One can also call: module:setLambda(lambda) ``` to set the hyper-parameter `lambda` dynamically during training. + + +## GPU ## + +```lua +gpu = nn.GPU(module, device, [outdevice]) +require 'cunn' +gpu:cuda() +``` + +Decorates an encapsulated `module` so that it can be executed on a specific GPU `device`. +The decorated module's `parameters` are thus hosted on the specified GPU `device`. +All operations on the `gpu` module are executed on that device. +Calls to `forward`/`backward` will transfer arguments `input` and `gradOutput` to the specified `device`, +which are then fed as arguments to the decorated `module`. +Returned `output` is located on the specified `outdevice` (defaults to `device`). +Returned `gradInput` is allocated on the same device as the `input`. + +When serialized/deserialized, the `gpu` module will be run on the same `device` that it was serialized with. +To prevent this from happening, the module can be converted to float/double before serialization: + +```lua +gpu:float() +gpustr = torch.serialize(gpu) +``` + +The module is located in the __nn__ package instead of __cunn__ as this allows +it to be used in CPU-only enviroments, which are common for production models. + +The module supports nested table `input` and `gradOutput` tensors originating from multiple devices. +Each nested tensor in the returned `gradInput` will be transfered to the device its commensurate tensor in the `input`. + +The intended use-case is not for model-parallelism where the models are executed in parallel on multiple devices, but +for sequential models where a single GPU doesn't have enough memory. + +Example using 4 GPUs: + +```lua +mlp = nn.Sequential() + :add(nn.GPU(nn.Linear(10000,10000), 1)) + :add(nn.GPU(nn.Linear(10000,10000), 2)) + :add(nn.GPU(nn.Linear(10000,10000), 3)) + :add(nn.GPU(nn.Linear(10000,10000), 4, cutorch.getDevice())) +``` + +Note how the last `GPU` instance will return an `output` tensor on the same device as the current device (`cutorch.getDevice`). + diff --git a/init.lua b/init.lua index 516f29b19..94e92d415 100644 --- a/init.lua +++ b/init.lua @@ -124,6 +124,8 @@ require('nn.VolumetricMaxUnpooling') require('nn.VolumetricAveragePooling') require('nn.VolumetricBatchNormalization') +require('nn.GPU') + require('nn.ParallelTable') require('nn.Identity') require('nn.ConcatTable') diff --git a/test.lua b/test.lua index 4f0a3e89b..f78967438 100644 --- a/test.lua +++ b/test.lua @@ -6228,6 +6228,11 @@ function nntest.ErrorHandling() ) end +function nntest.GPU() + -- this is a placeholder to let you know that the nn.GPU unit test + -- is located in cunn package. +end + mytester:add(nntest) jac = nn.Jacobian