You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Getting the below error when trying to run the Llam2 70B benchmark as given in the link - Here with the latest Gaudi stack 1.17.0 and pytorch version -2.3.1
Parameter Offload: Total persistent parameters: 1318912 in 161 params
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank0]: main(script_args, training_args)
[rank0]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank0]: trainer.train()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank0]: return inner_training_loop(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank0]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank0]: result = self._prepare_deepspeed(*args)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank0]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank0]: engine = DeepSpeedEngine(args=args,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank0]: self._configure_optimizer(optimizer, model_parameters)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank0]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank0]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank0]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank0]: param_groups: List[List[Parameter]] = tuple(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank0]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank0]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank0]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank0]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank0]: return object.getattribute(self, name)
[rank0]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank3]: Traceback (most recent call last):
[rank3]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank3]: main(script_args, training_args)
[rank3]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank3]: trainer.train()
[rank3]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank3]: return inner_training_loop(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank3]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank3]: result = self._prepare_deepspeed(*args)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank3]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank3]: engine = DeepSpeedEngine(args=args,
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank3]: self._configure_optimizer(optimizer, model_parameters)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank3]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank3]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank3]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank3]: param_groups: List[List[Parameter]] = tuple(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank3]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank3]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank3]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank3]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank3]: return object.getattribute(self, name)
[rank3]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank5]: Traceback (most recent call last):
[rank5]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank5]: main(script_args, training_args)
[rank5]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank5]: trainer.train()
[rank5]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank5]: return inner_training_loop(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank5]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank5]: result = self._prepare_deepspeed(*args)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank5]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank5]: engine = DeepSpeedEngine(args=args,
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank5]: self._configure_optimizer(optimizer, model_parameters)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank5]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank5]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank5]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank5]: param_groups: List[List[Parameter]] = tuple(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank5]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank5]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank5]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank5]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank5]: return object.getattribute(self, name)
[rank5]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank2]: Traceback (most recent call last):
[rank2]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank2]: main(script_args, training_args)
[rank2]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank2]: trainer.train()
[rank2]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank2]: return inner_training_loop(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank2]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank2]: result = self._prepare_deepspeed(*args)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank2]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank2]: engine = DeepSpeedEngine(args=args,
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank2]: self._configure_optimizer(optimizer, model_parameters)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank2]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank2]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank2]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank2]: param_groups: List[List[Parameter]] = tuple(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank2]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank2]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank2]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank2]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank2]: return object.getattribute(self, name)
[rank2]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank4]: Traceback (most recent call last):
[rank4]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank4]: main(script_args, training_args)
[rank4]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank4]: trainer.train()
[rank4]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank4]: return inner_training_loop(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank4]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank4]: result = self._prepare_deepspeed(*args)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank4]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank4]: engine = DeepSpeedEngine(args=args,
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank4]: self._configure_optimizer(optimizer, model_parameters)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank4]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank4]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank4]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank4]: param_groups: List[List[Parameter]] = tuple(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank4]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank4]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank4]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank4]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank4]: return object.getattribute(self, name)
[rank4]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank6]: Traceback (most recent call last):
[rank6]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank6]: main(script_args, training_args)
[rank6]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank6]: trainer.train()
[rank6]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank6]: return inner_training_loop(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank6]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank6]: result = self._prepare_deepspeed(*args)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank6]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank6]: engine = DeepSpeedEngine(args=args,
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank6]: self._configure_optimizer(optimizer, model_parameters)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank6]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank6]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank6]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank6]: param_groups: List[List[Parameter]] = tuple(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank6]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank6]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank6]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank6]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank6]: return object.getattribute(self, name)
[rank6]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank7]: Traceback (most recent call last):
[rank7]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank7]: main(script_args, training_args)
[rank7]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank7]: trainer.train()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank7]: return inner_training_loop(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank7]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank7]: result = self._prepare_deepspeed(*args)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank7]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank7]: engine = DeepSpeedEngine(args=args,
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank7]: self._configure_optimizer(optimizer, model_parameters)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank7]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank7]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank7]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank7]: param_groups: List[List[Parameter]] = tuple(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank7]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank7]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank7]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank7]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank7]: return object.getattribute(self, name)
[rank7]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank1]: Traceback (most recent call last):
[rank1]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank1]: main(script_args, training_args)
[rank1]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank1]: trainer.train()
[rank1]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank1]: return inner_training_loop(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank1]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank1]: result = self._prepare_deepspeed(*args)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank1]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank1]: engine = DeepSpeedEngine(args=args,
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank1]: self._configure_optimizer(optimizer, model_parameters)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank1]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank1]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank1]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank1]: param_groups: List[List[Parameter]] = tuple(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank1]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank1]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank1]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank1]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank1]: return object.getattribute(self, name)
[rank1]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[2024-08-23 05:53:18,483] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7697
[2024-08-23 05:53:18,581] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7698
[2024-08-23 05:53:18,837] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7699
[2024-08-23 05:53:19,093] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7700
[2024-08-23 05:53:19,097] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7701
[2024-08-23 05:53:19,194] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7702
[2024-08-23 05:53:19,251] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7703
[2024-08-23 05:53:19,254] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7704
[2024-08-23 05:53:19,254] [ERROR] [launch.py:321:sigkill_handler] ['/usr/bin/python3', '-u', 'scripts/train.py', '--dataset_path', '/root/datasets/scrolls_gov_report_8k', '--model_path', '/root/model/llama2-70b-fused-qkv-mlperf', '--max_seq_len', '8192', '--bf16', 'True', '--logging_steps', '24', '--eval_steps', '48', '--save_steps', '999', '--output_dir', '/tmp/llama-70b', '--dataloader_num_workers', '4', '--per_device_train_batch_size', '1', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--learning_rate', '4e-4', '--weight_decay', '0.0001', '--warmup_ratio', '0', '--max_grad_norm', '0.3', '--use_gradient_checkpointing', 'True', '--target_eval_loss', '0.925', '--use_peft_lora', 'True', '--lora_r', '16', '--lora_alpha', '32', '--lora_dropout', '0.1', '--max_steps', '1024', '--seed', '19034', '--use_habana', '--use_lazy_mode', '--deepspeed', 'configs/ds_zero3.json', '--evaluation_strategy', 'steps', '--gradient_checkpointing', 'True', '--cache_dir', '/tmp/datasets', '--adjust_throughput', '--flash_attention_fast_softmax_enable', 'true', '--flash_attention_recompute_enable', 'false', '--lora_target_modules', 'qkv_proj,o_proj', '--fp8'] exits with return code = 1
[ERROR|distributed_runner.py:222] 2024-08-23 05:53:20,189 >> deepspeed --num_nodes 1 --num_gpus 8 --no_local_rank --master_port 29500 scripts/train.py --dataset_path /root/datasets/scrolls_gov_report_8k --model_path /root/model/llama2-70b-fused-qkv-mlperf --max_seq_len 8192 --bf16 True --logging_steps 24 --eval_steps 48 --save_steps 999 --output_dir /tmp/llama-70b --dataloader_num_workers 4 --per_device_train_batch_size 1 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --learning_rate 4e-4 --weight_decay 0.0001 --warmup_ratio 0 --max_grad_norm 0.3 --use_gradient_checkpointing True --target_eval_loss 0.925 --use_peft_lora True --lora_r 16 --lora_alpha 32 --lora_dropout 0.1 --max_steps 1024 --seed 19034 --use_habana --use_lazy_mode --deepspeed configs/ds_zero3.json --evaluation_strategy steps --gradient_checkpointing True --cache_dir /tmp/datasets --adjust_throughput --flash_attention_fast_softmax_enable true --flash_attention_recompute_enable false --lora_target_modules qkv_proj,o_proj --fp8 exited with status = 1
The text was updated successfully, but these errors were encountered:
Getting the below error when trying to run the Llam2 70B benchmark as given in the link - Here with the latest Gaudi stack 1.17.0 and pytorch version -2.3.1
Parameter Offload: Total persistent parameters: 1318912 in 161 params
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank0]: main(script_args, training_args)
[rank0]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank0]: trainer.train()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank0]: return inner_training_loop(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank0]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank0]: result = self._prepare_deepspeed(*args)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank0]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank0]: engine = DeepSpeedEngine(args=args,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank0]: self._configure_optimizer(optimizer, model_parameters)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank0]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank0]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank0]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank0]: param_groups: List[List[Parameter]] = tuple(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank0]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank0]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank0]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank0]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank0]: return object.getattribute(self, name)
[rank0]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank3]: Traceback (most recent call last):
[rank3]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank3]: main(script_args, training_args)
[rank3]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank3]: trainer.train()
[rank3]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank3]: return inner_training_loop(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank3]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank3]: result = self._prepare_deepspeed(*args)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank3]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank3]: engine = DeepSpeedEngine(args=args,
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank3]: self._configure_optimizer(optimizer, model_parameters)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank3]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank3]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank3]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank3]: param_groups: List[List[Parameter]] = tuple(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank3]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank3]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank3]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank3]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank3]: return object.getattribute(self, name)
[rank3]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank5]: Traceback (most recent call last):
[rank5]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank5]: main(script_args, training_args)
[rank5]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank5]: trainer.train()
[rank5]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank5]: return inner_training_loop(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank5]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank5]: result = self._prepare_deepspeed(*args)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank5]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank5]: engine = DeepSpeedEngine(args=args,
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank5]: self._configure_optimizer(optimizer, model_parameters)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank5]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank5]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank5]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank5]: param_groups: List[List[Parameter]] = tuple(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank5]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank5]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank5]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank5]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank5]: return object.getattribute(self, name)
[rank5]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank2]: Traceback (most recent call last):
[rank2]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank2]: main(script_args, training_args)
[rank2]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank2]: trainer.train()
[rank2]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank2]: return inner_training_loop(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank2]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank2]: result = self._prepare_deepspeed(*args)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank2]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank2]: engine = DeepSpeedEngine(args=args,
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank2]: self._configure_optimizer(optimizer, model_parameters)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank2]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank2]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank2]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank2]: param_groups: List[List[Parameter]] = tuple(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank2]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank2]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank2]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank2]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank2]: return object.getattribute(self, name)
[rank2]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank4]: Traceback (most recent call last):
[rank4]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank4]: main(script_args, training_args)
[rank4]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank4]: trainer.train()
[rank4]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank4]: return inner_training_loop(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank4]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank4]: result = self._prepare_deepspeed(*args)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank4]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank4]: engine = DeepSpeedEngine(args=args,
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank4]: self._configure_optimizer(optimizer, model_parameters)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank4]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank4]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank4]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank4]: param_groups: List[List[Parameter]] = tuple(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank4]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank4]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank4]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank4]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank4]: return object.getattribute(self, name)
[rank4]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank6]: Traceback (most recent call last):
[rank6]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank6]: main(script_args, training_args)
[rank6]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank6]: trainer.train()
[rank6]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank6]: return inner_training_loop(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank6]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank6]: result = self._prepare_deepspeed(*args)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank6]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank6]: engine = DeepSpeedEngine(args=args,
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank6]: self._configure_optimizer(optimizer, model_parameters)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank6]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank6]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank6]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank6]: param_groups: List[List[Parameter]] = tuple(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank6]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank6]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank6]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank6]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank6]: return object.getattribute(self, name)
[rank6]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank7]: Traceback (most recent call last):
[rank7]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank7]: main(script_args, training_args)
[rank7]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank7]: trainer.train()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank7]: return inner_training_loop(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank7]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank7]: result = self._prepare_deepspeed(*args)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank7]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank7]: engine = DeepSpeedEngine(args=args,
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank7]: self._configure_optimizer(optimizer, model_parameters)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank7]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank7]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank7]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank7]: param_groups: List[List[Parameter]] = tuple(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank7]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank7]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank7]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank7]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank7]: return object.getattribute(self, name)
[rank7]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[rank1]: Traceback (most recent call last):
[rank1]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 249, in
[rank1]: main(script_args, training_args)
[rank1]: File "/root/MLPERF/Intel-HabanaLabs/benchmarks/llm_finetune/scripts/train.py", line 231, in main
[rank1]: trainer.train()
[rank1]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 526, in train
[rank1]: return inner_training_loop(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/transformers/trainer.py", line 701, in _inner_training_loop
[rank1]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1220, in prepare
[rank1]: result = self._prepare_deepspeed(*args)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/optimum/habana/accelerate/accelerator.py", line 637, in _prepare_deepspeed
[rank1]: engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 172, in initialize
[rank1]: engine = DeepSpeedEngine(args=args,
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 308, in init
[rank1]: self._configure_optimizer(optimizer, model_parameters)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1261, in _configure_optimizer
[rank1]: self.optimizer = self._configure_zero_optimizer(basic_optimizer)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1595, in _configure_zero_optimizer
[rank1]: optimizer = DeepSpeedZeroOptimizer_Stage3(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 313, in init
[rank1]: self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 654, in _create_fp16_partitions_with_defragmentation
[rank1]: param_groups: List[List[Parameter]] = tuple(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 655, in
[rank1]: self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in create_fp16_sub_groups
[rank1]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 889, in
[rank1]: params_group_numel = sum([param.partition_numel() for param in params_group])
[rank1]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py", line 24, in getattribute
[rank1]: return object.getattribute(self, name)
[rank1]: AttributeError: 'HabanaParameterWrapper' object has no attribute 'partition_numel'
[2024-08-23 05:53:18,483] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7697
[2024-08-23 05:53:18,581] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7698
[2024-08-23 05:53:18,837] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7699
[2024-08-23 05:53:19,093] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7700
[2024-08-23 05:53:19,097] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7701
[2024-08-23 05:53:19,194] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7702
[2024-08-23 05:53:19,251] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7703
[2024-08-23 05:53:19,254] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 7704
[2024-08-23 05:53:19,254] [ERROR] [launch.py:321:sigkill_handler] ['/usr/bin/python3', '-u', 'scripts/train.py', '--dataset_path', '/root/datasets/scrolls_gov_report_8k', '--model_path', '/root/model/llama2-70b-fused-qkv-mlperf', '--max_seq_len', '8192', '--bf16', 'True', '--logging_steps', '24', '--eval_steps', '48', '--save_steps', '999', '--output_dir', '/tmp/llama-70b', '--dataloader_num_workers', '4', '--per_device_train_batch_size', '1', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--learning_rate', '4e-4', '--weight_decay', '0.0001', '--warmup_ratio', '0', '--max_grad_norm', '0.3', '--use_gradient_checkpointing', 'True', '--target_eval_loss', '0.925', '--use_peft_lora', 'True', '--lora_r', '16', '--lora_alpha', '32', '--lora_dropout', '0.1', '--max_steps', '1024', '--seed', '19034', '--use_habana', '--use_lazy_mode', '--deepspeed', 'configs/ds_zero3.json', '--evaluation_strategy', 'steps', '--gradient_checkpointing', 'True', '--cache_dir', '/tmp/datasets', '--adjust_throughput', '--flash_attention_fast_softmax_enable', 'true', '--flash_attention_recompute_enable', 'false', '--lora_target_modules', 'qkv_proj,o_proj', '--fp8'] exits with return code = 1
[ERROR|distributed_runner.py:222] 2024-08-23 05:53:20,189 >> deepspeed --num_nodes 1 --num_gpus 8 --no_local_rank --master_port 29500 scripts/train.py --dataset_path /root/datasets/scrolls_gov_report_8k --model_path /root/model/llama2-70b-fused-qkv-mlperf --max_seq_len 8192 --bf16 True --logging_steps 24 --eval_steps 48 --save_steps 999 --output_dir /tmp/llama-70b --dataloader_num_workers 4 --per_device_train_batch_size 1 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --learning_rate 4e-4 --weight_decay 0.0001 --warmup_ratio 0 --max_grad_norm 0.3 --use_gradient_checkpointing True --target_eval_loss 0.925 --use_peft_lora True --lora_r 16 --lora_alpha 32 --lora_dropout 0.1 --max_steps 1024 --seed 19034 --use_habana --use_lazy_mode --deepspeed configs/ds_zero3.json --evaluation_strategy steps --gradient_checkpointing True --cache_dir /tmp/datasets --adjust_throughput --flash_attention_fast_softmax_enable true --flash_attention_recompute_enable false --lora_target_modules qkv_proj,o_proj --fp8 exited with status = 1
The text was updated successfully, but these errors were encountered: