You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
错误:
Tokenizing dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:00<00:00, 185.33it/s]
11/06 16:32:06 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
11/06 16:32:06 - mmengine - WARNING - Dataset PreferenceDataset has no metainfo. dataset_meta in visualizer will be None.
11/06 16:32:07 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
11/06 16:32:07 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
11/06 16:32:07 - mmengine - INFO - Checkpoints will be saved to /root/xtuner_for_my/xtuner/tools/work_dirs/internlm2_chat_reward_qlora_with_load_v_head_and_finetune.
/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py:198: UserWarning: Detected call of scheduler.step() before optimizer.step(). In PyTorch 1.1.0 and later, you should call them in the opposite order: optimizer.step() before scheduler.step(). Failure to do this will result in PyTorch skipping the first value of the parameter value schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
warnings.warn(
11/06 16:32:12 - mmengine - INFO - Iter(train) [ 10/1670] lr: 1.8368e-05 eta: 0:13:07 time: 0.4747 data_time: 0.0173 memory: 2865 loss: 0.3013 acc: 1.0000 chosen_score_mean: 0.6914 rejected_score_mean: -1.2861 num_samples: 1.0000 num_tokens: 1001.0000
Traceback (most recent call last):
File "/root/xtuner_for_my/xtuner/tools/train.py", line 360, in
main()
File "/root/xtuner_for_my/xtuner/tools/train.py", line 356, in main
runner.train()
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/runner.py", line 1777, in train
model = self.train_loop.run() # type: ignore
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 289, in run
self.run_iter(data_batch)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 313, in run_iter
outputs = self.runner.model.train_step(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 116, in train_step
optim_wrapper.update_params(parsed_losses)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/optimizer_wrapper.py", line 201, in update_params
self.step(**step_kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py", line 115, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/amp_optimizer_wrapper.py", line 137, in step
self.loss_scaler.unscale_(self.optimizer)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 338, in unscale_
optimizer_state["found_inf_per_device"] = self.unscale_grads(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 279, in unscale_grads
torch.amp_foreach_non_finite_check_and_unscale(
RuntimeError: "amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'Tokenizing dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:00<00:00, 185.33it/s]
11/06 16:32:06 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
11/06 16:32:06 - mmengine - WARNING - Dataset PreferenceDataset has no metainfo. dataset_meta in visualizer will be None.
11/06 16:32:07 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
11/06 16:32:07 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
11/06 16:32:07 - mmengine - INFO - Checkpoints will be saved to /root/xtuner_for_my/xtuner/tools/work_dirs/internlm2_chat_reward_qlora_with_load_v_head_and_finetune.
/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py:198: UserWarning: Detected call of scheduler.step() before optimizer.step(). In PyTorch 1.1.0 and later, you should call them in the opposite order: optimizer.step() before scheduler.step(). Failure to do this will result in PyTorch skipping the first value of the parameter value schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
warnings.warn(
11/06 16:32:12 - mmengine - INFO - Iter(train) [ 10/1670] lr: 1.8368e-05 eta: 0:13:07 time: 0.4747 data_time: 0.0173 memory: 2865 loss: 0.3013 acc: 1.0000 chosen_score_mean: 0.6914 rejected_score_mean: -1.2861 num_samples: 1.0000 num_tokens: 1001.0000
Traceback (most recent call last):
File "/root/xtuner_for_my/xtuner/tools/train.py", line 360, in
main()
File "/root/xtuner_for_my/xtuner/tools/train.py", line 356, in main
runner.train()
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/runner.py", line 1777, in train
model = self.train_loop.run() # type: ignore
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 289, in run
self.run_iter(data_batch)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 313, in run_iter
outputs = self.runner.model.train_step(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 116, in train_step
optim_wrapper.update_params(parsed_losses)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/optimizer_wrapper.py", line 201, in update_params
self.step(**step_kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py", line 115, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/amp_optimizer_wrapper.py", line 137, in step
self.loss_scaler.unscale(self.optimizer)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 338, in unscale_
optimizer_state["found_inf_per_device"] = self.unscale_grads(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 279, in unscale_grads
torch.amp_foreach_non_finite_check_and_unscale(
RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
错误:
Tokenizing dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:00<00:00, 185.33it/s]
11/06 16:32:06 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
11/06 16:32:06 - mmengine - WARNING - Dataset PreferenceDataset has no metainfo.
dataset_meta
in visualizer will be None.11/06 16:32:07 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
11/06 16:32:07 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
11/06 16:32:07 - mmengine - INFO - Checkpoints will be saved to /root/xtuner_for_my/xtuner/tools/work_dirs/internlm2_chat_reward_qlora_with_load_v_head_and_finetune.
/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py:198: UserWarning: Detected call of
scheduler.step()
beforeoptimizer.step()
. In PyTorch 1.1.0 and later, you should call them in the opposite order:optimizer.step()
beforescheduler.step()
. Failure to do this will result in PyTorch skipping the first value of the parameter value schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-ratewarnings.warn(
11/06 16:32:12 - mmengine - INFO - Iter(train) [ 10/1670] lr: 1.8368e-05 eta: 0:13:07 time: 0.4747 data_time: 0.0173 memory: 2865 loss: 0.3013 acc: 1.0000 chosen_score_mean: 0.6914 rejected_score_mean: -1.2861 num_samples: 1.0000 num_tokens: 1001.0000
Traceback (most recent call last):
File "/root/xtuner_for_my/xtuner/tools/train.py", line 360, in
main()
File "/root/xtuner_for_my/xtuner/tools/train.py", line 356, in main
runner.train()
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/runner.py", line 1777, in train
model = self.train_loop.run() # type: ignore
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 289, in run
self.run_iter(data_batch)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 313, in run_iter
outputs = self.runner.model.train_step(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 116, in train_step
optim_wrapper.update_params(parsed_losses)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/optimizer_wrapper.py", line 201, in update_params
self.step(**step_kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py", line 115, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/amp_optimizer_wrapper.py", line 137, in step
self.loss_scaler.unscale_(self.optimizer)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 338, in unscale_
optimizer_state["found_inf_per_device"] = self.unscale_grads(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 279, in unscale_grads
torch.amp_foreach_non_finite_check_and_unscale(
RuntimeError: "amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'Tokenizing dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:00<00:00, 185.33it/s]
11/06 16:32:06 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
11/06 16:32:06 - mmengine - WARNING - Dataset PreferenceDataset has no metainfo.
dataset_meta
in visualizer will be None.11/06 16:32:07 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
11/06 16:32:07 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
11/06 16:32:07 - mmengine - INFO - Checkpoints will be saved to /root/xtuner_for_my/xtuner/tools/work_dirs/internlm2_chat_reward_qlora_with_load_v_head_and_finetune.
/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py:198: UserWarning: Detected call of
scheduler.step()
beforeoptimizer.step()
. In PyTorch 1.1.0 and later, you should call them in the opposite order:optimizer.step()
beforescheduler.step()
. Failure to do this will result in PyTorch skipping the first value of the parameter value schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-ratewarnings.warn(
11/06 16:32:12 - mmengine - INFO - Iter(train) [ 10/1670] lr: 1.8368e-05 eta: 0:13:07 time: 0.4747 data_time: 0.0173 memory: 2865 loss: 0.3013 acc: 1.0000 chosen_score_mean: 0.6914 rejected_score_mean: -1.2861 num_samples: 1.0000 num_tokens: 1001.0000
Traceback (most recent call last):
File "/root/xtuner_for_my/xtuner/tools/train.py", line 360, in
main()
File "/root/xtuner_for_my/xtuner/tools/train.py", line 356, in main
runner.train()
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/runner.py", line 1777, in train
model = self.train_loop.run() # type: ignore
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 289, in run
self.run_iter(data_batch)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/runner/loops.py", line 313, in run_iter
outputs = self.runner.model.train_step(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 116, in train_step
optim_wrapper.update_params(parsed_losses)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/optimizer_wrapper.py", line 201, in update_params
self.step(**step_kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/scheduler/param_scheduler.py", line 115, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/mmengine/optim/optimizer/amp_optimizer_wrapper.py", line 137, in step
self.loss_scaler.unscale(self.optimizer)
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 338, in unscale_
optimizer_state["found_inf_per_device"] = self.unscale_grads(
File "/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/amp/grad_scaler.py", line 279, in unscale_grads
torch.amp_foreach_non_finite_check_and_unscale(
RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
环境:
python=3.8
仓库的torch版本也没指定,是不是跟torch无关
启动命令:NPROC_PER_NODE=2 xtuner train ./internlm2_chat_reward_qlora_with_load_v_head_and_finetune.py
自己修改了reward.py
别问我为什么不用python 3.10,问就是试过了。
是我太菜了,还是这个框架太用了,兼容性这么差吗
The text was updated successfully, but these errors were encountered: