You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Setting TOKENIZERS_PARALLELISM=false for forked processes.
Traceback (most recent call last):
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
main()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
runner.train()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1520, in build_train_loop
loop = LOOPS.build(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
dataloader = runner.build_dataloader(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 308, in process_hf_dataset
dataset = process(**kwargs)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 204, in process
assert {'input_ids', 'labels'}.issubset(dataset.column_names)
AssertionError
[rank1]:[E ProcessGroupGloo.cpp:144] Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
Traceback (most recent call last):
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
main()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
runner.train()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1520, in build_train_loop
loop = LOOPS.build(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:[E ProcessGroupGloo.cpp:144] Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
dataloader = runner.build_dataloader(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3519, in monitored_barrier
return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
RuntimeError: Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
Original exception:
[../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [127.0.1.1]:12167
Traceback (most recent call last):
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
main()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
runner.train()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1520, in build_train_loop
loop = LOOPS.build(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
dataloader = runner.build_dataloader(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3519, in monitored_barrier
return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
RuntimeError: Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
Original exception:
[../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [127.0.1.1]:12167
[rank3]:[E ProcessGroupGloo.cpp:144] Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
Traceback (most recent call last):
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
main()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
runner.train()
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1520, in build_train_loop
loop = LOOPS.build(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
dataloader = runner.build_dataloader(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3519, in monitored_barrier
return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
RuntimeError: Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
Original exception:
[../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [127.0.0.1]:59082
[2024-10-08 19:59:55,691] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 3078333) of binary: /home/yihua/.conda/envs/xtuner/bin/python
Traceback (most recent call last):
File "/home/yihua/.conda/envs/xtuner/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/home/yihua/.conda/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-10-08_19:59:55
host : server-4090x4
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 3078334)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-10-08_19:59:55
host : server-4090x4
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 3078335)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-10-08_19:59:55
host : server-4090x4
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 3078336)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-10-08_19:59:55
host : server-4090x4
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3078333)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
The text was updated successfully, but these errors were encountered:
xtuner训练llama3:8b时遇到下面错误,但是数据集扩大后就没问题,但是我现在需要在小数据集上训练。
The text was updated successfully, but these errors were encountered: