Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training Error #5

Open
JamesWandy opened this issue Mar 29, 2024 · 1 comment
Open

Training Error #5

JamesWandy opened this issue Mar 29, 2024 · 1 comment

Comments

@JamesWandy
Copy link

CUDA_LAUNCH_BLOCKING=1 python train.py ngpus=4 model=medium noise.type=loglinear graph.type=absorb training.accum=1:

-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4

Traceback (most recent call last):
File "/app/train.py", line 45, in main
mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 239, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4

@omerlux
Copy link

omerlux commented Jan 30, 2025

Updating flash-attn package fixed the issue for me

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants