Training Error #5

JamesWandy · 2024-03-29T22:33:55Z

CUDA_LAUNCH_BLOCKING=1 python train.py ngpus=4 model=medium noise.type=loglinear graph.type=absorb training.accum=1:

-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4

Traceback (most recent call last):
File "/app/train.py", line 45, in main
mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 239, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4

omerlux · 2025-01-30T09:32:22Z

Updating flash-attn package fixed the issue for me

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Training Error #5

Training Error #5

JamesWandy commented Mar 29, 2024

omerlux commented Jan 30, 2025

Training Error #5

Training Error #5

Comments

JamesWandy commented Mar 29, 2024

omerlux commented Jan 30, 2025