You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4
Traceback (most recent call last):
File "/app/train.py", line 45, in main
mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 239, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4
The text was updated successfully, but these errors were encountered:
CUDA_LAUNCH_BLOCKING=1 python train.py ngpus=4 model=medium noise.type=loglinear graph.type=absorb training.accum=1:
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4
Traceback (most recent call last):
File "/app/train.py", line 45, in main
mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 239, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 2 terminated with the following error:
Traceback (most recent call last):
File "/app/model/rotary.py", line 48, in apply_rotary_pos_emb
return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 233, in apply_rotary_emb_qkv_
return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/miniconda/envs/myenv/lib/python3.9/site-packages/flash_attn/layers/rotary.py", line 150, in forward
qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1, 64] because the unspecified dimension size -1 can be any value and is ambiguous
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/app/run_train.py", line 83, in run_multiprocess
_run(rank, world_size, cfg)
File "/app/run_train.py", line 193, in _run
loss = train_step_fn(state, batch)
File "/app/losses.py", line 104, in step_fn
loss = loss_fn(model, batch, cond=cond).mean() / accum
File "/app/losses.py", line 29, in loss_fn
log_score = log_score_fn(perturbed_batch, sigma)
File "/app/model/utils.py", line 47, in score_fn
score = model_fn(x, sigma)
File "/app/model/utils.py", line 34, in model_fn
return model(x, sigma)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 269, in forward
x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
File "/miniconda/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/app/model/transformer.py", line 169, in forward
qkv = rotary.apply_rotary_pos_emb(
File "/app/model/rotary.py", line 52, in apply_rotary_pos_emb
return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/app/model/rotary.py", line 40, in
@torch.jit.script
def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
return (qkv * cos) + (rotate_half(qkv) * sin)
~~~~~~~~~ <--- HERE
RuntimeError: The size of tensor a (64) must match the size of tensor b (32) at non-singleton dimension 4
The text was updated successfully, but these errors were encountered: