-
-
Notifications
You must be signed in to change notification settings - Fork 502
Description
#I encountered this problem during the training of stage 2.
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1134, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/opt/conda/lib/python3.8/multiprocessing/queues.py", line 107, in get
if not self._poll(timeout):
File "/opt/conda/lib/python3.8/multiprocessing/connection.py", line 257, in poll
return self._poll(timeout)
File "/opt/conda/lib/python3.8/multiprocessing/connection.py", line 424, in _poll
r = wait([self], timeout)
File "/opt/conda/lib/python3.8/multiprocessing/connection.py", line 936, in wait
timeout = deadline - time.monotonic()
File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 33959) is killed by signal: Killed.
Traceback (most recent call last):
File "/home/gpu4/UniAD/./tools/train.py", line 256, in
main()
File "/home/gpu4/UniAD/./tools/train.py", line 245, in main
custom_train_model(
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/apis/train.py", line 21, in custom_train_model
custom_train_detector(
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/apis/mmdet_train.py", line 194, in custom_train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/gpu4/mmcv/mmcv/runner/epoch_based_runner.py", line 136, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/gpu4/mmcv/mmcv/runner/epoch_based_runner.py", line 53, in train
self.run_iter(data_batch, train_mode=True, **kwargs)
File "/home/gpu4/mmcv/mmcv/runner/epoch_based_runner.py", line 31, in run_iter
outputs = self.model.train_step(data_batch, self.optimizer,
File "/home/gpu4/mmcv/mmcv/parallel/distributed.py", line 63, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/home/gpu4/miniconda3/envs/uniad2.0/lib/python3.9/site-packages/mmdet/models/detectors/base.py", line 248, in train_step
losses = self(**data)
File "/home/gpu4/miniconda3/envs/uniad2.0/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/detectors/uniad_e2e.py", line 81, in forward
return self.forward_train(**kwargs)
File "/home/gpu4/mmcv/mmcv/runner/fp16_utils.py", line 116, in new_func
return old_func(*args, **kwargs)
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/detectors/uniad_e2e.py", line 187, in forward_train
ret_dict_motion = self.motion_head.forward_train(bev_embed,
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/dense_heads/motion_head.py", line 137, in forward_train
losses = self.loss(*loss_inputs)
File "/home/gpu4/mmcv/mmcv/runner/fp16_utils.py", line 205, in new_func
return old_func(*args, **kwargs)
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/dense_heads/motion_head.py", line 416, in loss
gt_fut_traj_all, gt_fut_traj_mask_all = self.compute_matched_gt_traj(
File "/home/gpu4/UniAD/projects/mmdet3d_plugin/uniad/dense_heads/motion_head.py", line 475, in compute_matched_gt_traj
bboxes = track_bbox_results[i][0].tensor[valid_traj_masks]
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)