Description
What happened + What you expected to happen
(RayTrainWorker pid=342410) Process ForkServerProcess-8:
(RayTrainWorker pid=342410) Process ForkServerProcess-5:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
(RayTrainWorker pid=342410) key, func_name, detail = resource_proxy[TASK_QUEUE].get()
(RayTrainWorker pid=342410) File "", line 2, in get
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
(RayTrainWorker pid=342410) kind, result = conn.recv()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 250, in recv
(RayTrainWorker pid=342410) buf = self._recv_bytes()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
(RayTrainWorker pid=342410) buf = self._recv(4)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
(RayTrainWorker pid=342410) raise EOFError
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) EOFError
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
(RayTrainWorker pid=342410) Process ForkServerProcess-9:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
2025-02-13 11:29:15,200 ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_a00d4_00000
Traceback (most recent call last):
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
result = ray.get(future)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/_private/worker.py", line 2755, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/_private/worker.py", line 906, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::_Inner.train() (pid=341574, ip=100.90.145.135, actor_id=f3f4c784db6968ca5dc6cfec02000000, repr=TorchTrainer)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 331, in train
raise skipped from exception_cause(skipped)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 57, in check_for_failure
ray.get(object_ref)
ray.exceptions.RayTaskError(RuntimeError): ray::_RayTrainWorker__execute.get_next() (pid=342410, ip=100.90.145.135, actor_id=e4b734452d806a35799576dd02000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0xfffb5b315df0>)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 176, in discard_return_wrapper
train_func(*args, **kwargs)
File "/home/ma-user/example/acge_text_embedding/train_ray.py", line 187, in train_loop_per_worker
trainer.train()
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/transformers/trainer.py", line 2171, in train
return inner_training_loop(
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/transformers/trainer.py", line 2330, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1263, in prepare
result = tuple(
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1264, in
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1140, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1391, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 795, in init
_verify_param_shape_across_processes(self.process_group, parameters)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/torch/distributed/utils.py", line 265, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: No backend type associated with device type npu
Training errored after 0 iterations at 2025-02-13 11:29:15. Total running time: 32s
Error file: /tmp/ray/session_2025-02-13_11-26-43_017359_326892/artifacts/2025-02-13_11-28-43/TorchTrainer_2025-02-13_11-28-42/driver_artifacts/TorchTrainer_a00d4_00000_0_2025-02-13_11-28-43/error.txt
(RayTrainWorker pid=342410) Process ForkServerProcess-4:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
(RayTrainWorker pid=342410) Process ForkServerProcess-7:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/l
F438
ib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
(RayTrainWorker pid=342410) Process ForkServerProcess-10:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/utils/common.py", line 103, in daemon_process
2025-02-13 11:29:15,212 INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/TorchTrainer_2025-02-13_11-28-42' in 0.0065s.
(RayTrainWorker pid=342410) running.set()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 1081, in set
(RayTrainWorker pid=342410) return self._callmethod('set')
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
(RayTrainWorker pid=342410) Process ForkServerProcess-6:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
2025-02-13 11:29:15,214 ERROR tune.py:1037 -- Trials did not complete: [TorchTrainer_a00d4_00000]
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
ray.exceptions.RayTaskError(RuntimeError): ray::_Inner.train() (pid=341574, ip=100.90.145.135, actor_id=f3f4c784db6968ca5dc6cfec02000000, repr=TorchTrainer)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 331, in train
raise skipped from exception_cause(skipped)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 57, in check_for_failure
ray.get(object_ref)
ray.exceptions.RayTaskError(RuntimeError): ray::_RayTrainWorker__execute.get_next() (pid=342410, ip=100.90.145.135, actor_id=e4b734452d806a35799576dd02000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0xfffb5b315df0>)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 176, in discard_return_wrapper
train_func(*args, **kwargs)
File "/home/ma-user/example/acge_text_embedding/train_ray.py", line 187, in train_loop_per_worker
trainer.train()
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/transformers/trainer.py", line 2171, in train
return inner_training_loop(
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/transformers/trainer.py", line 2330, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1263, in prepare
result = tuple(
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1264, in
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1140, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/accelerate/accelerator.py", line 1391, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 795, in init
_verify_param_shape_across_processes(self.process_group, parameters)
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/torch/distributed/utils.py", line 265, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: No backend type associated with device type npu
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/ma-user/example/acge_text_embedding/train_ray.py", line 245, in
train_resnet(1)
File "/home/ma-user/example/acge_text_embedding/train_ray.py", line 239, in train_resnet
result = trainer.fit()
File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/ray/train/base_trainer.py", line 638, in fit
raise TrainingFailedError(
ray.train.base_trainer.TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: trainer = TorchTrainer.restore("/root/ray_results/TorchTrainer_2025-02-13_11-28-42")
.
To start a new run that will retry on training failures, set train.RunConfig(failure_config=train.FailureConfig(max_failures))
in the Trainer's run_config
with max_failures > 0
, or max_failures = -1
for unlimited retries.
(RayTrainWorker pid=342410) Process ForkServerProcess-3:
(RayTrainWorker pid=342410) Process ForkServerProcess-2:
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
(RayTrainWorker pid=342410) conn = self._tls.connection
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410) AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) During handling of the above exception, another exception occurred:
(RayTrainWorker pid=342410)
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) Traceback (most recent call last):
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
(RayTrainWorker pid=342410) self.run()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/process.py", line 108, in run
(RayTrainWorker pid=342410) self._target(*self._args, **self._kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
(RayTrainWorker pid=342410) raise exp
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
(RayTrainWorker pid=342410) func(*args, **kwargs)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 258, in task_distribute
(RayTrainWorker pid=342410) resource_proxy[SUB_PROCESS_STATE].append(True)
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "", line 2, in append
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
(RayTrainWorker pid=342410) self._connect()
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/managers.py", line 793, in _connect
(RayTrainWorker pid=342410) conn = self._Client(self._token.address, authkey=self._authkey)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 502, in Client
(RayTrainWorker pid=342410) c = SocketClient(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/multiprocessing/connection.py", line 630, in SocketClient
(RayTrainWorker pid=342410) s.connect(address)
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
(RayTrainWorker pid=342410) ConnectionRefusedError: [Errno 111] Connection refused
[ERROR] 2025-02-13-11:29:15 (PID:340344, Device:-1, RankID:-1) ERR99999 UNKNOWN application exception
Versions / Dependencies
-- CANN 版本: 8.0.RC2
-- Pytorch版本:2.1.0
-- torch_npu:2.1.0.post6
-- Python 版本:3.9.21
-- 训练卡: 910
-- transformers:4.48.1
-- ray:2.40.0
Reproduction script
import os
from typing import Dict
from tempfile import TemporaryDirectory
import torch
import torch_npu
from filelock import FileLock
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from datasets import Dataset
from tqdm import tqdm
from transformers import (
Trainer,
TrainingArguments,
EvalPrediction,
AutoModelForSequenceClassification,
AutoTokenizer
)
import numpy as np
import ray
import ray.train as train
from ray.train import ScalingConfig, Checkpoint
from ray.train.torch import TorchTrainer
from ray.train.torch import TorchConfig
from ray.runtime_env import RuntimeEnv
import ray.train.huggingface.transformers
import pandas as pd
import os
os.environ['ASCEND_RT_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
本地模型
os.environ["TRANSFORMERS_OFFLINE"] = "1"
本地数据集
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["HF_EVALUATE_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
device = 'npu:3'
import habana_frameworks.torch.core as htcore
def train_loop_per_worker(configs):
#
print(ray.train.torch.get_device())
# Datasets
model_path = '/home/ma-user/ceshi/acge_text_embedding'
os.makedirs(model_path, exist_ok=True)
# os.system(f"mc mirror --overwrite --remove oss/ray-model/bert-base-chinese {model_path}")
#os.makedirs(MODEL_PATH, exist_ok=True)
#os.system(f"mc mirror --overwrite --remove oss/ray-model/{pretrain_model} {MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(model_path)
dataset = 'ray-dataset/test.csv'
dataset_name = dataset.split('/')[-1]
dataset_path = os.path.join('/home/ma-user/example/acge_text_embedding/datasets', dataset_name)
dataset_path = 'test.csv'
dataset_path = '/home/ma-user/example/acge_text_embedding/datasets/test.csv'
# os.system(f"mc mirror --overwrite --remove oss/npu/test.csv {dataset_path}")
data = [
{
"text": "客服:您好很高兴为您服务。用户:你帮我查一下我这个号下面有有宽带没有,。客服:就那个副卡副卡里面是没有宽带的它的主卡里面有个宽带。用户:我主卡里那个宽带是在哪里呢。客服:主卡宽带在哪里是吗我看一下。客服:在泉州的显示的话。用户:深圳哪个地方。客服:漳州市只能在看才显示在城区。客服:具体哪个位置我看不到了。用户:是不是那个b栋22309。客服:我来看一下。客服:嗯行你把你把那个具体位置告诉我我帮你核对一下。用户:国庆北路苏宁易购b栋2309。客服:嗯。客服:国基北路对了。客服:然后新世界的苏宁苏宁易购,。用户:那为什么这个宽带用不了了你看看能不能帮我去看一下可以吗。客服:肯定用不了了吧。客服:嗯这个问题什么时候出现的。用户:这个什么意思。用户:呃不知道我因为我出差了在外地待了大0.5年回去就用不了。客服:我看它显示的话是你看它那个光猫那边是不是亮红灯了呢。用户:嗯我不在家你派个人去看家里的老人家在家他们不会弄。客服:啊。客服:哦行那我这边帮你安排个师傅上门帮你查看一下。用户:好好谢谢。客服:那到时候我们上门的话那个电话是联系哪个号码。用户:我报给你行吗。客服:嗯可以。用户:185。客服:185。用户:2821。客服:然后呢。用户:6967。客服:18528216967就联系这个号码是吗。用户:对嗯。客服:嗯好的先生那我这边去帮您做登记我帮您安排个师傅上门帮您检查一下。用户:好谢谢。客服:嗯好的不客气还有什么查询的吗。用户:嗯不用了。",
"label": "是"
},
{
"text": "客服:您好很高兴为您服务。用户:啊喂你好。用户:我打扰一下我想问。用户:我如果把那个他呃我。用户:网络加高啊加到一千兆我要是怎么搞,。客服:1000M是吗。客服:哦我这边帮您看一下。客服:嗯对如果要加到1000M的话您这边稍后就可以办理一个。客服:宽带提速包或者是也可以。客服:直接去申请更改套餐。用户:我知道不是我手机这个流量加了1000M的是我。用户:我家里的网呢。客服:嗯就是宽带嘛你介绍的话都可以去办理一个宽带提速包。客服:就带了1件罩。用户:呃多少钱一个月。客服:我看一下提速包多少钱一个月。客服:请稍等一下。用户:嗯。客服:呃先生您好这边为您查询到的话他这边是二十块钱一个月,。用户:多少。客服:20块钱。用户:20块钱一个月。客服:是的。用户:那个无线网。客服:20块钱一个月。用户:到哪里到哪里办呢。客服:他这个的话他这个的话如果要办理这个提速包的话他这边。客服:要前往那个营业厅办理啊。用户:我等于我是。用户:我的网是在抚州这边查询这边我到优先办可不可以。客服:有线。用户:等于我是属于塔顶管啊等于。客服:还是。客服:苏州是吗。用户:对对对株洲。客服:苏州市内的话都可以呀。用户:全国统1只要是湖南省的都可以是吧。客服:嗯这边现在如果是归属地的营业厅都可以呀。用户:等于20块钱一个月我其他费用。用户:话费一起等于就是,。用户:这个。客服:这边帮您承诺的话就是20块钱一个月。用户:啊啊。用户:本1吗。客服:是的。用户:啊好了好了好了。用户:嗯他这个1000M的无线网是读数的还是。用户:九百九百九十多了。客服:就1000M就是1000M的。用户:就是1000多的。用户:他这个以前那我想问啊因为我带几个摄像头啊还是带的动吧。客服:带的动啊。用户:大概。用户:10个10个摄像头左右。客服:但是1000M的话它的那个网速也是比较快的。用户:问题就是我跟你说说问题就是我们这边服务不怎么样知道不嗯。用户:我这个老是掉网老是掉网三天两头就掉网了。客服:那您这边稍等您这边是。客服:嗯然后续的话留意一下看一下他这边的话就是。客服:呃1000M的话这边就是25块钱。用户:等于加20块钱一个月就可以了。客服:对对对。用户:呃路由器应该不用换吧。客服:那到时候的话。客服:路由器的话我看一下。客服:那到时候可能呃您这边办理成功后可能。客服:呃到时候会存在那个荧光猫或路由器机顶盒等。客服:这些关。客服:的速率不匹配。客服:或不保留呃或导致不达标。用户:喂。用户:等于不达标不达标的话那要怎么搞呢。客服:就是他要。客服:更换一下那个终端设备。用户:呃那个路由器还有光猫还有什么都要换啊。客服:嗯是的。用户:这2个都要换是吧。客服:对。用户:嗯然后外面的线要不要重新搞了。客服:先不用。用户:先不用只是把房间里面那个路由器跟。用户:那些东西都要换。客服:是的。用户:好了好了那谢谢了麻烦了。客服:那先生请问还有什么可以帮您。用户:啊那没有了没有了。客服:啊那请您不要挂机稍后听到语音评价满意请帮忙按一个1。客服:谢谢您麻烦您了再见。用户:好的好的好的。",
"label": "是"
},
{
"text": "客服:您好很高兴为您服务。用户:喂你好我现在有个彩铃你们。用户:帮我取消没有啊。客服:嗯我看一下哈稍等一下。用户:我以前没有办那个彩铃你们怎么给我办那个彩铃呢。用户:我没有自己没有办宽带。客服:嗯确实很抱歉给您带来不便了这边查看到的话呢是有一个那个视频彩铃电商会员是吧这个业务已经帮您退订了取消掉了。用户:那那个彩铃是彩铃是多彩铃是多少的。客服:29元。用户:哦。客服:嗯已经帮您退订了这个业务取消掉了的。用户:是什么时候给我弄弄的彩铃呀。客服:这业务的话呢是10016外呼帮你办理的你有没有接到他们的电话统1订购这个业务。用户:啊那个电那个电话我没有接到啊。客服:110016帮您办的。客服:那现在已经帮您取消掉了取消掉了。用户:就。用户:可能就是。用户:10016嗯。客服:对。用户:是嗯是那个是开始办卡的时候。用户:Forthepass。客服:嗯什么。用户:我说是不是开始办这个联通的卡是不是。用户:就是嗯那个110016给我搞的这个彩铃。客服:不是你这个彩铃是11月1号才生效就是这个月才生效不是办卡的时候办的。客服:你这个办办卡的是8月份吗。客服:好的已经退订了不会再扣费了还有其他可以帮您的吗。用户:啊。用户:嗯好没有了。客服:嗯好的那感谢来电稍后听到语音评价之后帮忙按个1评价可以吗谢谢您。客服:妈妈按一个1谢谢你。",
"label": "否"
},
{
"text": "客服:很高兴为您服务您好。用户:喂你好我想问一下这个号码可以申请那个宽带业务。客服:呃您这个号码的话嗯目前您这个套餐类型它是不支持安装宽带的哦如果要装宽带的话可能需要换套餐之类的。用户:换换哪种套餐呢。客服:比如说是冰淇淋那种业务之类的比如说要那种什么呃99元之类的那种套餐。用户:那99元以上的套餐是吧。客服:对99和99以上那种套餐才实时宽带的。用户:嗯那如果如果安装那个宽带是有要有额外要。用户:有额外费用。客服:要一个安装费用的。用户:一个安装费,。客服:对。用户:那个光猫有吗。客服:光猫的话有1些地市他那边是支持就是免费给您配送配送完就是你这边会不用宽带的时候要还回去那设备的但具体的话要看当地那边营业厅的。用户:去当地营业厅办理的。客服:如果说嗯对如果说你这边要需要了解的话我可以给你发个链接你填一下当地位置。客服:然后当地那边给您回电的帮您介绍详细内容。客服:就没有说一定要办理只是说给你回电来介绍。用户:嗯可以可以。客服:嗯我给您发个链接吧稍后电话结束我看一下信息然后点一下提交一下当地位置。客服:让当地那边营业厅跟你联系。用户:好的。客服:嗯好那您看还有其他业务需要咨询的吗这边稍后给您发过去。用户:嗯谢谢。客服:好的嗯稍后也想请您帮忙按个1给我这边个人服务态度一个满意好评谢谢您了啊给您报修了看一下信息。",
"label": "否"
},
{
"text": "客服:很高兴为您服务您好。用户:您好。用户:麻烦我问一下现在装宽带是怎么装的。客服:呃如果说目前要了解宽带装安装宽带的话我可以给您发个链接您提交一下后面当地那边给您回电来进行介绍办理。用户:但是我可以在联通那上面下单但是我不知道现在。用户:装宽带。用户:光猫要不要钱啊。客服:光猫如果说您都没有的话看当地地市游戏是免费的但有1些的话它是需要交那些押金之类的。客服:详情的话呃要了解到我可以给您发链接1样的你提交一下我们当地那边给您回电介绍没没有。",
"label": "是"
}
]
# 定义列名(可选,但推荐)
columns = ['text', 'label']
# 将列表转换为 DataFrame
df = pd.DataFrame(data, columns=columns)
# 显示 DataFrame
print(df)
# 标签处理
#df = pd.read_csv(dataset_path)
df, num_labels, label_to_id, id_to_label = convert_label(df)
# df.to_csv(dataset_path,index=False)
# 暂时只支持csv
raw_datasets = Dataset.from_pandas(df)
# device = torch.device("npu")
def tokenize_function(examples):
# result = tokenizer(examples["text"], padding="max_length", truncation=True)
# result["label"] = [int(l) for l in examples["label"]]
result = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
input_ids = torch.tensor(result.input_ids)
token_type_ids = torch.tensor(result.token_type_ids)
attention_mask = torch.tensor(result.attention_mask)
# # result["label"] = [(label_to_id[str(l[0])] if l != -1 else -1) for l in examples["label"]]
# # 确保 label_to_id 字典能处理可能的特殊值,比如 -1(通常表示无标签或未知标签)
# # 这里我们假设 -1 应该保持为 -1,不转换
# # 应用转换函数到每个标签
result["label"] = [int(l) for l in examples["label"]] # 假设每个标签是一个列表,取第一个元素
# # result["label"] = examples["label"]
label = torch.tensor(result.label)
result2 = {}
result2['label'] = result["label"] # label
result2['input_ids'] = result.input_ids
result2['token_type_ids'] = result.token_type_ids
result2['attention_mask'] = result.attention_mask
print(label)
return result2
# small_train_dataset = (
# raw_datasets["train"].select(range(600)).map(tokenize_function, batched=True)
# )
# small_eval_dataset = (
# raw_datasets["test"].select(range(200)).map(tokenize_function, batched=True)
# )
small_train_dataset = raw_datasets.map(tokenize_function, batched=True)
# print(type(dataset_train['label'][0]))
small_eval_dataset = raw_datasets.map(tokenize_function, batched=True)
small_train_dataset = small_train_dataset.remove_columns(['text'])
# Model
model = AutoModelForSequenceClassification.from_pretrained(
model_path, num_labels=num_labels, ignore_mismatched_sizes=True
)
# Evaluation Metrics
# metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {"f1": 1}
# Hugging Face Trainer
training_args = TrainingArguments(
output_dir="test_trainer",
evaluation_strategy="epoch",
save_strategy="epoch",
report_to="none",
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
num_train_epochs=1,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics,
)
# model.to(device)
# [2] Report Metrics and Checkpoints to Ray Train
# ===============================================
callback = ray.train.huggingface.transformers.RayTrainReportCallback()
trainer.add_callback(callback)
# [3] Prepare Transformers Trainer
# ================================
trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
# Start Training
trainer.train()
def convert_label(df):
# 创建标签到ID的映射
label_to_id = {}
id_to_label = {}
id_counter = 0
# 遍历所有标签,创建映射
for label in df['label'].unique():
if label not in label_to_id:
label_to_id[label] = str(id_counter)
id_to_label[str(id_counter)] = label
id_counter += 1
# 将DataFrame中的标签替换为ID
df['label'] = df['label'].map(label_to_id)
return df, id_counter, label_to_id, id_to_label
@ray.remote(resources={"NPU":1})
def train_resnet(num_workers=1):
global_batch_size = 16
train_loop_config = {
"input_size": 224, # Input image size (224 x 224)
"batch_size": 32, # Batch size for training
"num_epochs": 10, # Number of epochs to train for
"lr": 0.001, # Learning Rate
"momentum": 0.9, # SGD optimizer momentum
}
# Configure computation resources
# In ScalingConfig, require an HPU for each worker
# , resources_per_worker={"CPU": 10, "NPU": 1}
scaling_config = ScalingConfig(num_workers=num_workers,use_gpu=False,resources_per_worker={"CPU": 10, "NPU": 1})
# Set backend to hccl in TorchConfig
#torch_config = TorchConfig(backend = "hccl")
#ray.init(address='172.17.0.6:6379')
# Initialize a Ray TorchTrainer
trainer = TorchTrainer(
train_loop_per_worker=train_loop_per_worker,
#train_loop_config=train_loop_config,
#?torch_config=torch_config,
scaling_config=scaling_config,
)
#print(ray.get_runtime_context().get_accelerator_ids())
#print("GPU IDs: {}".format(ray.get_runtime_context().get_accelerator_ids()["GPU"]))
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
print(ray.train.torch.get_device())
result = trainer.fit()
print(f"Training result: {result}")
if name == 'main':
ray.init(address='172.17.0.6:6379')
train_resnet(1)
#ray.get(train_resnet.remote())
Issue Severity
None