Description
Hello, I am a novice user of UnifyFS, and I encountered an error when trying to test a simple program on two nodes.So, I've come here seeking help, and I greatly appreciate it.
Operating System linux
OS Version Centos7
Architecture x86_64
UnifyFS Version 1.1
Describe the problem you're observing
This is the job script and program I'm using:
[root@gv240 work]# cat start_unifyfs.slurm
#!/bin/bash
#SBATCH -N 2
#SBATCH -n 2
#SBATCH -w gv[240,241]
#SBATCH --job-name=dog
export UNIFYFS_LOGIO_SPILL_DIR=/mnt/ssd
export UNIFYFS_LOG_DIR=/tmp/work/logs
export UNIFYFS_LOG_VERBOSITY=5
unifyfs start --share-dir=/tmp/shared --mount=/unifyfs
srun --overlap --mpi=pmix -N 2 -n 2 /tmp/work/helloauto -m /unifyfs
unifyfs terminate
[root@gv240 work]# cat helloauto.c
#include <mpi.h>
#include <stdio.h>
//#include <unifyfs.h>
int main(int argc, char * argv[]) {
int rank, rank_num;
//int rc;
FILE *fp;
//program initialization
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &rank_num);
//rc = unifyfs_mount("/unifyfs", rank, rank_num);
//perform I/O
fp = fopen("/unifyfs/dsetmpiauto.txt", "w");
fprintf(fp, "Hello World! I'm rank %d", rank);
fclose(fp);
//clean up
//unifyfs_unmount();
MPI_Finalize();
return 0;
}
Describe how to reproduce the problem
To submit the job on the node named 'gv240'
[root@gv240 work]# sbatch start_unifyfs.slurm
Include any warning or errors or releveant debugging data
The error messages in the UnifyFS log file on the second node named 'gv241':
UNIFYFS CONFIG: server.max_app_clients = 256
UNIFYFS CONFIG: sharedfs.dir = /tmp/shared
2023-09-11T13:20:59 tid=6410 @ process_servers_hostfile() [unifyfs_server.c:165] failed to open hostfile /tmp/shared/unifyfsd.hosts
2023-09-11T13:20:59 tid=6410 @ get_server_rank_and_size() [unifyfs_server.c:237] failed to gather server information
The error messages on node gv240:
2023-09-11T13:23:06 tid=6932 @ signal_new_requests() [unifyfs_request_manager.c:269] signaling new requests
2023-09-11T13:23:06 tid=7015 @ request_manager_thread() [unifyfs_request_manager.c:1738] I am request manager [app=1511587981:client=1] thread!
2023-09-11T13:23:06 tid=7015 @ rm_process_client_requests() [unifyfs_request_manager.c:1617] processing 1 client requests
2023-09-11T13:23:06 tid=7015 @ process_metaset_rpc() [unifyfs_request_manager.c:1282] setting metadata for gfid=1511587981
2023-09-11T13:23:06 tid=7015 @ unifyfs_fskv_lookup_remote() [../../common/src/unifyfs_keyval.c:825] failed to open kvstore entry /tmp/shared/kvstore/1/unifyfsd.margo-svr
2023-09-11T13:23:06 tid=7015 @ unifyfs_keyval_lookup_remote() [../../common/src/unifyfs_keyval.c:1036] remote keyval lookup for 'unifyfsd.margo-svr' failed
2023-09-11T13:23:06 tid=7015 @ margo_connect_server() [margo_server.c:495] server index=1 - margo server lookup failed
2023-09-11T13:23:06 tid=7015 @ get_p2p_request_handle() [unifyfs_p2p_rpc.c:40] missing margo address for rank=1
2023-09-11T13:23:06 tid=7015 @ process_metaset_rpc() [unifyfs_request_manager.c:1290] unifyfs_fops_metaset() failed
2023-09-11T13:23:06 tid=7015 @ rm_process_client_requests() [unifyfs_request_manager.c:1677] client rpc request 0 failed ("Mercury/Argobots operation error")
2023-09-11T13:23:06 tid=7015 @ request_manager_thread() [unifyfs_request_manager.c:1754] failed to process client rpc requests
2023-09-11T13:23:07 tid=6931 @ signal_new_requests() [unifyfs_request_manager.c:269] signaling new requests
2023-09-11T13:23:07 tid=7015 @ rm_process_client_requests() [unifyfs_request_manager.c:1617] processing 1 client requests
2023-09-11T13:23:07 tid=7015 @ process_attach_rpc() [unifyfs_request_manager.c:1065] attaching client 1511587981:1
2023-09-11T13:23:07 tid=7015 @ unifyfs_logio_init() [../../common/src/unifyfs_logio.c:221] shmem header - hdr_sz=4096, data_sz=264241152, data_offset=4096
2023-09-11T13:23:07 tid=7015 @ get_page_size() [../../common/src/unifyfs_logio.c:83] returning page size 4096 B
2023-09-11T13:23:07 tid=7015 @ map_spillfile() [../../common/src/unifyfs_logio.c:165] mapping spillfile - fd=24, pgsz=4096
2023-09-11T13:23:07 tid=7015 @ unifyfs_logio_init() [../../common/src/unifyfs_logio.c:251] spill header - hdr_sz=4096, data_sz=4290772992, data_offset=4096
2023-09-11T13:23:07 tid=7015 @ unifyfs_logio_init() [../../common/src/unifyfs_logio.c:270] logio_context for client [1511587981:1] - shmem(sz=268435456, hdr=0x7f7334001ae0), spill(sz=4294967296, hdr=0x7f7364b8b000)
2023-09-11T13:23:07 tid=7015 @ get_page_size() [../../common/src/unifyfs_logio.c:83] returning page size 4096 B
2023-09-11T13:23:07 tid=6934 @ signal_new_requests() [unifyfs_request_manager.c:269] signaling new requests
2023-09-11T13:23:07 tid=7015 @ rm_process_client_requests() [unifyfs_request_manager.c:1617] processing 1 client requests
2023-09-11T13:23:07 tid=7015 @ process_metaget_rpc() [unifyfs_request_manager.c:1220] getting metadata for gfid=1511587981
2023-09-11T13:23:07 tid=7015 @ unifyfs_invoke_metaget_rpc() [unifyfs_p2p_rpc.c:785] using cached attributes for gfid=1511587981
2023-09-11T13:23:08 tid=6933 @ signal_new_requests() [unifyfs_request_manager.c:269] signaling new requests
2023-09-11T13:23:08 tid=7015 @ request_manager_thread() [unifyfs_request_manager.c:1788] RM[1511587981:1] got work
2023-09-11T13:23:08 tid=7015 @ rm_process_client_requests() [unifyfs_request_manager.c:1617] processing 1 client requests
2023-09-11T13:23:08 tid=7015 @ process_metaset_rpc() [unifyfs_request_manager.c:1282] setting metadata for gfid=1877017155
2023-09-11T13:23:08 tid=7015 @ unifyfs_fskv_lookup_remote() [../../common/src/unifyfs_keyval.c:825] failed to open kvstore entry /tmp/shared/kvstore/1/unifyfsd.margo-svr
2023-09-11T13:23:08 tid=7015 @ unifyfs_keyval_lookup_remote() [../../common/src/unifyfs_keyval.c:1036] remote keyval lookup for 'unifyfsd.margo-svr' failed
2023-09-11T13:23:08 tid=7015 @ margo_connect_server() [margo_server.c:495] server index=1 - margo server lookup failed
2023-09-11T13:23:08 tid=7015 @ get_p2p_request_handle() [unifyfs_p2p_rpc.c:40] missing margo address for rank=1
2023-09-11T13:23:08 tid=7015 @ process_metaset_rpc() [unifyfs_request_manager.c:1290] unifyfs_fops_metaset() failed
2023-09-11T13:23:08 tid=7015 @ rm_process_client_requests() [unifyfs_request_manager.c:1677] client rpc request 0 failed ("Mercury/Argobots operation error")
2023-09-11T13:23:08 tid=7015 @ request_manager_thread() [unifyfs_request_manager.c:1754] failed to process client rpc requests
2023-09-11T13:23:10 tid=6929 @ unifyfs_publish_server_pids() [unifyfs_server_pid.c:151] server initialization timeout
2023-09-11T13:23:10 tid=6929 @ unifyfs_publish_server_pids() [unifyfs_server_pid.c:167] 1 of 2 servers reported their pids
2023-09-11T13:23:10 tid=6929 @ main() [unifyfs_server.c:493] failed to publish server pid file: Connection timed out