e***@info.sgu.ru
2017-01-31 15:33:42 UTC
Hi,
I am trying to write trivial master-slave program. Master simply creates
slaves, sends them a string, they print it out and exit. Everything works
just fine, however, when I add a delay (more than 2 sec) before calling
MPI_Init on slave, MPI fails with MPI_ERR_SPAWN. I am pretty sure that
MPI_Comm_spawn has some kind of timeout on waiting for slaves to call
MPI_Init, and if they fail to respond in time, it returns an error.
I believe there is a way to change this behaviour, but I wasn't able to
find any suggestions/ideas in the internet.
I would appreciate if someone could help with this.
---
--- terminal command i use to run program:
mpirun -n 1 hello 2 2 // the first argument to "hello" is number of
slaves, the second is delay in seconds
--- Error message I get when delay is >=2 sec:
[host:2231] *** An error occurred in MPI_Comm_spawn
[host:2231] *** reported by process [3453419521,0]
[host:2231] *** on communicator MPI_COMM_SELF
[host:2231] *** MPI_ERR_SPAWN: could not spawn processes
[host:2231] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will
now abort,
[host:2231] *** and potentially your MPI job)
--- The program itself:
#include "stdlib.h"
#include "stdio.h"
#include "mpi.h"
#include "unistd.h"
MPI_Comm slave_comm;
MPI_Comm new_world;
#define MESSAGE_SIZE 40
void slave() {
printf("Slave initialized; ");
MPI_Comm_get_parent(&slave_comm);
MPI_Intercomm_merge(slave_comm, 1, &new_world);
int slave_rank;
MPI_Comm_rank(new_world, &slave_rank);
char message[MESSAGE_SIZE];
MPI_Bcast(message, MESSAGE_SIZE, MPI_CHAR, 0, new_world);
printf("Slave %d received message from master: %s\n", slave_rank, message);
}
void master(int slave_count, char* executable, char* delay) {
char* slave_argv[] = { delay, NULL };
MPI_Comm_spawn( executable,
slave_argv,
slave_count,
MPI_INFO_NULL,
0,
MPI_COMM_SELF,
&slave_comm,
MPI_ERRCODES_IGNORE);
MPI_Intercomm_merge(slave_comm, 0, &new_world);
char* helloWorld = "Hello New World!\0";
MPI_Bcast(helloWorld, MESSAGE_SIZE, MPI_CHAR, 0, new_world);
printf("Processes spawned!\n");
}
int main(int argc, char* argv[]) {
if (argc > 2) {
MPI_Init(&argc, &argv);
master(atoi(argv[1]), argv[0], argv[2]);
} else {
sleep(atoi(argv[1])); /// delay
MPI_Init(&argc, &argv);
slave();
}
MPI_Comm_free(&new_world);
MPI_Comm_free(&slave_comm);
MPI_Finalize();
}
Thank you,
Andrew Elistratov
I am trying to write trivial master-slave program. Master simply creates
slaves, sends them a string, they print it out and exit. Everything works
just fine, however, when I add a delay (more than 2 sec) before calling
MPI_Init on slave, MPI fails with MPI_ERR_SPAWN. I am pretty sure that
MPI_Comm_spawn has some kind of timeout on waiting for slaves to call
MPI_Init, and if they fail to respond in time, it returns an error.
I believe there is a way to change this behaviour, but I wasn't able to
find any suggestions/ideas in the internet.
I would appreciate if someone could help with this.
---
--- terminal command i use to run program:
mpirun -n 1 hello 2 2 // the first argument to "hello" is number of
slaves, the second is delay in seconds
--- Error message I get when delay is >=2 sec:
[host:2231] *** An error occurred in MPI_Comm_spawn
[host:2231] *** reported by process [3453419521,0]
[host:2231] *** on communicator MPI_COMM_SELF
[host:2231] *** MPI_ERR_SPAWN: could not spawn processes
[host:2231] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will
now abort,
[host:2231] *** and potentially your MPI job)
--- The program itself:
#include "stdlib.h"
#include "stdio.h"
#include "mpi.h"
#include "unistd.h"
MPI_Comm slave_comm;
MPI_Comm new_world;
#define MESSAGE_SIZE 40
void slave() {
printf("Slave initialized; ");
MPI_Comm_get_parent(&slave_comm);
MPI_Intercomm_merge(slave_comm, 1, &new_world);
int slave_rank;
MPI_Comm_rank(new_world, &slave_rank);
char message[MESSAGE_SIZE];
MPI_Bcast(message, MESSAGE_SIZE, MPI_CHAR, 0, new_world);
printf("Slave %d received message from master: %s\n", slave_rank, message);
}
void master(int slave_count, char* executable, char* delay) {
char* slave_argv[] = { delay, NULL };
MPI_Comm_spawn( executable,
slave_argv,
slave_count,
MPI_INFO_NULL,
0,
MPI_COMM_SELF,
&slave_comm,
MPI_ERRCODES_IGNORE);
MPI_Intercomm_merge(slave_comm, 0, &new_world);
char* helloWorld = "Hello New World!\0";
MPI_Bcast(helloWorld, MESSAGE_SIZE, MPI_CHAR, 0, new_world);
printf("Processes spawned!\n");
}
int main(int argc, char* argv[]) {
if (argc > 2) {
MPI_Init(&argc, &argv);
master(atoi(argv[1]), argv[0], argv[2]);
} else {
sleep(atoi(argv[1])); /// delay
MPI_Init(&argc, &argv);
slave();
}
MPI_Comm_free(&new_world);
MPI_Comm_free(&slave_comm);
MPI_Finalize();
}
Thank you,
Andrew Elistratov