initial commit (99634760) · Commits · olcf / hello_jobstep

.gitignore

0 → 100644

+2 −0

Original line number	Diff line number	Diff line
		*.o
		hello_srun

Makefile

0 → 100644

+16 −0

Original line number	Diff line number	Diff line
		COMP = hipcc
		FLAGS = --amdgpu-target=gfx906,gfx908 -fopenmp

		INCLUDES = -I$(OLCF_OPENMPI_ROOT)/include -I$(ROCM_PATH)/rocm_smi/include
		LIBRARIES = -L$(OLCF_OPENMPI_ROOT)/lib -lmpi -L$(ROCM_PATH)/lib -lrocm_smi64

		hello_jobstep: hello_jobstep.o
		$(COMP) $(FLAGS) $(LIBRARIES) hello_jobstep.o -o hello_jobstep

		hello_jobstep.o: hello_jobstep.cpp
		$(COMP) $(FLAGS) $(INCLUDES) -c hello_jobstep.cpp

		.PHONY: clean

		clean:
		rm -f hello_jobstep *.o

README.md

0 → 100644

+13 −0

Original line number	Diff line number	Diff line
		# hello_jobstep

		For each job step launched with a job launcher, this program prints the hardware thread IDs that each MPI rank and OpenMP thread runs on, and the GPU IDs that each rank/thread has access to.

		## Compiling

		To compile, you'll need to have HIP and MPI installed, and use a OpenMP-capable compiler.

		## Usage

		To run, simply launch the code with your favorite job launcher.

		> OPTIONAL: There is an `example_map.sh` script that can be modified and called "in front of" `hello_jobstep` (or any other executable really). The script uses `numactl` to map hardware threads and GPUs to node-local MPI ranks.

example_map.sh

0 → 100755

+19 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		export APP=$1

		lrank=$(($SLURM_PROCID % 4))
		export OMP_NUM_THREADS=4
		export OMP_PLACES=cores

		case ${lrank} in
		[0])
		export HIP_VISIBLE_DEVICES=0,1
		numactl --physcpubind=64,65,66,67 $APP
		;;

		[1])
		export HIP_VISIBLE_DEVICES=2,3
		numactl --physcpubind=68,69,70,71 $APP
		;;
		esac

hello_jobstep.cpp

0 → 100644

+111 −0

Original line number	Diff line number	Diff line
		/**********************************************************

		"Hello World"-type program to test different srun layouts.

		Written by Tom Papatheodore

		**********************************************************/

		#include <stdlib.h>
		#include <stdio.h>
		#include <iostream>
		#include <iomanip>
		#include <string.h>
		#include <mpi.h>
		#include <sched.h>
		#include <hip/hip_runtime.h>
		#include <omp.h>

		// Macro for checking errors in HIP API calls
		#define hipErrorCheck(call) \
		do{ \
		hipError_t hipErr = call; \
		if(hipSuccess != hipErr){ \
		printf("HIP Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(hipErr)); \
		exit(0); \
		} \
		}while(0)

		int main(int argc, char *argv[]){

		MPI_Init(&argc, &argv);

		int size;
		MPI_Comm_size(MPI_COMM_WORLD, &size);

		int rank;
		MPI_Comm_rank(MPI_COMM_WORLD, &rank);

		char name[MPI_MAX_PROCESSOR_NAME];
		int resultlength;
		MPI_Get_processor_name(name, &resultlength);

		const char* gpu_id_list = getenv("HIP_VISIBLE_DEVICES");

		// Find how many GPUs HIP runtime says are available
		int num_devices = 0;
		hipErrorCheck( hipGetDeviceCount(&num_devices) );

		int hwthread;
		int num_threads = 0;
		int thread_id = 0;

		#pragma omp parallel default(shared)
		{
		num_threads = omp_get_num_threads();
		}

		if(num_devices == 0){
		#pragma omp parallel default(shared) private(hwthread, thread_id)
		{
		thread_id = omp_get_thread_num();
		hwthread = sched_getcpu();

		std::cout << "MPI " << rank << " - OMP " << thread_id << " - HWT "
		<< hwthread << " - Node " << name << std::endl;
		}
		}
		else{

		char busid[64];

		std::string busid_list = "";
		std::string rt_gpu_id_list = "";

		// Loop over the GPUs available to each MPI rank
		for(int i=0; i<num_devices; i++){

		hipErrorCheck( hipSetDevice(i) );

		// Get the PCIBusId for each GPU and use it to query for UUID
		hipErrorCheck( hipDeviceGetPCIBusId(busid, 64, i) );

		// Concatenate per-MPIrank GPU info into strings for printf
		if(i > 0) rt_gpu_id_list.append(",");
		rt_gpu_id_list.append(std::to_string(i));

		std::string temp_busid(busid);

		if(i > 0) busid_list.append(",");
		busid_list.append(temp_busid.substr(5,2));

		}

		#pragma omp parallel default(shared) private(hwthread, thread_id)
		{
		#pragma omp critical
		{
		thread_id = omp_get_thread_num();
		hwthread = sched_getcpu();

		std::cout << "MPI " << rank << " - OMP " << thread_id << " - HWT "
		<< hwthread << " - Node " << name << " - RT_GPU_ID " << rt_gpu_id_list
		<< " - GPU_ID " << gpu_id_list << " - Bus_ID " << busid_list << std::endl;
		}
		}
		}

		MPI_Finalize();

		return 0;
		}