# Invocation command line: # /lustre/fsw/devtech/hpc-devtech/cponder/SPEC-HPG/2022-09-03.documentation/bin/harness/runhpc --flagsurl nvhpc_flags.xml --configfile /lustre/fsw/devtech/hpc-devtech/cponder/SPEC-HPG/2022-09-03.documentation/3030084.run_medium.11_0.REPORTABLE.064_NODES/REPORTABLE.cfg --define CONTROL=Reportable --iterations 3 --nobuild --action run --tune base --nopower --runmode speed --tune base --size ref medium # output_root was not used for this run ############################################################################ env_vars=1 teeout = yes backup_config = no preENV_SPEC_NO_RUNDIR_DEL = on output_format = text,cfg,html,pdf,rsf reportable = 1 mean_anyway = 0 strict_rundir_verify = 1 ignore_errors = no tune = base,peak iterations=5 showtimer = 0 %define TIMELIMIT default: license_num = 019 test_sponsor = NVIDIA Corporation tester = NVIDIA Corporation default: system_vendor = NVIDIA Corporation interconnect_fs_hw_switch_fs_model = NVIDIA Quantum QM8700 interconnect_comm_hw_switch_comm_model = NVIDIA Quantum QM8700 system_name000 = Selene: NVIDIA DGX SuperPOD system_name001 = (AMD EPYC 7742 2.25 GHz, Tesla A100-SXM-80 GB) node_compute_sw_accel_driver = NVIDIA UNIX x86_64 Kernel Module 470.103.01 hw_avail = Jul-2020 sw_avail = Mar-2022 prepared_by000= Carl Ponder (cponder@nvidia.com) & prepared_by001 = Mathew Colgrove (mcolgrove@nvidia.com) default: system_class = SMP interconnect_comm_syslbl = Multi-rail InfiniBand HDR fabric interconnect_comm_order = 2 interconnect_comm_purpose = Inter-process communication interconnect_comm_hw_vendor = NVIDIA interconnect_comm_hw_model = N/A interconnect_comm_hw_switch_comm_count = 164 interconnect_comm_hw_switch_comm_ports = 40 interconnect_comm_hw_topo = Full three-level fat-tree interconnect_comm_hw_switch_comm_data_rate = 200 GB/s per port interconnect_comm_hw_switch_comm_firmware = MLNX-OS v3.10.2202 interconnect_fs_syslbl = DDN EXAScalar file system interconnect_fs_order = 3 interconnect_fs_purpose = Global storage interconnect_fs_hw_vendor = NVIDIA interconnect_fs_hw_model = N/A interconnect_fs_hw_switch_fs_count = 26 interconnect_fs_hw_switch_fs_ports = 40 interconnect_fs_hw_topo = Full three-level fat-tree interconnect_fs_hw_switch_fs_data_rate = 200 GB/s per port interconnect_fs_hw_switch_fs_firmware = MLNX-OS v3.10.2202 notes_submit_000 = MPI startup command: notes_submit_005 = srun command was used to start MPI jobs. notes_submit_010 = notes_submit_015 = Individual Ranks were bound to the NUMA nodes, GPUs and NICs using this "wrapper.GPU" bash-script for the case of 1 rank per GPU notes_submit_020 = notes_submit_025 = ln -s -f libnuma.so.1 /usr/lib/x86_64-linux-gnu/libnuma.so notes_submit_030 = export LD_LIBRARY_PATH+=:/usr/lib/x86_64-linux-gnu notes_submit_035 = export LD_RUN_PATH+=:/usr/lib/x86_64-linux-gnu notes_submit_040 = declare -a NUMA_LIST notes_submit_045 = declare -a GPU_LIST notes_submit_050 = declare -a NIC_LIST notes_submit_055 = NUMA_LIST=($NUMAS) notes_submit_060 = GPU_LIST=($GPUS) notes_submit_065 = NIC_LIST=($NICS) notes_submit_070 = export UCX_NET_DEVICES=${NIC_LIST[$SLURM_LOCALID]}:1 notes_submit_075 = export OMPI_MCA_btl_openib_if_include=${NIC_LIST[$SLURM_LOCALID]} notes_submit_080 = export CUDA_VISIBLE_DEVICES=${GPU_LIST[$SLURM_LOCALID]} notes_submit_085 = numactl -l -N ${NUMA_LIST[$SLURM_LOCALID]} $* notes_submit_090 = notes_submit_095 = and this "wrapper.MPS" bash-script for the oversubscribed case. notes_submit_100 = notes_submit_105 = ln -s -f libnuma.so.1 /usr/lib/x86_64-linux-gnu/libnuma.so notes_submit_110 = export LD_LIBRARY_PATH+=:/usr/lib/x86_64-linux-gnu notes_submit_115 = export LD_RUN_PATH+=:/usr/lib/x86_64-linux-gnu notes_submit_120 = declare -a NUMA_LIST notes_submit_125 = declare -a GPU_LIST notes_submit_130 = declare -a NIC_LIST notes_submit_135 = NUMA_LIST=($NUMAS) notes_submit_140 = GPU_LIST=($GPUS) notes_submit_145 = NIC_LIST=($NICS) notes_submit_150 = NUM_GPUS=${#GPU_LIST[@]} notes_submit_155 = RANKS_PER_GPU=$((SLURM_NTASKS_PER_NODE / NUM_GPUS)) notes_submit_160 = GPU_LOCAL_RANK=$((SLURM_LOCALID / RANKS_PER_GPU)) notes_submit_165 = export UCX_NET_DEVICES=${NIC_LIST[$GPU_LOCAL_RANK]}:1 notes_submit_170 = export OMPI_MCA_btl_openib_if_include=${NIC_LIST[$GPU_LOCAL_RANK]} notes_submit_175 = set +e notes_submit_180 = nvidia-cuda-mps-control -d 1>&2 notes_submit_185 = set -e notes_submit_190 = export CUDA_VISIBLE_DEVICES=${GPU_LIST[$GPU_LOCAL_RANK]} notes_submit_195 = numactl -l -N ${NUMA_LIST[$GPU_LOCAL_RANK]} $* notes_submit_200 = if [ $SLURM_LOCALID -eq 0 ] notes_submit_205 = then notes_submit_210 = echo 'quit' | nvidia-cuda-mps-control 1>&2 notes_submit_215 = fi default: ENV_GPUS = 0 1 2 3 4 5 6 7 ENV_NICS = mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9 ENV_NUMAS = 2 3 0 1 6 7 4 5 default: %define GPUS_PER_NODE 8 %define GPUS_PER_NODEx2 (%{GPUS_PER_NODE}*2) %define RANKS_PROCS (%{ENV_SLURM_JOB_NUM_NODES}*%{ENV_SLURM_NTASKS_PER_NODE}) %define RANKS_1xGPUS (1*%{ENV_SLURM_JOB_NUM_NODES}*%{GPUS_PER_NODE}) %define RANKS_2xGPUS (2*%{ENV_SLURM_JOB_NUM_NODES}*%{GPUS_PER_NODE}) %define THREADS_PROCS (256/%{ENV_SLURM_NTASKS_PER_NODE}) %define THREADS_1xGPUS (256/%{GPUS_PER_NODE}) %define THREADS_2xGPUS (256/%{GPUS_PER_NODE}/2) %define HALF_NUMA_THREADS (128/%{GPUS_PER_NODE}/2) large=base=default: threads=%{HALF_NUMA_THREADS} default: ENV_ENROOT_LOGIN_SHELL=n ENV_NVIDIA_DISABLE_REQUIRE=1 SRUN_OPTS= SRUN_OPTS+= --mpi=pmix SRUN_OPTS+= --container-entrypoint SRUN_OPTS+= --container-name spec-hpc SRUN_OPTS+= --container-mounts $[top] --container-workdir `/bin/pwd` SRUN_OPTS+= -N %{ENV_SLURM_JOB_NUM_NODES} ranks=%{RANKS_2xGPUS} threads=%{THREADS_2xGPUS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODEx2} %{TIMELIMIT} $[top]/wrapper.MPS $command 705.lbm_m=peak=default: 513.soma_t,613.soma_s=peak=default: ranks=%{RANKS_1xGPUS} threads=%{THREADS_1xGPUS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.GPU $command 518.tealeaf_t,618.tealeaf_s,718.tealeaf_m,818.tealeaf_l=peak=default: ranks=%{RANKS_1xGPUS} threads=%{THREADS_1xGPUS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.GPU $command 519.clvleaf_t=peak=default: ranks=%{RANKS_2xGPUS} threads=%{THREADS_2xGPUS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODEx2} %{TIMELIMIT} $[top]/wrapper.MPS $command 619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=peak=default: ranks=%{RANKS_PROCS} threads=%{THREADS_PROCS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{ENV_SLURM_NTASKS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.MPS $command 819.clvleaf_l=peak=default: ENV_NV_ACC_GANGLIMIT=3000000 ENV_NUMAS = 3 3 1 1 7 7 5 5 threads=%{HALF_NUMA_THREADS} 521.miniswp_t=peak=default: 621.miniswp_s=peak=default: basepeak=1 528.pot3d_t,628.pot3d_s,728.pot3d_m=peak=default: 828.pot3d_l=peak=default: ranks=%{RANKS_1xGPUS} threads=%{THREADS_1xGPUS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.GPU $command 532.sph_exa_t,632.sph_exa_s=peak=default: ENV_NV_ACC_GANGLIMIT=3000000 ranks=%{RANKS_PROCS} threads=%{THREADS_PROCS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{ENV_SLURM_NTASKS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.MPS $command 534.hpgmgfv_t=peak=default: ranks=%{RANKS_2xGPUS} threads=%{THREADS_2xGPUS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{GPUS_PER_NODEx2} %{TIMELIMIT} $[top]/wrapper.MPS $command 634.hpgmgfv_s,734.hpgmgfv_m,834.hpgmgfv_l=peak=default: ranks=%{RANKS_PROCS} threads=%{THREADS_PROCS} submit = srun ${SRUN_OPTS} -n $[ranks] -c $[threads] --ntasks-per-node %{ENV_SLURM_NTASKS_PER_NODE} %{TIMELIMIT} $[top]/wrapper.MPS $command 635.weather_s=peak=default: basepeak=1 735.weather_m=peak=default: ENV_NUMAS = 3 3 1 1 7 7 5 5 threads=%{HALF_NUMA_THREADS} default: node_compute_syslbl = DGX A100 node_compute_order = 1 node_compute_count = 64 node_compute_purpose = compute node_compute_hw_vendor = NVIDIA Corporation node_compute_hw_model = NVIDIA DGX A100 System node_compute_hw_cpu_name = AMD EPYC 7742 node_compute_hw_ncpuorder = 2 chips node_compute_hw_nchips = 2 node_compute_hw_ncores = 128 node_compute_hw_ncoresperchip = 64 node_compute_hw_nthreadspercore = 2 node_compute_hw_cpu_char =Turbo Boost up to 3400 MHz node_compute_hw_cpu_mhz = 2250 node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core node_compute_hw_scache = 512 KB I+D on chip per core node_compute_hw_tcache000= 256 MB I+D on chip per chip node_compute_hw_tcache001 = (16 MB shared / 4 cores) node_compute_hw_ocache = None node_compute_hw_memory = 2 TB (32 x 64 GB 2Rx8 PC4-3200AA-R) node_compute_hw_disk000= OS: 2TB U.2 NVMe SSD drive node_compute_hw_disk001 = Internal Storage: 30TB (8x 3.84TB U.2 NVMe SSD node_compute_hw_disk002 = drives) node_compute_hw_other = None node_compute_hw_adapter_comm_interconnect = InfiniBand / Communication node_compute_hw_adapter_comm_model = NVIDIA ConnectX-6 MT28908 node_compute_hw_adapter_comm_count = 8 node_compute_hw_adapter_comm_ports_used = 1 node_compute_hw_adapter_comm_slot_type = PCIe Gen4 node_compute_hw_adapter_comm_data_rate = 200 Gb/s node_compute_hw_adapter_comm_driver = InfiniBand: 5.4-3.4.0.0 node_compute_hw_adapter_comm_firmware = InfiniBand: 20.32.1010 node_compute_hw_adapter_fs_interconnect = InfiniBand / FileSystem node_compute_hw_adapter_fs_model = NVIDIA ConnectX-6 MT28908 node_compute_hw_adapter_fs_count = 2 node_compute_hw_adapter_fs_ports_used = 2 node_compute_hw_adapter_fs_slot_type = PCIe Gen4 node_compute_hw_adapter_fs_data_rate = 200 Gb/s node_compute_hw_adapter_fs_driver = Ethernet: 5.4-3.4.0.0 node_compute_hw_adapter_fs_firmware = Ethernet: 20.32.1010 node_compute_sw_os000 = Ubuntu 20.04 node_compute_sw_os001 = 5.4.0-121-generic node_compute_sw_localfile = ext4 node_compute_sw_sharedfile = Lustre node_compute_sw_state = Multi-user, run level 3 node_compute_sw_other = None default: node_compute_hw_accel_model = Tesla A100-SXM-80 GB node_compute_hw_accel_count = 8 node_compute_hw_accel_vendor= NVIDIA Corporation node_compute_hw_accel_type = GPU node_compute_hw_accel_connect = NVLINK 3.0, NVSWITCH 2.0 600 GB/s node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = See Notes default: notes_plat_000 = Detailed A100 Information from nvaccelinfo notes_plat_005 = CUDA Driver Version: 11040 notes_plat_010 = NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.7.01 notes_plat_015 = Device Number: 0 notes_plat_020 = Device Name: NVIDIA A100-SXM-80 GB notes_plat_025 = Device Revision Number: 8.0 notes_plat_030 = Global Memory Size: 85198045184 notes_plat_035 = Number of Multiprocessors: 108 notes_plat_040 = Concurrent Copy and Execution: Yes notes_plat_045 = Total Constant Memory: 65536 notes_plat_050 = Total Shared Memory per Block: 49152 notes_plat_055 = Registers per Block: 65536 notes_plat_060 = Warp Size: 32 notes_plat_065 = Maximum Threads per Block: 1024 notes_plat_070 = Maximum Block Dimensions: 1024, 1024, 64 notes_plat_075 = Maximum Grid Dimensions: 2147483647 x 65535 x 65535 notes_plat_080 = Maximum Memory Pitch: 2147483647B notes_plat_085 = Texture Alignment: 512B notes_plat_090 = Clock Rate: 1410 MHz notes_plat_095 = Execution Timeout: No notes_plat_100 = Integrated Device: No notes_plat_105 = Can Map Host Memory: Yes notes_plat_110 = Compute Mode: default notes_plat_115 = Concurrent Kernels: Yes notes_plat_120 = ECC Enabled: Yes notes_plat_125 = Memory Clock Rate: 1593 MHz notes_plat_130 = Memory Bus Width: 5120 bits notes_plat_135 = L2 Cache Size: 41943040 bytes notes_plat_140 = Max Threads Per SMP: 2048 notes_plat_145 = Async Engines: 3 notes_plat_150 = Unified Addressing: Yes notes_plat_155 = Managed Memory: Yes notes_plat_160 = Concurrent Managed Memory: Yes notes_plat_165 = Preemption Supported: Yes notes_plat_170 = Cooperative Launch: Yes notes_plat_175 = Multi-Device: Yes notes_plat_180 = Default Target: cc80 default: makeflags=-j 40 flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.2022-11-03.xml default: sw_compiler000 = C/C++/Fortran: Version 22.3 of sw_compiler001 = NVIDIA HPC SDK for Linux sw_mpi_library = OpenMPI Version 4.1.2rc4 sw_mpi_other = HPC-X Software Toolkit Version 2.10 sw_other = None default: label = %{CONTROL}.11_0 default: CC = mpicc CXX = mpicxx FC = mpif90 CC_VERSION_OPTION = -V CXX_VERSION_OPTION = -V FC_VERSION_OPTION = -V ENV_OMPI_MCA_pml=ucx ENV_OMPI_MCA_topo=basic ENV_UCX_LOG_LEVEL=error ENV_OMPI_MCA_coll_hcoll_enable=1 ENV_CUDA_CACHE_DISABLE=1 ENV_HCOLL_BUFFER_POOL_MEM_PER_NODE=1024Mb ENV_RETRY_COUNT=1000 ENV_UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy ENV_UCX_RNDV_SCHEME=get_zcopy ENV_UCX_RNDV_THRESH=8192 ENV_UCX_MAX_RNDV_RAILS=1 ENV_OMPI_MCA_pml_ucx_devices=any ENV_OMPI_MCA_pml_ucx_tls=any default: pmodel=ACC OPTIMIZE = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 CXXPORTABILITY = --c++17 505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l=default=default: CPORTABILITY = -DSPEC_OPENACC_NO_SELF 505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l=peak=default: OPTIMIZE = -w -O3 -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80,maxregcount:128 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -mp 513.soma_t,613.soma_s=peak=default: 518.tealeaf_t,618.tealeaf_s,718.tealeaf_m,818.tealeaf_l=peak=default: OPTIMIZE = -w -O3 -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -mp COPTIMIZE = -Msafeptr 519.clvleaf_t,619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=peak=default: OPTIMIZE = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -mp 521.miniswp_t,621.miniswp_s=peak=default: OPTIMIZE = -w -O3 -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 COPTIMIZE = -Msafeptr 528.pot3d_t,628.pot3d_s,728.pot3d_m,828.pot3d_l=peak=default: ENV_HCOLL_BUFFER_POOL_MEM_PER_NODE=512Mb srcalt=acc_async 532.sph_exa_t,632.sph_exa_s=peak=default: OPTIMIZE = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 -Mquad COPTIMIZE = -Msafeptr 534.hpgmgfv_t,634.hpgmgfv_s,734.hpgmgfv_m,834.hpgmgfv_l=peak=default: OPTIMIZE = -w -fast -DSPEC_ACCEL_AWARE_MPI -acc=gpu -gpu=cuda11.0,cc80 -Mstack_arrays -Mfprelaxed -Mnouniform -tp=zen2 COPTIMIZE = -Msafeptr 535.weather_t,635.weather_s,735.weather_m,835.weather_l=peak=default: srcalt=acc_collapse default: notes_comp_000 = Binaries built and run within a NVHPC SDK 22.3 CUDA 11.0 Ubuntu 20.04 notes_comp_005 = Container available from NVIDIA GPU Cloud (NGC): notes_comp_010 = https://ngc.nvidia.com/catalog/containers/nvidia:nvhpc notes_comp_015 = https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags notes_comp_020 = default: notes_000 =Full system details documented here: notes_005 =https://images.nvidia.com/aem-dam/Solutions/Data-Center/gated-resources/nvidia-dgx-superpod-a100.pdf notes_010 = notes_015 =Environment variables set by runhpc before the start of the run: notes_020 =SPEC_NO_RUNDIR_DEL = "on" notes_025 =