MPI - Distributed Memory Parallelism

Last updated on 2024-02-06 | Edit this page

Estimated time: 12 minutes

Overview

Questions

  • How do you utilize more than one shared memory node?

Objectives

  • Demonstrate that distributed memory parallelism is useful for working with large data
  • Demonstrate that distributed memory parallelism can lead to improved time to solution

Introduction


Distributed Memory Random Forest

Digit Recognition

R

suppressPackageStartupMessages(library(randomForest))
data(LetterRecognition, package = "mlbench")
library(pbdMPI, quiet = TRUE)                #<<
comm.set.seed(seed = 7654321, diff = FALSE)      #<<

n = nrow(LetterRecognition)
n_test = floor(0.2 * n)
i_test = sample.int(n, n_test)
train = LetterRecognition[-i_test, ]
test = LetterRecognition[i_test, ][comm.chunk(n_test, form = "vector"), ]    #<<

comm.set.seed(seed  = 1234, diff = TRUE)          #<<
my.rf = randomForest(lettr ~ ., train, ntree = comm.chunk(500), norm.votes = FALSE) #<<
rf.all = allgather(my.rf)                  #<<
rf.all = do.call(combine, rf.all)          #<<
pred = as.vector(predict(rf.all, test))

correct = allreduce(sum(pred == test$lettr))  #<<
comm.cat("Proportion Correct:", correct/(n_test), "\n")

finalize()

Diamond Classification

R

library(randomForest)
data(diamonds, package = "ggplot2")
library(pbdMPI)                                  #<<
comm.set.seed(seed = 7654321, diff = FALSE)      #<<

n = nrow(diamonds)
n_test = floor(0.5 * n)
i_test = sample.int(n, n_test)
train = diamonds[-i_test, ]
test = diamonds[i_test, ][comm.chunk(n_test, form = "vector"), ]    #<<

comm.set.seed(seed = 1e6 * runif(1), diff = TRUE)          #<<
my.rf = randomForest(price ~ ., train, ntree = comm.chunk(100), norm.votes = FALSE) #<<
rf.all = allgather(my.rf)                  #<<
rf.all = do.call(combine, rf.all)          #<<
pred = as.vector(predict(rf.all, test))

sse = sum((pred - test$price)^2)
comm.cat("MSE =", reduce(sse)/n_test, "\n")

finalize()          #<<

BASH

#!/bin/bash
#SBATCH -J rf
#SBATCH -A CSC143
#SBATCH -p batch
#SBATCH --nodes=1
#SBATCH -t 00:40:00
#SBATCH --mem=0
#SBATCH -e ./rf.e
#SBATCH -o ./rf.o
#SBATCH --open-mode=truncate

cd ~/R4HPC/code_5
pwd

## modules are specific to andes.olcf.ornl.gov
module load openblas/0.3.17-omp
module load flexiblas
flexiblas add OpenBLAS $OLCF_OPENBLAS_ROOT/lib/libopenblas.so
export LD_PRELOAD=$OLCF_FLEXIBLAS_ROOT/lib64/libflexiblas.so
module load r
echo -e "loaded R with FlexiBLAS"
module list

time Rscript ../code_2/rf_serial.R
time mpirun --map-by ppr:1:node Rscript rf_mpi.R
time mpirun --map-by ppr:2:node Rscript rf_mpi.R
time mpirun --map-by ppr:4:node Rscript rf_mpi.R
time mpirun --map-by ppr:8:node Rscript rf_mpi.R
time mpirun --map-by ppr:16:node Rscript rf_mpi.R
time mpirun --map-by ppr:32:node Rscript rf_mpi.R

BASH

#!/bin/bash
#PBS -N rf
#PBS -l select=1:ncpus=32
#PBS -l walltime=00:05:00
#PBS -q qexp
#PBS -e rf.e
#PBS -o rf.o

cd ~/R4HPC/code_5
pwd

module load R
echo "loaded R"

time Rscript ../code_2/rf_serial.R
time mpirun --map-by ppr:1:node Rscript rf_mpi.R
time mpirun --map-by ppr:2:node Rscript rf_mpi.R
time mpirun --map-by ppr:4:node Rscript rf_mpi.R
time mpirun --map-by ppr:8:node Rscript rf_mpi.R
time mpirun --map-by ppr:16:node Rscript rf_mpi.R
time mpirun --map-by ppr:32:node Rscript rf_mpi.R

Key Points

  • Classification can be used for data other than digits, such as diamonds
  • Distributed memory parallelism can speed up training