/ - Diff - BoT - Forge du Centre Blaise Pascal

Révision b412345d

     Package: bot
     Maintainer: Florent Chuffart <florent.chuffart@ens-lyon.fr>
     Author: Florent Chuffart
     Version: 0.9.1
     License: CeCILL
     Title: bot
     Depends: fork
     Description: BoT (stands for Bag of Tasks) is an R package allowing to distribute independent tasks over many cores and many computing nodes. The simple fact that BoT is based on the process forking feature and task locking over file system makes BoT compatible with most of computing infrastructures (multicore, clusters, grids, clouds). Using BoT, each task is a set of parameters associated with a user-defined function built on an R process. Next step consists in forking this R process for each core of the computing node. Finally, the forked set of tasks is randomized and executed in a parallel way.  When a task starts a distributed lock is taken. This avoids redundant task execution. When a task is ended, result is dumped into a file. BoT is used to compute NGS data in the SiGHT project context (ERC-StG2011-281359). BoT has been tested on two infrastructures: Grid'5000 experimental testbed (https://www.grid5000.fr) and PSMN computing center of ENS de Lyon (http://www.ens-lyon.fr/PSMN). For more information on how to use bot, have a look on the examples of the help ?bot::run_engine and ?bot::botapply.
     URL: http://www.ens-lyon.fr/LBMC/gisv/index.php/en/protocols/bioinformatics  http://www.ens-lyon.fr/LBMC/gisv

b/src/NAMESPACE
	1	export(run_engine, bot_stats, botapply)

     ######################################
     # DISTRIBUTED COMPUTING TASKS ENGINE #
     ######################################
     get_nb_proc = function ## Empirical function to find the number of core of the current computing node.
     ### This function uses \emph{/proc/cpuinfo} file for Debian and \emph{sysctl} for macosx. This fucntion could be extended or overrided by the user to adapt it to his own system.
     () {
       if (Sys.info()[["nodename"]] == "cremone") {
         return(12)
+      }
       dyn_nb_proc = as.integer(system("cat /proc/cpuinfo | grep 'core id' | wc -l",intern=TRUE))
       if (dyn_nb_proc == 0) {
         dyn_nb_proc = as.integer(system("cat /proc/cpuinfo | grep 'cpuid level' | wc -l",intern=TRUE))
         if (dyn_nb_proc == 0) {
           dyn_nb_proc = as.integer(system("sysctl -a | grep machdep.cpu.core_count | cut -d ' ' -f 2",intern=TRUE))
           if (dyn_nb_proc == 0) {
             return(1)
+          }
+        }
+      }
       return(dyn_nb_proc)
     ### The number of core that the current computing node owns.
+    }
     run_engine = structure(function (## Execute bag of tasks parallely, on as many cores as the current computing node owns.
     ### This bag of tasks engine forks processes on as many cores as the current computing node owns. Each sub-process takes a task randomly in the list of tasks. For each task, it starts by taking a lock on this task (creating a file named out_filename.lock). Next, it executes the task_processor (a function) using the corresponding set of parameters (task). When this execution is completed, it dumps task_processor results into a results file (named out_filename.RData).
     tasks,  ##<< A list of tasks, each task is a list of key values that will be passed as arguments to the task_processor. Note that task$out_filename is a mandatory parameter.
     task_processor, ##<< A function that will be called for each task in the task list \emph{tasks}.
     debug=FALSE, ##<< If \emph{TRUE} no process will be forked, the list of tasks will be executed in the current process.
     starter_name="~/.start_best_effort_jobs", ##<< Path to file that will be deleted after the execution of all tasks if \emph{rm_starter} is set to \emph{TRUE}.
     rm_starter=TRUE, ##<< If \emph{TRUE} the file \emph{starter_name} will be deleted after the execution of all tasks.
     log_dir="log", ##<< Path to the \emph{log} directory.
     bot_cache_dir = "cache",   ##<< the directory where task results are cached
     nb_proc=NULL, ##<< If not NULL fix the number of core on which tasks must be computed.
     ... ##<< Other arguments that will be passed to \emph{task_processor}.
     ){
       if (!file.exists(log_dir)) {
         dir.create(log_dir, recursive = TRUE)
+      }
       if (!file.exists(bot_cache_dir)) {
         dir.create(bot_cache_dir, recursive = TRUE)
+      }
       print(paste("#tasks: ", length(tasks)))
       forked_part = function(){
         stats = list()
         stats$proc_id = proc_id
         stats$UID = UID
         set.seed(proc_id + UID)
         if (!debug) {
           Sys.sleep(floor(runif(1,1,30)))
+        }
         hostname = system("hostname", intern=TRUE)
         stats$hostname = hostname
         sink(paste(log_dir, "/proc_id_hostname_", proc_id, "_", hostname, ".log", sep=""), type =c("output", "message"), split = TRUE)
         need_rerun = TRUE
         nb_loop = 0
         while (need_rerun){
             need_rerun = FALSE
             for (task in sample(tasks)) {
             # Check mandatory task attribute
             if (is.null(task$out_filename)) {
               print("ERROR! Attribute task$out_filename is mandatory.")
               exit(1)
+            }
             # Check if task is already done or currently processed
             lock_filename = paste(bot_cache_dir, "/", task$out_filename, ".lock", sep="")
             save_filename = paste(bot_cache_dir, "/", task$out_filename, ".RData", sep="")
             lock_filename_bis = paste(bot_cache_dir, "/", task$out_filename, "_", (proc_id + UID) ,".lock4no", sep="")
             for_nothing_filename = paste(bot_cache_dir, "/", task$out_filename, "_", (proc_id + UID) , ".RData4no", sep="")
             stats$start_date = as.integer(format(Sys.time(), "%s"))
             if (nb_loop==0 & file.exists(lock_filename)) {
               print(paste("[proc_", proc_id , "] ", date(), " ", save_filename, " is locked... skipping.", sep=""))
               need_rerun = TRUE
             } else if (file.exists(save_filename)) {
               print(paste("[proc_", proc_id , "] ", date(), " ", save_filename, " exists... skipping.", sep=""))
             } else {
               need_rerun = TRUE
               print(paste("[proc_", proc_id , "] ", date(), " taking lock on ", lock_filename, " and computing...", sep=""))
               save(stats, file=lock_filename)
               save(stats, file=lock_filename_bis)
               task_result = task_processor(task, ...)
               stats$stop_date = as.integer(format(Sys.time(), "%s"))
               if (!file.exists(save_filename)) {
                 save(task_result, stats, file=save_filename)
                 file.remove(lock_filename)
                 file.remove(lock_filename_bis)
               } else {
                 print(paste("[proc_", proc_id , "] ", stats$stop_date, " ", save_filename, " already exists... So it have been computed for nothing.", sep=""))
                 save(task_result, stats, file=for_nothing_filename)
                 file.remove(lock_filename)
                 file.remove(lock_filename_bis)
+              }
+            }
+          }
           nb_loop = nb_loop + 1
+        }
         if (rm_starter) {
           print("all task have been processed. Removing ~/.start_best_effort_jobs..." )
           file.remove(starter_name)
         } else {
           print("all task have been processed." )
+        }
         sink()
+      }
       UID = round(runif(1,1,1000000))
       if (debug) {
         nb_proc = 1
         proc_id = 0
         print("running engine witout forking (DEBUG MODE)...")
         forked_part()
       } else {
         if (is.null(nb_proc)) {
           nb_proc = get_nb_proc()
+        }
         print(paste("running engine over ", nb_proc, " proc(s)...", sep=""))
         pids = c()
         for (proc_id in 1:nb_proc) {
           # Here we fork!
           pids = c(pids,fork(forked_part))
+        }
         # wait until each childs finishe, then display their exit status
         for (pid in pids) {
           wait(pid)
+        }
+      }
       # Nothing
     }, ex=function(){
       # We define a basic task_processor
       sum_a_b = function(task) {
         return(task$a + task$b)
+      }
       # We define 9 tasks
       tasks = list()
       for (a in 1:3) {
         for (b in 4:6) {
           tasks[[length(tasks) + 1]] = list(a=a, b=b, out_filename=paste("sum_a_b", a, b, sep="_"))
+        }
+      }
       # We execute the 3 tasks
       run_engine(tasks, sum_a_b)
       # We collect 9 task results
       for (a in 1:3) {
         for (b in 4:6) {
           out_filename = paste("sum_a_b", a, b, sep="_")
           out_filename = paste("cache/", out_filename, ".RData", sep="")
           load(out_filename)
           print(task_result)
+        }
+      }
       # Better way to do that
       apply(t(tasks), 2, function(task) {
         out_filename = task[[1]]$out_filename
         out_filename = paste("cache/", out_filename, ".RData", sep="")
         load(out_filename)
         print(task_result)
       })
       # Viewing statistics about the campain.
       bot_stats()
     })
     botapply = structure(function(## A function to use bot features in an apply fashion.
     ### With bot apply you could write your independant loop in an apply fashion, results will be collected ans returned when all tasks will be done.
     tasks,  ##<< A list of tasks, each task is a list of key values that will be passed as arguments to the task_processor. Note that task$out_filename is a mandatory parameter.
     task_processor, ##<< A function that will be called for each task in the task list \emph{tasks}.
     bot_cache_dir = "cache",   ##<< the directory where task results are cached
     ... ##<< Other arguments that will be passed to \emph{run_engine}.
     ) {
       print(bot_cache_dir)
       run_engine(tasks, task_processor, bot_cache_dir=bot_cache_dir, ...)
       ret = apply(t(tasks), 2, function(task) {
         out_filename = task[[1]]$out_filename
         out_filename = paste(bot_cache_dir, "/", out_filename, ".RData", sep="")
         task_result = NULL
         load(out_filename)
         return(task_result)
       })
       return(ret)
     # It returns the list of compurted tasks
     }, ex=function(){
       botapply(
         list(
           list(a=1, b=10, out_filename="task1"),
           list(a=2, b=20, out_filename="task2"),
           list(a=3, b=30, out_filename="task3"),
           list(a=4, b=40, out_filename="task4")),
         function(task) {
           return(task$a + task$b)})
       # botapply(list(list(a=1, b=10, out_filename="task1")),function(task) {return(task$a + task$b)})
     })
     bot_stats = function(## It compute and display statistique about a campain.
     ### This function browses bot_cache_dir directory and collects information about tasks. Next, it display gantt chart, tasks chart and how computing element are used. Finally it prints on outpout some stats about the campain.
     bot_cache_dir = "cache",   ##<< the directory where task results are cached
     WHITH4NO = FALSE  ##<< TRUE if you want to include redundante submission in the stats.
     ) {
       foo = apply(t(list.files(bot_cache_dir, "*RData")), 2, function(file) {
         load(paste(bot_cache_dir, "/", file, sep=""))
         stats
         })
       stats = data.frame(t(matrix(unlist(foo), length(foo[[1]]))), stringsAsFactors=FALSE)
       names(stats) = names(foo[[1]])
       stats$efficient = 1
       if (WHITH4NO) {
         foo2 = apply(t(list.files(bot_cache_dir, "*4no")), 2, function(file) {
         load(paste(bot_cache_dir, "/", file, sep=""))
         stats
         })
         stats2 = data.frame(t(matrix(unlist(foo2), length(foo2[[1]]))), stringsAsFactors=FALSE)
         names(stats2) = names(foo2[[1]])
         stats2$efficient = 2
         stats = rbind(stats, stats2)
+      }
       stats$start_date = as.integer(stats$start_date)/60
       stats$stop_date = as.integer(stats$stop_date)/60
       zero = min(stats$start_date)
       stats$start_date = stats$start_date - zero
       stats$stop_date = stats$stop_date - zero
       stats$duration = stats$stop_date - stats$start_date
       stats$core = paste(stats$hostname, stats$proc_id, sep="_")
       cores = sort(unique(stats$core))
       x11(width=16, height=9)
       layout(matrix(1:3, nrow=1), respect=TRUE)
       stats = stats[ order(stats$start_date), ]
       plot(0,0,col=0, xlim=c(0, max(stats$stop_date)), ylim=c(0, length(stats[,1])), main=paste("Gantt Chart for", bot_cache_dir), xlab="Time (min)", ylab= "Task")
       arrows( stats$start_date, 1:length(stats[,1]), stats$stop_date, 1:length(stats[,1]), 0, 0, col=stats$efficient)
       stats = stats[ order(stats$duration, decreasing=TRUE),]
       plot(0,0,col=0, xlim=c(0, max(stats$duration)), ylim=c(0, length(stats[,1])), main=paste("Task duration for", bot_cache_dir), xlab="Time (min)", ylab= "Task")
       arrows( rep(0, length(stats[,1])), 1:length(stats[,1]), stats$duration, 1:length(stats[,1]), 0, 0, , col=stats$efficient)
       plot(0,0,col=0, xlim=c(0, max(stats$stop_date)), ylim=c(0, length(cores)), main=paste("Task repartition over computing elements for", bot_cache_dir), xlab="Time (min)", ylab= "Computing Element")
       ys = apply(t(stats$core), 2, function(core){which(cores == core)})
       arrows( stats$start_date, ys, stats$stop_date, ys, 0, 0, col=stats$efficient)
       format.timediff <- function(diff) {
           hr <- diff%/%60
           min <- floor(diff - hr * 60)
           sec <- round(diff%%1 * 60,digits=2)
           return(paste(hr,min,sec,sep=':'))
+      }
       cat("#cores....................", length(cores), "\n", sep="")
       cat("#tasks....................", length(stats[stats$efficient==1, 1]), "\n", sep="")
       cat("cpu.time..................", format.timediff(sum(stats[stats$efficient==1, ]$duration)), "\n", sep="")
       cat("time......................", format.timediff(max(stats[stats$efficient==1, ]$stop_date)), "\n", sep="")
       cat("speedup...................", sum(stats[stats$efficient==1, ]$duration)/max(stats[stats$efficient==1, ]$stop_date), "\n", sep="")
       cat("efficiency................", sum(stats[stats$efficient==1, ]$duration)/max(stats[stats$efficient==1, ]$stop_date)/length(cores), "\n", sep="")
       return(stats)
       # It returns the data.frame of collected informations.
+    }

     \name{bot-package}
     \alias{bot-package}
     \alias{bot}
     \docType{package}
     \title{bot}
     \description{BoT (stands for Bag Of Tasks) is an R package allowing to distribute independent tasks over many cores and many computing nodes. The simple fact that BoT is based on the process forking feature and task locking over file system makes BoT compatible with most of computing infrastructures (multicore, clusters, grids, clouds). Using BoT, each task is a set of parameters associated with a user-defined function built on an R process. Next step consists in forking this R process for each core of the computing node. Finally, the forked set of tasks is randomized and executed in a parallel way.  When a task starts a distributed lock is taken. This avoids redundant task execution. When a task is ended, result is dumped into a file. BoT is used to compute NGS data in the SiGHT project context (ERC-StG2011-281359). BoT has been tested on two infrastructures: Grid'5000 experimental testbed (https://www.grid5000.fr) and PSMN computing center of ENS de Lyon (http://www.ens-lyon.fr/PSMN). For more information on how to use bot, have a look on the example of the help ?bot::run_engine.}
     \details{
     \tabular{ll}{Package: \tab bot\cr
     Maintainer: \tab Florent Chuffart <florent.chuffart@ens-lyon.fr>\cr
     Author: \tab Florent Chuffart\cr
     Version: \tab 0.9\cr
     License: \tab CeCILL\cr
     Title: \tab bot\cr
     Depends: \tab fork\cr
     URL: \tab http://www.ens-lyon.fr/LBMC/gisv/index.php/en/protocols/bioinformatics  http://www.ens-lyon.fr/LBMC/gisv\cr}
+    }
     \author{Florent Chuffart}
     \keyword{ package }

     \name{bot_stats}
     \alias{bot_stats}
     \title{# It compute and display statistique about a campain.}
     \description{This function browses bot_cache_dir directory and collects information about tasks. Next, it display gantt chart, tasks chart and how computing element are used. Finally it prints on outpout some stats about the campain.}
     \usage{bot_stats(bot_cache_dir = "cache", WHITH4NO = FALSE)}
     \arguments{
       \item{bot_cache_dir}{the directory where task results are cached}
       \item{WHITH4NO}{TRUE if you want to include redundante submission in the stats.}
+    }
     \author{Florent Chuffart}

     \name{botapply}
     \alias{botapply}
     \title{# A function to use bot features in an apply fashion.}
     \description{With bot apply you could write your independant loop in an apply fashion, results will be collected ans returned when all tasks will be done.}
     \usage{botapply(tasks, task_processor, bot_cache_dir = "cache", ...)}
     \arguments{
       \item{tasks}{A list of tasks, each task is a list of key values that will be passed as arguments to the task_processor. Note that task$out_filename is a mandatory parameter.}
       \item{task_processor}{A function that will be called for each task in the task list \emph{tasks}.}
       \item{bot_cache_dir}{the directory where task results are cached}
       \item{\dots}{Other arguments that will be passed to \emph{run_engine}.}
+    }
     \author{Florent Chuffart}
     \examples{
     botapply(
       list(
         list(a=1, b=10, out_filename="task1"),
         list(a=2, b=20, out_filename="task2"),
         list(a=3, b=30, out_filename="task3"),
         list(a=4, b=40, out_filename="task4")),
       function(task) {
         return(task$a + task$b)})
     # botapply(list(list(a=1, b=10, out_filename="task1")),function(task) {return(task$a + task$b)})
+    }

     \name{get_nb_proc}
     \alias{get_nb_proc}
     \title{# Empirical function to find the number of core of the current computing node.}
     \description{This function uses \emph{/proc/cpuinfo} file for Debian and \emph{sysctl} for macosx. This fucntion could be extended or overrided by the user to adapt it to his own system.}
     \usage{get_nb_proc()}
     \value{The number of core that the current computing node owns.}
     \author{Florent Chuffart}

     \name{run_engine}
     \alias{run_engine}
     \title{# Execute bag of tasks parallely, on as many cores as the current computing node owns.}
     \description{This bag of tasks engine forks processes on as many cores as the current computing node owns. Each sub-process takes a task randomly in the list of tasks. For each task, it starts by taking a lock on this task (creating a file named out_filename.lock). Next, it executes the task_processor (a function) using the corresponding set of parameters (task). When this execution is completed, it dumps task_processor results into a results file (named out_filename.RData).}
     \usage{run_engine(tasks, task_processor, debug = FALSE, starter_name = "~/.start_best_effort_jobs",
         rm_starter = TRUE, log_dir = "log", bot_cache_dir = "cache",
         nb_proc = NULL, ...)}
     \arguments{
       \item{tasks}{A list of tasks, each task is a list of key values that will be passed as arguments to the task_processor. Note that task$out_filename is a mandatory parameter.}
       \item{task_processor}{A function that will be called for each task in the task list \emph{tasks}.}
       \item{debug}{If \emph{TRUE} no process will be forked, the list of tasks will be executed in the current process.}
       \item{starter_name}{Path to file that will be deleted after the execution of all tasks if \emph{rm_starter} is set to \emph{TRUE}.}
       \item{rm_starter}{If \emph{TRUE} the file \emph{starter_name} will be deleted after the execution of all tasks.}
       \item{log_dir}{Path to the \emph{log} directory.}
       \item{bot_cache_dir}{the directory where task results are cached}
       \item{nb_proc}{If not NULL fix the number of core on which tasks must be computed.}
       \item{\dots}{Other arguments that will be passed to \emph{task_processor}.}
+    }
     \author{Florent Chuffart}
     \examples{
     # We define a basic task_processor
     sum_a_b = function(task) {
       return(task$a + task$b)
+    }
     # We define 9 tasks
     tasks = list()
     for (a in 1:3) {
       for (b in 4:6) {
         tasks[[length(tasks) + 1]] = list(a=a, b=b, out_filename=paste("sum_a_b", a, b, sep="_"))
+      }
+    }
     # We execute the 3 tasks
     run_engine(tasks, sum_a_b)
     # We collect 9 task results
     for (a in 1:3) {
       for (b in 4:6) {
         out_filename = paste("sum_a_b", a, b, sep="_")
         out_filename = paste("cache/", out_filename, ".RData", sep="")
         load(out_filename)
         print(task_result)
+      }
+    }
     # Better way to do that
     apply(t(tasks), 2, function(task) {
       out_filename = task[[1]]$out_filename
       out_filename = paste("cache/", out_filename, ".RData", sep="")
       load(out_filename)
       print(task_result)
     })
     # Viewing statistics about the campain.
     bot_stats()
+    }

Formats disponibles : Unified diff

LBMC » BoT

Révision b412345d