001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.yarn.applications.distributedshell;
020    
021    import java.io.BufferedReader;
022    import java.io.IOException;
023    import java.io.InputStreamReader;
024    import java.net.URI;
025    import java.net.URISyntaxException;
026    import java.nio.ByteBuffer;
027    import java.util.ArrayList;
028    import java.util.HashMap;
029    import java.util.List;
030    import java.util.Map;
031    import java.util.Vector;
032    import java.util.concurrent.ConcurrentHashMap;
033    import java.util.concurrent.ConcurrentMap;
034    import java.util.concurrent.atomic.AtomicInteger;
035    
036    import org.apache.commons.cli.CommandLine;
037    import org.apache.commons.cli.GnuParser;
038    import org.apache.commons.cli.HelpFormatter;
039    import org.apache.commons.cli.Options;
040    import org.apache.commons.cli.ParseException;
041    import org.apache.commons.logging.Log;
042    import org.apache.commons.logging.LogFactory;
043    import org.apache.hadoop.classification.InterfaceAudience;
044    import org.apache.hadoop.classification.InterfaceStability;
045    import org.apache.hadoop.conf.Configuration;
046    import org.apache.hadoop.yarn.api.ContainerManagementProtocol;
047    import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
048    import org.apache.hadoop.yarn.api.ApplicationConstants;
049    import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
050    import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
051    import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
052    import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
053    import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
054    import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
055    import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
056    import org.apache.hadoop.yarn.api.records.Container;
057    import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
058    import org.apache.hadoop.yarn.api.records.ContainerId;
059    import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
060    import org.apache.hadoop.yarn.api.records.ContainerState;
061    import org.apache.hadoop.yarn.api.records.ContainerStatus;
062    import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
063    import org.apache.hadoop.yarn.api.records.LocalResource;
064    import org.apache.hadoop.yarn.api.records.LocalResourceType;
065    import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
066    import org.apache.hadoop.yarn.api.records.NodeReport;
067    import org.apache.hadoop.yarn.api.records.Priority;
068    import org.apache.hadoop.yarn.api.records.Resource;
069    import org.apache.hadoop.yarn.api.records.ResourceRequest;
070    import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
071    import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
072    import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
073    import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
074    import org.apache.hadoop.yarn.conf.YarnConfiguration;
075    import org.apache.hadoop.yarn.exceptions.YarnException;
076    import org.apache.hadoop.yarn.util.ConverterUtils;
077    import org.apache.hadoop.yarn.util.Records;
078    
079    /**
080     * An ApplicationMaster for executing shell commands on a set of launched
081     * containers using the YARN framework.
082     * 
083     * <p>
084     * This class is meant to act as an example on how to write yarn-based
085     * application masters.
086     * </p>
087     * 
088     * <p>
089     * The ApplicationMaster is started on a container by the
090     * <code>ResourceManager</code>'s launcher. The first thing that the
091     * <code>ApplicationMaster</code> needs to do is to connect and register itself
092     * with the <code>ResourceManager</code>. The registration sets up information
093     * within the <code>ResourceManager</code> regarding what host:port the
094     * ApplicationMaster is listening on to provide any form of functionality to a
095     * client as well as a tracking url that a client can use to keep track of
096     * status/job history if needed.
097     * </p>
098     * 
099     * <p>
100     * The <code>ApplicationMaster</code> needs to send a heartbeat to the
101     * <code>ResourceManager</code> at regular intervals to inform the
102     * <code>ResourceManager</code> that it is up and alive. The
103     * {@link ApplicationMasterProtocol#allocate} to the <code>ResourceManager</code> from the
104     * <code>ApplicationMaster</code> acts as a heartbeat.
105     * 
106     * <p>
107     * For the actual handling of the job, the <code>ApplicationMaster</code> has to
108     * request the <code>ResourceManager</code> via {@link AllocateRequest} for the
109     * required no. of containers using {@link ResourceRequest} with the necessary
110     * resource specifications such as node location, computational
111     * (memory/disk/cpu) resource requirements. The <code>ResourceManager</code>
112     * responds with an {@link AllocateResponse} that informs the
113     * <code>ApplicationMaster</code> of the set of newly allocated containers,
114     * completed containers as well as current state of available resources.
115     * </p>
116     * 
117     * <p>
118     * For each allocated container, the <code>ApplicationMaster</code> can then set
119     * up the necessary launch context via {@link ContainerLaunchContext} to specify
120     * the allocated container id, local resources required by the executable, the
121     * environment to be setup for the executable, commands to execute, etc. and
122     * submit a {@link StartContainerRequest} to the {@link ContainerManagementProtocol} to
123     * launch and execute the defined commands on the given allocated container.
124     * </p>
125     * 
126     * <p>
127     * The <code>ApplicationMaster</code> can monitor the launched container by
128     * either querying the <code>ResourceManager</code> using
129     * {@link ApplicationMasterProtocol#allocate} to get updates on completed containers or via
130     * the {@link ContainerManagementProtocol} by querying for the status of the allocated
131     * container's {@link ContainerId}.
132     *
133     * <p>
134     * After the job has been completed, the <code>ApplicationMaster</code> has to
135     * send a {@link FinishApplicationMasterRequest} to the
136     * <code>ResourceManager</code> to inform it that the
137     * <code>ApplicationMaster</code> has been completed.
138     */
139    @InterfaceAudience.Public
140    @InterfaceStability.Unstable
141    public class ApplicationMaster {
142    
143      private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
144    
145      // Configuration
146      private Configuration conf;
147    
148      // Handle to communicate with the Resource Manager
149      @SuppressWarnings("rawtypes")
150      private AMRMClientAsync resourceManager;
151    
152      // Handle to communicate with the Node Manager
153      private NMClientAsync nmClientAsync;
154      // Listen to process the response from the Node Manager
155      private NMCallbackHandler containerListener;
156      
157      // Application Attempt Id ( combination of attemptId and fail count )
158      private ApplicationAttemptId appAttemptID;
159    
160      // TODO
161      // For status update for clients - yet to be implemented
162      // Hostname of the container
163      private String appMasterHostname = "";
164      // Port on which the app master listens for status updates from clients
165      private int appMasterRpcPort = 0;
166      // Tracking url to which app master publishes info for clients to monitor
167      private String appMasterTrackingUrl = "";
168    
169      // App Master configuration
170      // No. of containers to run shell command on
171      private int numTotalContainers = 1;
172      // Memory to request for the container on which the shell command will run
173      private int containerMemory = 10;
174      // Priority of the request
175      private int requestPriority;
176    
177      // Counter for completed containers ( complete denotes successful or failed )
178      private AtomicInteger numCompletedContainers = new AtomicInteger();
179      // Allocated container count so that we know how many containers has the RM
180      // allocated to us
181      private AtomicInteger numAllocatedContainers = new AtomicInteger();
182      // Count of failed containers
183      private AtomicInteger numFailedContainers = new AtomicInteger();
184      // Count of containers already requested from the RM
185      // Needed as once requested, we should not request for containers again.
186      // Only request for more if the original requirement changes.
187      private AtomicInteger numRequestedContainers = new AtomicInteger();
188    
189      // Shell command to be executed
190      private String shellCommand = "";
191      // Args to be passed to the shell command
192      private String shellArgs = "";
193      // Env variables to be setup for the shell command
194      private Map<String, String> shellEnv = new HashMap<String, String>();
195    
196      // Location of shell script ( obtained from info set in env )
197      // Shell script path in fs
198      private String shellScriptPath = "";
199      // Timestamp needed for creating a local resource
200      private long shellScriptPathTimestamp = 0;
201      // File length needed for local resource
202      private long shellScriptPathLen = 0;
203    
204      // Hardcoded path to shell script in launch container's local env
205      private final String ExecShellStringPath = "ExecShellScript.sh";
206    
207      private volatile boolean done;
208      private volatile boolean success;
209      
210      // Launch threads
211      private List<Thread> launchThreads = new ArrayList<Thread>();
212    
213      /**
214       * @param args Command line args
215       */
216      public static void main(String[] args) {
217        boolean result = false;
218        try {
219          ApplicationMaster appMaster = new ApplicationMaster();
220          LOG.info("Initializing ApplicationMaster");
221          boolean doRun = appMaster.init(args);
222          if (!doRun) {
223            System.exit(0);
224          }
225          result = appMaster.run();
226        } catch (Throwable t) {
227          LOG.fatal("Error running ApplicationMaster", t);
228          System.exit(1);
229        }
230        if (result) {
231          LOG.info("Application Master completed successfully. exiting");
232          System.exit(0);
233        } else {
234          LOG.info("Application Master failed. exiting");
235          System.exit(2);
236        }
237      }
238    
239      /**
240       * Dump out contents of $CWD and the environment to stdout for debugging
241       */
242      private void dumpOutDebugInfo() {
243    
244        LOG.info("Dump debug output");
245        Map<String, String> envs = System.getenv();
246        for (Map.Entry<String, String> env : envs.entrySet()) {
247          LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue());
248          System.out.println("System env: key=" + env.getKey() + ", val="
249              + env.getValue());
250        }
251    
252        String cmd = "ls -al";
253        Runtime run = Runtime.getRuntime();
254        Process pr = null;
255        try {
256          pr = run.exec(cmd);
257          pr.waitFor();
258    
259          BufferedReader buf = new BufferedReader(new InputStreamReader(
260              pr.getInputStream()));
261          String line = "";
262          while ((line = buf.readLine()) != null) {
263            LOG.info("System CWD content: " + line);
264            System.out.println("System CWD content: " + line);
265          }
266          buf.close();
267        } catch (IOException e) {
268          e.printStackTrace();
269        } catch (InterruptedException e) {
270          e.printStackTrace();
271        }
272      }
273    
274      public ApplicationMaster() throws Exception {
275        // Set up the configuration and RPC
276        conf = new YarnConfiguration();
277      }
278    
279      /**
280       * Parse command line options
281       *
282       * @param args Command line args
283       * @return Whether init successful and run should be invoked
284       * @throws ParseException
285       * @throws IOException
286       */
287      public boolean init(String[] args) throws ParseException, IOException {
288    
289        Options opts = new Options();
290        opts.addOption("app_attempt_id", true,
291            "App Attempt ID. Not to be used unless for testing purposes");
292        opts.addOption("shell_command", true,
293            "Shell command to be executed by the Application Master");
294        opts.addOption("shell_script", true,
295            "Location of the shell script to be executed");
296        opts.addOption("shell_args", true, "Command line args for the shell script");
297        opts.addOption("shell_env", true,
298            "Environment for shell script. Specified as env_key=env_val pairs");
299        opts.addOption("container_memory", true,
300            "Amount of memory in MB to be requested to run the shell command");
301        opts.addOption("num_containers", true,
302            "No. of containers on which the shell command needs to be executed");
303        opts.addOption("priority", true, "Application Priority. Default 0");
304        opts.addOption("debug", false, "Dump out debug information");
305    
306        opts.addOption("help", false, "Print usage");
307        CommandLine cliParser = new GnuParser().parse(opts, args);
308    
309        if (args.length == 0) {
310          printUsage(opts);
311          throw new IllegalArgumentException(
312              "No args specified for application master to initialize");
313        }
314    
315        if (cliParser.hasOption("help")) {
316          printUsage(opts);
317          return false;
318        }
319    
320        if (cliParser.hasOption("debug")) {
321          dumpOutDebugInfo();
322        }
323    
324        Map<String, String> envs = System.getenv();
325    
326        if (!envs.containsKey(Environment.CONTAINER_ID.name())) {
327          if (cliParser.hasOption("app_attempt_id")) {
328            String appIdStr = cliParser.getOptionValue("app_attempt_id", "");
329            appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr);
330          } else {
331            throw new IllegalArgumentException(
332                "Application Attempt Id not set in the environment");
333          }
334        } else {
335          ContainerId containerId = ConverterUtils.toContainerId(envs
336              .get(Environment.CONTAINER_ID.name()));
337          appAttemptID = containerId.getApplicationAttemptId();
338        }
339    
340        if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)) {
341          throw new RuntimeException(ApplicationConstants.APP_SUBMIT_TIME_ENV
342              + " not set in the environment");
343        }
344        if (!envs.containsKey(Environment.NM_HOST.name())) {
345          throw new RuntimeException(Environment.NM_HOST.name()
346              + " not set in the environment");
347        }
348        if (!envs.containsKey(Environment.NM_HTTP_PORT.name())) {
349          throw new RuntimeException(Environment.NM_HTTP_PORT
350              + " not set in the environment");
351        }
352        if (!envs.containsKey(Environment.NM_PORT.name())) {
353          throw new RuntimeException(Environment.NM_PORT.name()
354              + " not set in the environment");
355        }
356    
357        LOG.info("Application master for app" + ", appId="
358            + appAttemptID.getApplicationId().getId() + ", clustertimestamp="
359            + appAttemptID.getApplicationId().getClusterTimestamp()
360            + ", attemptId=" + appAttemptID.getAttemptId());
361    
362        if (!cliParser.hasOption("shell_command")) {
363          throw new IllegalArgumentException(
364              "No shell command specified to be executed by application master");
365        }
366        shellCommand = cliParser.getOptionValue("shell_command");
367    
368        if (cliParser.hasOption("shell_args")) {
369          shellArgs = cliParser.getOptionValue("shell_args");
370        }
371        if (cliParser.hasOption("shell_env")) {
372          String shellEnvs[] = cliParser.getOptionValues("shell_env");
373          for (String env : shellEnvs) {
374            env = env.trim();
375            int index = env.indexOf('=');
376            if (index == -1) {
377              shellEnv.put(env, "");
378              continue;
379            }
380            String key = env.substring(0, index);
381            String val = "";
382            if (index < (env.length() - 1)) {
383              val = env.substring(index + 1);
384            }
385            shellEnv.put(key, val);
386          }
387        }
388    
389        if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION)) {
390          shellScriptPath = envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION);
391    
392          if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)) {
393            shellScriptPathTimestamp = Long.valueOf(envs
394                .get(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP));
395          }
396          if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)) {
397            shellScriptPathLen = Long.valueOf(envs
398                .get(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN));
399          }
400    
401          if (!shellScriptPath.isEmpty()
402              && (shellScriptPathTimestamp <= 0 || shellScriptPathLen <= 0)) {
403            LOG.error("Illegal values in env for shell script path" + ", path="
404                + shellScriptPath + ", len=" + shellScriptPathLen + ", timestamp="
405                + shellScriptPathTimestamp);
406            throw new IllegalArgumentException(
407                "Illegal values in env for shell script path");
408          }
409        }
410    
411        containerMemory = Integer.parseInt(cliParser.getOptionValue(
412            "container_memory", "10"));
413        numTotalContainers = Integer.parseInt(cliParser.getOptionValue(
414            "num_containers", "1"));
415        if (numTotalContainers == 0) {
416          throw new IllegalArgumentException(
417              "Cannot run distributed shell with no containers");
418        }
419        requestPriority = Integer.parseInt(cliParser
420            .getOptionValue("priority", "0"));
421    
422        return true;
423      }
424    
425      /**
426       * Helper function to print usage
427       *
428       * @param opts Parsed command line options
429       */
430      private void printUsage(Options opts) {
431        new HelpFormatter().printHelp("ApplicationMaster", opts);
432      }
433    
434      /**
435       * Main run function for the application master
436       *
437       * @throws YarnException
438       * @throws IOException
439       */
440      @SuppressWarnings({ "unchecked" })
441      public boolean run() throws YarnException, IOException {
442        LOG.info("Starting ApplicationMaster");
443    
444        AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler();
445        resourceManager =
446            AMRMClientAsync.createAMRMClientAsync(1000, allocListener);
447        resourceManager.init(conf);
448        resourceManager.start();
449    
450        containerListener = new NMCallbackHandler();
451        nmClientAsync = new NMClientAsyncImpl(containerListener);
452        nmClientAsync.init(conf);
453        nmClientAsync.start();
454    
455        // Setup local RPC Server to accept status requests directly from clients
456        // TODO need to setup a protocol for client to be able to communicate to
457        // the RPC server
458        // TODO use the rpc port info to register with the RM for the client to
459        // send requests to this app master
460    
461        // Register self with ResourceManager
462        // This will start heartbeating to the RM
463        RegisterApplicationMasterResponse response = resourceManager
464            .registerApplicationMaster(appMasterHostname, appMasterRpcPort,
465                appMasterTrackingUrl);
466        // Dump out information about cluster capability as seen by the
467        // resource manager
468        int maxMem = response.getMaximumResourceCapability().getMemory();
469        LOG.info("Max mem capabililty of resources in this cluster " + maxMem);
470    
471        // A resource ask cannot exceed the max.
472        if (containerMemory > maxMem) {
473          LOG.info("Container memory specified above max threshold of cluster."
474              + " Using max value." + ", specified=" + containerMemory + ", max="
475              + maxMem);
476          containerMemory = maxMem;
477        }
478    
479    
480        // Setup ask for containers from RM
481        // Send request for containers to RM
482        // Until we get our fully allocated quota, we keep on polling RM for
483        // containers
484        // Keep looping until all the containers are launched and shell script
485        // executed on them ( regardless of success/failure).
486        for (int i = 0; i < numTotalContainers; ++i) {
487          ContainerRequest containerAsk = setupContainerAskForRM();
488          resourceManager.addContainerRequest(containerAsk);
489        }
490        numRequestedContainers.set(numTotalContainers);
491    
492        while (!done) {
493          try {
494            Thread.sleep(200);
495          } catch (InterruptedException ex) {}
496        }
497        finish();
498        
499        return success;
500      }
501      
502      private void finish() {
503        // Join all launched threads
504        // needed for when we time out
505        // and we need to release containers
506        for (Thread launchThread : launchThreads) {
507          try {
508            launchThread.join(10000);
509          } catch (InterruptedException e) {
510            LOG.info("Exception thrown in thread join: " + e.getMessage());
511            e.printStackTrace();
512          }
513        }
514    
515        // When the application completes, it should stop all running containers
516        LOG.info("Application completed. Stopping running containers");
517        nmClientAsync.stop();
518    
519        // When the application completes, it should send a finish application
520        // signal to the RM
521        LOG.info("Application completed. Signalling finish to RM");
522    
523        FinalApplicationStatus appStatus;
524        String appMessage = null;
525        success = true;
526        if (numFailedContainers.get() == 0 && 
527            numCompletedContainers.get() == numTotalContainers) {
528          appStatus = FinalApplicationStatus.SUCCEEDED;
529        } else {
530          appStatus = FinalApplicationStatus.FAILED;
531          appMessage = "Diagnostics." + ", total=" + numTotalContainers
532              + ", completed=" + numCompletedContainers.get() + ", allocated="
533              + numAllocatedContainers.get() + ", failed="
534              + numFailedContainers.get();
535          success = false;
536        }
537        try {
538          resourceManager.unregisterApplicationMaster(appStatus, appMessage, null);
539        } catch (YarnException ex) {
540          LOG.error("Failed to unregister application", ex);
541        } catch (IOException e) {
542          LOG.error("Failed to unregister application", e);
543        }
544        
545        done = true;
546        resourceManager.stop();
547      }
548      
549      private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
550        @SuppressWarnings("unchecked")
551        @Override
552        public void onContainersCompleted(List<ContainerStatus> completedContainers) {
553          LOG.info("Got response from RM for container ask, completedCnt="
554              + completedContainers.size());
555          for (ContainerStatus containerStatus : completedContainers) {
556            LOG.info("Got container status for containerID="
557                + containerStatus.getContainerId() + ", state="
558                + containerStatus.getState() + ", exitStatus="
559                + containerStatus.getExitStatus() + ", diagnostics="
560                + containerStatus.getDiagnostics());
561    
562            // non complete containers should not be here
563            assert (containerStatus.getState() == ContainerState.COMPLETE);
564    
565            // increment counters for completed/failed containers
566            int exitStatus = containerStatus.getExitStatus();
567            if (0 != exitStatus) {
568              // container failed
569              if (ContainerExitStatus.ABORTED != exitStatus) {
570                // shell script failed
571                // counts as completed
572                numCompletedContainers.incrementAndGet();
573                numFailedContainers.incrementAndGet();
574              } else {
575                // container was killed by framework, possibly preempted
576                // we should re-try as the container was lost for some reason
577                numAllocatedContainers.decrementAndGet();
578                numRequestedContainers.decrementAndGet();
579                // we do not need to release the container as it would be done
580                // by the RM
581              }
582            } else {
583              // nothing to do
584              // container completed successfully
585              numCompletedContainers.incrementAndGet();
586              LOG.info("Container completed successfully." + ", containerId="
587                  + containerStatus.getContainerId());
588            }
589          }
590          
591          // ask for more containers if any failed
592          int askCount = numTotalContainers - numRequestedContainers.get();
593          numRequestedContainers.addAndGet(askCount);
594    
595          if (askCount > 0) {
596            for (int i = 0; i < askCount; ++i) {
597              ContainerRequest containerAsk = setupContainerAskForRM();
598              resourceManager.addContainerRequest(containerAsk);
599            }
600          }
601          
602          if (numCompletedContainers.get() == numTotalContainers) {
603            done = true;
604          }
605        }
606    
607        @Override
608        public void onContainersAllocated(List<Container> allocatedContainers) {
609          LOG.info("Got response from RM for container ask, allocatedCnt="
610              + allocatedContainers.size());
611          numAllocatedContainers.addAndGet(allocatedContainers.size());
612          for (Container allocatedContainer : allocatedContainers) {
613            LOG.info("Launching shell command on a new container."
614                + ", containerId=" + allocatedContainer.getId()
615                + ", containerNode=" + allocatedContainer.getNodeId().getHost()
616                + ":" + allocatedContainer.getNodeId().getPort()
617                + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress()
618                + ", containerResourceMemory"
619                + allocatedContainer.getResource().getMemory());
620            // + ", containerToken"
621            // +allocatedContainer.getContainerToken().getIdentifier().toString());
622    
623            LaunchContainerRunnable runnableLaunchContainer =
624                new LaunchContainerRunnable(allocatedContainer, containerListener);
625            Thread launchThread = new Thread(runnableLaunchContainer);
626    
627            // launch and start the container on a separate thread to keep
628            // the main thread unblocked
629            // as all containers may not be allocated at one go.
630            launchThreads.add(launchThread);
631            launchThread.start();
632          }
633        }
634    
635        @Override
636        public void onShutdownRequest() {
637          done = true;
638        }
639    
640        @Override
641        public void onNodesUpdated(List<NodeReport> updatedNodes) {}
642    
643        @Override
644        public float getProgress() {
645          // set progress to deliver to RM on next heartbeat
646          float progress = (float) numCompletedContainers.get()
647              / numTotalContainers;
648          return progress;
649        }
650    
651        @Override
652        public void onError(Throwable e) {
653          done = true;
654          resourceManager.stop();
655        }
656      }
657    
658      private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
659    
660        private ConcurrentMap<ContainerId, Container> containers =
661            new ConcurrentHashMap<ContainerId, Container>();
662    
663        public void addContainer(ContainerId containerId, Container container) {
664          containers.putIfAbsent(containerId, container);
665        }
666    
667        @Override
668        public void onContainerStopped(ContainerId containerId) {
669          if (LOG.isDebugEnabled()) {
670            LOG.debug("Succeeded to stop Container " + containerId);
671          }
672          containers.remove(containerId);
673        }
674    
675        @Override
676        public void onContainerStatusReceived(ContainerId containerId,
677            ContainerStatus containerStatus) {
678          if (LOG.isDebugEnabled()) {
679            LOG.debug("Container Status: id=" + containerId + ", status=" +
680                containerStatus);
681          }
682        }
683    
684        @Override
685        public void onContainerStarted(ContainerId containerId,
686            Map<String, ByteBuffer> allServiceResponse) {
687          if (LOG.isDebugEnabled()) {
688            LOG.debug("Succeeded to start Container " + containerId);
689          }
690          Container container = containers.get(containerId);
691          if (container != null) {
692            nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId());
693          }
694        }
695    
696        @Override
697        public void onStartContainerError(ContainerId containerId, Throwable t) {
698          LOG.error("Failed to start Container " + containerId);
699          containers.remove(containerId);
700        }
701    
702        @Override
703        public void onGetContainerStatusError(
704            ContainerId containerId, Throwable t) {
705          LOG.error("Failed to query the status of Container " + containerId);
706        }
707    
708        @Override
709        public void onStopContainerError(ContainerId containerId, Throwable t) {
710          LOG.error("Failed to stop Container " + containerId);
711          containers.remove(containerId);
712        }
713      }
714    
715      /**
716       * Thread to connect to the {@link ContainerManagementProtocol} and launch the container
717       * that will execute the shell command.
718       */
719      private class LaunchContainerRunnable implements Runnable {
720    
721        // Allocated container
722        Container container;
723    
724        NMCallbackHandler containerListener;
725    
726        /**
727         * @param lcontainer Allocated container
728         * @param containerListener Callback handler of the container
729         */
730        public LaunchContainerRunnable(
731            Container lcontainer, NMCallbackHandler containerListener) {
732          this.container = lcontainer;
733          this.containerListener = containerListener;
734        }
735    
736        @Override
737        /**
738         * Connects to CM, sets up container launch context 
739         * for shell command and eventually dispatches the container 
740         * start request to the CM. 
741         */
742        public void run() {
743          LOG.info("Setting up container launch container for containerid="
744              + container.getId());
745          ContainerLaunchContext ctx = Records
746              .newRecord(ContainerLaunchContext.class);
747    
748          // Set the environment
749          ctx.setEnvironment(shellEnv);
750    
751          // Set the local resources
752          Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
753    
754          // The container for the eventual shell commands needs its own local
755          // resources too.
756          // In this scenario, if a shell script is specified, we need to have it
757          // copied and made available to the container.
758          if (!shellScriptPath.isEmpty()) {
759            LocalResource shellRsrc = Records.newRecord(LocalResource.class);
760            shellRsrc.setType(LocalResourceType.FILE);
761            shellRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
762            try {
763              shellRsrc.setResource(ConverterUtils.getYarnUrlFromURI(new URI(
764                  shellScriptPath)));
765            } catch (URISyntaxException e) {
766              LOG.error("Error when trying to use shell script path specified"
767                  + " in env, path=" + shellScriptPath);
768              e.printStackTrace();
769    
770              // A failure scenario on bad input such as invalid shell script path
771              // We know we cannot continue launching the container
772              // so we should release it.
773              // TODO
774              numCompletedContainers.incrementAndGet();
775              numFailedContainers.incrementAndGet();
776              return;
777            }
778            shellRsrc.setTimestamp(shellScriptPathTimestamp);
779            shellRsrc.setSize(shellScriptPathLen);
780            localResources.put(ExecShellStringPath, shellRsrc);
781          }
782          ctx.setLocalResources(localResources);
783    
784          // Set the necessary command to execute on the allocated container
785          Vector<CharSequence> vargs = new Vector<CharSequence>(5);
786    
787          // Set executable command
788          vargs.add(shellCommand);
789          // Set shell script path
790          if (!shellScriptPath.isEmpty()) {
791            vargs.add(ExecShellStringPath);
792          }
793    
794          // Set args for the shell command if any
795          vargs.add(shellArgs);
796          // Add log redirect params
797          vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout");
798          vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr");
799    
800          // Get final commmand
801          StringBuilder command = new StringBuilder();
802          for (CharSequence str : vargs) {
803            command.append(str).append(" ");
804          }
805    
806          List<String> commands = new ArrayList<String>();
807          commands.add(command.toString());
808          ctx.setCommands(commands);
809    
810          containerListener.addContainer(container.getId(), container);
811          nmClientAsync.startContainerAsync(container, ctx);
812        }
813      }
814    
815      /**
816       * Setup the request that will be sent to the RM for the container ask.
817       *
818       * @param numContainers Containers to ask for from RM
819       * @return the setup ResourceRequest to be sent to RM
820       */
821      private ContainerRequest setupContainerAskForRM() {
822        // setup requirements for hosts
823        // using * as any host will do for the distributed shell app
824        // set the priority for the request
825        Priority pri = Records.newRecord(Priority.class);
826        // TODO - what is the range for priority? how to decide?
827        pri.setPriority(requestPriority);
828    
829        // Set up resource type requirements
830        // For now, only memory is supported so we set memory requirements
831        Resource capability = Records.newRecord(Resource.class);
832        capability.setMemory(containerMemory);
833    
834        ContainerRequest request = new ContainerRequest(capability, null, null,
835            pri);
836        LOG.info("Requested container ask: " + request.toString());
837        return request;
838      }
839    }