001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.yarn.applications.distributedshell; 020 021 import java.io.BufferedReader; 022 import java.io.IOException; 023 import java.io.InputStreamReader; 024 import java.net.URI; 025 import java.net.URISyntaxException; 026 import java.nio.ByteBuffer; 027 import java.util.ArrayList; 028 import java.util.HashMap; 029 import java.util.List; 030 import java.util.Map; 031 import java.util.Vector; 032 import java.util.concurrent.ConcurrentHashMap; 033 import java.util.concurrent.ConcurrentMap; 034 import java.util.concurrent.atomic.AtomicInteger; 035 036 import org.apache.commons.cli.CommandLine; 037 import org.apache.commons.cli.GnuParser; 038 import org.apache.commons.cli.HelpFormatter; 039 import org.apache.commons.cli.Options; 040 import org.apache.commons.cli.ParseException; 041 import org.apache.commons.logging.Log; 042 import org.apache.commons.logging.LogFactory; 043 import org.apache.hadoop.classification.InterfaceAudience; 044 import org.apache.hadoop.classification.InterfaceStability; 045 import org.apache.hadoop.conf.Configuration; 046 import org.apache.hadoop.yarn.api.ContainerManagementProtocol; 047 import org.apache.hadoop.yarn.api.ApplicationMasterProtocol; 048 import org.apache.hadoop.yarn.api.ApplicationConstants; 049 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; 050 import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; 051 import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; 052 import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest; 053 import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; 054 import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; 055 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; 056 import org.apache.hadoop.yarn.api.records.Container; 057 import org.apache.hadoop.yarn.api.records.ContainerExitStatus; 058 import org.apache.hadoop.yarn.api.records.ContainerId; 059 import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; 060 import org.apache.hadoop.yarn.api.records.ContainerState; 061 import org.apache.hadoop.yarn.api.records.ContainerStatus; 062 import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; 063 import org.apache.hadoop.yarn.api.records.LocalResource; 064 import org.apache.hadoop.yarn.api.records.LocalResourceType; 065 import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; 066 import org.apache.hadoop.yarn.api.records.NodeReport; 067 import org.apache.hadoop.yarn.api.records.Priority; 068 import org.apache.hadoop.yarn.api.records.Resource; 069 import org.apache.hadoop.yarn.api.records.ResourceRequest; 070 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest; 071 import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; 072 import org.apache.hadoop.yarn.client.api.async.NMClientAsync; 073 import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl; 074 import org.apache.hadoop.yarn.conf.YarnConfiguration; 075 import org.apache.hadoop.yarn.exceptions.YarnException; 076 import org.apache.hadoop.yarn.util.ConverterUtils; 077 import org.apache.hadoop.yarn.util.Records; 078 079 /** 080 * An ApplicationMaster for executing shell commands on a set of launched 081 * containers using the YARN framework. 082 * 083 * <p> 084 * This class is meant to act as an example on how to write yarn-based 085 * application masters. 086 * </p> 087 * 088 * <p> 089 * The ApplicationMaster is started on a container by the 090 * <code>ResourceManager</code>'s launcher. The first thing that the 091 * <code>ApplicationMaster</code> needs to do is to connect and register itself 092 * with the <code>ResourceManager</code>. The registration sets up information 093 * within the <code>ResourceManager</code> regarding what host:port the 094 * ApplicationMaster is listening on to provide any form of functionality to a 095 * client as well as a tracking url that a client can use to keep track of 096 * status/job history if needed. 097 * </p> 098 * 099 * <p> 100 * The <code>ApplicationMaster</code> needs to send a heartbeat to the 101 * <code>ResourceManager</code> at regular intervals to inform the 102 * <code>ResourceManager</code> that it is up and alive. The 103 * {@link ApplicationMasterProtocol#allocate} to the <code>ResourceManager</code> from the 104 * <code>ApplicationMaster</code> acts as a heartbeat. 105 * 106 * <p> 107 * For the actual handling of the job, the <code>ApplicationMaster</code> has to 108 * request the <code>ResourceManager</code> via {@link AllocateRequest} for the 109 * required no. of containers using {@link ResourceRequest} with the necessary 110 * resource specifications such as node location, computational 111 * (memory/disk/cpu) resource requirements. The <code>ResourceManager</code> 112 * responds with an {@link AllocateResponse} that informs the 113 * <code>ApplicationMaster</code> of the set of newly allocated containers, 114 * completed containers as well as current state of available resources. 115 * </p> 116 * 117 * <p> 118 * For each allocated container, the <code>ApplicationMaster</code> can then set 119 * up the necessary launch context via {@link ContainerLaunchContext} to specify 120 * the allocated container id, local resources required by the executable, the 121 * environment to be setup for the executable, commands to execute, etc. and 122 * submit a {@link StartContainerRequest} to the {@link ContainerManagementProtocol} to 123 * launch and execute the defined commands on the given allocated container. 124 * </p> 125 * 126 * <p> 127 * The <code>ApplicationMaster</code> can monitor the launched container by 128 * either querying the <code>ResourceManager</code> using 129 * {@link ApplicationMasterProtocol#allocate} to get updates on completed containers or via 130 * the {@link ContainerManagementProtocol} by querying for the status of the allocated 131 * container's {@link ContainerId}. 132 * 133 * <p> 134 * After the job has been completed, the <code>ApplicationMaster</code> has to 135 * send a {@link FinishApplicationMasterRequest} to the 136 * <code>ResourceManager</code> to inform it that the 137 * <code>ApplicationMaster</code> has been completed. 138 */ 139 @InterfaceAudience.Public 140 @InterfaceStability.Unstable 141 public class ApplicationMaster { 142 143 private static final Log LOG = LogFactory.getLog(ApplicationMaster.class); 144 145 // Configuration 146 private Configuration conf; 147 148 // Handle to communicate with the Resource Manager 149 @SuppressWarnings("rawtypes") 150 private AMRMClientAsync resourceManager; 151 152 // Handle to communicate with the Node Manager 153 private NMClientAsync nmClientAsync; 154 // Listen to process the response from the Node Manager 155 private NMCallbackHandler containerListener; 156 157 // Application Attempt Id ( combination of attemptId and fail count ) 158 private ApplicationAttemptId appAttemptID; 159 160 // TODO 161 // For status update for clients - yet to be implemented 162 // Hostname of the container 163 private String appMasterHostname = ""; 164 // Port on which the app master listens for status updates from clients 165 private int appMasterRpcPort = 0; 166 // Tracking url to which app master publishes info for clients to monitor 167 private String appMasterTrackingUrl = ""; 168 169 // App Master configuration 170 // No. of containers to run shell command on 171 private int numTotalContainers = 1; 172 // Memory to request for the container on which the shell command will run 173 private int containerMemory = 10; 174 // Priority of the request 175 private int requestPriority; 176 177 // Counter for completed containers ( complete denotes successful or failed ) 178 private AtomicInteger numCompletedContainers = new AtomicInteger(); 179 // Allocated container count so that we know how many containers has the RM 180 // allocated to us 181 private AtomicInteger numAllocatedContainers = new AtomicInteger(); 182 // Count of failed containers 183 private AtomicInteger numFailedContainers = new AtomicInteger(); 184 // Count of containers already requested from the RM 185 // Needed as once requested, we should not request for containers again. 186 // Only request for more if the original requirement changes. 187 private AtomicInteger numRequestedContainers = new AtomicInteger(); 188 189 // Shell command to be executed 190 private String shellCommand = ""; 191 // Args to be passed to the shell command 192 private String shellArgs = ""; 193 // Env variables to be setup for the shell command 194 private Map<String, String> shellEnv = new HashMap<String, String>(); 195 196 // Location of shell script ( obtained from info set in env ) 197 // Shell script path in fs 198 private String shellScriptPath = ""; 199 // Timestamp needed for creating a local resource 200 private long shellScriptPathTimestamp = 0; 201 // File length needed for local resource 202 private long shellScriptPathLen = 0; 203 204 // Hardcoded path to shell script in launch container's local env 205 private final String ExecShellStringPath = "ExecShellScript.sh"; 206 207 private volatile boolean done; 208 private volatile boolean success; 209 210 // Launch threads 211 private List<Thread> launchThreads = new ArrayList<Thread>(); 212 213 /** 214 * @param args Command line args 215 */ 216 public static void main(String[] args) { 217 boolean result = false; 218 try { 219 ApplicationMaster appMaster = new ApplicationMaster(); 220 LOG.info("Initializing ApplicationMaster"); 221 boolean doRun = appMaster.init(args); 222 if (!doRun) { 223 System.exit(0); 224 } 225 result = appMaster.run(); 226 } catch (Throwable t) { 227 LOG.fatal("Error running ApplicationMaster", t); 228 System.exit(1); 229 } 230 if (result) { 231 LOG.info("Application Master completed successfully. exiting"); 232 System.exit(0); 233 } else { 234 LOG.info("Application Master failed. exiting"); 235 System.exit(2); 236 } 237 } 238 239 /** 240 * Dump out contents of $CWD and the environment to stdout for debugging 241 */ 242 private void dumpOutDebugInfo() { 243 244 LOG.info("Dump debug output"); 245 Map<String, String> envs = System.getenv(); 246 for (Map.Entry<String, String> env : envs.entrySet()) { 247 LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue()); 248 System.out.println("System env: key=" + env.getKey() + ", val=" 249 + env.getValue()); 250 } 251 252 String cmd = "ls -al"; 253 Runtime run = Runtime.getRuntime(); 254 Process pr = null; 255 try { 256 pr = run.exec(cmd); 257 pr.waitFor(); 258 259 BufferedReader buf = new BufferedReader(new InputStreamReader( 260 pr.getInputStream())); 261 String line = ""; 262 while ((line = buf.readLine()) != null) { 263 LOG.info("System CWD content: " + line); 264 System.out.println("System CWD content: " + line); 265 } 266 buf.close(); 267 } catch (IOException e) { 268 e.printStackTrace(); 269 } catch (InterruptedException e) { 270 e.printStackTrace(); 271 } 272 } 273 274 public ApplicationMaster() throws Exception { 275 // Set up the configuration and RPC 276 conf = new YarnConfiguration(); 277 } 278 279 /** 280 * Parse command line options 281 * 282 * @param args Command line args 283 * @return Whether init successful and run should be invoked 284 * @throws ParseException 285 * @throws IOException 286 */ 287 public boolean init(String[] args) throws ParseException, IOException { 288 289 Options opts = new Options(); 290 opts.addOption("app_attempt_id", true, 291 "App Attempt ID. Not to be used unless for testing purposes"); 292 opts.addOption("shell_command", true, 293 "Shell command to be executed by the Application Master"); 294 opts.addOption("shell_script", true, 295 "Location of the shell script to be executed"); 296 opts.addOption("shell_args", true, "Command line args for the shell script"); 297 opts.addOption("shell_env", true, 298 "Environment for shell script. Specified as env_key=env_val pairs"); 299 opts.addOption("container_memory", true, 300 "Amount of memory in MB to be requested to run the shell command"); 301 opts.addOption("num_containers", true, 302 "No. of containers on which the shell command needs to be executed"); 303 opts.addOption("priority", true, "Application Priority. Default 0"); 304 opts.addOption("debug", false, "Dump out debug information"); 305 306 opts.addOption("help", false, "Print usage"); 307 CommandLine cliParser = new GnuParser().parse(opts, args); 308 309 if (args.length == 0) { 310 printUsage(opts); 311 throw new IllegalArgumentException( 312 "No args specified for application master to initialize"); 313 } 314 315 if (cliParser.hasOption("help")) { 316 printUsage(opts); 317 return false; 318 } 319 320 if (cliParser.hasOption("debug")) { 321 dumpOutDebugInfo(); 322 } 323 324 Map<String, String> envs = System.getenv(); 325 326 if (!envs.containsKey(Environment.CONTAINER_ID.name())) { 327 if (cliParser.hasOption("app_attempt_id")) { 328 String appIdStr = cliParser.getOptionValue("app_attempt_id", ""); 329 appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr); 330 } else { 331 throw new IllegalArgumentException( 332 "Application Attempt Id not set in the environment"); 333 } 334 } else { 335 ContainerId containerId = ConverterUtils.toContainerId(envs 336 .get(Environment.CONTAINER_ID.name())); 337 appAttemptID = containerId.getApplicationAttemptId(); 338 } 339 340 if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)) { 341 throw new RuntimeException(ApplicationConstants.APP_SUBMIT_TIME_ENV 342 + " not set in the environment"); 343 } 344 if (!envs.containsKey(Environment.NM_HOST.name())) { 345 throw new RuntimeException(Environment.NM_HOST.name() 346 + " not set in the environment"); 347 } 348 if (!envs.containsKey(Environment.NM_HTTP_PORT.name())) { 349 throw new RuntimeException(Environment.NM_HTTP_PORT 350 + " not set in the environment"); 351 } 352 if (!envs.containsKey(Environment.NM_PORT.name())) { 353 throw new RuntimeException(Environment.NM_PORT.name() 354 + " not set in the environment"); 355 } 356 357 LOG.info("Application master for app" + ", appId=" 358 + appAttemptID.getApplicationId().getId() + ", clustertimestamp=" 359 + appAttemptID.getApplicationId().getClusterTimestamp() 360 + ", attemptId=" + appAttemptID.getAttemptId()); 361 362 if (!cliParser.hasOption("shell_command")) { 363 throw new IllegalArgumentException( 364 "No shell command specified to be executed by application master"); 365 } 366 shellCommand = cliParser.getOptionValue("shell_command"); 367 368 if (cliParser.hasOption("shell_args")) { 369 shellArgs = cliParser.getOptionValue("shell_args"); 370 } 371 if (cliParser.hasOption("shell_env")) { 372 String shellEnvs[] = cliParser.getOptionValues("shell_env"); 373 for (String env : shellEnvs) { 374 env = env.trim(); 375 int index = env.indexOf('='); 376 if (index == -1) { 377 shellEnv.put(env, ""); 378 continue; 379 } 380 String key = env.substring(0, index); 381 String val = ""; 382 if (index < (env.length() - 1)) { 383 val = env.substring(index + 1); 384 } 385 shellEnv.put(key, val); 386 } 387 } 388 389 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION)) { 390 shellScriptPath = envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION); 391 392 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)) { 393 shellScriptPathTimestamp = Long.valueOf(envs 394 .get(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)); 395 } 396 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)) { 397 shellScriptPathLen = Long.valueOf(envs 398 .get(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)); 399 } 400 401 if (!shellScriptPath.isEmpty() 402 && (shellScriptPathTimestamp <= 0 || shellScriptPathLen <= 0)) { 403 LOG.error("Illegal values in env for shell script path" + ", path=" 404 + shellScriptPath + ", len=" + shellScriptPathLen + ", timestamp=" 405 + shellScriptPathTimestamp); 406 throw new IllegalArgumentException( 407 "Illegal values in env for shell script path"); 408 } 409 } 410 411 containerMemory = Integer.parseInt(cliParser.getOptionValue( 412 "container_memory", "10")); 413 numTotalContainers = Integer.parseInt(cliParser.getOptionValue( 414 "num_containers", "1")); 415 if (numTotalContainers == 0) { 416 throw new IllegalArgumentException( 417 "Cannot run distributed shell with no containers"); 418 } 419 requestPriority = Integer.parseInt(cliParser 420 .getOptionValue("priority", "0")); 421 422 return true; 423 } 424 425 /** 426 * Helper function to print usage 427 * 428 * @param opts Parsed command line options 429 */ 430 private void printUsage(Options opts) { 431 new HelpFormatter().printHelp("ApplicationMaster", opts); 432 } 433 434 /** 435 * Main run function for the application master 436 * 437 * @throws YarnException 438 * @throws IOException 439 */ 440 @SuppressWarnings({ "unchecked" }) 441 public boolean run() throws YarnException, IOException { 442 LOG.info("Starting ApplicationMaster"); 443 444 AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler(); 445 resourceManager = 446 AMRMClientAsync.createAMRMClientAsync(1000, allocListener); 447 resourceManager.init(conf); 448 resourceManager.start(); 449 450 containerListener = new NMCallbackHandler(); 451 nmClientAsync = new NMClientAsyncImpl(containerListener); 452 nmClientAsync.init(conf); 453 nmClientAsync.start(); 454 455 // Setup local RPC Server to accept status requests directly from clients 456 // TODO need to setup a protocol for client to be able to communicate to 457 // the RPC server 458 // TODO use the rpc port info to register with the RM for the client to 459 // send requests to this app master 460 461 // Register self with ResourceManager 462 // This will start heartbeating to the RM 463 RegisterApplicationMasterResponse response = resourceManager 464 .registerApplicationMaster(appMasterHostname, appMasterRpcPort, 465 appMasterTrackingUrl); 466 // Dump out information about cluster capability as seen by the 467 // resource manager 468 int maxMem = response.getMaximumResourceCapability().getMemory(); 469 LOG.info("Max mem capabililty of resources in this cluster " + maxMem); 470 471 // A resource ask cannot exceed the max. 472 if (containerMemory > maxMem) { 473 LOG.info("Container memory specified above max threshold of cluster." 474 + " Using max value." + ", specified=" + containerMemory + ", max=" 475 + maxMem); 476 containerMemory = maxMem; 477 } 478 479 480 // Setup ask for containers from RM 481 // Send request for containers to RM 482 // Until we get our fully allocated quota, we keep on polling RM for 483 // containers 484 // Keep looping until all the containers are launched and shell script 485 // executed on them ( regardless of success/failure). 486 for (int i = 0; i < numTotalContainers; ++i) { 487 ContainerRequest containerAsk = setupContainerAskForRM(); 488 resourceManager.addContainerRequest(containerAsk); 489 } 490 numRequestedContainers.set(numTotalContainers); 491 492 while (!done) { 493 try { 494 Thread.sleep(200); 495 } catch (InterruptedException ex) {} 496 } 497 finish(); 498 499 return success; 500 } 501 502 private void finish() { 503 // Join all launched threads 504 // needed for when we time out 505 // and we need to release containers 506 for (Thread launchThread : launchThreads) { 507 try { 508 launchThread.join(10000); 509 } catch (InterruptedException e) { 510 LOG.info("Exception thrown in thread join: " + e.getMessage()); 511 e.printStackTrace(); 512 } 513 } 514 515 // When the application completes, it should stop all running containers 516 LOG.info("Application completed. Stopping running containers"); 517 nmClientAsync.stop(); 518 519 // When the application completes, it should send a finish application 520 // signal to the RM 521 LOG.info("Application completed. Signalling finish to RM"); 522 523 FinalApplicationStatus appStatus; 524 String appMessage = null; 525 success = true; 526 if (numFailedContainers.get() == 0 && 527 numCompletedContainers.get() == numTotalContainers) { 528 appStatus = FinalApplicationStatus.SUCCEEDED; 529 } else { 530 appStatus = FinalApplicationStatus.FAILED; 531 appMessage = "Diagnostics." + ", total=" + numTotalContainers 532 + ", completed=" + numCompletedContainers.get() + ", allocated=" 533 + numAllocatedContainers.get() + ", failed=" 534 + numFailedContainers.get(); 535 success = false; 536 } 537 try { 538 resourceManager.unregisterApplicationMaster(appStatus, appMessage, null); 539 } catch (YarnException ex) { 540 LOG.error("Failed to unregister application", ex); 541 } catch (IOException e) { 542 LOG.error("Failed to unregister application", e); 543 } 544 545 done = true; 546 resourceManager.stop(); 547 } 548 549 private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler { 550 @SuppressWarnings("unchecked") 551 @Override 552 public void onContainersCompleted(List<ContainerStatus> completedContainers) { 553 LOG.info("Got response from RM for container ask, completedCnt=" 554 + completedContainers.size()); 555 for (ContainerStatus containerStatus : completedContainers) { 556 LOG.info("Got container status for containerID=" 557 + containerStatus.getContainerId() + ", state=" 558 + containerStatus.getState() + ", exitStatus=" 559 + containerStatus.getExitStatus() + ", diagnostics=" 560 + containerStatus.getDiagnostics()); 561 562 // non complete containers should not be here 563 assert (containerStatus.getState() == ContainerState.COMPLETE); 564 565 // increment counters for completed/failed containers 566 int exitStatus = containerStatus.getExitStatus(); 567 if (0 != exitStatus) { 568 // container failed 569 if (ContainerExitStatus.ABORTED != exitStatus) { 570 // shell script failed 571 // counts as completed 572 numCompletedContainers.incrementAndGet(); 573 numFailedContainers.incrementAndGet(); 574 } else { 575 // container was killed by framework, possibly preempted 576 // we should re-try as the container was lost for some reason 577 numAllocatedContainers.decrementAndGet(); 578 numRequestedContainers.decrementAndGet(); 579 // we do not need to release the container as it would be done 580 // by the RM 581 } 582 } else { 583 // nothing to do 584 // container completed successfully 585 numCompletedContainers.incrementAndGet(); 586 LOG.info("Container completed successfully." + ", containerId=" 587 + containerStatus.getContainerId()); 588 } 589 } 590 591 // ask for more containers if any failed 592 int askCount = numTotalContainers - numRequestedContainers.get(); 593 numRequestedContainers.addAndGet(askCount); 594 595 if (askCount > 0) { 596 for (int i = 0; i < askCount; ++i) { 597 ContainerRequest containerAsk = setupContainerAskForRM(); 598 resourceManager.addContainerRequest(containerAsk); 599 } 600 } 601 602 if (numCompletedContainers.get() == numTotalContainers) { 603 done = true; 604 } 605 } 606 607 @Override 608 public void onContainersAllocated(List<Container> allocatedContainers) { 609 LOG.info("Got response from RM for container ask, allocatedCnt=" 610 + allocatedContainers.size()); 611 numAllocatedContainers.addAndGet(allocatedContainers.size()); 612 for (Container allocatedContainer : allocatedContainers) { 613 LOG.info("Launching shell command on a new container." 614 + ", containerId=" + allocatedContainer.getId() 615 + ", containerNode=" + allocatedContainer.getNodeId().getHost() 616 + ":" + allocatedContainer.getNodeId().getPort() 617 + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() 618 + ", containerResourceMemory" 619 + allocatedContainer.getResource().getMemory()); 620 // + ", containerToken" 621 // +allocatedContainer.getContainerToken().getIdentifier().toString()); 622 623 LaunchContainerRunnable runnableLaunchContainer = 624 new LaunchContainerRunnable(allocatedContainer, containerListener); 625 Thread launchThread = new Thread(runnableLaunchContainer); 626 627 // launch and start the container on a separate thread to keep 628 // the main thread unblocked 629 // as all containers may not be allocated at one go. 630 launchThreads.add(launchThread); 631 launchThread.start(); 632 } 633 } 634 635 @Override 636 public void onShutdownRequest() { 637 done = true; 638 } 639 640 @Override 641 public void onNodesUpdated(List<NodeReport> updatedNodes) {} 642 643 @Override 644 public float getProgress() { 645 // set progress to deliver to RM on next heartbeat 646 float progress = (float) numCompletedContainers.get() 647 / numTotalContainers; 648 return progress; 649 } 650 651 @Override 652 public void onError(Throwable e) { 653 done = true; 654 resourceManager.stop(); 655 } 656 } 657 658 private class NMCallbackHandler implements NMClientAsync.CallbackHandler { 659 660 private ConcurrentMap<ContainerId, Container> containers = 661 new ConcurrentHashMap<ContainerId, Container>(); 662 663 public void addContainer(ContainerId containerId, Container container) { 664 containers.putIfAbsent(containerId, container); 665 } 666 667 @Override 668 public void onContainerStopped(ContainerId containerId) { 669 if (LOG.isDebugEnabled()) { 670 LOG.debug("Succeeded to stop Container " + containerId); 671 } 672 containers.remove(containerId); 673 } 674 675 @Override 676 public void onContainerStatusReceived(ContainerId containerId, 677 ContainerStatus containerStatus) { 678 if (LOG.isDebugEnabled()) { 679 LOG.debug("Container Status: id=" + containerId + ", status=" + 680 containerStatus); 681 } 682 } 683 684 @Override 685 public void onContainerStarted(ContainerId containerId, 686 Map<String, ByteBuffer> allServiceResponse) { 687 if (LOG.isDebugEnabled()) { 688 LOG.debug("Succeeded to start Container " + containerId); 689 } 690 Container container = containers.get(containerId); 691 if (container != null) { 692 nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId()); 693 } 694 } 695 696 @Override 697 public void onStartContainerError(ContainerId containerId, Throwable t) { 698 LOG.error("Failed to start Container " + containerId); 699 containers.remove(containerId); 700 } 701 702 @Override 703 public void onGetContainerStatusError( 704 ContainerId containerId, Throwable t) { 705 LOG.error("Failed to query the status of Container " + containerId); 706 } 707 708 @Override 709 public void onStopContainerError(ContainerId containerId, Throwable t) { 710 LOG.error("Failed to stop Container " + containerId); 711 containers.remove(containerId); 712 } 713 } 714 715 /** 716 * Thread to connect to the {@link ContainerManagementProtocol} and launch the container 717 * that will execute the shell command. 718 */ 719 private class LaunchContainerRunnable implements Runnable { 720 721 // Allocated container 722 Container container; 723 724 NMCallbackHandler containerListener; 725 726 /** 727 * @param lcontainer Allocated container 728 * @param containerListener Callback handler of the container 729 */ 730 public LaunchContainerRunnable( 731 Container lcontainer, NMCallbackHandler containerListener) { 732 this.container = lcontainer; 733 this.containerListener = containerListener; 734 } 735 736 @Override 737 /** 738 * Connects to CM, sets up container launch context 739 * for shell command and eventually dispatches the container 740 * start request to the CM. 741 */ 742 public void run() { 743 LOG.info("Setting up container launch container for containerid=" 744 + container.getId()); 745 ContainerLaunchContext ctx = Records 746 .newRecord(ContainerLaunchContext.class); 747 748 // Set the environment 749 ctx.setEnvironment(shellEnv); 750 751 // Set the local resources 752 Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); 753 754 // The container for the eventual shell commands needs its own local 755 // resources too. 756 // In this scenario, if a shell script is specified, we need to have it 757 // copied and made available to the container. 758 if (!shellScriptPath.isEmpty()) { 759 LocalResource shellRsrc = Records.newRecord(LocalResource.class); 760 shellRsrc.setType(LocalResourceType.FILE); 761 shellRsrc.setVisibility(LocalResourceVisibility.APPLICATION); 762 try { 763 shellRsrc.setResource(ConverterUtils.getYarnUrlFromURI(new URI( 764 shellScriptPath))); 765 } catch (URISyntaxException e) { 766 LOG.error("Error when trying to use shell script path specified" 767 + " in env, path=" + shellScriptPath); 768 e.printStackTrace(); 769 770 // A failure scenario on bad input such as invalid shell script path 771 // We know we cannot continue launching the container 772 // so we should release it. 773 // TODO 774 numCompletedContainers.incrementAndGet(); 775 numFailedContainers.incrementAndGet(); 776 return; 777 } 778 shellRsrc.setTimestamp(shellScriptPathTimestamp); 779 shellRsrc.setSize(shellScriptPathLen); 780 localResources.put(ExecShellStringPath, shellRsrc); 781 } 782 ctx.setLocalResources(localResources); 783 784 // Set the necessary command to execute on the allocated container 785 Vector<CharSequence> vargs = new Vector<CharSequence>(5); 786 787 // Set executable command 788 vargs.add(shellCommand); 789 // Set shell script path 790 if (!shellScriptPath.isEmpty()) { 791 vargs.add(ExecShellStringPath); 792 } 793 794 // Set args for the shell command if any 795 vargs.add(shellArgs); 796 // Add log redirect params 797 vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"); 798 vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"); 799 800 // Get final commmand 801 StringBuilder command = new StringBuilder(); 802 for (CharSequence str : vargs) { 803 command.append(str).append(" "); 804 } 805 806 List<String> commands = new ArrayList<String>(); 807 commands.add(command.toString()); 808 ctx.setCommands(commands); 809 810 containerListener.addContainer(container.getId(), container); 811 nmClientAsync.startContainerAsync(container, ctx); 812 } 813 } 814 815 /** 816 * Setup the request that will be sent to the RM for the container ask. 817 * 818 * @param numContainers Containers to ask for from RM 819 * @return the setup ResourceRequest to be sent to RM 820 */ 821 private ContainerRequest setupContainerAskForRM() { 822 // setup requirements for hosts 823 // using * as any host will do for the distributed shell app 824 // set the priority for the request 825 Priority pri = Records.newRecord(Priority.class); 826 // TODO - what is the range for priority? how to decide? 827 pri.setPriority(requestPriority); 828 829 // Set up resource type requirements 830 // For now, only memory is supported so we set memory requirements 831 Resource capability = Records.newRecord(Resource.class); 832 capability.setMemory(containerMemory); 833 834 ContainerRequest request = new ContainerRequest(capability, null, null, 835 pri); 836 LOG.info("Requested container ask: " + request.toString()); 837 return request; 838 } 839 }