001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.mapred; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.net.InetSocketAddress; 023 import java.net.URL; 024 import java.security.PrivilegedExceptionAction; 025 import java.util.ArrayList; 026 import java.util.Collection; 027 import java.util.List; 028 029 import org.apache.hadoop.classification.InterfaceAudience; 030 import org.apache.hadoop.classification.InterfaceStability; 031 import org.apache.hadoop.conf.Configuration; 032 import org.apache.hadoop.fs.FileStatus; 033 import org.apache.hadoop.fs.FileSystem; 034 import org.apache.hadoop.fs.Path; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.mapred.ClusterStatus.BlackListInfo; 037 import org.apache.hadoop.mapreduce.Cluster; 038 import org.apache.hadoop.mapreduce.ClusterMetrics; 039 import org.apache.hadoop.mapreduce.Job; 040 import org.apache.hadoop.mapreduce.QueueInfo; 041 import org.apache.hadoop.mapreduce.TaskTrackerInfo; 042 import org.apache.hadoop.mapreduce.TaskType; 043 import org.apache.hadoop.mapreduce.filecache.DistributedCache; 044 import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier; 045 import org.apache.hadoop.mapreduce.tools.CLI; 046 import org.apache.hadoop.mapreduce.util.ConfigUtil; 047 import org.apache.hadoop.security.UserGroupInformation; 048 import org.apache.hadoop.security.token.SecretManager.InvalidToken; 049 import org.apache.hadoop.security.token.Token; 050 import org.apache.hadoop.security.token.TokenRenewer; 051 import org.apache.hadoop.util.Tool; 052 import org.apache.hadoop.util.ToolRunner; 053 054 /** 055 * <code>JobClient</code> is the primary interface for the user-job to interact 056 * with the cluster. 057 * 058 * <code>JobClient</code> provides facilities to submit jobs, track their 059 * progress, access component-tasks' reports/logs, get the Map-Reduce cluster 060 * status information etc. 061 * 062 * <p>The job submission process involves: 063 * <ol> 064 * <li> 065 * Checking the input and output specifications of the job. 066 * </li> 067 * <li> 068 * Computing the {@link InputSplit}s for the job. 069 * </li> 070 * <li> 071 * Setup the requisite accounting information for the {@link DistributedCache} 072 * of the job, if necessary. 073 * </li> 074 * <li> 075 * Copying the job's jar and configuration to the map-reduce system directory 076 * on the distributed file-system. 077 * </li> 078 * <li> 079 * Submitting the job to the cluster and optionally monitoring 080 * it's status. 081 * </li> 082 * </ol></p> 083 * 084 * Normally the user creates the application, describes various facets of the 085 * job via {@link JobConf} and then uses the <code>JobClient</code> to submit 086 * the job and monitor its progress. 087 * 088 * <p>Here is an example on how to use <code>JobClient</code>:</p> 089 * <p><blockquote><pre> 090 * // Create a new JobConf 091 * JobConf job = new JobConf(new Configuration(), MyJob.class); 092 * 093 * // Specify various job-specific parameters 094 * job.setJobName("myjob"); 095 * 096 * job.setInputPath(new Path("in")); 097 * job.setOutputPath(new Path("out")); 098 * 099 * job.setMapperClass(MyJob.MyMapper.class); 100 * job.setReducerClass(MyJob.MyReducer.class); 101 * 102 * // Submit the job, then poll for progress until the job is complete 103 * JobClient.runJob(job); 104 * </pre></blockquote></p> 105 * 106 * <h4 id="JobControl">Job Control</h4> 107 * 108 * <p>At times clients would chain map-reduce jobs to accomplish complex tasks 109 * which cannot be done via a single map-reduce job. This is fairly easy since 110 * the output of the job, typically, goes to distributed file-system and that 111 * can be used as the input for the next job.</p> 112 * 113 * <p>However, this also means that the onus on ensuring jobs are complete 114 * (success/failure) lies squarely on the clients. In such situations the 115 * various job-control options are: 116 * <ol> 117 * <li> 118 * {@link #runJob(JobConf)} : submits the job and returns only after 119 * the job has completed. 120 * </li> 121 * <li> 122 * {@link #submitJob(JobConf)} : only submits the job, then poll the 123 * returned handle to the {@link RunningJob} to query status and make 124 * scheduling decisions. 125 * </li> 126 * <li> 127 * {@link JobConf#setJobEndNotificationURI(String)} : setup a notification 128 * on job-completion, thus avoiding polling. 129 * </li> 130 * </ol></p> 131 * 132 * @see JobConf 133 * @see ClusterStatus 134 * @see Tool 135 * @see DistributedCache 136 */ 137 @InterfaceAudience.Public 138 @InterfaceStability.Stable 139 public class JobClient extends CLI { 140 141 @InterfaceAudience.Private 142 public static final String MAPREDUCE_CLIENT_RETRY_POLICY_ENABLED_KEY = 143 "mapreduce.jobclient.retry.policy.enabled"; 144 @InterfaceAudience.Private 145 public static final boolean MAPREDUCE_CLIENT_RETRY_POLICY_ENABLED_DEFAULT = 146 false; 147 @InterfaceAudience.Private 148 public static final String MAPREDUCE_CLIENT_RETRY_POLICY_SPEC_KEY = 149 "mapreduce.jobclient.retry.policy.spec"; 150 @InterfaceAudience.Private 151 public static final String MAPREDUCE_CLIENT_RETRY_POLICY_SPEC_DEFAULT = 152 "10000,6,60000,10"; // t1,n1,t2,n2,... 153 154 public static enum TaskStatusFilter { NONE, KILLED, FAILED, SUCCEEDED, ALL } 155 private TaskStatusFilter taskOutputFilter = TaskStatusFilter.FAILED; 156 157 static{ 158 ConfigUtil.loadResources(); 159 } 160 161 /** 162 * A NetworkedJob is an implementation of RunningJob. It holds 163 * a JobProfile object to provide some info, and interacts with the 164 * remote service to provide certain functionality. 165 */ 166 static class NetworkedJob implements RunningJob { 167 Job job; 168 /** 169 * We store a JobProfile and a timestamp for when we last 170 * acquired the job profile. If the job is null, then we cannot 171 * perform any of the tasks. The job might be null if the cluster 172 * has completely forgotten about the job. (eg, 24 hours after the 173 * job completes.) 174 */ 175 public NetworkedJob(JobStatus status, Cluster cluster) throws IOException { 176 job = Job.getInstance(cluster, status, new JobConf(status.getJobFile())); 177 } 178 179 public NetworkedJob(Job job) throws IOException { 180 this.job = job; 181 } 182 183 public Configuration getConfiguration() { 184 return job.getConfiguration(); 185 } 186 187 /** 188 * An identifier for the job 189 */ 190 public JobID getID() { 191 return JobID.downgrade(job.getJobID()); 192 } 193 194 /** @deprecated This method is deprecated and will be removed. Applications should 195 * rather use {@link #getID()}.*/ 196 @Deprecated 197 public String getJobID() { 198 return getID().toString(); 199 } 200 201 /** 202 * The user-specified job name 203 */ 204 public String getJobName() { 205 return job.getJobName(); 206 } 207 208 /** 209 * The name of the job file 210 */ 211 public String getJobFile() { 212 return job.getJobFile(); 213 } 214 215 /** 216 * A URL where the job's status can be seen 217 */ 218 public String getTrackingURL() { 219 return job.getTrackingURL(); 220 } 221 222 /** 223 * A float between 0.0 and 1.0, indicating the % of map work 224 * completed. 225 */ 226 public float mapProgress() throws IOException { 227 return job.mapProgress(); 228 } 229 230 /** 231 * A float between 0.0 and 1.0, indicating the % of reduce work 232 * completed. 233 */ 234 public float reduceProgress() throws IOException { 235 return job.reduceProgress(); 236 } 237 238 /** 239 * A float between 0.0 and 1.0, indicating the % of cleanup work 240 * completed. 241 */ 242 public float cleanupProgress() throws IOException { 243 try { 244 return job.cleanupProgress(); 245 } catch (InterruptedException ie) { 246 throw new IOException(ie); 247 } 248 } 249 250 /** 251 * A float between 0.0 and 1.0, indicating the % of setup work 252 * completed. 253 */ 254 public float setupProgress() throws IOException { 255 return job.setupProgress(); 256 } 257 258 /** 259 * Returns immediately whether the whole job is done yet or not. 260 */ 261 public synchronized boolean isComplete() throws IOException { 262 return job.isComplete(); 263 } 264 265 /** 266 * True iff job completed successfully. 267 */ 268 public synchronized boolean isSuccessful() throws IOException { 269 return job.isSuccessful(); 270 } 271 272 /** 273 * Blocks until the job is finished 274 */ 275 public void waitForCompletion() throws IOException { 276 try { 277 job.waitForCompletion(false); 278 } catch (InterruptedException ie) { 279 throw new IOException(ie); 280 } catch (ClassNotFoundException ce) { 281 throw new IOException(ce); 282 } 283 } 284 285 /** 286 * Tells the service to get the state of the current job. 287 */ 288 public synchronized int getJobState() throws IOException { 289 try { 290 return job.getJobState().getValue(); 291 } catch (InterruptedException ie) { 292 throw new IOException(ie); 293 } 294 } 295 296 /** 297 * Tells the service to terminate the current job. 298 */ 299 public synchronized void killJob() throws IOException { 300 job.killJob(); 301 } 302 303 304 /** Set the priority of the job. 305 * @param priority new priority of the job. 306 */ 307 public synchronized void setJobPriority(String priority) 308 throws IOException { 309 try { 310 job.setPriority( 311 org.apache.hadoop.mapreduce.JobPriority.valueOf(priority)); 312 } catch (InterruptedException ie) { 313 throw new IOException(ie); 314 } 315 } 316 317 /** 318 * Kill indicated task attempt. 319 * @param taskId the id of the task to kill. 320 * @param shouldFail if true the task is failed and added to failed tasks list, otherwise 321 * it is just killed, w/o affecting job failure status. 322 */ 323 public synchronized void killTask(TaskAttemptID taskId, 324 boolean shouldFail) throws IOException { 325 if (shouldFail) { 326 job.failTask(taskId); 327 } else { 328 job.killTask(taskId); 329 } 330 } 331 332 /** @deprecated Applications should rather use {@link #killTask(TaskAttemptID, boolean)}*/ 333 @Deprecated 334 public synchronized void killTask(String taskId, boolean shouldFail) throws IOException { 335 killTask(TaskAttemptID.forName(taskId), shouldFail); 336 } 337 338 /** 339 * Fetch task completion events from cluster for this job. 340 */ 341 public synchronized TaskCompletionEvent[] getTaskCompletionEvents( 342 int startFrom) throws IOException { 343 try { 344 org.apache.hadoop.mapreduce.TaskCompletionEvent[] acls = 345 job.getTaskCompletionEvents(startFrom, 10); 346 TaskCompletionEvent[] ret = new TaskCompletionEvent[acls.length]; 347 for (int i = 0 ; i < acls.length; i++ ) { 348 ret[i] = TaskCompletionEvent.downgrade(acls[i]); 349 } 350 return ret; 351 } catch (InterruptedException ie) { 352 throw new IOException(ie); 353 } 354 } 355 356 /** 357 * Dump stats to screen 358 */ 359 @Override 360 public String toString() { 361 return job.toString(); 362 } 363 364 /** 365 * Returns the counters for this job 366 */ 367 public Counters getCounters() throws IOException { 368 Counters result = null; 369 org.apache.hadoop.mapreduce.Counters temp = job.getCounters(); 370 if(temp != null) { 371 result = Counters.downgrade(temp); 372 } 373 return result; 374 } 375 376 @Override 377 public String[] getTaskDiagnostics(TaskAttemptID id) throws IOException { 378 try { 379 return job.getTaskDiagnostics(id); 380 } catch (InterruptedException ie) { 381 throw new IOException(ie); 382 } 383 } 384 385 public String getHistoryUrl() throws IOException { 386 try { 387 return job.getHistoryUrl(); 388 } catch (InterruptedException ie) { 389 throw new IOException(ie); 390 } 391 } 392 393 public boolean isRetired() throws IOException { 394 try { 395 return job.isRetired(); 396 } catch (InterruptedException ie) { 397 throw new IOException(ie); 398 } 399 } 400 401 boolean monitorAndPrintJob() throws IOException, InterruptedException { 402 return job.monitorAndPrintJob(); 403 } 404 405 @Override 406 public String getFailureInfo() throws IOException { 407 try { 408 return job.getStatus().getFailureInfo(); 409 } catch (InterruptedException ie) { 410 throw new IOException(ie); 411 } 412 } 413 414 @Override 415 public JobStatus getJobStatus() throws IOException { 416 try { 417 return JobStatus.downgrade(job.getStatus()); 418 } catch (InterruptedException ie) { 419 throw new IOException(ie); 420 } 421 } 422 } 423 424 /** 425 * Ugi of the client. We store this ugi when the client is created and 426 * then make sure that the same ugi is used to run the various protocols. 427 */ 428 UserGroupInformation clientUgi; 429 430 /** 431 * Create a job client. 432 */ 433 public JobClient() { 434 } 435 436 /** 437 * Build a job client with the given {@link JobConf}, and connect to the 438 * default cluster 439 * 440 * @param conf the job configuration. 441 * @throws IOException 442 */ 443 public JobClient(JobConf conf) throws IOException { 444 init(conf); 445 } 446 447 /** 448 * Build a job client with the given {@link Configuration}, 449 * and connect to the default cluster 450 * 451 * @param conf the configuration. 452 * @throws IOException 453 */ 454 public JobClient(Configuration conf) throws IOException { 455 init(new JobConf(conf)); 456 } 457 458 /** 459 * Connect to the default cluster 460 * @param conf the job configuration. 461 * @throws IOException 462 */ 463 public void init(JobConf conf) throws IOException { 464 setConf(conf); 465 cluster = new Cluster(conf); 466 clientUgi = UserGroupInformation.getCurrentUser(); 467 } 468 469 /** 470 * Build a job client, connect to the indicated job tracker. 471 * 472 * @param jobTrackAddr the job tracker to connect to. 473 * @param conf configuration. 474 */ 475 public JobClient(InetSocketAddress jobTrackAddr, 476 Configuration conf) throws IOException { 477 cluster = new Cluster(jobTrackAddr, conf); 478 clientUgi = UserGroupInformation.getCurrentUser(); 479 } 480 481 /** 482 * Close the <code>JobClient</code>. 483 */ 484 public synchronized void close() throws IOException { 485 cluster.close(); 486 } 487 488 /** 489 * Get a filesystem handle. We need this to prepare jobs 490 * for submission to the MapReduce system. 491 * 492 * @return the filesystem handle. 493 */ 494 public synchronized FileSystem getFs() throws IOException { 495 try { 496 return cluster.getFileSystem(); 497 } catch (InterruptedException ie) { 498 throw new IOException(ie); 499 } 500 } 501 502 /** 503 * Get a handle to the Cluster 504 */ 505 public Cluster getClusterHandle() { 506 return cluster; 507 } 508 509 /** 510 * Submit a job to the MR system. 511 * 512 * This returns a handle to the {@link RunningJob} which can be used to track 513 * the running-job. 514 * 515 * @param jobFile the job configuration. 516 * @return a handle to the {@link RunningJob} which can be used to track the 517 * running-job. 518 * @throws FileNotFoundException 519 * @throws InvalidJobConfException 520 * @throws IOException 521 */ 522 public RunningJob submitJob(String jobFile) throws FileNotFoundException, 523 InvalidJobConfException, 524 IOException { 525 // Load in the submitted job details 526 JobConf job = new JobConf(jobFile); 527 return submitJob(job); 528 } 529 530 /** 531 * Submit a job to the MR system. 532 * This returns a handle to the {@link RunningJob} which can be used to track 533 * the running-job. 534 * 535 * @param conf the job configuration. 536 * @return a handle to the {@link RunningJob} which can be used to track the 537 * running-job. 538 * @throws FileNotFoundException 539 * @throws IOException 540 */ 541 public RunningJob submitJob(final JobConf conf) throws FileNotFoundException, 542 IOException { 543 return submitJobInternal(conf); 544 } 545 546 @InterfaceAudience.Private 547 public RunningJob submitJobInternal(final JobConf conf) 548 throws FileNotFoundException, IOException { 549 try { 550 conf.setBooleanIfUnset("mapred.mapper.new-api", false); 551 conf.setBooleanIfUnset("mapred.reducer.new-api", false); 552 Job job = clientUgi.doAs(new PrivilegedExceptionAction<Job> () { 553 @Override 554 public Job run() throws IOException, ClassNotFoundException, 555 InterruptedException { 556 Job job = Job.getInstance(conf); 557 job.submit(); 558 return job; 559 } 560 }); 561 // update our Cluster instance with the one created by Job for submission 562 // (we can't pass our Cluster instance to Job, since Job wraps the config 563 // instance, and the two configs would then diverge) 564 cluster = job.getCluster(); 565 return new NetworkedJob(job); 566 } catch (InterruptedException ie) { 567 throw new IOException("interrupted", ie); 568 } 569 } 570 571 private Job getJobUsingCluster(final JobID jobid) throws IOException, 572 InterruptedException { 573 return clientUgi.doAs(new PrivilegedExceptionAction<Job>() { 574 public Job run() throws IOException, InterruptedException { 575 return cluster.getJob(jobid); 576 } 577 }); 578 } 579 /** 580 * Get an {@link RunningJob} object to track an ongoing job. Returns 581 * null if the id does not correspond to any known job. 582 * 583 * @param jobid the jobid of the job. 584 * @return the {@link RunningJob} handle to track the job, null if the 585 * <code>jobid</code> doesn't correspond to any known job. 586 * @throws IOException 587 */ 588 public RunningJob getJob(final JobID jobid) throws IOException { 589 try { 590 591 Job job = getJobUsingCluster(jobid); 592 if (job != null) { 593 JobStatus status = JobStatus.downgrade(job.getStatus()); 594 if (status != null) { 595 return new NetworkedJob(status, cluster); 596 } 597 } 598 } catch (InterruptedException ie) { 599 throw new IOException(ie); 600 } 601 return null; 602 } 603 604 /**@deprecated Applications should rather use {@link #getJob(JobID)}. 605 */ 606 @Deprecated 607 public RunningJob getJob(String jobid) throws IOException { 608 return getJob(JobID.forName(jobid)); 609 } 610 611 private static final TaskReport[] EMPTY_TASK_REPORTS = new TaskReport[0]; 612 613 /** 614 * Get the information of the current state of the map tasks of a job. 615 * 616 * @param jobId the job to query. 617 * @return the list of all of the map tips. 618 * @throws IOException 619 */ 620 public TaskReport[] getMapTaskReports(JobID jobId) throws IOException { 621 return getTaskReports(jobId, TaskType.MAP); 622 } 623 624 private TaskReport[] getTaskReports(final JobID jobId, TaskType type) throws 625 IOException { 626 try { 627 Job j = getJobUsingCluster(jobId); 628 if(j == null) { 629 return EMPTY_TASK_REPORTS; 630 } 631 return TaskReport.downgradeArray(j.getTaskReports(type)); 632 } catch (InterruptedException ie) { 633 throw new IOException(ie); 634 } 635 } 636 637 /**@deprecated Applications should rather use {@link #getMapTaskReports(JobID)}*/ 638 @Deprecated 639 public TaskReport[] getMapTaskReports(String jobId) throws IOException { 640 return getMapTaskReports(JobID.forName(jobId)); 641 } 642 643 /** 644 * Get the information of the current state of the reduce tasks of a job. 645 * 646 * @param jobId the job to query. 647 * @return the list of all of the reduce tips. 648 * @throws IOException 649 */ 650 public TaskReport[] getReduceTaskReports(JobID jobId) throws IOException { 651 return getTaskReports(jobId, TaskType.REDUCE); 652 } 653 654 /** 655 * Get the information of the current state of the cleanup tasks of a job. 656 * 657 * @param jobId the job to query. 658 * @return the list of all of the cleanup tips. 659 * @throws IOException 660 */ 661 public TaskReport[] getCleanupTaskReports(JobID jobId) throws IOException { 662 return getTaskReports(jobId, TaskType.JOB_CLEANUP); 663 } 664 665 /** 666 * Get the information of the current state of the setup tasks of a job. 667 * 668 * @param jobId the job to query. 669 * @return the list of all of the setup tips. 670 * @throws IOException 671 */ 672 public TaskReport[] getSetupTaskReports(JobID jobId) throws IOException { 673 return getTaskReports(jobId, TaskType.JOB_SETUP); 674 } 675 676 677 /**@deprecated Applications should rather use {@link #getReduceTaskReports(JobID)}*/ 678 @Deprecated 679 public TaskReport[] getReduceTaskReports(String jobId) throws IOException { 680 return getReduceTaskReports(JobID.forName(jobId)); 681 } 682 683 /** 684 * Display the information about a job's tasks, of a particular type and 685 * in a particular state 686 * 687 * @param jobId the ID of the job 688 * @param type the type of the task (map/reduce/setup/cleanup) 689 * @param state the state of the task 690 * (pending/running/completed/failed/killed) 691 */ 692 public void displayTasks(final JobID jobId, String type, String state) 693 throws IOException { 694 try { 695 Job job = getJobUsingCluster(jobId); 696 super.displayTasks(job, type, state); 697 } catch (InterruptedException ie) { 698 throw new IOException(ie); 699 } 700 } 701 702 /** 703 * Get status information about the Map-Reduce cluster. 704 * 705 * @return the status information about the Map-Reduce cluster as an object 706 * of {@link ClusterStatus}. 707 * @throws IOException 708 */ 709 public ClusterStatus getClusterStatus() throws IOException { 710 try { 711 return clientUgi.doAs(new PrivilegedExceptionAction<ClusterStatus>() { 712 public ClusterStatus run() throws IOException, InterruptedException { 713 ClusterMetrics metrics = cluster.getClusterStatus(); 714 return new ClusterStatus(metrics.getTaskTrackerCount(), metrics 715 .getBlackListedTaskTrackerCount(), cluster 716 .getTaskTrackerExpiryInterval(), metrics.getOccupiedMapSlots(), 717 metrics.getOccupiedReduceSlots(), metrics.getMapSlotCapacity(), 718 metrics.getReduceSlotCapacity(), cluster.getJobTrackerStatus(), 719 metrics.getDecommissionedTaskTrackerCount(), metrics 720 .getGrayListedTaskTrackerCount()); 721 } 722 }); 723 } catch (InterruptedException ie) { 724 throw new IOException(ie); 725 } 726 } 727 728 private Collection<String> arrayToStringList(TaskTrackerInfo[] objs) { 729 Collection<String> list = new ArrayList<String>(); 730 for (TaskTrackerInfo info: objs) { 731 list.add(info.getTaskTrackerName()); 732 } 733 return list; 734 } 735 736 private Collection<BlackListInfo> arrayToBlackListInfo(TaskTrackerInfo[] objs) { 737 Collection<BlackListInfo> list = new ArrayList<BlackListInfo>(); 738 for (TaskTrackerInfo info: objs) { 739 BlackListInfo binfo = new BlackListInfo(); 740 binfo.setTrackerName(info.getTaskTrackerName()); 741 binfo.setReasonForBlackListing(info.getReasonForBlacklist()); 742 binfo.setBlackListReport(info.getBlacklistReport()); 743 list.add(binfo); 744 } 745 return list; 746 } 747 748 /** 749 * Get status information about the Map-Reduce cluster. 750 * 751 * @param detailed if true then get a detailed status including the 752 * tracker names 753 * @return the status information about the Map-Reduce cluster as an object 754 * of {@link ClusterStatus}. 755 * @throws IOException 756 */ 757 public ClusterStatus getClusterStatus(boolean detailed) throws IOException { 758 try { 759 return clientUgi.doAs(new PrivilegedExceptionAction<ClusterStatus>() { 760 public ClusterStatus run() throws IOException, InterruptedException { 761 ClusterMetrics metrics = cluster.getClusterStatus(); 762 return new ClusterStatus(arrayToStringList(cluster.getActiveTaskTrackers()), 763 arrayToBlackListInfo(cluster.getBlackListedTaskTrackers()), 764 cluster.getTaskTrackerExpiryInterval(), metrics.getOccupiedMapSlots(), 765 metrics.getOccupiedReduceSlots(), metrics.getMapSlotCapacity(), 766 metrics.getReduceSlotCapacity(), 767 cluster.getJobTrackerStatus()); 768 } 769 }); 770 } catch (InterruptedException ie) { 771 throw new IOException(ie); 772 } 773 } 774 775 776 /** 777 * Get the jobs that are not completed and not failed. 778 * 779 * @return array of {@link JobStatus} for the running/to-be-run jobs. 780 * @throws IOException 781 */ 782 public JobStatus[] jobsToComplete() throws IOException { 783 List<JobStatus> stats = new ArrayList<JobStatus>(); 784 for (JobStatus stat : getAllJobs()) { 785 if (!stat.isJobComplete()) { 786 stats.add(stat); 787 } 788 } 789 return stats.toArray(new JobStatus[0]); 790 } 791 792 /** 793 * Get the jobs that are submitted. 794 * 795 * @return array of {@link JobStatus} for the submitted jobs. 796 * @throws IOException 797 */ 798 public JobStatus[] getAllJobs() throws IOException { 799 try { 800 org.apache.hadoop.mapreduce.JobStatus[] jobs = 801 clientUgi.doAs(new PrivilegedExceptionAction< 802 org.apache.hadoop.mapreduce.JobStatus[]> () { 803 public org.apache.hadoop.mapreduce.JobStatus[] run() 804 throws IOException, InterruptedException { 805 return cluster.getAllJobStatuses(); 806 } 807 }); 808 JobStatus[] stats = new JobStatus[jobs.length]; 809 for (int i = 0; i < jobs.length; i++) { 810 stats[i] = JobStatus.downgrade(jobs[i]); 811 } 812 return stats; 813 } catch (InterruptedException ie) { 814 throw new IOException(ie); 815 } 816 } 817 818 /** 819 * Utility that submits a job, then polls for progress until the job is 820 * complete. 821 * 822 * @param job the job configuration. 823 * @throws IOException if the job fails 824 */ 825 public static RunningJob runJob(JobConf job) throws IOException { 826 JobClient jc = new JobClient(job); 827 RunningJob rj = jc.submitJob(job); 828 try { 829 if (!jc.monitorAndPrintJob(job, rj)) { 830 throw new IOException("Job failed!"); 831 } 832 } catch (InterruptedException ie) { 833 Thread.currentThread().interrupt(); 834 } 835 return rj; 836 } 837 838 /** 839 * Monitor a job and print status in real-time as progress is made and tasks 840 * fail. 841 * @param conf the job's configuration 842 * @param job the job to track 843 * @return true if the job succeeded 844 * @throws IOException if communication to the JobTracker fails 845 */ 846 public boolean monitorAndPrintJob(JobConf conf, 847 RunningJob job 848 ) throws IOException, InterruptedException { 849 return ((NetworkedJob)job).monitorAndPrintJob(); 850 } 851 852 static String getTaskLogURL(TaskAttemptID taskId, String baseUrl) { 853 return (baseUrl + "/tasklog?plaintext=true&attemptid=" + taskId); 854 } 855 856 static Configuration getConfiguration(String jobTrackerSpec) 857 { 858 Configuration conf = new Configuration(); 859 if (jobTrackerSpec != null) { 860 if (jobTrackerSpec.indexOf(":") >= 0) { 861 conf.set("mapred.job.tracker", jobTrackerSpec); 862 } else { 863 String classpathFile = "hadoop-" + jobTrackerSpec + ".xml"; 864 URL validate = conf.getResource(classpathFile); 865 if (validate == null) { 866 throw new RuntimeException(classpathFile + " not found on CLASSPATH"); 867 } 868 conf.addResource(classpathFile); 869 } 870 } 871 return conf; 872 } 873 874 /** 875 * Sets the output filter for tasks. only those tasks are printed whose 876 * output matches the filter. 877 * @param newValue task filter. 878 */ 879 @Deprecated 880 public void setTaskOutputFilter(TaskStatusFilter newValue){ 881 this.taskOutputFilter = newValue; 882 } 883 884 /** 885 * Get the task output filter out of the JobConf. 886 * 887 * @param job the JobConf to examine. 888 * @return the filter level. 889 */ 890 public static TaskStatusFilter getTaskOutputFilter(JobConf job) { 891 return TaskStatusFilter.valueOf(job.get("jobclient.output.filter", 892 "FAILED")); 893 } 894 895 /** 896 * Modify the JobConf to set the task output filter. 897 * 898 * @param job the JobConf to modify. 899 * @param newValue the value to set. 900 */ 901 public static void setTaskOutputFilter(JobConf job, 902 TaskStatusFilter newValue) { 903 job.set("jobclient.output.filter", newValue.toString()); 904 } 905 906 /** 907 * Returns task output filter. 908 * @return task filter. 909 */ 910 @Deprecated 911 public TaskStatusFilter getTaskOutputFilter(){ 912 return this.taskOutputFilter; 913 } 914 915 protected long getCounter(org.apache.hadoop.mapreduce.Counters cntrs, 916 String counterGroupName, String counterName) throws IOException { 917 Counters counters = Counters.downgrade(cntrs); 918 return counters.findCounter(counterGroupName, counterName).getValue(); 919 } 920 921 /** 922 * Get status information about the max available Maps in the cluster. 923 * 924 * @return the max available Maps in the cluster 925 * @throws IOException 926 */ 927 public int getDefaultMaps() throws IOException { 928 try { 929 return clientUgi.doAs(new PrivilegedExceptionAction<Integer>() { 930 @Override 931 public Integer run() throws IOException, InterruptedException { 932 return cluster.getClusterStatus().getMapSlotCapacity(); 933 } 934 }); 935 } catch (InterruptedException ie) { 936 throw new IOException(ie); 937 } 938 } 939 940 /** 941 * Get status information about the max available Reduces in the cluster. 942 * 943 * @return the max available Reduces in the cluster 944 * @throws IOException 945 */ 946 public int getDefaultReduces() throws IOException { 947 try { 948 return clientUgi.doAs(new PrivilegedExceptionAction<Integer>() { 949 @Override 950 public Integer run() throws IOException, InterruptedException { 951 return cluster.getClusterStatus().getReduceSlotCapacity(); 952 } 953 }); 954 } catch (InterruptedException ie) { 955 throw new IOException(ie); 956 } 957 } 958 959 /** 960 * Grab the jobtracker system directory path where job-specific files are to be placed. 961 * 962 * @return the system directory where job-specific files are to be placed. 963 */ 964 public Path getSystemDir() { 965 try { 966 return clientUgi.doAs(new PrivilegedExceptionAction<Path>() { 967 @Override 968 public Path run() throws IOException, InterruptedException { 969 return cluster.getSystemDir(); 970 } 971 }); 972 } catch (IOException ioe) { 973 return null; 974 } catch (InterruptedException ie) { 975 return null; 976 } 977 } 978 979 /** 980 * Checks if the job directory is clean and has all the required components 981 * for (re) starting the job 982 */ 983 public static boolean isJobDirValid(Path jobDirPath, FileSystem fs) 984 throws IOException { 985 FileStatus[] contents = fs.listStatus(jobDirPath); 986 int matchCount = 0; 987 if (contents != null && contents.length >= 2) { 988 for (FileStatus status : contents) { 989 if ("job.xml".equals(status.getPath().getName())) { 990 ++matchCount; 991 } 992 if ("job.split".equals(status.getPath().getName())) { 993 ++matchCount; 994 } 995 } 996 if (matchCount == 2) { 997 return true; 998 } 999 } 1000 return false; 1001 } 1002 1003 /** 1004 * Fetch the staging area directory for the application 1005 * 1006 * @return path to staging area directory 1007 * @throws IOException 1008 */ 1009 public Path getStagingAreaDir() throws IOException { 1010 try { 1011 return clientUgi.doAs(new PrivilegedExceptionAction<Path>() { 1012 @Override 1013 public Path run() throws IOException, InterruptedException { 1014 return cluster.getStagingAreaDir(); 1015 } 1016 }); 1017 } catch (InterruptedException ie) { 1018 // throw RuntimeException instead for compatibility reasons 1019 throw new RuntimeException(ie); 1020 } 1021 } 1022 1023 private JobQueueInfo getJobQueueInfo(QueueInfo queue) { 1024 JobQueueInfo ret = new JobQueueInfo(queue); 1025 // make sure to convert any children 1026 if (queue.getQueueChildren().size() > 0) { 1027 List<JobQueueInfo> childQueues = new ArrayList<JobQueueInfo>(queue 1028 .getQueueChildren().size()); 1029 for (QueueInfo child : queue.getQueueChildren()) { 1030 childQueues.add(getJobQueueInfo(child)); 1031 } 1032 ret.setChildren(childQueues); 1033 } 1034 return ret; 1035 } 1036 1037 private JobQueueInfo[] getJobQueueInfoArray(QueueInfo[] queues) 1038 throws IOException { 1039 JobQueueInfo[] ret = new JobQueueInfo[queues.length]; 1040 for (int i = 0; i < queues.length; i++) { 1041 ret[i] = getJobQueueInfo(queues[i]); 1042 } 1043 return ret; 1044 } 1045 1046 /** 1047 * Returns an array of queue information objects about root level queues 1048 * configured 1049 * 1050 * @return the array of root level JobQueueInfo objects 1051 * @throws IOException 1052 */ 1053 public JobQueueInfo[] getRootQueues() throws IOException { 1054 try { 1055 return clientUgi.doAs(new PrivilegedExceptionAction<JobQueueInfo[]>() { 1056 public JobQueueInfo[] run() throws IOException, InterruptedException { 1057 return getJobQueueInfoArray(cluster.getRootQueues()); 1058 } 1059 }); 1060 } catch (InterruptedException ie) { 1061 throw new IOException(ie); 1062 } 1063 } 1064 1065 /** 1066 * Returns an array of queue information objects about immediate children 1067 * of queue queueName. 1068 * 1069 * @param queueName 1070 * @return the array of immediate children JobQueueInfo objects 1071 * @throws IOException 1072 */ 1073 public JobQueueInfo[] getChildQueues(final String queueName) throws IOException { 1074 try { 1075 return clientUgi.doAs(new PrivilegedExceptionAction<JobQueueInfo[]>() { 1076 public JobQueueInfo[] run() throws IOException, InterruptedException { 1077 return getJobQueueInfoArray(cluster.getChildQueues(queueName)); 1078 } 1079 }); 1080 } catch (InterruptedException ie) { 1081 throw new IOException(ie); 1082 } 1083 } 1084 1085 /** 1086 * Return an array of queue information objects about all the Job Queues 1087 * configured. 1088 * 1089 * @return Array of JobQueueInfo objects 1090 * @throws IOException 1091 */ 1092 public JobQueueInfo[] getQueues() throws IOException { 1093 try { 1094 return clientUgi.doAs(new PrivilegedExceptionAction<JobQueueInfo[]>() { 1095 public JobQueueInfo[] run() throws IOException, InterruptedException { 1096 return getJobQueueInfoArray(cluster.getQueues()); 1097 } 1098 }); 1099 } catch (InterruptedException ie) { 1100 throw new IOException(ie); 1101 } 1102 } 1103 1104 /** 1105 * Gets all the jobs which were added to particular Job Queue 1106 * 1107 * @param queueName name of the Job Queue 1108 * @return Array of jobs present in the job queue 1109 * @throws IOException 1110 */ 1111 1112 public JobStatus[] getJobsFromQueue(final String queueName) throws IOException { 1113 try { 1114 QueueInfo queue = clientUgi.doAs(new PrivilegedExceptionAction<QueueInfo>() { 1115 @Override 1116 public QueueInfo run() throws IOException, InterruptedException { 1117 return cluster.getQueue(queueName); 1118 } 1119 }); 1120 if (queue == null) { 1121 return null; 1122 } 1123 org.apache.hadoop.mapreduce.JobStatus[] stats = 1124 queue.getJobStatuses(); 1125 JobStatus[] ret = new JobStatus[stats.length]; 1126 for (int i = 0 ; i < stats.length; i++ ) { 1127 ret[i] = JobStatus.downgrade(stats[i]); 1128 } 1129 return ret; 1130 } catch (InterruptedException ie) { 1131 throw new IOException(ie); 1132 } 1133 } 1134 1135 /** 1136 * Gets the queue information associated to a particular Job Queue 1137 * 1138 * @param queueName name of the job queue. 1139 * @return Queue information associated to particular queue. 1140 * @throws IOException 1141 */ 1142 public JobQueueInfo getQueueInfo(final String queueName) throws IOException { 1143 try { 1144 QueueInfo queueInfo = clientUgi.doAs(new 1145 PrivilegedExceptionAction<QueueInfo>() { 1146 public QueueInfo run() throws IOException, InterruptedException { 1147 return cluster.getQueue(queueName); 1148 } 1149 }); 1150 if (queueInfo != null) { 1151 return new JobQueueInfo(queueInfo); 1152 } 1153 return null; 1154 } catch (InterruptedException ie) { 1155 throw new IOException(ie); 1156 } 1157 } 1158 1159 /** 1160 * Gets the Queue ACLs for current user 1161 * @return array of QueueAclsInfo object for current user. 1162 * @throws IOException 1163 */ 1164 public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException { 1165 try { 1166 org.apache.hadoop.mapreduce.QueueAclsInfo[] acls = 1167 clientUgi.doAs(new 1168 PrivilegedExceptionAction 1169 <org.apache.hadoop.mapreduce.QueueAclsInfo[]>() { 1170 public org.apache.hadoop.mapreduce.QueueAclsInfo[] run() 1171 throws IOException, InterruptedException { 1172 return cluster.getQueueAclsForCurrentUser(); 1173 } 1174 }); 1175 QueueAclsInfo[] ret = new QueueAclsInfo[acls.length]; 1176 for (int i = 0 ; i < acls.length; i++ ) { 1177 ret[i] = QueueAclsInfo.downgrade(acls[i]); 1178 } 1179 return ret; 1180 } catch (InterruptedException ie) { 1181 throw new IOException(ie); 1182 } 1183 } 1184 1185 /** 1186 * Get a delegation token for the user from the JobTracker. 1187 * @param renewer the user who can renew the token 1188 * @return the new token 1189 * @throws IOException 1190 */ 1191 public Token<DelegationTokenIdentifier> 1192 getDelegationToken(final Text renewer) throws IOException, InterruptedException { 1193 return clientUgi.doAs(new 1194 PrivilegedExceptionAction<Token<DelegationTokenIdentifier>>() { 1195 public Token<DelegationTokenIdentifier> run() throws IOException, 1196 InterruptedException { 1197 return cluster.getDelegationToken(renewer); 1198 } 1199 }); 1200 } 1201 1202 /** 1203 * Renew a delegation token 1204 * @param token the token to renew 1205 * @return true if the renewal went well 1206 * @throws InvalidToken 1207 * @throws IOException 1208 * @deprecated Use {@link Token#renew} instead 1209 */ 1210 public long renewDelegationToken(Token<DelegationTokenIdentifier> token 1211 ) throws InvalidToken, IOException, 1212 InterruptedException { 1213 return token.renew(getConf()); 1214 } 1215 1216 /** 1217 * Cancel a delegation token from the JobTracker 1218 * @param token the token to cancel 1219 * @throws IOException 1220 * @deprecated Use {@link Token#cancel} instead 1221 */ 1222 public void cancelDelegationToken(Token<DelegationTokenIdentifier> token 1223 ) throws InvalidToken, IOException, 1224 InterruptedException { 1225 token.cancel(getConf()); 1226 } 1227 1228 /** 1229 */ 1230 public static void main(String argv[]) throws Exception { 1231 int res = ToolRunner.run(new JobClient(), argv); 1232 System.exit(res); 1233 } 1234 } 1235