001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapreduce.lib.jobcontrol; 020 021 import java.io.IOException; 022 import java.util.ArrayList; 023 import java.util.Collection; 024 import java.util.Iterator; 025 import java.util.LinkedList; 026 import java.util.List; 027 028 import org.apache.commons.logging.Log; 029 import org.apache.commons.logging.LogFactory; 030 import org.apache.hadoop.classification.InterfaceAudience; 031 import org.apache.hadoop.classification.InterfaceStability; 032 import org.apache.hadoop.mapred.jobcontrol.Job; 033 import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob.State; 034 import org.apache.hadoop.util.StringUtils; 035 036 /** 037 * This class encapsulates a set of MapReduce jobs and its dependency. 038 * 039 * It tracks the states of the jobs by placing them into different tables 040 * according to their states. 041 * 042 * This class provides APIs for the client app to add a job to the group 043 * and to get the jobs in the group in different states. When a job is 044 * added, an ID unique to the group is assigned to the job. 045 * 046 * This class has a thread that submits jobs when they become ready, 047 * monitors the states of the running jobs, and updates the states of jobs 048 * based on the state changes of their depending jobs states. The class 049 * provides APIs for suspending/resuming the thread, and 050 * for stopping the thread. 051 * 052 */ 053 @InterfaceAudience.Public 054 @InterfaceStability.Evolving 055 public class JobControl implements Runnable { 056 private static final Log LOG = LogFactory.getLog(JobControl.class); 057 058 // The thread can be in one of the following state 059 public static enum ThreadState {RUNNING, SUSPENDED,STOPPED, STOPPING, READY}; 060 061 private ThreadState runnerState; // the thread state 062 063 private LinkedList<ControlledJob> jobsInProgress = new LinkedList<ControlledJob>(); 064 private LinkedList<ControlledJob> successfulJobs = new LinkedList<ControlledJob>(); 065 private LinkedList<ControlledJob> failedJobs = new LinkedList<ControlledJob>(); 066 067 private long nextJobID; 068 private String groupName; 069 070 /** 071 * Construct a job control for a group of jobs. 072 * @param groupName a name identifying this group 073 */ 074 public JobControl(String groupName) { 075 this.nextJobID = -1; 076 this.groupName = groupName; 077 this.runnerState = ThreadState.READY; 078 } 079 080 private static List<ControlledJob> toList( 081 LinkedList<ControlledJob> jobs) { 082 ArrayList<ControlledJob> retv = new ArrayList<ControlledJob>(); 083 synchronized (jobs) { 084 for (ControlledJob job : jobs) { 085 retv.add(job); 086 } 087 } 088 return retv; 089 } 090 091 synchronized private List<ControlledJob> getJobsIn(State state) { 092 LinkedList<ControlledJob> l = new LinkedList<ControlledJob>(); 093 for(ControlledJob j: jobsInProgress) { 094 if(j.getJobState() == state) { 095 l.add(j); 096 } 097 } 098 return l; 099 } 100 101 /** 102 * @return the jobs in the waiting state 103 */ 104 public List<ControlledJob> getWaitingJobList() { 105 return getJobsIn(State.WAITING); 106 } 107 108 /** 109 * @return the jobs in the running state 110 */ 111 public List<ControlledJob> getRunningJobList() { 112 return getJobsIn(State.RUNNING); 113 } 114 115 /** 116 * @return the jobs in the ready state 117 */ 118 public List<ControlledJob> getReadyJobsList() { 119 return getJobsIn(State.READY); 120 } 121 122 /** 123 * @return the jobs in the success state 124 */ 125 public List<ControlledJob> getSuccessfulJobList() { 126 return toList(this.successfulJobs); 127 } 128 129 public List<ControlledJob> getFailedJobList() { 130 return toList(this.failedJobs); 131 } 132 133 private String getNextJobID() { 134 nextJobID += 1; 135 return this.groupName + this.nextJobID; 136 } 137 138 /** 139 * Add a new controlled job. 140 * @param aJob the new controlled job 141 */ 142 synchronized public String addJob(ControlledJob aJob) { 143 String id = this.getNextJobID(); 144 aJob.setJobID(id); 145 aJob.setJobState(State.WAITING); 146 jobsInProgress.add(aJob); 147 return id; 148 } 149 150 /** 151 * Add a new job. 152 * @param aJob the new job 153 */ 154 synchronized public String addJob(Job aJob) { 155 return addJob((ControlledJob) aJob); 156 } 157 158 /** 159 * Add a collection of jobs 160 * 161 * @param jobs 162 */ 163 public void addJobCollection(Collection<ControlledJob> jobs) { 164 for (ControlledJob job : jobs) { 165 addJob(job); 166 } 167 } 168 169 /** 170 * @return the thread state 171 */ 172 public ThreadState getThreadState() { 173 return this.runnerState; 174 } 175 176 /** 177 * set the thread state to STOPPING so that the 178 * thread will stop when it wakes up. 179 */ 180 public void stop() { 181 this.runnerState = ThreadState.STOPPING; 182 } 183 184 /** 185 * suspend the running thread 186 */ 187 public void suspend () { 188 if (this.runnerState == ThreadState.RUNNING) { 189 this.runnerState = ThreadState.SUSPENDED; 190 } 191 } 192 193 /** 194 * resume the suspended thread 195 */ 196 public void resume () { 197 if (this.runnerState == ThreadState.SUSPENDED) { 198 this.runnerState = ThreadState.RUNNING; 199 } 200 } 201 202 synchronized public boolean allFinished() { 203 return jobsInProgress.isEmpty(); 204 } 205 206 /** 207 * The main loop for the thread. 208 * The loop does the following: 209 * Check the states of the running jobs 210 * Update the states of waiting jobs 211 * Submit the jobs in ready state 212 */ 213 public void run() { 214 try { 215 this.runnerState = ThreadState.RUNNING; 216 while (true) { 217 while (this.runnerState == ThreadState.SUSPENDED) { 218 try { 219 Thread.sleep(5000); 220 } 221 catch (Exception e) { 222 //TODO the thread was interrupted, do something!!! 223 } 224 } 225 226 synchronized(this) { 227 Iterator<ControlledJob> it = jobsInProgress.iterator(); 228 while(it.hasNext()) { 229 ControlledJob j = it.next(); 230 LOG.debug("Checking state of job "+j); 231 switch(j.checkState()) { 232 case SUCCESS: 233 successfulJobs.add(j); 234 it.remove(); 235 break; 236 case FAILED: 237 case DEPENDENT_FAILED: 238 failedJobs.add(j); 239 it.remove(); 240 break; 241 case READY: 242 j.submit(); 243 break; 244 case RUNNING: 245 case WAITING: 246 //Do Nothing 247 break; 248 } 249 } 250 } 251 252 if (this.runnerState != ThreadState.RUNNING && 253 this.runnerState != ThreadState.SUSPENDED) { 254 break; 255 } 256 try { 257 Thread.sleep(5000); 258 } 259 catch (Exception e) { 260 //TODO the thread was interrupted, do something!!! 261 } 262 if (this.runnerState != ThreadState.RUNNING && 263 this.runnerState != ThreadState.SUSPENDED) { 264 break; 265 } 266 } 267 }catch(Throwable t) { 268 LOG.error("Error while trying to run jobs.",t); 269 //Mark all jobs as failed because we got something bad. 270 failAllJobs(t); 271 } 272 this.runnerState = ThreadState.STOPPED; 273 } 274 275 synchronized private void failAllJobs(Throwable t) { 276 String message = "Unexpected System Error Occured: "+ 277 StringUtils.stringifyException(t); 278 Iterator<ControlledJob> it = jobsInProgress.iterator(); 279 while(it.hasNext()) { 280 ControlledJob j = it.next(); 281 try { 282 j.failJob(message); 283 } catch (IOException e) { 284 LOG.error("Error while tyring to clean up "+j.getJobName(), e); 285 } catch (InterruptedException e) { 286 LOG.error("Error while tyring to clean up "+j.getJobName(), e); 287 } finally { 288 failedJobs.add(j); 289 it.remove(); 290 } 291 } 292 } 293 }