001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapreduce.lib.jobcontrol;
020    
021    import java.io.IOException;
022    import java.util.ArrayList;
023    import java.util.Collection;
024    import java.util.Iterator;
025    import java.util.LinkedList;
026    import java.util.List;
027    
028    import org.apache.commons.logging.Log;
029    import org.apache.commons.logging.LogFactory;
030    import org.apache.hadoop.classification.InterfaceAudience;
031    import org.apache.hadoop.classification.InterfaceStability;
032    import org.apache.hadoop.mapred.jobcontrol.Job;
033    import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob.State;
034    import org.apache.hadoop.util.StringUtils;
035    
036    /** 
037     *  This class encapsulates a set of MapReduce jobs and its dependency.
038     *   
039     *  It tracks the states of the jobs by placing them into different tables
040     *  according to their states. 
041     *  
042     *  This class provides APIs for the client app to add a job to the group 
043     *  and to get the jobs in the group in different states. When a job is 
044     *  added, an ID unique to the group is assigned to the job. 
045     *  
046     *  This class has a thread that submits jobs when they become ready, 
047     *  monitors the states of the running jobs, and updates the states of jobs
048     *  based on the state changes of their depending jobs states. The class 
049     *  provides APIs for suspending/resuming the thread, and 
050     *  for stopping the thread.
051     *  
052     */
053    @InterfaceAudience.Public
054    @InterfaceStability.Evolving
055    public class JobControl implements Runnable {
056      private static final Log LOG = LogFactory.getLog(JobControl.class);
057    
058      // The thread can be in one of the following state
059      public static enum ThreadState {RUNNING, SUSPENDED,STOPPED, STOPPING, READY};
060            
061      private ThreadState runnerState;                      // the thread state
062            
063      private LinkedList<ControlledJob> jobsInProgress = new LinkedList<ControlledJob>();
064      private LinkedList<ControlledJob> successfulJobs = new LinkedList<ControlledJob>();
065      private LinkedList<ControlledJob> failedJobs = new LinkedList<ControlledJob>();
066            
067      private long nextJobID;
068      private String groupName;
069            
070      /** 
071       * Construct a job control for a group of jobs.
072       * @param groupName a name identifying this group
073       */
074      public JobControl(String groupName) {
075        this.nextJobID = -1;
076        this.groupName = groupName;
077        this.runnerState = ThreadState.READY;
078      }
079            
080      private static List<ControlledJob> toList(
081                       LinkedList<ControlledJob> jobs) {
082        ArrayList<ControlledJob> retv = new ArrayList<ControlledJob>();
083        synchronized (jobs) {
084          for (ControlledJob job : jobs) {
085            retv.add(job);
086          }
087        }
088        return retv;
089      }
090            
091      synchronized private List<ControlledJob> getJobsIn(State state) {
092        LinkedList<ControlledJob> l = new LinkedList<ControlledJob>();
093        for(ControlledJob j: jobsInProgress) {
094          if(j.getJobState() == state) {
095            l.add(j);
096          }
097        }
098        return l;
099      }
100      
101      /**
102       * @return the jobs in the waiting state
103       */
104      public List<ControlledJob> getWaitingJobList() {
105        return getJobsIn(State.WAITING);
106      }
107            
108      /**
109       * @return the jobs in the running state
110       */
111      public List<ControlledJob> getRunningJobList() {
112        return getJobsIn(State.RUNNING);
113      }
114            
115      /**
116       * @return the jobs in the ready state
117       */
118      public List<ControlledJob> getReadyJobsList() {
119        return getJobsIn(State.READY);
120      }
121            
122      /**
123       * @return the jobs in the success state
124       */
125      public List<ControlledJob> getSuccessfulJobList() {
126        return toList(this.successfulJobs);
127      }
128            
129      public List<ControlledJob> getFailedJobList() {
130        return toList(this.failedJobs);
131      }
132            
133      private String getNextJobID() {
134        nextJobID += 1;
135        return this.groupName + this.nextJobID;
136      }
137    
138      /**
139       * Add a new controlled job.
140       * @param aJob the new controlled job
141       */
142      synchronized public String addJob(ControlledJob aJob) {
143        String id = this.getNextJobID();
144        aJob.setJobID(id);
145        aJob.setJobState(State.WAITING);
146        jobsInProgress.add(aJob);
147        return id;  
148      }
149    
150      /**
151       * Add a new job.
152       * @param aJob the new job
153       */
154      synchronized public String addJob(Job aJob) {
155        return addJob((ControlledJob) aJob);
156      }
157    
158      /**
159       * Add a collection of jobs
160       * 
161       * @param jobs
162       */
163      public void addJobCollection(Collection<ControlledJob> jobs) {
164        for (ControlledJob job : jobs) {
165          addJob(job);
166        }
167      }
168            
169      /**
170       * @return the thread state
171       */
172      public ThreadState getThreadState() {
173        return this.runnerState;
174      }
175            
176      /**
177       * set the thread state to STOPPING so that the 
178       * thread will stop when it wakes up.
179       */
180      public void stop() {
181        this.runnerState = ThreadState.STOPPING;
182      }
183            
184      /**
185       * suspend the running thread
186       */
187      public void suspend () {
188        if (this.runnerState == ThreadState.RUNNING) {
189          this.runnerState = ThreadState.SUSPENDED;
190        }
191      }
192            
193      /**
194       * resume the suspended thread
195       */
196      public void resume () {
197        if (this.runnerState == ThreadState.SUSPENDED) {
198          this.runnerState = ThreadState.RUNNING;
199        }
200      }
201            
202      synchronized public boolean allFinished() {
203        return jobsInProgress.isEmpty();
204      }
205            
206      /**
207       *  The main loop for the thread.
208       *  The loop does the following:
209       *    Check the states of the running jobs
210       *    Update the states of waiting jobs
211       *    Submit the jobs in ready state
212       */
213      public void run() {
214        try {
215          this.runnerState = ThreadState.RUNNING;
216          while (true) {
217            while (this.runnerState == ThreadState.SUSPENDED) {
218              try {
219                Thread.sleep(5000);
220              }
221              catch (Exception e) {
222                //TODO the thread was interrupted, do something!!!
223              }
224            }
225            
226            synchronized(this) {
227              Iterator<ControlledJob> it = jobsInProgress.iterator();
228              while(it.hasNext()) {
229                ControlledJob j = it.next();
230                LOG.debug("Checking state of job "+j);
231                switch(j.checkState()) {
232                case SUCCESS:
233                  successfulJobs.add(j);
234                  it.remove();
235                  break;
236                case FAILED:
237                case DEPENDENT_FAILED:
238                  failedJobs.add(j);
239                  it.remove();
240                  break;
241                case READY:
242                  j.submit();
243                  break;
244                case RUNNING:
245                case WAITING:
246                  //Do Nothing
247                  break;
248                }
249              }
250            }
251            
252            if (this.runnerState != ThreadState.RUNNING && 
253                this.runnerState != ThreadState.SUSPENDED) {
254              break;
255            }
256            try {
257              Thread.sleep(5000);
258            }
259            catch (Exception e) {
260              //TODO the thread was interrupted, do something!!!
261            }
262            if (this.runnerState != ThreadState.RUNNING && 
263                this.runnerState != ThreadState.SUSPENDED) {
264              break;
265            }
266          }
267        }catch(Throwable t) {
268          LOG.error("Error while trying to run jobs.",t);
269          //Mark all jobs as failed because we got something bad.
270          failAllJobs(t);
271        }
272        this.runnerState = ThreadState.STOPPED;
273      }
274    
275      synchronized private void failAllJobs(Throwable t) {
276        String message = "Unexpected System Error Occured: "+
277        StringUtils.stringifyException(t);
278        Iterator<ControlledJob> it = jobsInProgress.iterator();
279        while(it.hasNext()) {
280          ControlledJob j = it.next();
281          try {
282            j.failJob(message);
283          } catch (IOException e) {
284            LOG.error("Error while tyring to clean up "+j.getJobName(), e);
285          } catch (InterruptedException e) {
286            LOG.error("Error while tyring to clean up "+j.getJobName(), e);
287          } finally {
288            failedJobs.add(j);
289            it.remove();
290          }
291        }
292      }
293    }