/* Schedwi
   Copyright (C) 2014, 2015 Herve Quatremain

   This file is part of Schedwi.

   Schedwi is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   Schedwi is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/* @file update_status_jobset_tree.c
 * Update the jobs and jobsets status in the workload trees.
 */

#include <schedwi.h>

#if STDC_HEADERS
#include <stdlib.h>
#include <string.h>
#else
#if HAVE_STDLIB_H
#include <stdlib.h>
#endif
#if HAVE_STRING_H
#include <string.h>
#endif
#endif

#if HAVE_TIME_H
#include <time.h>
#endif

#include <lwc_log.h>
#include <sql_job.h>
#include <sql_status.h>
#include <sql_hierarchy.h>
#include <job_status_state.h>
#include <job_status_node.h>
#include <check_waiting_can_start.h>
#include <stopjob.h>
#include <xmem.h>
#include <update_status_jobset_tree.h>



/*
 * Error callback function for the sql_status_updated_workload_list(),
 * sql_job_get_max_duration() and sql_status_num_waiting_links() functions
 */
static void
sql_error_logger (void *data, const char *msg, int err_code)
{
	if (msg != NULL) {
		lwc_writeLog (LOG_ERR, msg);
	}
	else {
		lwc_writeLog (LOG_ERR,
	    _("Database error while trying to retrieve the workloads"));
	}
}


/**
 * Get the workload list for which job/jobset status have been modified.  Also
 * the worloads for yesterday and today are always returned.
 *
 * @param[in] now The current time.
 * @return The workload list (to be freed by the caller by
 *         lwc_delLL (rows, (void (*)(const void *)) sql_free_row);
 *         or NULL in case of error (a message has been logged by
 *         lwc_writeLog())
 */
static lwc_LL *
get_workloads (time_t now)
{
	static time_t last_update = 0;
	lwc_LL *rows;


	if (sql_status_updated_workload_list (	&rows, last_update,
						sql_error_logger, NULL) != 0)
	{
		return NULL;
	}
	last_update = now;
	return rows;
}


/**
 * Check the status of the given job.
 *
 * @param[in] workload_date Workload date of the job (YYYYMMDD).
 * @param[in] job_id Job ID.
 * @param[in] job_name_with_path Full job name.
 * @param[in] hierarchy_list Hierarchy list (see sql_hierarchy.c) for this
 *                           job parent jobset.
 * @param[in] now Current time (as in time(2))
 * @param[in] parent_status Parent jobset status.
 * @param[out] status Status set for the given job.
 * @return 0 on success or -1 in case of error (a message has been logged by
 *         lwc_writeLog())
 */
static int
update_status_job (	int workload_date,
			unsigned long long int job_id,
			const char *job_name_with_path,
			lwc_LL *hierarchy_list,
			time_t now,
			job_status_state parent_status,
			job_status_state *status,
			int num_iter)
{
	job_status_node_ptr ptr;
	char *err_msg;
	short int max_duration;
	int ret;


	ptr = job_status_node_get (workload_date, job_id, job_name_with_path);
	if (ptr == NULL) {
		return -1;
	}

	/* Add the job to the hierarchy list */
	err_msg = NULL;
	if (hierarchy_list_push_job (	hierarchy_list,
					workload_date, job_id,
					&err_msg) != 0)
	{
		if (err_msg != NULL) {
			lwc_writeLog (LOG_ERR, err_msg);
			free (err_msg);
		}
		else {
			lwc_writeLog (LOG_ERR,
			_("Database error while retrieving a job details"));
		}
		job_status_node_destroy (ptr);
		return -1;
	}

	/*
	 * If the job is running, check that it has not been running for too
	 * long (just for the first iteration, otherwise, we'll send multiple
	 * stop requests to the agent).
	 */
	if (ptr->status == JOB_STATUS_STATE_RUNNING && num_iter <= 0) {
		if (sql_job_get_max_duration (	workload_date,
						hierarchy_list,
						&max_duration,
						sql_error_logger, NULL) != 0)
		{
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return -1;
		}
		if (	   max_duration > 0
			&& ptr->time_status_set + max_duration * 60 < now)
		{
			/* Too long - Stop the job */
			lwc_writeLog (	LOG_INFO,
	_("Workload %d: %s (id %lld): job running for too long. Stopping..."),
					workload_date,
					job_name_with_path,
					job_id);
			err_msg = NULL;
			if (stopjob (	hierarchy_list, job_name_with_path,
					workload_date, job_id, &err_msg) != 0)
			{
				if (err_msg != NULL) {
					lwc_writeLog (	LOG_ERR,
			_("Workload %d: %s (id %lld): job failed to stop: %s"),
							workload_date,
							job_name_with_path,
							job_id,
							err_msg);
				}
				else {
					lwc_writeLog (	LOG_ERR,
			_("Workload %d: %s (id %lld): job failed to stop"),
							workload_date,
							job_name_with_path,
							job_id);
				}
			}
			if (err_msg != NULL) {
				free (err_msg);
			}
		}
	}

	/*
	 * Check if the job can start. If yes, set its status to running and
	 * start the associated command.
	 */
	else if (ptr->status == JOB_STATUS_STATE_WAITING) {
		ret = check_waiting_can_start (	ptr, hierarchy_list, now,
						parent_status);
		if (ret < 0) {
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return -1;
		}
		if (ret == 2) {
			/* Start limit reached */
			job_status_node_set_status (
					ptr, JOB_STATUS_STATE_FAILED, now);
			job_status_node_set_message (
					ptr, _("Start time limit reached"));
			if (job_status_node_set (hierarchy_list, ptr) != 0) {
				hierarchy_list_pop_job (hierarchy_list);
				job_status_node_destroy (ptr);
				return -1;
			}
		}
		else if (ret == 0) {
			/* Start the job */
			job_status_node_set_status (
					ptr, JOB_STATUS_STATE_RUNNING, now);
			if (job_status_node_set (hierarchy_list, ptr) != 0) {
				hierarchy_list_pop_job (hierarchy_list);
				job_status_node_destroy (ptr);
				return -1;
			}
		}
	}
	*status = ptr->status;
	hierarchy_list_pop_job (hierarchy_list);
	job_status_node_destroy (ptr);
	return 0;
}


/**
 * Recursivelly (deep first) update the jobs and jobsets status.
 *
 * @param[in] workload_date Workload date of the jobset (YYYYMMDD).
 * @param[in] jobset_id Jobset ID.
 * @param[in] jobset_name_with_path Full jobset name.
 * @param[in] hierarchy_list Hierarchy list (see sql_hierarchy.c) for this
 *                           jobset parent.
 * @param[in] now Current time (as in time(2))
 * @param[in] parent_status Parent jobset status.
 * @param[out] status Status set for the given jobset.
 * @return 0 on success or -1 in case of error (a message has been logged by
 *         lwc_writeLog())
 */
static int
update_status_jobset (	int workload_date,
			unsigned long long int jobset_id,
			const char *jobset_name_with_path,
			lwc_LL *hierarchy_list,
			time_t now,
			job_status_state parent_status,
			job_status_state *status,
			int num_iter)
{
	job_status_node_ptr ptr;
	lwc_LL *children;
	row_item_t *child;
	unsigned long long int id;
	size_t parent_name_length;
	char *path, *err_msg;
	int num_children_per_status[NUM_JOB_STATUS_STATE] = { 0 };
	int num_children, ret;
	job_status_state new_status;


	ptr = job_status_node_get (	workload_date, jobset_id,
					jobset_name_with_path);
	if (ptr == NULL) {
		return -1;
	}

	/* Add the current jobset to the hierarchy list */
	err_msg = NULL;
	if (hierarchy_list_push_job (	hierarchy_list,
					workload_date, jobset_id,
					&err_msg) != 0)
	{
		if (err_msg != NULL) {
			lwc_writeLog (LOG_ERR, err_msg);
			free (err_msg);
		}
		else {
			lwc_writeLog (LOG_ERR,
	    		_("Database error while retrieving a jobset details"));
		}
		job_status_node_destroy (ptr);
		return -1;
	}

	/* Check if the jobset can start. If yes, set its status to running */
	if (ptr->status == JOB_STATUS_STATE_WAITING) {
		ret = check_waiting_can_start (	ptr, hierarchy_list, now,
						parent_status);
		if (ret < 0) {
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return -1;
		}
		if (ret == 2) {
			/* Start limit reached */
			job_status_node_set_status (
					ptr, JOB_STATUS_STATE_FAILED, now);
			job_status_node_set_message (
					ptr, _("Start time limit reached"));
			if (job_status_node_set (hierarchy_list, ptr) != 0) {
				hierarchy_list_pop_job (hierarchy_list);
				job_status_node_destroy (ptr);
				return -1;
			}
		}
		else if (ret == 0) {
			/* Mark the jobset as started */
			job_status_node_set_status (
					ptr, JOB_STATUS_STATE_RUNNING, now);
			if (job_status_node_set (hierarchy_list, ptr) != 0) {
				hierarchy_list_pop_job (hierarchy_list);
				job_status_node_destroy (ptr);
				return -1;
			}
		}
	}


	/*
	 * Collect the status of the children
	 */

	if (sql_job_get_children (	workload_date, jobset_id, &children,
					sql_error_logger, NULL) != 0)
	{
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return -1;
	}

	parent_name_length = strlen (jobset_name_with_path);
	num_children = lwc_getNumNode (children);
	while ((child = (row_item_t *) lwc_delStartLL (children)) != NULL) {
		/*
		 * child[0] --> ID
		 * child[1] --> Type (0: jobset and 1: job)
		 * child[2] --> Start time
		 * child[3] --> Job/Jobset name
		 */

		/* Get the job/jobset ID */
		id = (unsigned long long int) sql_row_item2ll (&(child[0]));

		/* Build the full path name of the job/jobset */
		path = (char *)xmalloc (parent_name_length + child[3].len + 2);
		if (id == ROOT_JOBSET) {
			path[0] = '/';
			path[1] = '\0';
		}
		else {
			if (jobset_id == ROOT_JOBSET) {
				path[0] = '/';
				strcpy (path + 1, child[3].value_string);
			}
			else {
				strcpy (path, jobset_name_with_path);
				path[parent_name_length] = '/';
				strcpy (path + parent_name_length + 1,
					child[3].value_string);
			}
		}

		/* It's a jobset */
		if (sql_row_item2ll (&(child[1])) == JOBSET) {
			if (update_status_jobset (	workload_date, id,
							path,
							hierarchy_list,
							now,
							ptr->status,
							&new_status,
							num_iter) != 0)
			{
				free (path);
				sql_free_row (child);
				lwc_delLL (children,
					(void (*)(const void *)) sql_free_row);
				hierarchy_list_pop_job (hierarchy_list);
				job_status_node_destroy (ptr);
				return -1;
			}
			num_children_per_status[new_status]++;
		}

		/* It's a job */
		else {
			if (update_status_job (	workload_date, id, path,
						hierarchy_list, now,
						ptr->status,
						&new_status,
						num_iter) != 0)
			{
				free (path);
				sql_free_row (child);
				lwc_delLL (children,
					(void (*)(const void *)) sql_free_row);
				hierarchy_list_pop_job (hierarchy_list);
				job_status_node_destroy (ptr);
				return -1;
			}
			num_children_per_status[new_status]++;
		}
		free (path);
		sql_free_row (child);
	}
	lwc_delLL (children, NULL);

	/*
	 * Now, compute the status of the current jobset
	 */

	/* No children */
	if (num_children == 0) {
		if (ptr->status == JOB_STATUS_STATE_RUNNING) {
			new_status = JOB_STATUS_STATE_COMPLETED;
		}
		else {
			if (status != NULL) {
				*status = ptr->status;
			}
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return 0;
		}
	}

	/* All children are completed */
	else if (num_children_per_status[JOB_STATUS_STATE_COMPLETED] >=
								num_children)
	{
		if (ptr->status != JOB_STATUS_STATE_COMPLETED) {
			new_status = JOB_STATUS_STATE_COMPLETED;
		}
		else {
			if (status != NULL) {
				*status = JOB_STATUS_STATE_COMPLETED;
			}
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return 0;
		}
	}

	/* All children are finished but at least one failed */
	else if (	  num_children_per_status[JOB_STATUS_STATE_FAILED]
			+ num_children_per_status[JOB_STATUS_STATE_COMPLETED]
			>= num_children)
	{
		if (ptr->status != JOB_STATUS_STATE_FAILED) {
			new_status = JOB_STATUS_STATE_FAILED;
		}
		else {
			if (status != NULL) {
				*status = JOB_STATUS_STATE_FAILED;
			}
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return 0;
		}
	}

	/* At least one child is running */
	else if (num_children_per_status[JOB_STATUS_STATE_RUNNING] != 0) {
		if (ptr->status != JOB_STATUS_STATE_RUNNING) {
			new_status = JOB_STATUS_STATE_RUNNING;
		}
		else {
			if (status != NULL) {
				*status = JOB_STATUS_STATE_RUNNING;
			}
			hierarchy_list_pop_job (hierarchy_list);
			job_status_node_destroy (ptr);
			return 0;
		}
	}
	else {
		if (status != NULL) {
			*status = ptr->status;
		}
		hierarchy_list_pop_job (hierarchy_list);
		job_status_node_destroy (ptr);
		return 0;
	}

	job_status_node_set_status (ptr, new_status, now);
	if (job_status_node_set (hierarchy_list, ptr) != 0) {
		hierarchy_list_pop_job (hierarchy_list);
		job_status_node_destroy (ptr);
		return -1;
	}
	if (status != NULL) {
		*status = new_status;
	}
	hierarchy_list_pop_job (hierarchy_list);
	job_status_node_destroy (ptr);
	return 0;
}


/**
 * Update the status of the jobsets.
 *
 * @return 0 on success or -1 in case of error (a message has been logged by
 *         lwc_writeLog())
 */
int
update_status_jobset_tree ()
{
	lwc_LL *rows, *hierarchy_list;
	row_item_t *row;
	int ret, nb_waiting_on_links, i, workload_date, num_iter;
	time_t now;


	now = time (NULL);
	rows = get_workloads (now);
	if (rows == NULL) {
		return -1;
	}

	ret = 0;
	while ((row = (row_item_t *)lwc_delStartLL (rows)) != NULL) {
		/*
		 * row[0] is the workload date (YYYYMMDD)
		 */
		workload_date = (int) sql_row_item2ll (&(row[0]));
		/*
		 * Loop until no more jobs/jobsets have been started.  For
		 * instance a job may be waiting for a link to an other job
		 * which get started after the first one has been checked.
		 * In the next iteration, the link is then resolved and the
		 * first job can start.
		 */
		i = num_iter = 0;
		do {
			nb_waiting_on_links = i;
			hierarchy_list = lwc_newLL ();
			ret += update_status_jobset (
					workload_date,
					ROOT_JOBSET,
					"/",
					hierarchy_list,
					now,
					JOB_STATUS_STATE_RUNNING,
					NULL,
					num_iter++);
			hierarchy_list_destroy (hierarchy_list);
			i = sql_status_num_waiting_links (
					workload_date, sql_error_logger, NULL);
		} while (i > 0 && i != nb_waiting_on_links);
		sql_free_row (row);
	}
	lwc_delLL (rows, NULL);
	return (ret == 0) ? 0 : -1;
}

/*------------------------======= End Of File =======------------------------*/
