#! /bin/bash

# Copyright (C) 2012 Charles Atkinson
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

# Purpose: indexes the document collation using Xapian Omega's omindex

# Usage: 
#    * The current working directory must be this script's directory
#    * Arguments
#      1: configuration directory.  Required
#      2: log directory.  Required
#      3: log file.  Required when logging (no tty or $SET_HAVE_TTY_FALSE is true)
#    * Automatically outputs to log if there is no tty to log to
#    * To force output to log: export SET_HAVE_TTY_FALSE=true

# Function call tree
#    +
#    |
#    +-- initialise
#    |   |
#    |   +-- parse_cmdline
#    |   |   |
#    |   |   +-- usage
#    |   |
#    |   +-- parse_omindex_sh_cfg (library)
#    |   |
#    |   +-- randomise_filter_choice
#    |
#    +-- index
#    |
#    +-- finalise
#
# Utility functions called from various places:
#    ck_file msg

# Function definitions in alphabetical order.  Execution begins after the last function definition.

#--------------------------
# Name: finalise
# Purpose: final logging and get out of here
#--------------------------
function finalise {

    local msg rc

    # Remove temporary files
    # ~~~~~~~~~~~~~~~~~~~~~~
    [[ ${parse_cfg_for_bash_rb_log_fn:-} != '' ]] && rm -f "$parse_cfg_for_bash_rb_log_fn"
    [[ ${fifo_fn:-} != '' ]] && rm -f "$fifo_fn" 

    # Final logging
    # ~~~~~~~~~~~~~
    msg=
    rc=$1
    case $rc in 
        129 )
            msg I "my_nam: finalising on SIGHUP"
            ;;
        130 )
            msg I "my_nam: finalising on SIGINT"
            ;;
        131 )
            msg I "my_nam: finalising on SIGQUIT"
            ;;
        143 )
            msg I "my_nam: finalising on SIGTERM"
            ;;
    esac
    if [[ $global_warning_flag ]]; then
        msg="$msg"$'\n'"  There was at least one WARNING"
    fi
    if [[ $global_error_flag ]]; then
        msg="$msg"$'\n'"  There was at least one ERROR"
    fi
    if [[ "$msg" != '' ]]; then
        [[ $rc -lt 1 ]] && rc=1
        msg I "Error and warning summary:$msg"
    fi
    msg I "$my_nam: exiting with return code $rc"

    exit $rc

}  # end of function finalise

#--------------------------
# Name: index
# Purpose: indexes the document collation
#--------------------------
function index {
    local analyse_omindex_log_rc buf filter_option fifo_failed i j mk_fifo_rc

    msg I "$my_nam: starting indexing (index database directory: $index_db_dir)"
    ck_file $index_db_dir d:rw: || finalise 1

    # Build omindex filter options array
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if [[ $filter_option_argument != '' ]]; then
        for (( i=0; i<${#filter_option_argument[*]}; i++ ))
        do
            (( j=i*2 ))
            filter_option[j]='--filter'
            filter_option[j+1]=${filter_option_argument[i]}
        done
    fi

    # Set up redirection for the omindex command
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    now=$( date +'%y-%m-%d@%H:%M' )
    omindex_log_fn=$log_dir/omindex_error.$now.log
    msg I "$my_nam: omindex error log: $omindex_log_fn"
    omindex_log_created=$true
    exec 3>&1; exec 4>&2                  # Duplicate (save) existing file descriptors
    if [[ ! $omindex_log_cleaning ]]; then
        exec 1>$omindex_log_fn 2>&1
    else
        fifo_fn=$tmp_dir/omindex.$now.fifo
        clean_omindex_log_cfg_fn=$cfg_dir/clean_omindex_log.cfg
        ck_file $clean_omindex_log_cfg_fn f:r: || finalise 1
        clean_omindex_log_log_fn=$log_dir/clean_omindex_log.$now.log
        buf=$( mkfifo "$fifo_fn" 2>&1 )
        mk_fifo_rc=$?
        if [[ $mk_fifo_rc -eq 0 && $buf = '' ]]; then
            ./clean_omindex_log.rb \
                --config "$clean_omindex_log_cfg_fn" \
                --input "$fifo_fn" \
                --output "$omindex_log_fn" \
                --log "$clean_omindex_log_log_fn" \
                2>&1 & 
                sleep 1    # Allow time for clean_omindex_log.rb to start reading the FIFO
                exec 1>"$fifo_fn" 2>&1
        else
            msg E "$my_nam: mkfifo rc: $mk_fifo_rc, output: $buf"
            msg I "$my_nam: Disabling omindex log cleaning because of the error"
            exec 1>$omindex_log_fn 2>&1
        fi
    fi

    # Run omindex
    # ~~~~~~~~~~~
    bin_dir=$PWD
    cd "$tmp_dir" || finalise 1
    if [[ ${filter_option:-} = '' ]]; then
        buf=$( echo time omindex --db "$index_db_dir" \
            --stemmer=english \
            --url / \
            "$collation_root_dir" )
        msg I "$my_nam: omindex command: $buf"
        time omindex --db "$index_db_dir" \
            --stemmer=english \
            --url / \
            "$collation_root_dir"
    else
        buf=$( echo omindex --db "$index_db_dir" \
            "${filter_option[@]}" \
            --stemmer=english \
            --url / \
            "$collation_root_dir" )
        msg I "$my_nam: omindex command: $buf"
        time omindex --db "$index_db_dir" \
            "${filter_option[@]}" \
            --stemmer=english \
            --url / \
            "$collation_root_dir"
    fi

    # Dismantle the omindex command's redirection
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if [[ $omindex_log_cleaning ]]; then
        trap '' 'PIPE'
        # Send termination message, enough to flush buffers
        for ((i=0; i<50; i++))
        do
            echo 'All done now. You can flush and exit' >&2
        done
        sleep 5    # Allow time to finish writing the cleaned log
    fi
    exec 1>&3; exec 2>&4                  # Restore file descriptors
    exec 3>&-; exec 4>&-                  # Free unused file descriptors

    # Analyse the omindex log
    # ~~~~~~~~~~~~~~~~~~~~~~~
    msg I "$my_nam: starting to analyse the omindex error log"
    cd "$bin_dir" || finalise 1
    ./analyse_omindex_log.sh -c "$cfg_dir" -l "$log_fn" -o "$omindex_log_fn" -t "$tmp_dir"

}  # end of function index

#--------------------------
# Name: initialise
# Purpose: sets up environment, parses command line, sets up logging and parses the config file
#--------------------------
function initialise {

    local bash_lib buf cfg_fn extra_log_text filter i my_cfg_fn my_log_dir now

    # Source the bash library
    # ~~~~~~~~~~~~~~~~~~~~~~~
    bash_lib=./bash_lib.sh
    source $bash_lib
    if [[ $? -ne 0 ]]; then
        echo "Unable to read the bash library, '$bash_lib'. Exiting" >&2
        exit 1
    fi
    
    # Parse command line
    # ~~~~~~~~~~~~~~~~~~
    # Has to be done now to determine log directory
    parse_cmdline "${@:-}"
    
    # Redirect output to log and log startup command
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    exec 1>>"$log_fn"
    exec 2>>"$log_fn"
    msg I "$my_nam: started by: $0 $*"
    
    # Parse the common configuration file
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    emsg=
    cfg_fn=$cfg_dir/collate.cfg
    ck_file $cfg_fn f:r: || finalise 1
    now=$( date +'%y-%m-%d@%H:%M' )
    parse_cfg_for_bash_rb_log_fn=$log_dir/parse_cfg_for_bash.rb.$now.log
    buf=$( ./parse_cfg_for_bash.rb --config $cfg_fn --log $parse_cfg_for_bash_rb_log_fn 2>&1 )
    if [[ ! $buf =~ ^Parameters ]]; then
        msg E "$my_nam: did not find 'Parameters' in parse_cfg_for_bash.rb output:"$'\n'"$buf"
        finalise 1
    fi
    buf=$( echo "$buf" | grep 'CollationRootDir: ' \
        | sed --regexp-extended -e 's/  CollationRootDir: //' -e 's/[[:space:]]*$//' \
    )
    collation_root_dir=$buf
    [[ $collation_root_dir = '' ]] && \
        emsg="$emsg"$'\n'"  $cfg_fn: CollationRootDir keyword not found or has no value"

    # Parse this script's configuration file
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    emsg=
    omindex_log_cleaning=$true
    my_cfg_fn=$cfg_dir/omindex.sh.cfg
    ck_file $my_cfg_fn f:r: || finalise 1
    parse_omindex_sh_cfg $my_cfg_fn || finalise 1
    if [[ ${index_db_dir:-} = '' ]]; then
        emsg="$emsg"$'\n'"  $my_cfg_fn: keyword 'omega index database directory' missing or has no value"
    fi
    omindex_log_cleaning=${omindex_log_cleaning,,}    # Convert to lowercase
    case $omindex_log_cleaning in
        false )
            ;;
        $true | true )
            omindex_log_cleaning=true    # For config logging; set to $true later
            ;;
        * )
            emsg="$emsg"$'\n'"  $my_cfg_fn: keyword 'omindex log cleaning': invalid value: $omindex_log_cleaning"
    esac

    # Incorporate any filters from the configuration file
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    using_unoconv_wrapper=$false
    i=0
    if [[ $filters = '' ]]; then
        filter_option_argument=
    else
        while read -r 
        do
            # An associative array is not used because they were relatively
            # new at the time of writing so many systems would not have a
            # version of bash that supports them
            filter_option_argument[i]=$REPLY
            let i++
        done <<< "$( echo "${filters}" )"
        randomise_filter_choice
    fi

    # Check directory permissions
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    buf=$( ck_file $collation_root_dir d:rx: 2>&1 )
    [[ $buf != '' ]] && emsg="$emsg"$'\n'"  $cfg_fn: keyword CollationRootDir "$'\n'"    $buf"
    buf=$( ck_file $index_db_dir d:wx: 2>&1 )
    if [[ $buf != '' ]]; then
        buf2=$( ck_file "${index_db_dir%/*/}/" d:wx: 2>&1 )
        if [[ $buf2 != '' ]]; then
            emsg="$emsg"$'\n'"  $my_cfg_fn: keyword 'omega index database directory': "$'\n'"    $buf"$'\n'"    $buf2"
        fi
    fi

    # Report any configuration errors
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if [[ $emsg != '' ]]
    then
        msg E "$my_nam: configuration file(s) errors:$emsg"
        finalise 1
    fi

    # Set environment variables for called programs
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    [[ $java_opts != '' ]] && export JAVA_OPTS=$java_opts
    [[ $using_unoconv_wrapper ]] \
        && export UNOCONV_WRAPPER_LOG=$log_dir/unoconv_wrapper.$now.log

    # Log configuration values
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    extra_log_text=
    [[ $java_opts != '' ]] && extra=$'\n'"  JAVA_OPTS: $JAVA_OPTS"
    if [[ ${filter_option_argument:-} != '' ]]; then
        for (( i=0; i<${#filter_option_argument[*]}; i++))
        do 
            extra_log_text+=$'\n'"  Filter: ${filter_option_argument[i]}"
        done 
    fi
    [[ $using_unoconv_wrapper ]] \
        && extra_log_text+=$'\n'"  \$UNOCONV_WRAPPER_LOG for unoconv_wrapper.sh: $UNOCONV_WRAPPER_LOG"
    msg I "$my_nam: configuration values:
  Configuration directory: $cfg_dir $( my_readlink $cfg_dir )
  Log directory: $log_dir $( my_readlink $log_dir )
  Collation root directory: $collation_root_dir $( my_readlink $collation_root_dir )
  Index database directory: $index_db_dir $( my_readlink $index_db_dir )$extra_log_text
  Omindex log cleaning: $omindex_log_cleaning"

    # Normalise after logging
    # ~~~~~~~~~~~~~~~~~~~~~~~
    [[ $omindex_log_cleaning = false ]] && omindex_log_cleaning=$false || omindex_log_cleaning=$true

}  # end of function initialise

#--------------------------
# Name: parse_cmdline
# Purpose: parses the command line
#--------------------------
function parse_cmdline {

    # Logging is not set up yet so any error messages are written to stderr.
    # This script is normally run by run_scripts.sh which directs stdout and stderr
    # to its own log.

    cfg_dir=unset
    emsg=
    log_fn=unset
    tmp_dir=unset
    while getopts c:hl:t: opt 2>/dev/null
    do
        case $opt in
            c )
                cfg_dir="$OPTARG"
                ;;
            h )
                usage verbose
                exit 0
                ;;
            l )
                log_fn="$OPTARG"
                ;;
            t )
                tmp_dir="$OPTARG"
                ;;
            * )
                emsg="$emsg"$'\n'"  Invalid option '$opt'"
        esac
    done
    
    # Test for extra arguments
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    shift $(( $OPTIND-1 ))
    if [[ $* != '' ]]; then
        emsg="$emsg"$'\n'"  Invalid extra argument(s) '$*'"
    fi
    
    # Test for mandatory options not set and directory permissions
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if [[ $cfg_dir != 'unset' ]]; then
        buf=$( ck_file "$cfg_dir" d:rx: 2>&1 )
        [[ $buf != '' ]] && emsg=$'\n'"  Configuration directory '$cfg_dir': $buf"
    else
        emsg="$emsg"$'\n'"  Mandatory option -c not given"
    fi
    cfg_dir=${cfg_dir%%*(/)}    # Remove any trailing "/"s
    if [[ $log_fn != 'unset' ]]; then
        if [[ $log_fn != /dev/tty ]]; then
            log_dir=${log_fn%/*}
            buf=$( ck_file "$log_dir" d:rx: 2>&1 )
            [[ $buf != '' ]] && emsg=$'\n'"  Log directory '$log_dir': $buf"
        else
            log_dir=${tmp_dir:-}
        fi
    else
        emsg="$emsg"$'\n'"  Mandatory option -l not given"
    fi
    log_dir=${log_dir%%*(/)}    # Remove any trailing "/"s
    if [[ $tmp_dir != 'unset' ]]; then
        buf=$( ck_file "$tmp_dir" d:rx: 2>&1 )
        [[ $buf != '' ]] && emsg=$'\n'"  Temporary directory '$tmp_dir': $buf"
    else
        emsg="$emsg"$'\n'"  Mandatory option -t not given"
    fi
    tmp_dir=${tmp_dir%%*(/)}    # Remove any trailing "/"s

    # Report any command line errors
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if [[ $emsg != '' ]]; then
        echo "$emsg" >&2
        usage
        exit 1
    fi
    
}  # end of function parse_cmdline

#--------------------------
# Name: randomise_filter_choice
# Purpose: when multiple filters were given in the configuration file
#    for the same "MIME type", chooses one at random.
#--------------------------
function randomise_filter_choice {

    local command commands i idx indexes j mime_type mime_types mime_types_seen unique_mime_types

    # Get list of unique MIME types
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    mime_types_seen=':'
    for (( i=0; i<${#filter_option_argument[*]}; i++ ))
    do
        mime_type=${filter_option_argument[i]%%:*}
        mime_types[i]=$mime_type
        [[ ! $mime_types_seen =~ :$mime_type: ]] && mime_types_seen+="$mime_type:"
        commands[i]=${filter_option_argument[i]#*:}
        command=${commands[i]%% *}
        buf=$( ck_cmd $command 2>&1 )
        [[ $buf != '' ]] && msg E "${filter_option_argument[i]}: $buf"
    done
    mime_types_seen=${mime_types_seen%:}
    mime_types_seen=${mime_types_seen#:}
    
    # Rebuild filter_option_argument with single member for each unique MIME type
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    unset filter_option_argument
    IFS=':'; unique_mime_types=( $mime_types_seen ); unset IFS
    for (( i=0; i<${#unique_mime_types[*]}; i++ ))
    do
        unique_mime_type=${unique_mime_types[i]}
        idx=0
        indexes=
        for (( j=0; j<${#mime_types[*]}; j++ ))
        do
            if [[ ${mime_types[j]} = $unique_mime_type ]]; then
                indexes[idx++]=$j
            fi
        done
        j=$RANDOM; (( j %= idx ))
        idx=$j
        j=${indexes[idx]}
        filter_option_argument[i]="$unique_mime_type:${commands[j]}"
        [[ ${filter_option_argument[i]} =~ unoconv_wrapper\.sh ]] && using_unoconv_wrapper=$true
    done

}  # end of function randomise_filter_choice

#--------------------------
# Name: usage
# Purpose: prints usage message
#--------------------------
function usage {

    echo "usage: ${0##*/} -c cfg_dir [-h] -l log_file -t tmp_dir" >&2    
    if [[ ${1:-} != 'verbose' ]]
    then
        echo "(use -h for help)" >&2
    else
        echo "  where:
    -c names the directory containing the configuration files
    -h prints this help and exits
    -l names the log file.  Use /dev/tty to get logging on screen
    -t names the directory for temporary files
" >&2
    fi

}  # end of function usage

#--------------------------
# Name: main
# Purpose: where it all happens
#--------------------------
initialise "${@:-}"
index
finalise 0

