#! /usr/bin/ruby -wEUTF-8:UTF-8

# Copyright (C) 2012 Charles Atkinson
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

# Purpose: copies files of configured types from the configured source 
#   directory tree(s) to the configured collation directory, preserving 
#   relative path names, hardlinking any duplicates, setting the mtime to
#   the earliest of the duplicates.

# Object call tree
#    +
#    |
#    +-- Initialise
#    |   |
#    |   +-- InitialiseParameters
#    |   |
#    |   +-- ParseConfigFile (library method)
#    |   |
#    |   +-- ParseCommandLine
#    |   |   |
#    |   |   +-- Usage
#    |   |
#    |   +-- NormaliseParameters
#    |   |
#    |   +-- LogParameters
#    |   |
#    |   +-- CheckParameters
#    |
#    +-- Clean
#    |
#    +-- Finalise

require 'English'
require 'getoptlong'

require './Log'
require './ruby_lib'
# Undefine the library methods that this script redefines
undef CheckParameters
undef InitialiseParameters
undef LogParameters
undef NormaliseParameters


# Method definitions
# ~~~~~~~~~~~~~~~~~~
# (in alphabetical order)

def CheckParameters( )
  error_msg = ''
  if $parameters[ "LogLevel" ] !~ /DEBUG|INFO|WARN|ERROR|FATAL/
      error_msg << "\n  Invalid LogLevel #{ $parameters[ "LogLevel" ] }"
  end
  if $parameters[ "LogToFile" ]
    dir = $parameters[ "LogPath" ].gsub( %r|[^/]*$|, '' )
    error_msg << CheckDir( dir, 'w' )
  end
  return error_msg
end


def Clean( )

  input_file = File.open($parameters[ "InputFile" ], "r" )
  output_file = File.open($parameters[ "OutputFile" ], "wb" )

  while true \
  do
    line = input_file.gets
    if line == nil
       # It is not possible to set a maximum wait time because omindex and the
       # filters it runs may run for an arbitrarily long time without writing
       # anything to stderr
       sleep 1
    else
      $num_lines_read = $num_lines_read + 1
      line.chomp!
      if line.valid_encoding?
        if line == "All done now. You can flush and exit"; Finalise 0 end 
        discarded = false
        $parameters[ "LinesToDiscard" ].each_with_index \
        do |regexp, idx |
          if line =~ regexp
            $num_lines_discarded = $num_lines_discarded + 1
            discarded = true
            $LinesToDiscardProfile[ idx ] += 1
            break
          end
        end
        if discarded; next end
      end
    end
    output_file.print line + "\n"
  end
end


def Finalise( exitcode, *msg )
  # TODO: might be nice to have a quiet option, eg for use with command line errors

  if $logging_initialised
    # Log any optional message
    if msg.length > 0; $log.write( Log::INFO, msg[ 0 ] ) end
  
    # Log lines statistics
    # OS printf is used to get locale-specific large integer separators
    if $num_lines_read > 0
      $log.write( Log::INFO, "Summary:\n" + \
        "  Lines read: #{ ` /usr/bin/printf "%'d" #{ $num_lines_read } ` }\n" + \
        "  Lines discarded: #{ ` /usr/bin/printf "%'d" #{ $num_lines_discarded } ` }\n" \
      )
    end

    # Log LinesToDiscard regex usage
    if $parameters[ "LinesToDiscardProfiling" ]
      # Find the usage count maximum field width
      num_max = 0
      $LinesToDiscardProfile.each \
      do |num|
        if num > num_max then num_max = num end
      end
      num_max_formatted = ` /usr/bin/printf "%'d" #{ num_max } `
      fw = num_max_formatted.length
      
      # Generate and log the message
      msg = ''
      sorted = $LinesToDiscardProfile.sort
      sorted = sorted.reverse
      sorted.each \
      do | sorted_num |
        $LinesToDiscardProfile.each_with_index \
        do | num, idx |
          if sorted_num == num
            count = ` /usr/bin/printf "%'#{ fw }d" #{ num } `
            regex = "#{ $parameters[ "LinesToDiscard" ][ idx ] }"
            regex = regex[ ( regex.index( ':' ) + 1)..-2]
            msg << "  #{ count }: " + regex + "\n"
            $LinesToDiscardProfile[ idx ] = -1    # Mark displayed
            break
          end
        end
      end
      $log.write( Log::INFO, "$LinesToDiscard regex usage:\n" + \
        msg \
      )
    end

    # Log any warnings and errors
    if $log.n_warnings > 0
      if $log.n_warnings == 1
        $log.write( Log::WARN, "There was one warning" )
      else
        $log.write( Log::WARN, "There were #{ $log.n_warnings } warnings" )
      end
      if exitcode == 0; exitcode = 1 end
    end
    if $log.n_errors > 0
      if $log.n_errors == 1
        $log.write( Log::ERROR, "There was one error" )
      else
        $log.write( Log::ERROR, "There were #{ $log.n_errors } errors" )
      end
      if exitcode == 0; exitcode = 1 end
    end

    # Final log message
    $log.write( Log::INFO, "#{ File.basename( $0 ) }: exiting with exitcode #{ exitcode }" )
    $log.close
  end

  # Bye!
  exit exitcode
end


def Initialise
  # Disable common traps until have set up Finalise( ) requirements
  trap( "INT" ) { }
  trap( "HUP" ) { }
  trap( "QUIT" ) { }
  trap( "TERM" ) { }

  # Set default parameters
  InitialiseParameters( )

  # Parse the config file
  # Must do now so any LogLevel set in the config file can be overridden by the
  # command line option
  x = ARGV.index( "--config" ) 
  if x != nil && ARGV[ x + 1 ] != nil
    config_file_error_msg = ParseConfigFile( ARGV[ x + 1 ], $parameters.keys )
  else
    config_file_error_msg = ''
  end

  # Parse command line
  # Must do now in case "--help" or "--loglevel" is given
  # Save the options and arguments because GetoptsLong in ParseCommandLine will empty ARGV :-(
  $logging_initialised = false
  opts_and_args = ARGV.join( ' ' )
  cmd_line_error_msg = ParseCommandLine( )

  # Set up logging
  if $parameters[ "LogToFile" ]
    # TODO: pass path to $Log.new when it can accept
    # TODO: error trap the File.open (which would be better in the Log class anyway)
    log_fd = File.open( $parameters[ "LogPath" ], 'w' )
    timestamps = true
  else
    log_fd = $stdout
    timestamps = false
  end
  $log = Log.new( log_fd, $parameters[ "LogLevel" ], timestamps )
  $logging_initialised = true

  # Set up Finalise( ) requirements
  # (logging FATAL messages calls Finalise)
  $num_lines_read = 0
  $num_lines_discarded = 0

  # Report any command line or config errors
  if cmd_line_error_msg != ''
    $log.write( Log::ERROR, cmd_line_error_msg )
    Usage( "not verbose" )
    Finalise( 1 )
  end
  if config_file_error_msg != ''
    $log.write( Log::FATAL, config_file_error_msg )
  end

  # Set common traps
  trap( "HUP" ) { Finalise( 129, "Received signal HUP" ) }
  trap( "INT" ) { Finalise( 130, "Received signal INT" ) }
  trap( "QUIT" ) { Finalise( 131, "Received signal QUIT" ) }
  trap( "TERM" ) { Finalise( 143, "Received signal TERM" ) }

  # Log startup message
  now = "#{ Time.now.strftime( '%y-%m-%d@%H:%M:%S' ) }"
  $log.write( Log::INFO, \
    "#{ File.basename( $0 ) } started at #{ now } by " + \
    "#{ $0 } #{ opts_and_args }" \
  )

  # Normalise, log and check the parameters
  error_msg = NormaliseParameters( )
  LogParameters( )
  error_msg << CheckParameters( )
  if error_msg != '' 
    $log.write( Log::FATAL, "Parameter error(s):" + error_msg )
  end

  # Initialise array for counting LinesToDiscard matches
  $LinesToDiscardProfile = Array.new
  $parameters[ "LinesToDiscard" ].each_with_index \
  do |regexp, idx |
      $LinesToDiscardProfile[ idx ] = 0
  end

end


def InitialiseParameters
  # In alphabetical order ...

  $parameters = Hash.new

  $parameters[ "ConfigFile" ] = ""

  $parameters[ "LinesToDiscard" ] = [
    # These are for omindex' default filters.
    # The ones matched > 50 times when indexing a 49,000+ document collation
    # are in descending usage order.
    %r|^Error \([[:digit:]]*\): Illegal character <[^>]*> in hex string$|, \
    %r|^Format a4 is redefined$|, \
    %r|^Error \([[:digit:]]*\): Missing 'endstream'$|, \
    %r|^Error \([[:digit:]]*\): Command token too long$|, \
    %r|^Error \([[:digit:]]*\): Illegal character '[^']*'$|, \
    %r|^Skipping an unmatched table row$|, \
    %r|^Error: Bad annotation destination$|, \
    %r|^Error \([[:digit:]]*\): Inline image dictionary key must be a name object$|, \
    %r|^BOF when current sheet is not flushed$|, \
    %r|^no text extracted from document body, but indexing metadata anyway$|, \
    %r|^Error: PDF file is damaged - attempting to reconstruct xref table\.\.\.$|, \
    %r|^Error: Couldn't find trailer dictionary$|, \
    %r|^Error: Couldn't read xref table$|, \
    %r|^Error: Invalid Font Weight$|, \
    %r|^Error \([[:digit:]]*\): Illegal digit in hex char in name$|, \
    %r|^Error \([[:digit:]]*\): Internal: got 'EI' operator$|, \
    %r|^Error \([[:digit:]]*\): No font in show$|, \
    %r|^Error \([[:digit:]]*\): Too few \([[:digit:]]*\) args to '[^']*' operator$|, \
    %r|^Error \([[:digit:]]*\): Unknown operator '[^']*'$|, \
    %r|^Error read MSAT!$|, \
    %r|^Error: Arg #[[:digit:]]* to '[^']*' operator is wrong type \([^)]*\)$|, \
    %r|^Error: Corrupted memory profile$|, \
    %r|^Error: Couldn't read page catalog$|, \
    %r|^Error: End of file inside array$|, \
    %r|^Error: End of file inside dictionary$|, \
    %r|^Error: Expected the default config, but wasn't able to find it, or it isn't a Dictionary$|, \
    %r|^Error: Expected the optional content group list, but wasn't able to find it, or it isn't an Array$|, \
    %r|^Error: Kids? object \(page [[:digit:]]*\) is wrong type \([[:alpha:]]*\)$|, \
    %r|^Error: Leftover args in content stream$|, \
    %r|^Error: Page count in top-level pages object is incorrect$|, \
    %r|^Error: Top-level pages object is wrong type \([^)]*\)$|, \
    %r|^Error: Unknown character collection '[^']*'$|, \
    %r|^Error: Unknown operator '[^']*'$|, \
    %r|^Error: Unterminated string$|, \
    %r|^Error: XObject '[^']*' is unknown$|, \
    %r|^Error: read ICCBased color space profile error$|, \
    %r|^caution: filename not matched: |, \
  ]

  $parameters[ "LinesToDiscardProfiling" ] = true

  $parameters[ "LogLevel" ] = "INFO"

  $parameters[ "LogPath" ] = ""

  if $stdout.tty?
    $parameters[ "LogToFile" ] = false
  else
    $parameters[ "LogToFile" ] = true
  end

end


def LogParameters( )
  info_msg = "Parameters:"
  $parameters.each \
  do |parameter|
    info_msg += "\n  #{ parameter[ 0 ] }: #{ parameter[ 1 ] }"
  end
  $log.write( Log::INFO, info_msg )
end


def NormaliseParameters( )

  error_msg = ""
  if $parameters[ "ConfigFile" ] != ''
    $parameters[ "ConfigFile" ] = NormalisePath( $parameters[ "ConfigFile" ] )
  end
  
  if $parameters[ "LinesToDiscard" ][ 0 ].class == String 
    # The array has come from the config file; convert to array of regexps
    $parameters[ "LinesToDiscard" ].each_index \
    do |x|
      regexp = Regexp.new( $parameters[ "LinesToDiscard" ][ x ] )
      begin
        $parameters[ "LinesToDiscard" ][ x ] = regexp
      rescue
        error_msg << "\n  Invalid regular expression: " + regexp
      end
    end
  end

  if $parameters[ "LinesToDiscardProfiling" ].class == String
    # The value has come from the config file; convert to a logical value
    case $parameters[ "LinesToDiscardProfiling" ].downcase
      when 'true'
        $parameters[ "LinesToDiscardProfiling" ] = true
      when 'false'
        $parameters[ "LinesToDiscardProfiling" ] = false
      else
        error_msg << "\n  Invalid LinesToDiscardProfiling value: " \
          + $parameters[ "LinesToDiscardProfiling" ]
    end
  end

  return error_msg
end


def ParseCommandLine( )
  # Options that require an argument are marked OPTIONAL_ARGUMENT so this
  # script can handle missing arguments itself
  opts = GetoptLong.new(
    [ '--config', '-c', GetoptLong::OPTIONAL_ARGUMENT ],
    [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
    [ '--input', '-i', GetoptLong::OPTIONAL_ARGUMENT ],
    [ '--log', '-l', GetoptLong::OPTIONAL_ARGUMENT ],
    [ '--loglevel', '-k', GetoptLong::OPTIONAL_ARGUMENT ],
    [ '--output', '-o', GetoptLong::OPTIONAL_ARGUMENT ]
  )
  # Option arguments that may later be changed are duplicated
  # (strings in ARGV[ ] are frozen)
  error_msg = ''
  opts.each \
  do |opt, arg|
    case opt
    when "--config"
      if arg != ''
        $parameters[ "ConfigFile" ] = arg.dup
      else
        error_msg += "\n  '--config' argument missing"
      end
    when "--help"
      Usage( "verbose" )
      exit( 0 )
    when "--input"
      if arg != ''
        $parameters[ "InputFile" ] = arg.dup
      else
        error_msg += "\n  '--input' argument missing"
      end
    when "--log"
      $parameters[ "LogToFile" ] = true
      if arg != ''
        $parameters[ "LogPath" ] = arg.dup
      end
    when "--loglevel"
      case arg
        when 'D' 
          $parameters[ "LogLevel" ] = "DEBUG"
        when 'I' 
          $parameters[ "LogLevel" ] = "INFO"
        when 'W' 
          $parameters[ "LogLevel" ] = "WARN"
        when 'E' 
          $parameters[ "LogLevel" ] = "ERROR"
        when 'F' 
          $parameters[ "LogLevel" ] = "FATAL"
        when ''
          error_msg += "\n  '--loglevel' argument missing"
        else
          error_msg += "\n  Invalid '--loglevel' argument: '#{ arg }'"
      end
    when "--output"
      if arg != ''
        $parameters[ "OutputFile" ] = arg.dup
      else
        error_msg += "\n  '--output' argument missing"
      end
    else
      error_msg += "\n  Invalid option, '#{ opt }'"
    end
  end
  if ARGV.length != 0
    error_msg += "\n  Invalid argument(s) after options and their arguments: '#{ ARGV.join( ' ' ) }'"
  end
  if $parameters[ "ConfigFile" ] == ""
    error_msg += "\n  option --config (or -c) is required"
  end
  if $parameters[ "InputFile" ] == ""
    error_msg += "\n  option --input (or -i) is required"
  end
  if $parameters[ "OutputFile" ] == ""
    error_msg += "\n  option --output (or -o) is required"
  end
  if error_msg != ''
    error_msg = "Command line error(s):" + error_msg
  end
  return error_msg
end
  

def Usage( verbosity )
  # If logging not set up, set up default logging
  # This is required when "--help" is given on the command line
  if ! $logging_initialised
    log_fd = $stdout
    timestamps = false
    $log = Log.new( log_fd, $parameters[ "LogLevel" ], timestamps )
  end

  # Display usage
  $log.write( Log::INFO, "Usage: #{File.basename( $0 )} " + \
    "--config config_file [--help] --input input_file [--log [log_file]] [--loglevel level] --output output_file" \
  )
  if verbosity == "verbose"
    $log.write( Log::INFO, \
      "  --config (or -c): names the configuration file" + \
      "\n  --help (or -h): print this help message and exit" + \
      "\n  --input (or -i): names the input file" + \
      "\n  --log (or -l)" + \
      "\n    log_file given: specify the log_file" + \
      "\n    log_file not given: log to the default log file" + \
      "\n  --loglevel (or -k): set lowest log level messages to log.  In order:" + \
      "\n    D for debug" + \
      "\n    I for informtion" + \
      "\n    W for warning" + \
      "\n    E for error" + \
      "\n    F for fatal" + \
      "\n  --output (or -o): names the output file" \
  )
  end
end


# Execute
# ~~~~~~~
Initialise( )
Clean( )
Finalise( 0 )
