#!/usr/bin/perl -W -I /usr/lib/

#     This file is part of asd.
    
#     asd is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.

#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.

#     You should have received a copy of the GNU General Public License
#     along with this program; if not, write to the Free Software
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

#     asd 0.2 Copyright 2004 Antonini Daniele <arpeda@gmail.com>


use strict;
use warnings;

use ASD::Function ':all';
use ASD::Indexer ':all';
use ASD::Store ':all';
use ASD::Statistics ':all';
use ASD::Uncompress ':all';
use ASD::Indexer::Parsing ':all';
use Getopt::Long;
use File::stat;
#use Devel::Size qw(size total_size);

my $path = "/var/cache/man/asd";
my %index_file = ( 'man_page_title' => "$path/man_page_title.asd",
		   'lessico'        => "$path/lessico.asd",
		   'document'       => "$path/document.asd",
		   'occurrence'     => "$path/occurrence.asd",
		   'fast'           => "$path/fast.asd" );

## Check path
##
my $create_dir_error;

$create_dir_error = `mkdir -p $path 2>&1` unless ( -e $path );

if ( $create_dir_error ) {
    print $create_dir_error;
    exit( 1 );
}

##
## Pre elaboration for bash expansion
##
my @expansion = ();

expand_parameter( \@ARGV, \@expansion );

##
##   Parameter
##
my $dir_get_opt = undef;
my $occurrence_get_opt = undef;
my $help = undef;
my $fast_get_opt = undef;

my @file_get_opt = undef;

GetOptions( 'file=s' => \@file_get_opt, 
	    'dir=s' => \$dir_get_opt,
	    'occurrence' => \$occurrence_get_opt,
	    'fast-search' => \$fast_get_opt,
	    'help' => \$help );

#### --help
if ( $help ) {
    print "Usage: create_asdb [option]\n";
    print "Create inverted file for asd\n\n";
    print "You can use only one between --file --dir\n";
    print "\n";
    print "\t--file path_to_file: indexing of file.\n";
    print "\t--dir path_to_dir: indexing of all man page in path_to_dir\n";
    print "\t--occurrence: store word position. This option create a BIG file. Use only for testing\n";
    print "\t--fast-search: create an index for fast search\n";
    print "\t--help: show this help\n";
    print "\n";
    print "Report bugs to <antonini.daniele\@gmail.com>\n";
    exit( 1 );
}

#### --file
my $first_value = undef;

$first_value = shift @file_get_opt;
if ( $first_value ) {
    unshift @file_get_opt, $first_value;
}

### Check --file --dir
if ( $dir_get_opt && @file_get_opt ) {
    print "You can only use --file or --dir\n";
    exit( 1 );
}

##
## Check for needed command
##
my $check = undef;
my @needed_command = qw( manpath cat zcat bzcat );

foreach my $command ( @needed_command ) {
    $check = `which $command`;
    $check !~ /^which:/ || die "Check that $command is in your PATH!\n";
}

##
## Get dir of man pages
##
my @manpath_dir = get_man_page_dir();

##
## Section not much important
##
my @man_page_section_to_delete = ('synopsis', 'copyrigth', 'see also', 'example', 'author', 'reporting bug', 'keywords', 'warning' );

##
## Calculate number man pages and size total
##
my $num_man_pages = 0;

my $text;
my $text_no_color;
my $string_size_total = "";
my $string_num_man_page = "";

my @statistics_man_pages_to_analize = ();

@statistics_man_pages_to_analize = stat_man_page_to_analize( \@manpath_dir, $dir_get_opt, \@file_get_opt, );

$num_man_pages = $statistics_man_pages_to_analize[0];

##
## simple visualization of indexing
##
$text = print_initial_statistics( \@statistics_man_pages_to_analize );

## Begin to analize man pages
##
my $duplicate = 0;
my $empty_man_pages = 0;
my $true_man_pages = 0;

my %inverted_list = ();
my %man_page_hash_title = ();

my $docID = 0;
my $ref_man_page_indexed;

(open OCCURRENCE, ">", "occurrence.asd" or die "Can't open file: ".$!) if ($occurrence_get_opt);

foreach my $man_page_dir ( @manpath_dir ) {

    next if ( -l $man_page_dir );

    my @man_page_section = `ls $man_page_dir 2>/dev/null | grep man`;
    foreach my $section ( @man_page_section ) {

	chop($section);

	my $current_dir = $man_page_dir."/".$section;
	$current_dir = $dir_get_opt if ( $dir_get_opt );
	next if ( -l $current_dir );
	
	my @man_page_to_examine = `ls -1 $current_dir`;

	if ( @file_get_opt ) {

	    my @tmp;

	    @man_page_to_examine = ();
	    for my $i ( 0.. $#file_get_opt ) {
		
		@tmp = split( /\//, $file_get_opt[$i] );
		$man_page_to_examine[$i] = pop @tmp;
	    }
	    $current_dir = join('/', @tmp );
	}

	foreach my $current_man_page ( @man_page_to_examine ) {

	    my $man_page_section = undef;
	    my $clean_function = undef; #pointer to function
	    my $man_page_basename = undef;
	    my $man_page_path = undef;
	    my $man_page_title = undef;
	    my $man_page_id = undef;
	    my $ref_man_page = undef;
	    my @man_page_file = undef;
	    my %man_page_text = ();
	    my %man_page_stemmed = ();

	    #now can elaborate the man page
	    chop( $current_man_page ) unless ( @file_get_opt );
	    $man_page_path = $current_dir."/" . $current_man_page;

	    next if ( -l $man_page_path );

	    $num_man_pages --;
	    print_updated_statistics( \$num_man_pages );

	    #FIXME: controllo sulla data del dizionario!!!
	    @man_page_file = split( /\./, $current_man_page );

	    $ref_man_page = unzip_man_page( "$current_dir/$current_man_page" );

	    next unless ( $ref_man_page );

	    #now can analyze man page:
	    
	    #1) get section and basename of man page
	    $man_page_section = get_section( $current_man_page );
	    $man_page_basename = get_basename( $current_man_page );

	    #1.1)clean man page
	    $clean_function = get_clean_function( $current_man_page );
	    &$clean_function( $ref_man_page );

	    #2) get title of man page and .... 
	    $man_page_title = get_title( $ref_man_page, $man_page_basename, $man_page_section );

	    if ( $man_page_title ) {
		$true_man_pages ++;

		#3) ...  insert it into hash title
		$man_page_id = insert_into_hash_title( \%man_page_hash_title, $man_page_title, \$docID );

		# man page can be already analized
		if ( $man_page_id == $docID ) {

		    #4) remove section not much important
		    remove_man_page_section( $ref_man_page, \@man_page_section_to_delete );

		    if ( $occurrence_get_opt ) {

			#5) indexing of man page
			$ref_man_page_indexed =  indexing_of_man_page_with_occurrence( $ref_man_page, $man_page_title );

			#6) to limit memory usage write list occurrence on file
			write_occurrence_list_and_set_pointer( $ref_man_page_indexed );

		    }
		    else {
			
			#5) indexing of man page
			$ref_man_page_indexed =  indexing_of_man_page_without_occurrence( $ref_man_page, $man_page_title );
		    }

		    #7) merge
		    merge( \%inverted_list, $ref_man_page_indexed, $man_page_section, $man_page_id );
		}
		else  {
		    $duplicate ++;
		}

	    } 
	    else {
		$empty_man_pages ++;
	    }

	    @$ref_man_page = ();
	}
	last if ( @file_get_opt || $dir_get_opt );
    }
    last if ( @file_get_opt || $dir_get_opt );
}
print "$text\n";

close OCCURRENCE if ( $occurrence_get_opt );

##
## Store man_page_title, lessico and document
##

if ( $fast_get_opt ) {
    store_without_occurrence_fast_search( \%man_page_hash_title, \%inverted_list, \%index_file ) unless ( $occurrence_get_opt );
} 
else {
    store_without_occurrence( \%man_page_hash_title, \%inverted_list, \%index_file ) unless ( $occurrence_get_opt );
}

##
## Some statistics
##

print_end_statistics( $statistics_man_pages_to_analize[0], \%index_file, $duplicate, $empty_man_pages, $true_man_pages, $occurrence_get_opt );

#print "Debug information iL: " . total_size( \%inverted_list )/1024 . "\n";
#print "Debug information mP: " . total_size( \%man_page_hash_title )/1024 . "\n";

#####################
#### End Program ####
#####################

sub write_occurrence_list_and_set_pointer {

    my $hash = shift;

    my $position;

    foreach my $i ( keys %$hash ) {

	$position = tell OCCURRENCE;

  	for my $k (0..scalar @{$hash->{$i}}-1 ) {
	    
  	    print OCCURRENCE "$hash->{$i}[$k] ";
  	}

 	print OCCURRENCE "\n";

	#delete array occurrence and replace with position inside file OCCURRENCE
	$hash->{$i} = ();
	$hash->{$i} = $position;
    }

}
