#!/usr/bin/perl -w

###############################################################################
# cvsplot: Copyright (c) 2001, 2002 David Sitsky.  All rights reserved.
#
# cvsplot is a perl script which is used to extract information from CVS and
# plots the total number of lines and number of files in a selected file set
# against time.
#
# File sets can be specified using regular expressions.
# The start and end dates may also be specified.
#
# This program is free software; you can redistribute it and modify it under
# the terms of the GPL.

use Cwd;
use Date::Manip;

# Whether debugging is enabled or not.
$debug = 0;

# The start date in which to gather statistics.
$start_date = "";

# The final date in which to gather statistics.
$end_date = "";

# The directory is which to gather the cvs statistics (where the cvs log
# command is run from), or the directory of the CVS repository, if the
# -rlog option is used.
$cvsdir = "";

# The module to run cvs rlog over, if the -rlog option is specified.
$rlog_module = "";

# The branch that we are collecting statistics from.  By default, the main
# branch is used.
$branch_tag = "";

# Parallel arrays of file patterns which indicate whether it is an include
# or exclude pattern, and what the regular expression is.
@pattern_include = ();
@pattern_regexp = ();

# Where the number of lines statistics will be stored.
$linedata = "";

# Where the number of files statistics will be stored.
$filedata = "";

# A hash (by date) of a hash (by filename) of lines added.
%line_stats = ();

# A hash (by date) of a hash (by filename) of the status.
%state_stats = ();

# A hash (by date) of a hash (by filename) of the revision.
%revision_stats = ();

# A hash (by filename) of a hash (by version) of lines added.
%file_version_delta = ();

# A hash (by filename) of a hash (by version) of the file state.
%file_version_state = ();

# A hash (by filename) of the magic branch number.
%file_branch_number = ();

# A hash (by filename) of the number of branch revisions made.
%file_number_branch_revisions = ();

# A hash (by date) of the total number of lines.
%total_lines = ();

# A hash (by date) of the total number of files.
%total_files = ();

# Flag to indicate if gnuplot is to be used.
$use_gnuplot = 0;

# The gnuplot output filename to write the output of the line data.
$gnuplot_linedata = "";

# The gnuplot output filename to write the output of the file data.
$gnuplot_filedata = "";

# The gnuplot "set term" expression to use when generating output.
# By default, generate colour png files.
$gnuplot_setterm = "png color";

# A general gnuplot command that can be executed to change some
# aspect of the plotting command, such as the format of the x values.
$gnuplot_command = "";

process_command_line_arguments();
get_cvs_statistics();
analyse_statistics();
generate_data_files();
generate_plots();

###############################################################################
# Check whether the supplied file is to be examined or not depending on what
# the user set for the -include and -exclude options.  Return true if the
# file is to be included.  If no -include or -exclude options have been
# set by the user, return true by default.
#
sub include_file
{
    my ($filename) = @_;

    # If there are no settings, include everything.
    if ($#pattern_regexp == -1)
    {
	return 1;
    }

    # Go through the pattern_regexp array, and see if there is any matches.
    for ($i = 0; $i <= $#pattern_regexp; $i++)
    {
	if ($filename =~ /$pattern_regexp[$i]/)
	{
	    # Got a match, return whether or not the file should be included
	    # or not.
	    return $pattern_include[$i];
	}
    }

    # No matches, don't include this file.
    return 0;
}

###############################################################################
# Using "cvs log" and a few other commands, gather all of the necessary
# statistics.
#
sub get_cvs_statistics
{
    my $working_file = "";
    my $relative_working_file = "";
    my $working_cvsdir = "";
    my $search_file = 0;

    # Change to the directory nominated by $cvsdir, and save the current
    # directory, only if we aren't using the -rlog option.
    if ($rlog_module eq "")
    {
	$saved_cwd = cwd();
	chdir $cvsdir;
    }
    else
    {
	# Remove the accessor part, and just get the pathname.
	$cvsdir =~ /([^:]+)$/;
	$working_cvsdir = $1;
	print "Got working_cvsdir as $working_cvsdir\n" if $debug;
    }

    # Flag to indicate what the state is when parsing the output from cvs log.
    # true indicates that the parser is waiting for the start of a cvs log
    # entry.
    $search_file = 1;

    # Build up the command string appropriately, depending on what options
    # have been set.
    my $command =
	($rlog_module ne "") ? "cvs -d $cvsdir rlog $rlog_module" : "cvs log";
    print "Executing \"$command\"\n" if $debug;

    open (CVSLOG, "$command |") || die "Couldn't execute \"$command\"";
    while (<CVSLOG>)
    {
	if ($search_file == 1)
	{
	    # Need to locate the name of the working file
	    if (/^RCS file: (.*),v$/)
	    {
		$working_file = $1;
		$working_file =~ s/Attic\///g;
		$relative_working_file = "";

		# Check if this file is to be included or not.
		if (include_file($working_file))
		{
		    # Yep, search for more details on this file.
		    $search_file = 0;

		    if ($branch_tag eq "")
		    {
			# Main branch to be investigated only.
			$file_branch_number{$working_file} = "1";
			$file_number_branch_revisions{$working_file} = 0;
		    }
		    print "Including file \"$working_file\"\n" if $debug;
		}
		else
		{
		    print "Excluding file \"$working_file\"\n" if $debug;
		}
	    }
	}
	else
	{
	    # Collective the relative part for those runs that don't use
	    # -rlog.
	    if (/^Working file: (.*)$/)
	    {
		$relative_working_file = $1;
	    }
	    # If we are collecting statistics on a branch, determine the magic
	    # branch number for this file.
	    elsif ( (! defined $file_branch_number{$working_file}) &&
		 (/^\s*${branch_tag}: ([\d\.]+)\.0\.(\d+)$/) )
	    {
		$file_branch_number{$working_file} = "${1}.${2}";
		$file_number_branch_revisions{$working_file} = 0;
		if ($debug)
		{
		    print "Got branch $file_branch_number{$working_file}";
		    print " for file \"$working_file\"\n";
		}
	    }
	    elsif (/^keyword substitution: b$/)
	    {
		# This is a binary file, ignore it.
		undef($file_branch_number{$working_file});
		undef($file_number_branch_revisions{$working_file});
		$search_file = 1;
		print "Excluding binary file \"$working_file\"\n" if $debug;
	    }
	    elsif (/^=============================================================================$/)
	    {
		# End of the log entry for this file, start parsing for the
		# next file.
		$search_file = 1;
		next;
	    }
	    elsif (/^----------------------------$/)
	    {
		# Matched the description separator.  If a branch has been
		# specified, but this file doesn't exist on it, skip this file.
		if (($branch_tag ne "") &&
		    (! defined $file_branch_number{$working_file}))
		{
		    if ($debug)
		    {
			print "File \"$working_file\" not on branch\n";
		    }
		    $search_file = 1;
		    next;
		}

		# Read the revision line, and record the appropriate
		# information.
		$_ = <CVSLOG>;

		if (/^revision ([\d\.]+)$/)
		{
		    # Record the revision, and whether it is part of the tag
		    # of interest.
		    $revision = $1;
		    if ($revision =~
			/^$file_branch_number{$working_file}\.\d+$/)
		    {
			$file_on_branch = 1;
			$file_number_branch_revisions{$working_file}++;
		    }
		    else
		    {
			$file_on_branch = 0;
		    }
		    if ($debug)
		    {
			print "Got branch number: $file_branch_number{$working_file} rev $revision on branch: $file_on_branch\n";
		    }
		}
		else
		{
		    # Problem in parsing, skip it.
		    print "Couldn't parse line: $_\n";
		    $search_file = 1;
		    next;
		}
		    
		$_ = <CVSLOG>;		# Read the "date" line.
		if (/^date: (\d\d\d\d\/\d\d\/\d\d \d\d:\d\d:\d\d); .* state: (.*);.*lines: \+(\d+) \-(\d+)$/)
		{
		    # Note for some CVS clients, state dead is presented in
		    # this this way, as the following pattern.
		    $date = $1;
		    $state = $2;
		    $lines_added = $3;
		    $lines_removed = $4;
		    $number_lines = $lines_added - $lines_removed;

		    $file_version_delta{$working_file}{$revision} =
			$number_lines;
		    $file_version_state{$working_file}{$revision} = $state;

		    if ($file_on_branch)
		    {
			# This revision lives on the branch of interest.
			$line_stats{$date}{$working_file} += $number_lines;
			$state_stats{$date}{$working_file} = $state;
			$revision_stats{$date}{$working_file} = $revision;
		    }
	        }
		elsif (/^date: (\d\d\d\d\/\d\d\/\d\d \d\d:\d\d:\d\d); .* state: dead;$/)
		{
		    # File has been removed.
		    $date = $1;

		    $file_version_delta{$working_file}{$revision} = 0;
		    $file_version_state{$working_file}{$revision} = "dead";
		    
		    if ($file_on_branch)
		    {
			$line_stats{$date}{$working_file} = 0;
			$state_stats{$date}{$working_file} = "dead";
			$revision_stats{$date}{$working_file} = $revision;
		    }
		}
		elsif (/^date: (\d\d\d\d\/\d\d\/\d\d \d\d:\d\d:\d\d); .* state: Exp;$/)
		{
		    $date = $1;

		    # Unfortunately, cvs log doesn't indicate the number of
		    # lines an initial revision is created with, so find this
		    # out using the following cvs command.
		    my $lccmd = "";
		    if ($rlog_module ne "")
		    {
			$working_file =~ /^${working_cvsdir}\/(.*)$/;
			$lccmd = "cvs -d $cvsdir co -r $revision -p \"$1\"";
		    }
		    else
		    {
			$lccmd = "cvs update -r $revision -p \"$relative_working_file\"";
		    }
		    print "Executing $lccmd\n" if $debug;
		    $number_lines = `$lccmd 2>/dev/null | wc -l`;
		    chop $number_lines;
		    $number_lines =~ s/ //g;
		    print "$working_file 1.1 = $number_lines lines\n" if $debug;

		    $file_version_delta{$working_file}{$revision} =
			$number_lines;
		    $file_version_state{$working_file}{$revision} = "Exp";

		    if ($file_on_branch)
		    {
			$line_stats{$date}{$working_file} += $number_lines;
			$state_stats{$date}{$working_file} = "Exp";
			$revision_stats{$date}{$working_file} = $revision;
		    }
		}
		else
		{
		    print "Couldn't parse: $_";
		}
		if ($debug)
		{
		    print "File \"$working_file\" rev $revision ";
		    print "delta $file_version_delta{$working_file}{$revision} ";
		    print "state $file_version_state{$working_file}{$revision}\n";
		}
	    }
	}
    }

    # Go back to the original directory if we aren't using the -rlog option.
    if ($rlog_module eq "")
    {
	chdir $saved_cwd;
    }
}

# Variable to store results when calling get_line_count.
%memorise_line_count = ();

###############################################################################
# Return the number of lines that constitute a particular revision of a file.
#
sub get_line_count
{
    my ($filename, $revision) = @_;

    my $count = get_line_count_inner($filename, $revision);

    # Store this result for future intermediate calculations.
    $memorise_line_count{$filename}{$revision} = $count;

    if ($debug)
    {
	print "get_line_count($filename, $revision) = $count\n";
    }

    return $count;
}

sub get_line_count_inner
{
    my ($filename, $revision) = @_;
    my $count = 0;
    my $finished = 0;

    while (!$finished)
    {
	if (defined $memorise_line_count{$filename}{$revision})
	{
	    $count += $memorise_line_count{$filename}{$revision};
	    $finished = 1;
	}
	elsif (! defined($file_version_state{$filename}{$revision}))
	{
	    # Case where we are looking for a revision that hasn't
	    # been found in the output of the CVS log command. This is
	    # usually because a developer decided to start the file
	    # revision at something other than 1.1.
	    $memorise_line_count{$filename}{$revision} = 0;
	    $finished = 1;
	}
	elsif ($revision eq "1.1")
	{
	    # Base case where the revision is 1.1
	    $memorise_line_count{$filename}{$revision} =
		$file_version_delta{$filename}{$revision};
	    $count += $memorise_line_count{$filename}{$revision};
	    $finished = 1;
	}
	elsif ($file_version_state{$filename}{$revision} eq "dead")
	{
	    # Case where file has been removed.  The file count is
	    # effectively the previous version's count.
	    $revision =~ /^([\d\.]+)\.(\d+)$/;
	    $previous_subrevision = $2 - 1;
	    $previous_revision = "${1}.${previous_subrevision}";
	    $revision = $previous_revision;
	}
	elsif ($revision =~ /^([\d\.]+)\.\d+\.1$/)
	{
	    # Case where need to decend down branch point and find the
	    # contributions made there.
	    $branch_point_revision = $1;
	    if (! defined($file_version_delta{$filename}{$revision}))
	    {
		print "file_version_data not defined for $filename $revision\n";
	    }
	    $count += $file_version_delta{$filename}{$revision};
	    $revision = $branch_point_revision;
	}
	elsif ($revision =~ /^([\d\.]+)\.(\d+)$/)
	{
	    # Need to determine previous revision number + this revision's
	    # contribution.
	    $previous_subrevision = $2 - 1;
	    $previous_revision = "${1}.${previous_subrevision}";
	    if (! defined($file_version_delta{$filename}{$revision}))
	    {
		print "[2] file_version_data not defined for $filename $revision\n";
	    }
	    $count += $file_version_delta{$filename}{$revision};
	    $revision = $previous_revision;
	}
	else
	{
	    print "Unhandled case for file $filename revision $revision\n";
	    exit 0;
	}

    }

    return $count;
}
	

###############################################################################
# Sum up those entries with the same date, and add up the line count.
# When a file has been removed, its contribution from the total file
# count must be removed completely.
#
sub analyse_statistics
{
    # Keep a record of what files are present when gathering statistics.
    my %files_present = ();

    # Keep a record of the current revision a file has when gathering
    # statistics.  Their initial revisions will be the revision that
    # they branched from if we are doing statistics on a branch.
    my %file_revision = ();
    if ($branch_tag ne "")
    {
	foreach $file ( keys %file_branch_number )
	{
	    $file_branch_number{$file} =~ /^([\d\.]+)\.\d+$/;
	    my $base_revision = $1;
	    $file_revision{$file} = $base_revision;
	    if ($file_version_state{$file}{$base_revision} ne "dead")
	    {
		$files_present{$file} = 1;
	    }
	}
    }

    # Go through the records in order of earliest to latest.
    foreach $date ( sort keys %line_stats )
    {
	# Investigate what CVS operations occurred on this date.
	foreach $file ( keys %{ $line_stats{$date} } )
	{
	    # Update the current revision a file has.
	    $file_revision{$file} = $revision_stats{$date}{$file};

	    # Record if a file was removed or not.
	    if ($state_stats{$date}{$file} eq "dead")
	    {
		delete $files_present{$file};
	    }
	    else
	    {
		$files_present{$file} = 1;
	    }
	}

	# Calculate the total number of files present on this date.
	$total_files{$date} = scalar keys %files_present;

	# Count the total number of lines present for the current file set.
	my $total_line_count = 0;
	foreach $file ( keys %file_revision )
	{
	    if (defined $files_present{$file})
	    {
		$total_line_count += get_line_count($file, $file_revision{$file});
	    }
	}
	$total_lines{$date} = $total_line_count;
    }

    # Filter out those entries to only contain what the user specified
    # in the date interval.  This could be done far more efficiently, but
    # for now...
    foreach $date ( sort keys %total_lines )
    {
	$current_date = &ParseDate($date);
	if (($start_date ne "" && &Date_Cmp($current_date, $start_date) < 0) ||
	    ($end_date ne "" && &Date_Cmp($current_date, $end_date) > 0))
	{
	    # This date is before the start date specified by the user, or
	    # this date is after the end date specified by the user.
	    # Delete it.
	    delete $total_lines{$date};
	    delete $total_files{$date};
	}
    }
}

###############################################################################
# Generate the data files into the specified locations.  Only write out
# those entries within the user's specified date interval.  Note, this could
# be handled far more efficiently, but for now...
#
sub generate_data_files
{
    # Write out the line data.
    foreach $date ( sort keys %total_lines )
    {
	print LINEDATA "$date $total_lines{$date}\n";
	print FILEDATA "$date $total_files{$date}\n";
    }
    close LINEDATA;
    close FILEDATA;
}

###############################################################################
# Generate the gnuplot data files into the specified locations.
#
sub generate_plots
{
    if ($use_gnuplot)
    {
	# Generate a gnuplot command script to build the necessary images.
	$command = <<EOF;

set xdata time
set timefmt '%Y/%m/%d %H:%M:%S'
set format x '%m/%y'
set xlabel 'Date'
set ylabel 'Number of lines'
set nokey
set terminal $gnuplot_setterm
set output '$gnuplot_linedata'
$gnuplot_command
plot '$linedata' using 1:3
set ylabel 'Number of files'
set output '$gnuplot_filedata'
plot '$filedata' using 1:3
EOF

        # Pipe this command into gnuplot.
	`echo "$command" | gnuplot`;
    }
}

###############################################################################
# Process the command line arguments and perform sanity checks.
#
sub process_command_line_arguments
{
    for ($i = 0; $i <= $#ARGV; )
    {
	if ($ARGV[$i] eq "-debug")
	{
	    $debug = 1;
	    $i++;
	}
	elsif ($ARGV[$i] eq "-include")
	{
	    $pattern_include[++$#pattern_include] = 1;
	    $pattern_regexp[++$#pattern_regexp] = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-exclude")
	{
	    $pattern_include[++$#pattern_include] = 0;
	    $pattern_regexp[++$#pattern_regexp] = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-branch")
	{
	    $branch_tag = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-start")
	{
	    $start_date = &ParseDate($ARGV[$i+1]);
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-end")
	{
	    $end_date = &ParseDate($ARGV[$i+1]);
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-cvsdir")
	{
	    $cvsdir = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-rlog")
	{
	    $rlog_module = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-linedata")
	{
	    $linedata = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-filedata")
	{
	    $filedata = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-gnuplotlinedata")
	{
	    $use_gnuplot = 1;
	    $gnuplot_linedata = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-gnuplotfiledata")
	{
	    $use_gnuplot = 1;
	    $gnuplot_filedata = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-gnuplotsetterm")
	{
	    $use_gnuplot = 1;
	    $gnuplot_setterm = $ARGV[$i+1];
	    $i += 2;
	}
	elsif ($ARGV[$i] eq "-gnuplotcommand")
	{
	    $use_gnuplot = 1;
	    $gnuplot_command = $ARGV[$i+1];
	    $i += 2;
	}
	else
	{
	    print "Unrecognized option: $ARGV[$i]\n";
	    usage();
	}
    }

    # If any -include or -exclude options have been specified, check that
    # there is at least one -include option, otherwise the resulting
    # file set will be empty.
    if ($#pattern_include >= 0)
    {
	$found_include = 0;
	for ($i = 0; $i <= $#pattern_include; $i++)
	{
	    if ($pattern_include[$i])
	    {
		$found_include = 1;
		last;
	    }
	}
	if ($found_include == 0)
	{
	    print "error: empty file set specified: ";
	    print "missing -include option\n";
	    print " You probably want to add \"-include \'.*\'\"";
	    print " to the end of your command.\n";
	    exit 1;
	}
    }

    # Check the mandatory arguments have been set.
    if ($cvsdir eq "" || $linedata eq "" || $filedata eq "")
    {
	print "error: Not all mandatory arguments specified.\n\n";
	usage();
    }


    # The line and file gnuplot options need to be set if a gnuplot option is
    # used.
    if ($use_gnuplot)
    {
	if ($gnuplot_linedata eq "" || $gnuplot_filedata eq "")
	{
	    print "error: Both the -gnuplotlinedata and -gnuplotfiledata ";
	    print "options must be specified if using gnuplot.\n\n";
	    usage();
	}
    }

    # If both the start and end dates are specified, check that the start date
    # occurs before the end date.
    if ($start_date ne "" && $end_date ne "" &&
	&Date_Cmp($start_date, $end_date) >= 0)
    {
	print "error: Start date specified must occur before the end date.\n\n";
	usage();
    }

    # If the -rlog option has been specified, need to make sure that the
    # CVS version install is >= 1.11.1, as it is not supported in earlier
    # versions.
    if ($rlog_module ne "" &&
	system("cvs rlog 2>&1 | grep deprecated > /dev/null") == 0)
    {
	print "error: -rlog option requires CVS version >= 1.11.1\n\n";
	exit 1;
    }

    # Open the specified output files.
    open (LINEDATA, ">$linedata")
	|| die "Failed to create file \"$linedata\": $!";

    open (FILEDATA, ">$filedata")
	|| die "Failed to create file \"$filedata\": $!";
}
	    
###############################################################################
# Print out a usage message.
#
sub usage
{
    print "cvsplot version 1.6.1 - ";
    print "Copyright David Sitsky: sits\@users.sourceforge.net\n\n";
    print "cvsplot collects statistics from CVS controlled files.\n\n";
    print "usage: cvsplot.pl -cvsdir <dir> [-rlog <module>]\n";
    print "                  -linedata <file> -filedata <file>\n";
    print "                  [-branch <branch name>] [-start <date>] [-end <date>]\n";
    print "                  {-include <regexp> | -exclude <regexp>}\n";
    print "                  [-gnuplotlinedata <gnuplot output file for line data>\n";
    print "                   -gnuplotfiledata <gnuplot output file for file data>\n";
    print "                   [-gnuplotsetterm <gnuplot set terminal expression>]\n";
    print "                   [-gnuplotcommand <general gnuplot command>]]\n\n";
    print "See http://cvsplot.sourceforge.net for updates.\n";
    exit 1;
}
