######################################################################
##  This module is copyright (c) 2001-2006 Bruce Ravel
##  <bravel@anl.gov>
##  http://cars9.uchicago.edu/~ravel/
##
## -------------------------------------------------------------------
##     All rights reserved. This program is free software; you can
##     redistribute it and/or modify it under the same terms as Perl
##     itself.
##
##     This program is distributed in the hope that it will be useful,
##     but WITHOUT ANY WARRANTY; without even the implied warranty of
##     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##     Artistic License for more details.
## -------------------------------------------------------------------
######################################################################
## Code:

package Chemistry::Formula;

require Exporter;

@ISA       = qw(Exporter);
@EXPORT    = qw();
@EXPORT_OK = qw(parse_formula formula_data);

use strict;
use File::Spec;
use Ifeffit::FindFile;
use vars qw(@ISA $cvs_info $VERSION $version $date $install_dir);
$cvs_info = '$Id: Formula.pm,v 1.1 2001/03/17 01:35:44 bruce Exp bruce $ ';
$VERSION  = "0.04";
$version  = (split(' ', $cvs_info))[2];
$date     = (split(' ', $cvs_info))[3];
$install_dir = &identify_self;

my $is_windows = (($^O eq 'MSWin32') or ($^O eq 'cygwin'));

my $elem_match = '([BCFHIKNOPSUVWY]|A[cglrstu]|B[aeir]|C[adelorsu]|Dy|E[ru]|F[er]|G[ade]|H[efgo]|I[nr]|Kr|L[aiu]|M[gno]|N[abdeip]|Os|P[abdmortu]|R[abehnu]|S[bceimnr]|T[abcehilm]|Xe|Yb|Z[nr])';
my $num_match  = '\d+\.?\d*|\.\d+';

my $debug = 0;
my ($ok, $count);

## input: string and reference to a hash
## return: 1 if string was parsed, 0 if an error was encountered
## this throws an error after the *first* error encountered
sub parse_formula {
  my $in = $_[0];
  $count = $_[1];
  $ok    = 1;
  ## pre-process the string: (1) remove spaces and underscores -- also
  ## remove $ and curly braces in an attempt to deal with TeX, (2)
  ## translate square braces to parens, (3) remove /sub#/ in an
  ## attempt to deal with INSPEC (4) count number of open and close
  ## parens
  $in    =~ s/[ \t_\$\{\}]+//g; # (1)
  $in    =~ tr/[]/()/;		# (2)
  $in    =~ s&/sub(\d+)/&$1&g;	# (3), note that spaces have already
                                # been removed
  my @chars = split(//, $in);	  #
  my $open  = grep /\(/g, @chars; #  (4)
  my $close = grep /\)/g, @chars; #
  if ($open != $close) {
    $$count{error} =  "$open opening and $close closing parentheses.\n";
    return 0;
  } else {
    &parse_segment($in, 1); # this fills the %count hash
    ## &normalize_count;
    return $ok;
  };
};


## This works by recursion.  Pluck off the first segment of the
## string, interpret that segment, and pass the rest of the string to
## this routine for further processing.
##
## So, for Pb(TiO3)2, interpret Pb and recurse (TiO3)2.
## Then recurse TiO3 with a scaling factor of 2.
sub parse_segment {
  my ($in, $mult) = @_;
  return unless $ok;
  return if ($in =~ /^\s*$/);
  printf(":parse_segment: \"%s\" with multiplier %d\n", $in, $mult) if ($debug);
  my ($end, $scale) = (0, 1);
  $end = ($in =~ /\(/g) ? pos($in) : length($in)+1; # look for next open paren
  if ($end > 1) {
    unit(substr($in, 0, $end-1), $mult);
    --$end;
  } else {
    matchingbrace($in);
    $end = pos($in);
    if (substr($in, $end) =~ /^($num_match)/o) { # handle number outside parens
      $scale = $1;
      $end += length($1);
      pos($in) = $end;
      parse_segment(substr($in, 1, $end-2-length($1)), $mult*$scale);
    } else {
      parse_segment(substr($in, 1, $end-2), $mult*$scale);
    };
  };
  return unless $ok; # parse remaining bit after last paren
  ($end < length($in)) and parse_segment(substr($in, $end), $mult);
};


## interpret an unparenthesized segment
sub unit {
  my ($string, $multiplier) = @_;
  while ($string) {
    print ":unit: ", $string, $/ if ($debug);
    if ($string =~ /^([A-Z][a-z]?)/) {
      my $el = $1;
      unless ($el =~ /^($elem_match)$/o) {
	$$count{error}  = "\"$el\" is not a valid element symbol\n";
	print ":unit: ", $$count{error}, $/ if ($debug);
	$ok = 0;
	return;
      };
      $string = substr($string, length($el));
      if ($string =~ /^($num_match)/o) {
	$$count{$el} += $1*$multiplier;
	$string = substr($string, length($1));
      } else {
	$$count{$el} += $multiplier;
      };
    } else {
      $$count{error}  =
	"\"$string\" begins with something that is not an element symbol\n";
      $$count{error} .= "\telements must be first letter capitalized\n";
      print ":unit: ", $$count{error}, $/ if ($debug);
      $ok = 0;
      return;
    };
  };
};

## Swiped from C::Scan, found on CPAN, and written (I think) by
## Hugo van der Sanden (hv@crypt0.demon.co.uk)
sub matchingbrace {
  # pos($_[0]) is after the opening brace now
  my $n = 0;
  while ($_[0] =~ /([\{\[\(])|([\]\)\}])/g) {
    $1 ? $n++ : $n-- ;
    return 1 if $n < 0;
  }
  # pos($_[0]) is after the closing brace now
  return;				# false
}


##
## sub normalize_count {
##   my $sum = 0;
##   map { $sum += $$count{$_} } (keys %$count);
##   map { $$count{$_} /= $sum } (keys %$count);
## };


sub identify_self {
  my @caller = caller;
  use File::Basename qw(dirname);
  return dirname($caller[1]);
};


sub formula_data {
  my ($formula, $density) = @_;
  my $datafile = ($is_windows) ?
    Ifeffit::FindFile->find("other", "formula_dat") :
	File::Spec -> catfile($install_dir, "formula.dat");
  open FORMULA, $datafile or die "could not open $datafile for reading: $!\n";
  while (<FORMULA>) {
    next if (/^\s*$/);
    next if (/^\s*\#/);
    chomp;
    my @list = (split(/\|/, $_));
    foreach (0..2) {
      $list[$_] =~ s/^\s+//;
      $list[$_] =~ s/\s+$//;
    };
    $$formula{$list[0]} = $list[1];
    $$density{$list[0]} = $list[2];
  };
  close FORMULA;
};



=head1 NAME

Chemistry::Formula - enumerate elements in a chemical formula

=head1 SYNOPSIS

   use Chemistry::Formula qw(parse_formula);
   parse_formula('Pb (H (TiO3)2 )2 U [(H2O)3]2', \%count);

That is obviously not a real compound, but it demonstrates the
capabilities of the routine.  This returns

  %count = (
	    'O' => 18,
	    'H' => 14,
	    'Ti' => 4,
	    'U' => 1,
	    'Pb' => 1
	   );

=head1 DESCRIPTION

This module provides a function which parses a string containing a
chemical formula and returns the number of each element in the string.
It can handle nested parentheses and square brackets and correctly
computes stoichiometry given numbers outside the (possibly nested)
parentheses.

Only one function is exported, C<parse_formula>.  This takes a string
and a hash reference as its arguments and returns 0 or 1.

    $ok = parse_formula('PbTiO3', \%count);

If the formula was parsed without trouble, C<parse_formula> returns
1. If there was any problem, it returns 0 and $count{error} is filled
with a string describing the problem.  It throws an error afer the
B<first> error encountered without testing the rest of the string.

If the formula was parsed correctly, the %count hash contains element
symbols as its keys and the number of each element as its values.

Here is an example of a program that reads a string from the command
line and, for the formula unit described in the string, writes the
weight and absorption in barns.

    use Data::Dumper;
    use Xray::Absorption;
    Xray::Absorption -> load("elam");
    use Chemistry::Formula qw(parse_formula);

    parse_formula($ARGV[0], \%count);

    print  Data::Dumper->Dump([\%count], [qw(*count)]);
    my ($weight, $barns) = (0,0);
    foreach my $k (keys(%$count)) {
      $weight +=
	Xray::Absorption -> get_atomic_weight($k) * $count{$k};
      $barns  +=
	Xray::Absorption -> cross_section($k, 9000) * $count{$k};
    };
    printf "This weighs %.3f amu and absorbs %.3f barns at 9 keV.\n",
      $weight, $barns;

Pretty simple.

The parser is not brilliant.  Here are the ground rules:

=over 4

=item 1.

Element symbols must be first letter capitalized.

=item 2.

Whitespace is unimportant -- it will be removed from the string.  So
will dollar signs, underscores, and curly braces (in an attempt to
handle TeX).  Also a sequence like this: '/sub 3/' will be converted
to '3' (in an attempt to handle INSPEC).

=item 3.

Numbers can be integers or floating point numbers.  Things like 5,
0.5, 12.87, and .5 are all acceptible.

=item 4.

Uncapitalized symbols or unrecognized symbols will flag an error.

=item 5.

An error will be flagged if the number of open parens is different
from the number of close parens.

=item 6.

An error will be flagged if any unusual symbols are found in the
string.

=back

=head1 ACKNOWLEDGMENTS

This was written at the suggestion of Matt Newville, who tested early
versions.

The routine C<matchingbrace> was swiped from the C::Scan module, which
can be found on CPAN.  C::Scan is maintained by Hugo van der Sanden.

=head1 AUTHOR

  Bruce Ravel <ravel@phys.washington.edu>
  http://cars9.uchicago.edu/~ravel/software/


=cut


1;
