#!/usr/bin/perl -w
# Author:   $Author: merkosh $
# Revision: $Rev: 754 $
############################################################################
#    Copyright (C) 2005 by Uwe Mayer                                       #
#    merkosh@hadiko.de                                                     #
#                                                                          #
#    This program is free software; you can redistribute it and/or modify  #
#    it under the terms of the GNU General Public License as published by  #
#    the Free Software Foundation; either version 2 of the License, or     #
#    (at your option) any later version.                                   #
#                                                                          #
#    This program is distributed in the hope that it will be useful,       #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of        #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
#    GNU General Public License for more details.                          #
#                                                                          #
#    You should have received a copy of the GNU General Public License     #
#    along with this program; if not, write to the                         #
#    Free Software Foundation, Inc.,                                       #
#    51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA          #
############################################################################

#-----------------------------------------------------------#
# scan Amazon for a number of fields on a movie title       #
#-----------------------------------------------------------#

#-- imports --------------------------------------------------------------------
use URI::Escape;
use LWP::UserAgent;
use HTTP::Request;
use HTML::Entities;
use File::Basename;
use HTML::TreeBuilder;
use POSIX qw(ceil);

use LMCTools;


#-- display help screen --------------------------------------------------------
if ((grep /--help|-h/,@ARGV) || (scalar @ARGV == 0)) {
  print STDERR <<HELP;
amazon-en.pl  \$Rev\$  (c)  2005-01-04  by Uwe Mayer

Search Amazon.com for picture on a movie title.

Synopsis: IMDB-en.pl [-h|--help] <title>|<URL>

     -h     --help     this screen
     <title>           search for <title> on imdb and return either the
                       the information or a list of matches
     <URL>             get information from this URL

The URL is distinguished from the title by the prefix \'http://\'. If
your title happens to have this prefix you\'re busted. ;)

If your internet connection needs a proxy server set the environment
variable "http_proxy" to the appropriate url 
(i.e. http_proxy=http://proxy.somehost.com:8080)

HELP
  exit();

}


#-- parse arguments ------------------------------------------------------------
my $title = "";
my $URL = "";

if (substr($ARGV[0], 0,7) eq 'http://') {
  $URL = $ARGV[0];
} else {
  $title = $ARGV[0];
}


#-- scan amazon ----------------------------------------------------------------
#$baseURL = 'http://www.amazon.com';
$searchURL = 'http://www.amazon.com/exec/obidos/search-handle-url/index=dvd&field-title=';


#-- title ----------------------------------------------------------------------
if ($title) {
  # default variables
  $section = "Amazon.com Pictures:";
  %tlinks = ();
  @match = ();
  $pageCount = 1;

  # get list of titles
  $page = getPage($searchURL.uri_escape($title))->content();

  # try to get number of hits to calculate number of consecutive pages
  $matches = ($page =~ /Showing (\d+) (- (\d+) of (\d+) Results)?/);
  $pageCount = 1;
  $pageCount = ceil($3 /$4) if (($matches) & ($matches > 1));

  # process all pages
  for ($pg=1; $pg <= $pageCount; $pg++){
    # search for links which look like our target
    $tree = HTML::TreeBuilder->new();
    $tree->parse($page);
    # need all <a href="..."> tags
    @links = @{$tree->extract_links('a', 'href')};

    # filter target links
    foreach $anchor (@links){
      # the href must have the form: http://www.amazon.com/.../dp/
      $link_href = $anchor->[1]->attr_get_i('href');
      if ($link_href =~ /http\:\/\/www\.amazon\.com\/.+?\/dp\//) {
	# the child of the a-tag must be a span-tag
	@children = $anchor->[1]->content_list();
	foreach $child (@children) {
  	  next if (ref($child) ne "HTML::Element");
	  next if ($child->tag() ne "span");

	  # the (only) child of the span-tag is the text description
	  @names = $child->content_list();
	  $tlinks{$names[0]} = $link_href if (scalar @names == 1);
	}
      }
    }

    # download next page
    if ($pg < $pageCount) {
      $page = getPage($searchURL.uri_escape($title)."&pg=".($pg+1))->content();
    }
  }

  # scan for url and text
  foreach (keys %tlinks){
    push @match, {Text => $_, 
		  URL => $tlinks{$_}};
  }


  #-- output Results
  print "status: list\n";
  print "section: $section\n";

  foreach $rec (@match){
    print "title: $rec->{Text}\n";
    print "url: $rec->{URL}\n";
    print "\n";
  }
  exit;
}


#-- URL ------------------------------------------------------------------------
if ($URL){
  #-- get main page
  $response = getPage($URL);
  $page = $response->content();
  # when querying for a list and recieving details the url
  # of the page has changed: update this
  $URL = $response->base();

  # fields we are looking out for:
  %data = ();
  # - picture
  
  # the following are not available
  # - originalTitle
  # - year
  # - director
  # - category
  # - actors
  # - rating
  # - url
  # - length
  # - country
  # - language
  # - description
  # - comments
  # - translated title
  # - producer

  # picture
  if ($URL !~ /\/dp\/(.+?)$/) {
    print "status: error\n";
    print STDERR "url: $URL\n";
    print STDERR "Could not extract picture ID from URL\n";
  }
  $ID = $1;

  # double-link image
  $URL2 = "http://www.amazon.com/gp/product/images/".$ID;
  
  $response = getPage($URL2);
  $page = $response->content();

  $tree = HTML::TreeBuilder->new();
  $tree->parse($page);

  @result = $tree->find_by_attribute('id', 'prodImage');

  $data{picture} = $result[0]->attr('src') if (scalar @result);

  
  #-- output
  print "status: details\n";

  foreach $k (keys(%data)){
    if ($k =~ /description|actors/){
      foreach (@{$data{$k}}){
	print "$k: $_\n";
      }
    }
    else {
      print "$k: $data{$k}\n";
    }
    print "\n";
  }


}
