# Copyright (C) 2007 Marco Costantini
# based on ibs_it.rb by Claudio Belotti
#
# Alexandria is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Alexandria is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with Alexandria; see the file COPYING.  If not,
# write to the Free Software Foundation, Inc., 51 Franklin Street,
# Fifth Floor, Boston, MA 02110-1301 USA.

# http://en.wikipedia.org/wiki/WorldCat
# See http://www.oclc.org/worldcat/policies/terms/

require 'fileutils'
require 'net/http'
require 'open-uri'
#require 'cgi'

module Alexandria
  class BookProviders
    class WorldcatProvider < GenericProvider
      include Logging
      BASE_URI = "http://worldcat.org"
      CACHE_DIR = File.join(Alexandria::Library::DIR, '.worldcat_cache')
      REFERER = BASE_URI
      def initialize
        super("Worldcat", "Worldcat")
        FileUtils.mkdir_p(CACHE_DIR) unless File.exists?(CACHE_DIR)
        # no preferences for the moment
        at_exit { clean_cache }
      end

      def search(criterion, type)
        req = BASE_URI + "/"
        req += case type
               when SEARCH_BY_ISBN
                 "isbn/"

               when SEARCH_BY_TITLE
                 "search?q=ti%3A"

               when SEARCH_BY_AUTHORS
                 "search?q=au%3A"

               when SEARCH_BY_KEYWORD
                 "search?q="

               else
                 raise InvalidSearchTypeError

               end

        req += CGI.escape(criterion)
        p req if $DEBUG
        data = transport.get(URI.parse(req))
        if type == SEARCH_BY_ISBN
          isbn = Library.canonicalise_isbn(criterion)
          begin
            book_rslt = to_book(data, isbn) #rescue raise NoResultsError
          rescue Exception => ex
            puts "failed to_book #{ex}"
            raise ex
          end

          book = book_rslt[0]

          #require 'pp'    ##
          #puts "WorldCat" ##
          #pp book_rslt    ##

          begin
            if book.isbn.nil?
              log.info { "Re-setting isbn on WorldCat book" }
              ## this often happens because the html for a single book
              ## lists multiple ISBNs, so we add the one we searched by here
              ###isbn = Library.canonicalise_isbn(criterion)
              ## This is amazing, we need to create a new book so that
              ## the broken saved_ident value doesn't persist!
              ## This domain model SO needs an overhaul...
              new_book = Book.new(book.title, book.authors, isbn,
                                  book.publisher, book.publishing_year,
                                  book.edition)
              book_rslt[0] = new_book
            end
            return book_rslt
          rescue Exception => ex
            puts ex
          end
        else
          begin
            results = []
            each_book_page(data) do |code, title|
              results << to_book(transport.get(URI.parse(BASE_URI + "/oclc/" + code)))
            end
            return results
          rescue
            raise NoResultsError
          end
        end
      end

      def url(book)
        BASE_URI + "/isbn/" + book.isbn
      end

      #######
      private
      #######

      def to_book(data, given_isbn=nil)
        raise NoResultsError if /<br><p>The page you tried was not found\./.match(data) != nil

        raise unless md = /<h1 class="item-title"> ?(<div class=vernacular lang="[^"]+">)?([^<]+)/.match(data)
        title = CGI.unescape(md[2].strip)

        authors = []
        md = data.scan(/title="Search for more by this author">([^<]+)/)
        #            raise "No authors" unless md.length > 0
        md = md.collect {|match| match[0]}
        md.each {|match|
          CGI.unescape(match.strip)
          authors << match
        }
        #                 md[1].strip.split(', ').each { |a| authors << CGI.unescape(a.strip) }

        # FIXME: The provider returns the first ISBN found. When searching by
        # ISBN, it should instead return the ISBN searched
        # Example: http://worldcat.org/isbn/9780805335576

        if md = /<strong>ISBN: <\/strong>\w+\W+(\d+)\D/.match(data)
          isbn = md[1].strip
        else
          isbn = nil
        end

        if isbn.nil? and not given_isbn.nil?
          isbn = given_isbn
        end
        # The provider returns
        # City : Publisher[ ; City2 : Publisher2], *year? [&copy;year]
        # currently the match is not good in case of City2 : Publisher2 and in case of &copy;year

        # FIXME: if the field 'Publisher' contains "| Other Editions ..." (as for 9788441000469), then this regexp doesn't match;
        # if not (as for 9785941454136), it is OK.
        begin
          if md = /<td class="label">Publisher:<\/td><td>[^:<]+ : ([^<]+), [^,<]*(\d\d\d\d).?<\/td>/.match(data)
            publisher = CGI.unescape(md[1].strip)
            publish_year = CGI.unescape(md[2].strip)[-4 .. -1].to_i
            publish_year = nil if publish_year == 0
          else
            publisher = nil
            publish_year = nil
          end
        rescue Exception => ex
          puts "failed to match publisher data #{ex}"
        end

        edition = nil ## urr... too hard to try just now (CathalMagus)


        if md = /<div id="div-cover"><img src="([^"]+)/.match(data)
          log.debug { "got image: #{md[1]}" }

          begin
            cover_url = BASE_URI + md[1].strip
            cover_filename = isbn + ".tmp"
            Dir.chdir(CACHE_DIR) do
              File.open(cover_filename, "w") do |file|
                file.write open(cover_url, "Referer" => REFERER ).read
              end
            end

            medium_cover = CACHE_DIR + "/" + cover_filename
            if File.size(medium_cover) > 0
              puts medium_cover + " has non-0 size" if $DEBUG
              return [ Book.new(title, authors, isbn, publisher, publish_year, edition),medium_cover ]
            end
            puts medium_cover + " has 0 size, removing ..." if $DEBUG
            File.delete(medium_cover)
          rescue Exception => ex
            log.error { "Couldn't download image from WorldCat: #{ex}" }
          end
        end

        return [ Book.new(title, authors, isbn, publisher, publish_year, edition) ]
      end

      def each_book_page(data)
        raise if data.scan(/<div class="name"><a href="\/oclc\/(\d+)&/) { |a| yield a}.empty?
      end

      def clean_cache
        begin
          Dir.chdir(CACHE_DIR) do
            Dir.glob("*.tmp") do |file|
              puts "removing " + file if $DEBUG
              File.delete(file)
            end
          end
        rescue Exception => ex
          log.error { "Error cleaning WorldCat cache: #{ex}" }
        end
      end
    end
  end
end
