# -*- coding: ascii -*-

###########################################################################
# clive, video extraction utility
# Copyright (C) 2007-2008 Toni Gundogdu
#
# clive is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 0.1.2-1307 USA
###########################################################################

## Classes for parsing video page HTML.

import os
import sys
import urlparse
import urllib
import formatter
import time
import md5
import re
import string

from htmllib import HTMLParser

from clive.error import CliveError

__all__ = ['PageParser']


## The class for host specific video page HTML parsing.
class PageParser:

	## Constructor
	def __init__(self):
		self._supported_hosts = [
			('youtube.com', 'ytube', self._parse_ytube),
			('video.google.', 'vgoogle', self._parse_vgoogle),
			('dailymotion.', 'dmotion', self._parse_dmotion),
			('guba.com', 'guba', self._parse_guba),
			('stage6.', 'stage6', self._parse_stage6),
			('metacafe.', 'metac', self._parse_metacafe),
		]

	## Parses a video page data (HTML).
	#
	# \param self The object pointer
	# \param data The object containing the video page data (HTML)
	# \param url The visited video page URL
	# \param opts The current program options
	# \param callb_query_video_length A callback function for querying file length
	# \param say A callback function for printing out stdout messages
	def parse(self, data, url, opts, callb_query_video_length, say):
		self._say = say

		fmt = formatter.AbstractFormatter(formatter.NullWriter())
		p = HTMLParser(fmt)
		p.feed(data)
		p.close()

		video_url = ''

		for (site, video_host, func) in self._supported_hosts:
			if url.lower().find(site) != -1:
				(video_url, video_id) = func(url, data)
				break

		if len(video_url) == 0:
			raise CliveError('error: extraction url not found')

		video_details = (url, video_url)
		video_details += callb_query_video_length(video_url)
		video_details += self._get_video_filename(
			p.title, url, opts, video_details, video_id, video_host
		)

		# (url, video_extraction_url, human_readable_length,
		# length_bytes, filename, reget)
		return video_details

	def _parse_ytube(self, url, data):
		try:
			vid = url.split('watch?v=',1)[1].split('&',1)[0]
		except IndexError:
			vid = md5.new(str(time.time())).hexdigest()[:8]

		ldata = data.lower()

		if ldata.find('please verify you are 18') != -1:
			raise CliveError('error: age verification')
		elif ldata.find('no longer available') != -1 or \
			ldata.find('has been removed') != -1 or \
			ldata.find('this video is unavailable') != -1:
			raise CliveError('error: video is unavailable')
		elif ldata.find('url contained a malformed video id') != -1:
			raise CliveError('error: url contained a malformed video id')
	
		try:
			video_id = \
				self._parse_from_to(data, 'video_id=', '&', skip_from=1)
			video_id = video_id.replace("'", "")
			if len(video_id) == 0:
				raise CliveError()
		except:
			raise CliveError('error: extraction url (&video_id) not found')
		
		try:
			t = self._parse_from_to(data, '&t=', '&', skip_from=1)
			t = t.replace("'", "")
			if len(t) == 0:
				raise CliveError()
		except:
			raise CliveError('error: extraction url (&t) not found')

		url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (
			video_id, t
		)

		return (url, vid)

	def _parse_vgoogle(self, url, data):
		try:
			vid = url.split('docid=',1)[1].split('&',1)[0]
		except IndexError:
			vid = md5.new(str(time.time())).hexdigest()[:8]

		url = self._parse_from_to(data, 'googleplayer.swf', '"')
		try:
			url = urllib.unquote(url.split('=',1)[1])
		except IndexError:
			url = ''
		return (url, vid)
	
	def _parse_dmotion(self, url, data):
		try:
			vid = url.rsplit('/',1)[1].split('_',1)[0]
		except IndexError:
			vid = md5.new(str(time.time())).hexdigest()[:8]

		url = self._parse_from_to(data, 'url=', '"')

		if len(url) == 0:
			ldata = data.lower()
			if ldata.find('content deleted.') != -1:
				raise CliveError('error: video has been removed')
			elif ldata.find('explicit content.') != -1:
				raise CliveError('error: inappropriate content')
			else:
				raise CliveError('error: extraction url not found')

		try:
			url = urllib.unquote(url.split('url=',2)[2])
		except IndexError:
			url = ''
		return (url, vid)
	
	def _parse_guba(self, url, data):
		try:
			vid = url.split('watch/',1)[1].split('?',1)[0]
		except IndexError:
			vid = md5.new(str(time.time())).hexdigest()[:8]

		url = self._parse_from_to(data,
			'http://free.guba.com/uploaditem/', '"')

		return (url, vid)

	def _parse_stage6(self, url, data):
		try:
			vid = url.split('/video/',1)[1].split('/')[0]
		except IndexError:
			vid = md5.new(str(time.time())).hexdigest()[:8]

		url = self._parse_from_to(data,
			'http://video.stage6.com/', '&')

		return (url, vid)

	def _parse_metacafe(self, url, data):
		try:
			vid = url.split('/watch/',1)[1].split('/')[0]
		except:
			vid = md5.new(str(time.time())).hexdigest()[:8]

		url = self._parse_from_to(data, 'mediaURL=', '&', skip_from=1)

		return (url, vid)

	def _parse_from_to(self, data, _from, to, skip_from=0):
		start = data.find(_from)

		if skip_from:
			start += len(_from)

		end = data.find(to, start)

		text = ''
		if start != -1 and end != -1:
			text = data[start:end]

		return text

	def _get_video_filename(self, title, url, opts, \
			video_details, vid, host):
		reget = None
		exists = 0
		(url, vurl, hlen, length) = video_details

		ext = (['flv', 'avi'][url.find('stage6.') != -1])
		title = title.replace('YouTube -', '')
		title = title.replace('GUBA -', '')
		title = title.replace(' Video', '') # metac

		try:
			if url.lower().find('dailymotion.com') != -1:
				title = title.lstrip('Video ').split('-')[0].rstrip()

			if url.lower().find('stage6.') != -1:
				# '\xa0'=&nbsp;
				# '\xb7'=&middot;

				a = title.replace('\xa0', '').split('\xb7')
				title = a[2]

				if url.lower().find('/user/') != -1:
					title = a[1]

				title = title.rsplit('-', 1)[0]

		except IndexError:
			pass

		if opts.output_mask == '<userdef>':
			if not opts._full_rcdir in sys.path:
				sys.path.append(opts._full_rcdir)
			try:
				from userdef import userdef_output_mask
				title = userdef_output_mask(title)
			except ImportError, err:
				self._say('warn: %s in ~/.clive' % err[0])
				self._say('warn: using default --output-mask instead')
				title = re.sub('[^A-Za-z0-9]', '', title)
		else:
			if opts.output_mask != 'off':
				title = re.sub('[^%s]' % opts.output_mask, '', title)

		title = title.lstrip().rstrip()

		if len(title) == 0:
			title = self._random_string(insert_dash=0)

		if len(title) > 64:
			title = title[:64]

		filename = opts.output_fmt.replace('%','$')
		d = {'t':title,'i':vid,'h':host,'e':ext}
		filename = string.Template(filename).substitute(d)

		if opts.output_file:
			filename = opts.output_file

		if opts.prefix:
			filename = os.path.join(opts.prefix, filename)

		if os.path.exists(filename) and not opts.emit:
			if os.path.getsize(filename) < length:
				a = ['dmotion', 'guba', 'stage6', 'metac']
				if host in a:
					reget = 'simple' # Try resuming
				else:
					if not opts.overwrite: # Force --rename for all others
						self._say('warn: host does not support resuming; ' +
							'forcing `--rename`')
						filename = self._rename_file(filename)
			else:
				if not opts.overwrite:
					if opts.rename:
						filename = self._rename_file(filename)
					else:
						exists = 1
						self._say('warn: %s of same size exists already; ' \
						'will not dl video.' % filename)
				else:
					pass # Overwrite existing file

		return (filename, reget, exists)
	
	def _rename_file(self, filename):
		(root, ext) = os.path.splitext(filename)
		return root + self._random_string() + ext

	def _random_string(self, insert_dash=1):
		s = ''
		if insert_dash: s = '-'
		s += md5.new(str(time.time())).hexdigest()[:8]
		s += time.strftime('-%Y-%m-%dT%H:%M:%S').replace(':', '_')
		return s


