# -*- shell-script -*-
# gfunc83100-extract-files.sh -- Represents all the code necessary to download images from 'A-MANGA-SITE' or videos from 'watchcartoononline'
# Copyright © 2015-2016 Michael Pagan
#
# Author: Michael Pagan
# E-Mail: michael.pagan@member.fsf.org
# Jabber: pegzmasta@member.fsf.org
#
# This file is part of Genshiken.
#
# Genshiken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Genshiken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Genshiken. If not, see http://www.gnu.org/licenses/.
#===================================================================
function Extract_Files
{
  # This variable will dictate the flow control
  local option=$(eval $GET_DOMAIN <<< $1)

  # Determine the initial download path, and make it the 'pwd'
  if   [[ $option = $DOMAIN1 ]]; then :
:<<'MANGA-PROCEDURE'
----[[ -d $DIR_PICS/ ]] && cd $DIR_PICS/ || { mkdir -p $DIR_PICS/; cd $DIR_PICS/; }
MANGA-PROCEDURE

  elif [[ $option = $DOMAIN2 ]]; then
    [[ ! -d $DIR_ANIM/$6 ]] && mkdir -p $DIR_ANIM/$6 # Parameter 6 is the ANIME_TYPE provided on the command-line by the user
    cd $DIR_ANIM/$6
  fi
  # Determine the category of this work
  if [[ $option = $DOMAIN1 ]]; then :
:<<'MANGA-PROCEDURE'
----declare -g category=$(sed -e 's_.*com/__' -e 's_/.*__' <<< $1) tmp=Gallery 1> /dev/null
----echo -e "${N}${R}[${FUNCNAME[0]}] ${G}The Category of the URL ${R}<${C}$1${R}>  \b${G}is: \"${M}$category${G}\"  \b"
MANGA-PROCEDURE

  elif [[ $option = $DOMAIN2 ]]; then
    tmp=Episode category=$2 title=$(eval $HUMAN_READABLE <<< $3 | sed 's/_English_[DS]ubbed//')
    episode_link=$(sed -e 's_<\(.*\)>.*_\1_' -e 's_<__' <<< $1) episode_file=$(sed -e 's_.*\..*/\(.*\)>.*_\1_' -e 's_<__' <<< $1)

    # The episode name is one of the hardest things to determine, for there may be several other episodes on the same page; hence, a global array ...
    declare -g episode_name[0]=$(tr \` \' <<< $1 | sed -e 's_.*>\(.*[A-Za-z0-9]\).*_\1_' -e 's/_English_[DS]ubbed//' | eval $HUMAN_READABLE) &> /dev/null

  fi
  # Enter a new directory, $category, as the 'pwd'
  [[ -d $category/ ]] && cd $category/ || { mkdir $category/; cd $category/; }

  # Enter a new directory, $tmp, as the 'pwd' for the first download; $title directory for the rest
  if [[ $option = $DOMAIN2 ]]; then [[ -d $title/ ]] && cd $title/ || { [[ -d $tmp/ ]] && cd $tmp/ || { mkdir $tmp/ && cd $tmp/; }; }
  elif
    [[ $option = $DOMAIN1 ]]; then :
:<<'MANGA-PROCEDURE'
----[[ -d $tmp/ ]] && cd $tmp/ || { mkdir $tmp/; cd $tmp/; }
MANGA-PROCEDURE

  fi
  # Determine if this episode was already downloaded/incomplete, and handle it accordingly
  if [[ $option = $DOMAIN2 ]]; then

    #  At this point: We don't know what extension the episode |
    #+ is, so let's check for several different types.         |
    episode=$(echo ${episode_name[0]}.{flv,mp4,mkv,avi,ogg,webm})
    for k in {1..4}
    do local lost_episode=$(gawk '{ print $'$k' }' <<< $episode)
    [[ -f   $lost_episode ]] && [[ -z ${STREAM:-} ]] && { Interrupted_Downloads $lost_episode $episode_file; return; }
    done

    # This episode hasn't been downloaded, yet
    echo -e "${N}${R}[${FUNCNAME[0]}] ${G}The Category of the URL ${R}<${C}$episode_link${R}>  \b${G}is: \"${M}$category${G}\" ${N}  \b"
    echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Captured Video Title: \"${M}$title${G}\" ${N}  \b"
  fi
  # Download the [gallery/video] cover page
  if [[ $option = $DOMAIN1 ]]; then :
:<<'MANGA-PROCEDURE'
----echo -e "${N}${R}[${FUNCNAME[0]}] ${G}The Present Working Directory is: ${C}$PWD$ ${N}"
----echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Downloading ${M}$tmp${G} Cover-page: ${R}<${C}$1${R}>${Y}   \b\b"
----Download_File index.html $1 -source
MANGA-PROCEDURE

  elif [[ $option = $DOMAIN2 ]]; then
    echo -e "${N}${R}[${FUNCNAME[0]}] ${G}The Present Working Directory is: ${C}$PWD ${N}"
    echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Downloading ${M}$tmp${G} Cover-page: ${R}<${C}$episode_link${R}>${Y}   \b\b"
    Download_File $episode_file $episode_link -source

    # For some reason, the name of the file may have some extra words appended to it
    [[ ! -f $episode_file ]] && episode_file=$(ls ${episode_file}* 2> /dev/null)

    # Discover whether we can parse the video links with free software; provide as much detail as possible
    unset selected_domain python_domain
    Extract_Information $episode_file
    if [[ -z ${selected_domain:-} && -z ${python_domain:-} ]]
    then Whats_Missing $6 ${episode_name[$j]}; return
    fi
  fi
  # Download the actual gallery page
  if [[ $option = $DOMAIN1 ]]; then :
:<<'MANGA-PROCEDURE'
----local gallery=$(               grep 'View Gallery' index.html | sed -e 's|<.*><a href="||' -e 's|".*||' -e 's|\(.*album\)&amp;\(.*\)|\1\&\2|')
----local gallery_index=index.html`grep 'View Gallery' index.html | sed -e 's|<.*><a href="||' -e 's|".*||' -e 's|\(.*album\)&amp;\(.*\)|\1\&\2|'
                                                                        -e 's|.*y/\(?.*\)|\1|'`
----# Downloading Gallery webpage
----echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Downloading Gallery Archive: ${R}<${C}$gallery${R}>${Y}   \b\b"
----Download_File $gallery_index $gallery -source

----# Determine the name of the work
----local title=$(grep '<title>' $gallery_index | sed -e 's|<.*>\[.*\][ ]||' -e 's|\(.*:\).*|\1|' -e 's|\(.*\)[ ]:|\1|')
----echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Captured Gallery Title: \"${M}$title${G}\" ${N}  \b"; title=$(sed 's| |_|g' <<< $title)
MANGA-PROCEDURE

  # Download the actual video page
  elif [[ $option = $DOMAIN2 ]]; then

    # Determine if the web-page is serving multiple episodes on the same page, and-- if so, capture all of their addresses |
    # NOTE: The first conditional is only here, because of the existence of an anime called: "009-1" (I kid you not)       |
    [[ ! -n `grep '<title>' $episode_file | gawk '{ print $1 }' | gawk -F'>' '{ print $2 }' | grep '[0-9]-[0-9]'` ]] &&
    [[ ! -z `grep '<title>' $episode_file | grep '[0-9]-[0-9]'` ]] &&
    {
      # Check if their are official episode names (delimiter is ";" or "/") in the title strings; if so, we're dealing with DUAL episodes; if not-- MULTI
      [[ ! -z `grep '<title>' $episode_file | grep '[0-9]-[0-9]' | grep ';'` || ! -z `grep '<title>' $episode_file | grep '[0-9]-[0-9]' | grep '/'` ]] ||
      [[ ! -z `grep 'Episode [0-9]* and Episode [0-9]*'` ]] &&
      # The web-page is serving DUAL episodes (2 on one page)
      {
        for j in {0..1}; do
          # Extract the episode address on the server
          server_path[$j]=$(egrep "${python_domain:-file}|${selected_domain:-}" $episode_file   |\
                            sed -e 's_.*src="\(.*\)"_\1_' -e 's_".*__' -e 's_>__' -e 's_<.*>__g' \
                            $([[ -n $python_domain ]] && echo "-e s_.*file=\(.*\)_\1_") | eval $BROWSER_READABLE |\
                            grep 'embed_js' | grep -v 'itemprop' | egrep 'file|embed' |\
                            sed -n "`expr $j '+' 1` p" | sed 's/%26videoAutoPlay=0//')

          [[ -z ${server_path[$j]:-} ]] &&
          server_path[$j]=$(egrep "${python_domain:-file}|${selected_domain:-}" $episode_file   |\
                            sed -e 's_.*src="\(.*\)"_\1_' -e 's_".*__' -e 's_>__' -e 's_<.*>__g' \
                            $([[ -n $python_domain ]] && echo "-e s_.*file=\(.*\)_\1_") | eval $BROWSER_READABLE |\
                            egrep -v 'embed_js|itemprop' | egrep 'file|embed' |\
                            sed -n "`expr $j '+' 1` p" | sed 's/%26videoAutoPlay=0//')

          # This is rare, but sometimes a dual episode web-link, may have both episodes combined inside a single video. |
          # The assumption here is that it would be weird to have only 1 video on a DUAL video web-page.                |
          [[ $j = 1 ]] && [[ -z ${server_path[1]:-} ]] && Retrieve_Video_Link 0 0 &&
          {
            # Ethical hosts usually provide a back-up link to their videos, so 2 episodes means there are 4 links (DUAL); 1 episode-- 2 links (Single)
            [[ $(grep 'file' $episode_file | sed -e 's_.*src="\(.*\)"_\1_' -e 's_".*__' -e "s_\(.*$ext\).*_\1_" | grep "${selected_domain:-$python_domain}" |\
                 grep 'http' | uniq | wc -l) -eq 2 ]] &&
            {
                [[ -f .last_video ]] && rm .last_video
                echo ${episode_name[0]}  > .last_video
                echo ${server_path[0]}  >> .last_video
                episodes=0; Next_Episode "Single 2-Parter $TYPE Episode" $4 $5 ${episode_name[0]} ${upload_path[0]}
                return
            }
          }
        done
        episode_name[0]=$(grep '<title>' $episode_file | tr \` \' |\
                          sed -e 's_<title>__' -e 's_[ ]|.*__' -e 's_\(.*[0-9]*\)-[0-9]*[ ]\(.*\)[/,;].*_\1 \2_' -e 's:[ ]:_:g' -e 's/_English_[DS]ubbed//' |\
                          eval $HUMAN_READABLE)
        episode_name[1]=$(grep '<title>' $episode_file | tr \` \' |\
                          sed -e 's_<title>__' -e 's_[ ]|.*__' -e 's_\(.*\)[ ][0-9]*-\([0-9]*\)[ ].*[/,;]\(.*\)_\1 \2 \3_' -e 's:[ ]:_:g' -e 's/_English_[DS]ubbed//' |\
                          eval $HUMAN_READABLE)
      } ||
      {
        # The web-page is serving Multiple (MULTI) episodes.  How many episodes is this page serving?                                              |
        # NOTE: I'm subtracting 2, because: 1-- these `for` loops deal with base 0 arrays (minus 1); 2-- `wc` counts an extra character (minus 2). |
        episodes=$(expr $(grep '<title>' $episode_file | sed -n '1 p' | sed -e 's_.*Episode.\(.*\)_\1_' -e 's_\([^ ]*\).*_\1_' -e 's_-__g' | wc -m) '-' 2)

        # Collect episode information
        for (( j=$GNU; j<=$episodes; j++ )); do

            # Extract the episode name
            episode_number=$(grep '<title>' $episode_file | sed -n '1 p' | gawk -F'-' '{ print $'`expr $j '+' 1`' }' | sed 's_.*\([0-9][0-9]*\).*_\1_')
            episode_title=$( grep '<title>' $episode_file | sed -n '1 p' | sed -e 's_<title>\(.*Episode.\).*_\1_' -e 's:[ ]:_:g' -e 's/_English_[DS]ubbed//' |
			     eval $HUMAN_READABLE)
            episode_name[$j]=${episode_title}${episode_number}

            # Extract the episode address on the server
            server_path[$j]=$(egrep "${python_domain:-file}|${selected_domain:-}" $episode_file   |\
                              sed -e 's_.*src="\(.*\)"_\1_' -e 's_".*__' -e 's_>__' -e 's_<.*>__g' \
                              $([[ -n $python_domain ]] && echo "-e s_.*file=\(.*\)_\1_") | eval $BROWSER_READABLE |\
                              grep 'embed_js' | grep -v 'itemprop' | egrep 'file|embed' |\
                              sed -n "`expr $j '+' 1` p" | sed 's/%26videoAutoPlay=0//')

            [[ -z ${server_path[$j]:-} ]] &&
            server_path[$j]=$(egrep "${python_domain:-file}|${selected_domain:-}" $episode_file   |\
                              sed -e 's_.*src="\(.*\)"_\1_' -e 's_".*__' -e 's_>__' -e 's_<.*>__g' \
                              $([[ -n $python_domain ]] && echo "-e s_.*file=\(.*\)_\1_") | eval $BROWSER_READABLE |\
                              egrep -v 'embed_js|itemprop' | egrep 'file|embed' |\
                              sed -n "`expr $j '+' 1` p" | sed 's/%26videoAutoPlay=0//')
        done
      }
      # Begin processing every episode on the webpage
      episodes=$(expr $(wc -l <<< ${episode_name[@]}) '-' 1)
      for (( j=$GNU; j<=$episodes; j++ )); do

        #  If the episode does not already exist, then find the video path on the server; if it does exist: Determine |
        #+ if the episode is a part of an interrupted download.  If it was interrupted, then handle it accordingly.   |
        episode=$(echo ${episode_name[0]}.{flv,mp4,mkv,avi,ogg,webm})
        for k in {1..4}
        do local lost_episode=$(gawk '{ print $'$k' }' <<< $episode)
        [[ -f   $lost_episode ]] && [[ -z ${STREAM:-} ]] && { Interrupted_Downloads $lost_episode; return; }
        done

        # If the server_path exists: Fetch the upload path
        [[ -z ${server_path[$j]:-} ]] && { Whats_Missing $6 ${episode_name[$j]}; return; }
        Retrieve_Video_Link $GNU $episodes

        # This is rare, but sometimes the uploader will post the previous episode address instead of the current one. |
        # This is a website administrator error (NOT my fault); to solve it: I simply add one to the episode number.  |
        [[ ${server_path[$j]} = $(sed -n '2 p' .last_video 2> /dev/null) ]] &&
        {
          : # NEEDS WORK!                                                                                          |
            #-$current_number=$(expr $(sed -e 's_.*\(-[0-9]*\).*_\1_' -e 's_-__g' <<< "${upload_path[$j]}") '+' 1) |
            #-$upload_path[$j]=$(sed "s_\(.*\)\(-[0-9]*\)\(.*\)_\1-$current_episode\3_" <<< "${upload_path[$j]}")  |
        }
        # Allow us a way to continue interrupted downloads
        [[ -f .last_video ]] && rm .last_video
        echo ${episode_name[$j]} > .last_video
        echo ${server_path[$j]} >> .last_video
      done
    } ||
    # The web-page is serving only one episode
    {
      # Determine the server path to the video, and if we can't: Report back to the user
      server_path[0]=$(egrep "${python_domain:-file}|${selected_domain:-}" $episode_file   |\
                       sed -e 's_.*src="\(.*\)"_\1_' -e 's_".*__' -e 's_>__' -e 's_<.*>__g' \
                       $([[ -n $python_domain ]] && echo "-e s_.*file=\(.*\)_\1_") | eval $BROWSER_READABLE |\
                       grep -v 'itemprop' | egrep 'file|embed' | sed -n '1 p' | sed 's/%26videoAutoPlay=0//')

      [[ -z ${server_path[0]:-} ]] &&
      {
        Whats_Missing $6 ${episode_name[0]}; return
      } ||
      {
        # Fetch the upload path
        Retrieve_Video_Link 0 0

        # This is rare, but sometimes the uploader will post the previous episode address instead of the current one. |
        # This is a website administrator error (NOT my fault); to solve it: I simply add one to the episode number.  |
        [[ ${server_path[0]} = $(sed -n '2 p' .last_video 2> /dev/null) ]] &&
        {
          : # NEEDS WORK!                                                                                          |
            #-$current_number=$(expr $(sed -e 's_.*\(-[0-9]*\).*_\1_' -e 's_-__g' <<< "${upload_path[$j]}") '+' 1) |
            #-$upload_path[0]=$(sed "s_\(.*\)\(-[0-9]*\)\(.*\)_\1-$current_episode\3_" <<< "${upload_path[$j]}")   |
        }
        # Allow us a way to continue interrupted downloads
        [[ -f .last_video ]] && rm .last_video
        echo ${episode_name[0]}  > .last_video
        echo ${server_path[0]}  >> .last_video
      }
    }
  fi
  # Before we create our new title directory, where we will download this anime series into, let's check if any previous genshiken sessions need to be updated
  [[ -z ${STREAM:-} ]] && [[ ! -d $DIR_ANIM/$6/$title/ ]] && Update_Previous_Sessions $DIR_ANIM/$6/$title/ $6

  # Move into the parent directory, and determine if the child directory we were in needs to be renamed; afterwords: Enter the title-dir & commence downloading
  cd ..; [[ -d $tmp/ ]] && { mv $tmp/ $title/; cd $title/; echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Renamed present directory as: ${C}$PWD ${N}"; }  || cd $title/
  case $option in

    # Download procedure for anime
    watchcartoononline)

      # Download the next episode (Group)
      if [[ ! -n `grep '<title>' $episode_file | gawk '{ print $1 }' | gawk -F'>' '{ print $2 }' | grep '[0-9]-[0-9]'` ]] &&
         [[ ! -z `grep '<title>' $episode_file | grep '[0-9]-[0-9]'` ]]
      then
        for (( j=$GNU; j<=$episodes; j++ )); do
          [[ $episodes -eq 1 ]] && group='DUAL' || group='MULTI'; [[ ! -f ${episode_name[$j]} ]] &&
          Next_Episode "Group $TYPE Episode-`expr $j '+' 1` ($group)" $4 $5 ${episode_name[$j]} ${upload_path[$j]}
        done

      # Download the next episode (Single)
      else Next_Episode "Single $TYPE Episode" $4 $5 ${episode_name[0]} ${upload_path[0]}
      fi
    ;;
    # Download procedure for mangas
    NOT-DEFINED-YET) :
:<<  'MANGA-PROCEDURE'
------# Determine if there is more than 1 gallery page
------local          gallery_pages=$(grep 'page_link' $gallery_index)
------if [[ ! -z   ${gallery_pages:-} ]]; then
------local  total_gallery_pages=$(grep 'page_link' $gallery_index | sed 's_.*>\(.\)</a>.*pagNext.*_\1_')
--------echo -e "${N}${R}[${FUNCNAME[0]}] ${G}There's a total of [${M}$total_gallery_pages${G}] Gallery pages in ${R}<${C}$gallery${R}>${Y}   \b\b"
------else
----------local total_gallery_pages=1
------fi; local page_count=1 jpeg_count=1

------# Count how many galleries there are, and loop through the below script for each gallery
------for (( i=$cover_page; i<=$total_gallery_pages; i++ )); do

--------# If there's more than one gallery page, then loop through each page.
--------if [[ $total_gallery_pages != 1 ]]; then
----------if [[ $i != 1 ]]; then
------------if [[ $i = 2 ]] # Determine the gallery names
------------then local gallery="$gallery&plog_page=$i"                 gallery_index="$gallery_index&plog_page=$i"
------------else local gallery=$(sed "s/page=./page=$i/" <<< $gallery) gallery_index=$(sed "s/page=./page=$i/" <<< $gallery_index)
------------fi
------------# Download the next page in the gallery
------------Download_File $gallery_index $gallery -source
------------fi
----------fi
--------# Count how many pictures there are, and loop through the below script for each picture
--------for image in $(grep '[0-9][0-9]*.*.[jp][pn]g' $gallery_index | sed -e 's|src="\(http://.*[0-9][0-9]*.*.[jp][pn]g\).*|\1|' \
                                                                         -e 's|\(.*s/\)\(.*\)|\1lrg-\2|') # add string 'lrg-' for BIG pics...
--------do
--------# Determine what the current page is (necessay after the 1st iteration of the outer 'for' loop)
--------[[ -f current_page_ ]] && jpeg_count=$(< current_page_)

----------# Determine how many pictures there are in the gallery
----------if [[ $page_count = 1 ]]; then
----------if [[ ! -z ${gallery_pages:-} ]]; then
--------------page_total=$(grep '[0-9][0-9]*.*.[jp][pn]g' $gallery_index | sed 's|src="\(http://.*[0-9][0-9]*.*.[jp][pn]g\).*|\1|' | wc -l)
--------------image_total=$page_total
--------------[[ -f current_page_ ]] && { page_total=$(expr $jpeg_count + $page_total - 1); rm current_page_; }
--------------echo -e "${N}${R}[${FUNCNAME[0]}] ${G}The total # of images in \"${M}$title${G}\" for Gallery [${M}$i${G}] is: ${M}$image_total ${N}"
------------else
--------------local page_total=$(grep '[0-9][0-9]*.*.[jp][pn]g' $gallery_index | sed 's|src="\(http://.*[0-9][0-9]*.*.[jp][pn]g\).*|\1|' | wc -l)
--------------echo -e "${N}${R}[${FUNCNAME[0]}] ${G}The total # of images in \"${M}$title${G}\" is: ${M}$page_total ${N}"
------------fi
----------fi
----------# Ensure the image can be read, no matter what the extension
----------local ext=$(grep '[0-9][0-9]*.*.[jp][pn]g' "$gallery_index" | sed 's|src="\(http://.*[0-9][0-9]*.*.[jp][pn]g\).*|\1|' |\
----------------------sed -n "$page_count s/http/&/p" | sed 's|.*\.\(.*\)|\1|'); [[ $ext = 'png' ]] && ext=$ext.jpeg

----------# Create a new file to store the image data; Download each picture-- one-by-one-- into the new file; then, increment the counts
----------touch $title-$jpeg_count.$ext
----------echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Downloading image ${M}$jpeg_count${G} of ${M}$page_total ${G}|| File='${M}$title-$jpeg_count.$ext${G}' ${N} \b"
----------Download_File $title-$jpeg_count.$ext $image; eval $ADD_PAGE $ADD_JPEG
----------done

--------# Finished downloading all pictures from the Galler[y,ies]
--------if [[ $total_gallery_pages = 1 ]]; then
----------echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Finished downloading all pictures in \"${M}$title${G}\". ${N}  \b\n"
--------else
----------echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Finished downloading all pictures from Gallery [${M}$i${G}]. ${N}  \b"
----------echo $jpeg_count > current_page_; page_count=1
----------if [[ $i = $total_gallery_pages ]]
----------then echo -e "${N}${R}[${FUNCNAME[0]}] ${G}Stored all images from all [${M}$i${G}] Galleries for \"${M}$title${G}\"! ${N}  \b\n"; rm current_page_
----------fi
--------fi; Clean_Up directory
------done
MANGA-PROCEDURE
    ;;
  esac
}

# End:
# gfunc83100-extract-files.sh ends here
