# -*- shell-script -*-
# gfunc7-extract-series-list.sh -- Extract an alphabetical series of lists for all the anime hosted on an anime website (only one supported as of right now)
# Copyright © 2015-2016 Michael Pagan
#
# Author: Michael Pagan
# E-Mail: michael.pagan@member.fsf.org
# Jabber: pegzmasta@member.fsf.org
#
# This file is part of Genshiken.
#
# Genshiken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Genshiken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Genshiken. If not, see http://www.gnu.org/licenses/.
#===================================================================
function Extract_Series_List
{
  # Keep all of our temporary data inside $DIR_DET/* (our 'PWD' until the user selects their series) |
  # NOTE: I'm renaming all the category files to '.old', for later analysis                          |
  if [[ -d $DIR_DET/$1 ]]; then
    cd     $DIR_DET/$1
    Clean_Up directory
    if [[ ! -z $(ls) ]]
    then
      rename .txt .old *.txt 2> /dev/null
      [[ $? -ne 0 ]] && rename 's/\.txt$/\.old/' *.txt
      touch new_releases-$1.log
      touch removed_show-$1.log
    fi
  else { mkdir -p $DIR_DET/$1; cd $DIR_DET/$1; }
  fi
  # Ensure the 'Early_Termination' function will clean-up our PWD if we terminate early
  [[ ! -f $DIR_DET/.extraction_interrupted ]] && echo $1 > $DIR_DET/.extraction_interrupted
  [[   -f $DIR_DET/.extraction_interrupted && `< $DIR_DET/.extraction_interrupted` != $1 ]] && sed -i s_.*_"$1"_ $DIR_DET/.extraction_interrupted

  # The $page_path varies, depending on whether the word 'anime' is contained in the 1st parameter
  [[ ${1:0:5} = 'anime' ]] && page_path=${1:6}bed-${1:0:5} ||
                              page_path=$1
  [[ ${1:0:5} = 'anime' ]] && Anime_List=${1:6}bed Anime_List=${Anime_List^?} ||
                              Anime_List=${1^?}

  # Download the website that contains a list of whatever ANIME-TYPE the user provided on the command-line
  local Anime_Site="http://www.watchcartoononline.com/$page_path-list" GET_SIZE="wc -c series-data.log | sed 's/\(.*\)[ ].*/\1/'"
  echo -e "\n${N}${N}${R}[${FUNCNAME[0]}] ${G}The Present Working Directory is: ${C}$PWD ${N}"
  echo -e "${N}${N}${R}[${FUNCNAME[0]}] ${G}Downloading ${M}${Anime_List/Ova/Specials} ${G}Anime-List: ${R}<${C}$Anime_Site${R}> ${N} \b"
  Download_File $page_path-list $Anime_Site -source

  # We're going to create a category file for every element in this array (unless said category has no anime titles for it)
  local Series=('#' A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)

  #  Extract the specified list of anime on the site (as defined by the user, parameter 1 represents this), |
  #+ starting from "#" to "Z", into category files =========================================================/
  for i in {0..26}
  do
    # Movies will require a slightly different set of REGEX, due to a different HTML scheme
    if [[ $page_path != 'movie' ]] && [[ $page_path != 'ova' ]]; then

      # Create a category file for each series, and send anime titles and links into it
      [[ ! -f series-data.log || -z `< series-data.log` ]] &&
      echo -e "${N}${N}${R}[${FUNCNAME[0]}] ${G}Building Category List: ${M}${Series[$i]} ${N}  \b" &&
      {
        [[ ${Series[$i]} = '#' ]] &&
        {
          # Parse the HTML, capturing only the anime links that begin with the specified character for that series (for the "#" category)
          (sed 's,.*\(sep.>${Series[$i]}<.*\),\1,' | sed -n '/sep.>${Series[$i]}</,/<.ul>/ p' | sed -n 's,<a.href=.*/anime/.*>[^A-Z].*</a>,&,p' |\
           grep 'href' | eval $HTML_PARSER) < $page_path-list >> series-data.log
        } ||
        {
          # Parse the HTML, capturing only the anime links that begin with the specified character for that series (for one of the '[A-Z]' categories)
          (sed "s,.*\(sep.>${Series[$i]}<.*\),\1," | sed -n "/sep.>${Series[$i]}</,/<.ul>/ p" | sed -n "s,<a.href=.*/anime/.*>${Series[$i]}.*</a>,&,p" |\
           grep 'href' | eval $HTML_PARSER) < $page_path-list >> series-data.log
        }
      }
    else
      # Create a category file for each series, and send anime titles and links into it
      [[ ! -f series-data.log || -z `< series-data.log` ]] &&
      echo -e "${N}${N}${R}[${FUNCNAME[0]}] ${G}Building Category List: ${M}${Series[$i]} ${N}  \b" &&
      {
        [[ ${Series[$i]} = '#' ]] &&
        {
          # Parse the HTML, capturing only the anime links that begin with the specified character for that series (for the "#" category) |
          # NOTE: We need the list item element `<li>' in order to easily separate movie links                                            |
          (sed -n "/sep.>${Series[$i]}</,/<.ul>/ p" | sed -n 's,<a.href=.*>[^A-Za-z].*</a>,&,p' | grep 'href' |\
           eval $(echo $HTML_PARSER | [[ $page_path = 'movie' ]] && echo "sed 's,-e..s_<li>__g..,,'" || echo tee)) < $page_path-list >> series-data.log
        } ||
        {
          # Parse the HTML, capturing only the anime links that begin with the specified character for that series (for one of the '[A-Z]' categories) |
          # NOTE: We need the list item element `<li>' in order to easily separate movie links                                                         |
          (sed -n "/sep.>${Series[$i]}</,/<.ul>/ p" | sed -n "s,<a.href=.*>${Series[$i]}.*</a>,&,p" | grep 'href' |\
           eval $(echo $HTML_PARSER | [[ $page_path = 'movie' ]] && echo "sed 's,-e..s_<li>__g..,,'" || echo tee)) < $page_path-list >> series-data.log
        }
      }
    fi
    # All series have been concatenated into a single string; therefore, we shall separate these strings until we have 1 series per row
    if [[ $page_path != 'movie' ]]; then
      #  Parse the data in reverse order, until there is no more data left to parse =================\
      #                                                                                              |
      #  NOTE: I'm doing this in reverse order, because the HTML tags at the end of the series-data  |
      #+ stream was easier to parse.  I don't usually do things backwards, but this is an exception. |
      local     Series_Size=$(eval $GET_SIZE)
      until [[ $Series_Size -eq 0 ]]
      do
        # Append the last anime [parsed data] from `series-data.log' to `series_#.txt'; afterwords, remove the parsed data from the original file
        sed -e 's_.*\(<a.*>\).*_\1_g' -e 's/\xe2\x80\x8b//g' -e 's_\&amp;_\&_g' -e 's_</a>__' -e 's_&#215;_x_' -e "s_&#8217;_'_" \
            -e 's_&#8211;_--_' < series-data.log | grep -v '<a href=""></a>' >> series_$(( ${i}+1 )).txt
        sed -i -n -e 's_\(.*\)<a.*</a>_\1_p' series-data.log; Series_Size=$(eval $GET_SIZE) # How much data do we have left to parse?
      done
    else
      #  The `movie' routine requires a different strategy, since each movie is stored in only 1 video on the `Title Page'; whereas if this was NOT      |
      #+ a movie, their would be a series of videos stored on separate pages, and these pages would be referenced as an episode list on the `Title Page' |
      #+ NOTE: The `Title Page' is referenced as the "$Anime_Page" in the [Download_Anime] function. ====================================================/
      count=2
      until [[ -z `gawk -F'<li>' '{ print $'$count' }' series-data.log` ]]
      do
        gawk -F'<li>' '{ print $'$count' }' series-data.log >> series_$(( ${i}+1 )).txt
        let count+=1	  
      done
      [[ -f series_$(( ${i}+1 )).txt ]] && cat series_$(( ${i}+1 )).txt | sort -r -o series_$(( ${i}+1 )).txt
      rm series-data.log
    fi
    # Alphabetize and remove any duplicates from this new Category File.  If not '#/_', rename the current series after the current element of "Series[@]"
    [[ -f series_$(( ${i}+1 )).txt ]] &&
    {
      # Let's remove any newlines
      grep -v '^$' series_$(( ${i}+1 )).txt | sort -u -o series_$(( ${i}+1 )).txt

      # Ensure only links and their associated titles are inside these text files
      grep 'http'  series_$(( ${i}+1 )).txt | sort -u -o series_$(( ${i}+1 )).txt

      #  If we're dealing with OVA/Specials, then we have some serious editing to do.  Our [View_Anime_List] procedure =======================================\
      #+ will be way too chaotic and out-of-whack, unless we remove all these duplicates.  The OVA Anime-List is currently a per episode list, instead of     |
      #+ a per series list (which is what we really want).                                                                                                    |
      #                                                                                                                                                       |
      #  NOTE: Brace yourself!  This is a single command list, and it's huge for what we're trying to do (This is the power of stream editing at work).       |
      #        The goal is to make only the first episode of each anime available on this list.  As long as I have at least one link to an anime (episode 1), |
      #+       I have a base that I can work off of to get the other episodes, too.  I also remove any anime series that I know of whose webpage on the site  |
      #+       refers to the entire anime series, and not just the ovas.  Most of them have their own separate pages, but not all of them.                    |
      if [[ $1 = 'ova' ]]; then
        sed -e 's_Episode.[0-9][^0-9\.]_Episode 1 _' -e 's_Episode[ ]1[ ]__' -e 's_\(.*\)\(episode-\).\(-english-\)\(.*\)_\1\21\3\4_' \
            -e 's_episode-.-.-english-dubbed_episode-1-2-english-dubbed_' -e 's_part-.-english-dubbed_part-1-english-dubbed_' \
            -e 's_episode-[0-9]"_episode-1"_' -e 's_\(.*\)Part[ ]..*_\1Part 1_' -e 's_[ ]Episode.1-[0-9]__' -e 's_\(.*\):[ ]\(English.*\)_\1 \2_' \
            -e 's_.Episode.1..English_ English_' -e 's_[ ][ ]_ _' -e 's_Episode.[0-9]$__' -e 's_Ova_OVA_' -e 's_[ ]sis_ Sis_' -e 's_Pure.[0-9]_Pure 1_' \
            -e 's_Dokuro-chan.*_Dokuro-chan_' -e 's_\(.*UFO.*Valkyrie.\)\(.\).*_\1Season \2_' -e 's_.*memory-episode-1[0-9].*__' -e 's_.*pure-[^1].*__' \
            -e 's_.*x-sis.*-10__' < series_$(( ${i}+1 )).txt | sed -e 's_Episode [0-9]*[ ]__' -e 's_[Oo][Vv][Aa] [0-9]*[ ]__' | uniq |\
        egrep -v '^-|13a|dokuro-chan-2|Komugi.*2.5|Korumi E|Fairy.Tail.E.*|Full.*Alchemist|Munto 2 E|Ranma|Ryo-Ohki.*20|Yakuindomo S' |\
        sort -u -o series_$(( ${i}+1 )).txt
        cat series_$(( ${i}+1 )).txt | while read anime_title
        do
          ova_row=${ova_row:-0}
          let ova_row+=1

          # The above removed most of the duplicates, but their will be a few on the right side of the carrot, as well
          if [[ $ova_row = 1 ]]; then
            echo $anime_title > ova-fix.log
          else

            # If their are no matches from the $anime_title and the `ova-fix.log' file, then this is a unique string-- not a duplicate
            [[ -z `grep "$(gawk -F'>' '{ print $2 }' <<< "$anime_title")" ova-fix.log` ]] &&
            echo $anime_title >> ova-fix.log
          fi
        done
        rm series_$(( ${i}+1 )).txt
        grep -v '^$' ova-fix.log > series_$(( ${i}+1 )).txt
      fi
      [[ -f ova-fix.log ]] && rm ova-fix.log

      # Next we'll rename the category file based off of the current element of the array "Series[@]"
      [[ $i -ne 0 ]] && mv series_$(( ${i}+1 )).txt series_${Series[$i]}.txt

      #  Analyze the difference between the category file we just created and our old version of it, ========================\
      #+ in order to determine what new animes have been released since the last time Genshiken updated its category files.  |
      #                                                                                                                      |
      #  NOTE: Believe it or not, I'm not doing this because I want to inform the user there's new stuff out there.  Sorry!  |
      #        The purpose of this test exists to meet the conditions below (i.e. the Update_Previous_Sessions procedure)    |
      #        If you have a saved '--get|--stream' session on the filesystem, newly discovered Anime_Titles will affect it. |
      #        SEE THE FUNCTION: Update_Previous_Sessions ===================================================================/
      if [[ -f new_releases-$1.log ]]; then
        [[ -f series_A.txt ]] &&
        {
          diff -p series_${Series[$i]}.old series_${Series[$i]}.txt | grep '^+' | grep -v '^$' | gawk -F'>' '{ print $2 }' >> new_releases-$1.log 2> /dev/null ||
          diff -p series_1.old             series_1.txt             | grep '^+' | grep -v '^$' | gawk -F'>' '{ print $2 }' >> new_releases-$1.log 2> /dev/null
        }
      fi
      if [[ -f removed_show-$1.log ]]; then
        [[ -f series_A.txt ]] &&
        {
          diff -p series_${Series[$i]}.old series_${Series[$i]}.txt | grep '^-' | grep -v '^$' | gawk -F'>' '{ print $2 }' >> removed_show-$1.log 2> /dev/null ||
          diff -p series_1.old             series_1.txt             | grep '^-' | grep -v '^$' | gawk -F'>' '{ print $2 }' >> removed_show-$1.log 2> /dev/null
        }
      fi
    }
  done
  # Ensure that our diff files (e.g. new_releases-$1.log, removed_show-$1.log) are not empty
  [[ -f new_releases-$1.log ]] && [[ -z `< new_releases-$1.log` ]] && rm new_releases-$1.log
  [[ -f removed_show-$1.log ]] && [[ -z `< removed_show-$1.log` ]] && rm removed_show-$1.log
  
  #  If there are new releases or removed shows, then update any previous Genshiken session files that contain an  |
  #+ anime series that belongs to the same Anime-Type and series category as the new release[s] or removed show[s] |
  if [[ -f new_releases-$1.log ]]; then
    if [[ `wc -c < new_releases-$1.log` -ne 0 ]]; then
      Update_Previous_Sessions new_releases-$1.log $1
      echo -e "${N}${R}[${FUNCNAME[0]}] ${B}$PROGNAME ${G}discovered newly released ${M}${1//-*/}${G}:${Y}   \b\b"
      [[ ! -z ${WAIT:-} && $WAIT -le 1 ]] && : || sleep 2
      cat new_releases-$1.log; [[ ! -z ${WAIT:-} && $WAIT -le 1 ]] && : || sleep 4; rm new_releases-$1.log
    fi
  fi
  if [[ -f removed_show-$1.log ]]; then
    if [[ `wc -c < removed_show-$1.log` -ne 0 ]]; then
      Update_Previous_Sessions removed_show-$1.log $1
      echo -e "${N}${R}[${FUNCNAME[0]}] ${B}$PROGNAME ${G}discovered recently removed series ${M}${1//-*/}${G}:${Y}   \b\b"
      [[ ! -z ${WAIT:-} && $WAIT -le 1 ]] && : || sleep 2
      cat removed_show-$1.log; [[ ! -z ${WAIT:-} && $WAIT -le 1 ]] && : || sleep 4; rm removed_show-$1.log
    fi
  fi
  rm *.old 2> /dev/null  # We don't need these anymore...
}

# End:
# gfunc7-extract-series-list.sh ends here
