! Srend.F90 : 2016jul04

! NOTE: 2016sep01 : fixed error in pt-sphere clip

! --------------------------------------------------------------------
! Copyright (c) 2016 Ted Wetherbee
!
! Permission is hereby granted, free of charge, to any person 
! obtaining a copy of this software and associated documentation files 
! (the "Software"), to deal in the Software without restriction, 
! including without limitation the rights to use, copy, modify, merge,
! publish, distribute, sublicense, and/or sell copies of the Software,
! and to permit persons to whom the Software is furnished to do so, 
! subject to the following conditions:
!
! The above copyright notice and this permission notice shall be
! included in all copies or substantial portions of the Software.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
! EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
! MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
! NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
! BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
! ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
! CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
! SOFTWARE.
! ----- recent changes -----------------------------------------------

! 2016aug02
! 1) changed _IN variables to real, cast within to REAL*4 for use in the
!    key vectorizable sections.  Also allow integer*8 for _IN parameters
!    though this is probably a programming mistake to compile a CFD app
!    with integer*8.  Some of the heavy counting & sorting codes do this
!    but mostly in biology, genetics, data crunching, not CFD where
!    volume rendering is obviously useful.
! 2) changed alpha scheme to using Valpha from cotab
!    from a "256 Valpha" entry trailing each "255 #" entry
! 3) in progress - changing MPI-specific INTEGER
!    variables so that Srend behaves well with integer*8.  Unknown as
!    to what codes would use integer*8 for MPI.  Have not seen one
!    yet that does.
! 4) Fixed a bug in multiple shells, also redid the finish routines
!    where tiles and multiple shells are montaged and reprojected
!    to perspective.  Unsure that "montage" shuld be internal for
!    tiling; there are reasons to keep this external (assynchronous).
!    Decided that multiple shells will always ignore perspective flag as
!    this makes no sense.
! 5) Examining 3-part full-spherical routine for multiple shells
!    and tiling.  For tiling, each shell for poles must be montaged,
!    then reprojected, then written to the correct position, and the
!    Npole, equator, and Spole image chunks must be written to the
!    same file in correct positions, else montaged from 3 pieces.
!    This is probably too much, too easy to get broken.
!    We have: tiling + shells + full spherical (one view with srendtype=0)
!    We have: 3-part full spherical + shells
!    We do NOT have: 3-part full spherical + tiling

! 2016jun06
! 1) removed Voff: This has not been used, app codes (and users) must
!    assume that the low corner of data is at (0,0,0). 
! 2) removed passing thread id = th and number of threads = nth:
!    There are no subsets of teams for "subset barriers" as I had
!    originally thought possible, so one might as well assume that the
!    entire thread team calls srend_render* within a parallel region.
! 3) simplified the alpha compositing options:
!    The user now has to define code for alpha compositing options for
!    AMR and schemes beyond the default.  This adds flexibility.  The
!    scheme options (1-5) were confusing and caused caused clutter.
! 4) Removed the srend_init() function: This was used only when a
!    separate Srend communicator was created and used for Srend MPI, but,
!    instead, Srend calls which might use MPI will each call MPI_dup()
!    once on first call when SREND_COMM_DUP is defined and SREND_NOMPI is not
!    defined.
! --------------------------------------------------------------------
! MPI Gnu:
!   gfortran sleep.c -c
!   mpif90 Srend.F90 -DSRENDERING -O3 -c
!   mpif90 app.F90 Srend.o sleep.o -O3 -o app ! for example app
!   mpirun -np 4 ./app 
! Serial Gnu:
!   gfortran Srend-F90 -DSRENDERING -DSREND_NO_MPI -O3 -c
!   gfortran app.F90 Srend.o -O3 -o app       ! for example app
!   ./app
! openmp:
!   add this flag for Gnu: -fopenmp
!   add this for ifort: -openmp
!   Cray does openmp by default
!   also, may need to do this in shell before running: ulimit -s unlimited 
! vectorize, encourage:
!   gfortran:  -O3 -ffast-math -march=native
!   ifort: -O3, and use SIMD directives for 16-pack loops, or vectorize always for Cray
! useful Gnu flags:
!   debugging flags: -fbacktrace -fbounds-check
!   default real*8 : -fdefault-real-8
!   default integer*8 : -fdefault-integer-8
!   flag unused variables: -Wunused
! performance (things that usually work well)
!   gfortran: -Ofast (includes: -O3 + -ffast-math, -fno-protect-params -fstack-arrays)
!   ifort: -fast (includes: -ipo, -O3, -no-prec-div, -static, and -xHost)
! --------------------------------------------------------------------
! ifort:
!   mpiifort srend.F -openmp -O3 -c
! BW & Daint Cray: (default OpenMP, but: -h omp)
!   ftn srend.F -O3 -c
! More than 199 MPI processes on a cpu (testing), edit
!    /etc/security/limits.conf 
!    add these lines:
!      * soft nofile 8196
!      * hard nofile 8196
!      default 1024 allows about 199 MPI processes
! ------------------------------------------------------------------
! Show vectorization, ifort:
!    mpif90 srend.F -O3 -openmp -vec-report1 -c
! Show vectorization gfortran
!    -fopt-info-optimized ! shows lots of info
!    -ftree-vectorizer-verbose=1   ! 0-6: 0 nothing, 6 everything, 1 just loops vectorized
! mic:
!    $ echo $LD_LIBRARY_PATH
!      /opt/intel/composer_xe_2013/lib/mic:/opt/intel/impi/4.1.0/mic/lib
!    $ echo $PATH
!      /usr/bin:/bin:/usr/sbin:/sbin:/opt/intel/impi/4.1.0/bin
!    $ icc sleep.c -c
!    $ mpiifort srend.F90 -O3 -mmic -openmp -align array64byte -c
!    $ mpiifort srend_app.F90 Srend.o sleep.o -O3 -o srend_app
!    $ mpirun -host mic0 -np 2 ./srend_app
! --------------------------------------------------------------------
!
! y is up in this right-handed scheme: x right, z toward you
!           ^
!           y
!           y                 data: (x,y,z)
!           y
!           + x x x x x x >
!         z
!       z
!     L
!
! However, there is no special orientation for vectors so long as
! Up x V != 0 (cross product of Up vector and View vector is non-zero,
! hence non-colinear),  so z can be up if desired.
!           ^
!           z
!           z                 data: (x,y,z)
!           z
!           + y y y y y y >
!         x
!       x
!     L
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc

! undefine this if SRENDERING is not going to be set externally for build convenience
#define SRENDERING
! ---- this SRENDERING definition block encloses everything following, leave defined
#ifdef SRENDERING


! #### srend definitions for cpp #####################################

! ---- SREND_VECTORIZE : define at most ONE of these --------------
! gfortran, a comment only, as vectorization doesn't use directives
#define SREND_VECTORIZE !gfortran
! fortran openmp 4.0
!#define SREND_VECTORIZE !$OMP SIMD
!     ifort vectorization directive for 16-pack key loops
!#define SREND_VECTORIZE !DIR$ SIMD
!     must only use this one (below) for Cray (Blue Waters & Piz Daint) cause SIMD won't work
!#define SREND_VECTORIZE !DEC$ VECTOR ALWAYS


! ---- MPI --- MPI --- MPI --- MPI --- MPI --- MPI --- MPI --- MPI ---
! ----- MPI on or off ------------------------------------------------
!     turn off MPI: for desktop/SMP, for srend_finish_dump
!#define SREND_NOMPI

! ---- SREND communicator --------------------------------------------
! for COMM_COMM communicator off of MPI_COMM_WORLD ranks
#define SREND_COMM_DUP

! ---- unique MPI tags -----------------------------------------------
! tag for data MPI messages, TAG2 for setup MPI messages.
! Each has to be the same for all processes within a job.  The view 
! index nV is added to these tag values for use.
#define SREND_TAG2  1000

! ---- srend yeilding within MPI_test loop; uncomment exactly one ----
!    A pause/context switch during test loops. Uncomment ONE of these.
#define SREND_YIELDING call sleep_n1() !one nanosecond
!                       ifort only for sleepqq()
!#define SREND_YIELDING call sleepqq(1) !one millisecond

! ---- synchronous issend, if not defined then MPI_isend
!#define SREND_ISSEND

! ---- Views ---------------------------------------------------------
! These are the number of distinct views that can be made within
! 1 process.  These are the number of VC structures created.
! These are not very large as they mostly contain data in 
! allocatable arrays which are empty until actually used.
#define SREND_MAX_V 8

! ---- blocks that can be loaded within one view ---------------------
! 1 is enough if srend_load will not be used.  Bump up as needed.
! These do not have to be powers of 2.  Various values have
! been tested up to 2^19 on small workstations, though
! <= 2^18 is about the practical limit.
#define SREND_MAX_LOAD 32768

! ---- file unit number ---------------------------------------------
#define SREND_FILE_UNIT 2300

! ---- .ppm header length --------------------------------------------
! The image format PPM allows comments in a variable length header.  
! For ease in locating data after the header (just for srend), we 
! fix the length here for our purposes.  Don't change this!
#define SREND_PPM_HEADER_LEN 20

! ---- stereo clipping spheres ---------------------------------------
! Needed if volume to render intersects clipping spheres.
! Otherwise, stereo views will each use clipping spheres centered
! on left and right eyes.  SREND_STEREO defined uses additional
! calculations which are unecessary otherwise even if correct when
! not stereo left & right shifted views.
!#define SREND_STEREO
! Adjust for sample offsets between clip0 and central clip00.
! The stereo option above must be defined for this to be on and do 
! something right.
!#define SREND_TOFF

! ---- debug options -------------------------------------------------
! timing tests, in compose/finish
!#define SREND_TIMING
! timing tests in render
!#define SREND_RENDER_TIMING
! debug dumps, check arguments at entry of functions
!#define SREND_DEBUG
!trace progress in compose/finish
!#define SREND_FTRACE

! ---- performance tests
! this just turns flop count on, meant to be used only to count flops
! which must be done serial, no vectorization, perhaps -O0
! this counts arithmetic and function calls each as 1, just for floating pt work, not index arith, etc.
! Only the ray bounds checking and sampling loops are counted as this is 99% of the FLOP count.
! These both are done by all threads and are where vectorization matters, so vectorization=OFF
! when making this count.
!#define SREND_FLOP
! define how flop will be incremented
! - - - - - - - - - - - - - - - -
! uncoment exactly 1 of the following regarding SREND_FLOP:
! just place this alone on a line, say, to add 7: FLOP(7), and uncomment this line following
!#define FLOP(n) flop = flop + n
! always leave this defined in order to undefine FLOP(n) within code
#define FLOP(n)

! ---- color table: --------------------------------------------------
! Uncomment at most one of these, leave undefined for default HARDCODED
!#define SREND_COTAB_FILENAME 'cotab.txt'
!#define SREND_COTAB_PASSNAMELIST

! - - - options for some special cases
! Create Color Table Legend in SVG --------------------
!#define SREND_SVG_LEGEND

! rebuild cotab each pass, also writes legend each pass if SREND_SVG_LEGEND
!#define SREND_COTAB_EACH_PASS

! adjust alpha values for AMR, takes a bit more work in sampling loop
!
! Undefined, this assumes that the cotab alpha entries are set for the
! uniform grid sampling increment dt.
!
! Defined, a "256 Valpha" entry is attempted to be read during cotab
! setup.  If none is read, then Valpha=Vdim.  Also, whether Valpha is read
! or not, alpha = 1 - (1 - alpha_entry)**(dt * Valpha/Vdim).  The
! cotab alpha entries are assumed to apply to one cell of width Valpha, so
! smaller dt will scale up transparency and alpha when alpha is between
! 0 and 1.
!#define SREND_AMR

! set this to overide/provide a Valpha for SREND_AMR enabled,
! effectively provides Valpha value for SREND_AMR enabled scheme
! which otherwise is Vdim
!#define SREND_VALPHA 512

! -----------------------------------
! - - - - - - these are set internally, don't change - - - - - - 
#if  !( defined(SREND_COTAB_FILENAME) || defined(SREND_COTAB_PASSNAMELIST) )
#define SREND_COTAB_HARDCODED
! The max number of key strings hardcoded, rgb and alpha keys.
! increase/adjust as needed
#define SREND_COTAB_KEYS 128
#endif
!
! For cluster use, it is probably best to hardcode the table.  The
! passed values in offset_cotab_in(), say (/2,3/) for nR=2 are for
! shared indices to the hardcoded arrays rgb_knot and alpha_knot
! where entries begin.
!
! For a file 'cotab.txt' option, the offset_cotab_in() arguments, say
! (/2,3/), are for scanning the 'cotab.txt' file for the key phrases
! 'alpha 2', 'rgb 2', 'alpha 3', and 'rgb 3' where the tables of
! values immediately follow each key phrase.  This is currently
! case sensitive.  It is important that each color table starts off
! with 0 for the first row and 255 for the last row.
! 
! The SREND_PASSNAMELIST option is meant for WRF and CM1 so that
! srend parameters can be set in the namelist.input file which is
! already used.  However, color table arrays passed to srend_render()
! don't have to be set this way.

! ---- openmp for compositing ----------------------------------------
! NOTE: multiple threads has not shown any perf increase 
! in the compositing area for older 4-core machines.
! Define this to use openmp in compositing.
! choose just 1 scheme by undefining it

! this has a parallel do about each compositing loop, thus openmp
! distributes work: likely best for fewer and larger blocks
!#define SREND_COMPOSITE_OPENMP

! this partitions the target array into lines and distributed among threads
! and one parallel section surrounding all, likely best for more blocks and
! evenly distributed across target array
!#define SREND_PARALLEL_COMPOSE

! this partitions each source array into lines and distributed among threads
! and one parallel section surrounding all, likely best for more blocks but
! not well distributed acrosss target array
!#define SREND_PARALLEL2_COMPOSE

! statistics for testing load sharing for openmp rendering
!#define SREND_THREAD_LOAD

!---- shells to render -----------------------------------------------
! The maximum number of shells, to partition [clip0, clip1].
! There is no limit on this but for memory, disk space, etc.
! Yet, a practical number would surely be <= 64
#define SREND_MAX_SHELLS 4

! ---- buffer deallocation -------------------------------------------
! Enable this to deallocate finish buffers after each image.
! Ths might be important if a node does a lot of different views and
! gets close to using up RAM.
!#define SREND_CLEAR_FBUFS 

! -------------- montage for tiling, to have rank 0 merge tiles to an output image
!     The rank need not be zero for this, just usual.
!#define SREND_TILE_MONTAGE 0

! -------------- solid rendering -------------------------------------
! This option has been used to render solid terrain within a fluid, a much
! slower operation at this time.  It is experimental and not as well tested.
!#define SREND_SOLID_RENDERING

! -------------- clipping switches --------------------------------------
! The parameters always need to be passed, some values to ignore if not used,
! but the clipping checks are skipped if these are commented out--making 
! the passed parameters useless yet essentially harmless.  Internal 
! variables will be set from passed but ignored parameters to prevent compiler 
! warnings about unused variables, but this takes very little time.
#define SREND_XYZ_AXIS_CLIP
#define SREND_POLAR_CLIP
#define SREND_POINT_RADIUS_CLIP

! #### end: srend #define cpp ########################################
! ********************************************************************
! ********************************************************************

module srend

! performance counting, only enabled for counting FLOPs
#ifdef SREND_FLOP
   integer*8 :: flop
#endif
     
! data structure used by rendering and compositing routines ----------
type cube_type ! these are the target arrays for rendering
   REAL*4,dimension(:,:,:,:,:,:),allocatable :: im
end type cube_type

type block_type
   integer :: load_pass = 1                                        ! increment during each flush, so init as 1
   logical :: used=.FALSE., mpi=.FALSE., load=.FALSE., flush=.FALSE. ! mode flags
! test completion of isend in render
   integer :: nreq = -1, nreq2 = -1
   integer,dimension(:),allocatable :: MYreq                       ! outgoing data
   integer,dimension(:),allocatable :: MYreq2                      ! outgoing setup information for data
   logical,dimension(:),allocatable :: isendDONE                     ! outgoing data 
   logical,dimension(:),allocatable :: isendDONE2                    ! outgoing setup
! test completion of isend in compose, just 1
   integer :: MYreq3                                               ! outgoing data
   logical :: isendDONE3 = .TRUE.
   integer :: MYreq4                                               ! outgoing setups
   logical :: isendDONE4 = .TRUE.
! tiling MPI
   integer, dimension(:,:),allocatable :: Tload        ! num blocks for each tile to composite, load increments
   integer, dimension(:,:),allocatable :: TARGid       ! target rank for each tile to send a particular tile
! for loading, there will be n blocks 1:n         
   integer :: n=0 ! initial value
   character*1,dimension(:,:),allocatable :: mpi_header  ! these to send, (:,0) sometimes to store common parameters
   integer,dimension(:,:),allocatable :: wh            ! for NOMPI use in compositing sort
   type(cube_type),dimension(:),allocatable :: cube      ! the array of target arrays: block(nV)%cube(:)%im
! testing thread load sharing in rendering loop
#ifdef SREND_THREAD_LOAD
   integer*8 :: th_hit(0:32) = 0                           ! init to 0, reset to zero on flush
#endif
end type block_type
      
type(block_type),dimension(1:SREND_MAX_V),save :: block ! one per view
            
#ifndef SREND_NOMPI
! these are global values.
logical, save :: srend_MPI_INIT = .FALSE. ! initial value, check this to set communicator
integer,save :: srend_COMM ! its own if SREND_comm_dup is called, MPI_COMM_WORLD otherwise
#endif        
 
   CONTAINS
      
#ifndef SREND_NOMPI      
   subroutine srend_comm_init() ! communicator for srend off MPI_COMM_WORLD
! This routine either duplicates MPI_COMM_WORLD to a new communicator
! srend_COMM (#define SREND_COMM_DUP), or sets srend_COMM = MPI_COMM_WORLD.
! It is important to MPI_COMM_dup() for a new communicator if there is a chance
! that the application using MPI_COMM_WORLD would clash.  On the downside, duplicating
! communicators adds overhead.  This overhead is not much for "small" MPI runs, but
! it becomes significant for "large" jobs, say .5 million MPI ranks.
      integer :: MYer
      include 'mpif.h'
      if(.NOT. srend_MPI_INIT) then
#ifdef SREND_COMM_DUP
         call MPI_comm_dup(MPI_COMM_WORLD, SREND_COMM, MYer)
#else
         srend_COMM = MPI_COMM_WORLD
#endif
         srend_MPI_INIT = .TRUE.
      end if
   end subroutine srend_comm_init
#endif

! ********************************************************************
! ---- srend_render_load() ----
! MPI and SMP use, must follow a sequence of these with srend_render_flush
! ********************************************************************
   subroutine srend_render_load(                                    &
      nV, nV_out,                                                   &
      Vdim_in,                                                      &
      E_in,                                                         &
      Ev_in,                                                        &
      Up_in,                                                        &
      Alpha_in,Beta_in,                                             &
      EyeRight_in,                                                  &
      clipx0_in,clipx1_in,clipy0_in,clipy1_in,clipz0_in,clipz1_in,  &
      pclip1_in,                                                    &
      pt_in, pt_r_in,                                               &
      clip0_in,clip1_in, nsh_in,                                    &
      perspective_in,                                               &
      srendtype_in,                                                 &
      dt_in,                                                        &
      bytedata,                                                     &
      XN_in,YN_in,ZN_in,                                            &
      Bd,                                                           &
      iNX_in,iNY_in,iNZ_in,                                         &
      W_in,H_in,                                                    &
      nR_in,                                                        &
      offset_cotab,                                                 &
#ifdef SREND_COTAB_PASSNAMELIST
      n_alpha_knot, alpha_knot, n_rgb_knot, rgb_knot,               &
#endif
#ifdef SREND_SOLID_RENDERING
      Sdim, Sbounds, S, Srgb,                                       &
#endif
      filenames_in,                                                 &
      tiles_right_in, tiles_down_in,                                &
      TARGid)
     
      implicit NONE
      
#ifndef SREND_NOMPI 
      include 'mpif.h'
#endif
      
      integer,intent(IN) :: nV, nV_out
      integer,intent(IN) :: Vdim_in
      real,dimension(3),intent(IN) :: E_in, Ev_in, Up_in
      real,intent(IN) :: Alpha_in, Beta_in
      real,intent(IN) :: EyeRight_in
      real,intent(IN) :: clipx0_in,clipx1_in, clipy0_in,clipy1_in, clipz0_in,clipz1_in
      real,intent(IN) :: pclip1_in
      real,dimension(3),intent(IN) ::pt_in
      real,intent(IN) :: pt_r_in
      real,intent(IN) :: clip0_in, clip1_in
      integer,intent(IN) :: nsh_in
      integer,intent(IN) :: perspective_in,srendtype_in
      real,intent(IN) :: dt_in
      integer,intent(IN) :: XN_in,YN_in,ZN_in
      integer,intent(IN) :: Bd ! boundary around incoming data
      character*1,dimension(1-Bd:XN_in+Bd,1-Bd:YN_in+Bd,1-Bd:ZN_in+Bd,1:nR_in),intent(IN) :: bytedata
      integer,intent(IN) :: iNX_in,iNY_in,iNZ_in ! offsets for cube
      integer,intent(IN) :: W_in, H_in
      integer,intent(IN) :: nR_in
      integer,dimension(1:nR_in),intent(IN) :: offset_cotab
#ifdef SREND_COTAB_PASSNAMELIST
      integer,intent(IN) :: n_alpha_knot
      character*256,intent(IN)  :: alpha_knot(1:n_alpha_knot)
      integer,intent(IN) :: n_rgb_knot
      character*256,intent(IN)  :: rgb_knot(1:n_rgb_knot)
#endif
#ifdef SREND_SOLID_RENDERING
      integer,intent(IN) :: Sdim ! 0 = off, 2 = 2D, (later: 3 = 3D)
      integer,intent(IN) :: Sbounds(0:1,1:3) ! 0=min:1=max,1=x 2=y 3=z
      character*1,intent(IN) ::        S(1-Bd:XN_in+Bd,1-Bd:YN_in+Bd,1-Bd:ZN_in+Bd) ! 1.0 = air, 0.0 = solid
      character*1,intent(IN) :: Srgb(1:3,1-Bd:XN_in+Bd,1-Bd:ZN_in+Bd)
#endif
      character*200,intent(IN) :: filenames_in
      integer,intent(IN) :: tiles_right_in, tiles_down_in
      integer,dimension(1:tiles_right_in,1:tiles_down_in),intent(IN) :: TARGid

! ---------- end var declarations ------------------------------------


!$OMP SINGLE

      if(.NOT. block(nV)%used ) then ! will always use this many
#ifndef SREND_NOMPI
         allocate( block(nV)%MYreq(0:SREND_MAX_LOAD) ); block(nV)%MYreq = MPI_REQUEST_NULL
         allocate( block(nV)%MYreq2(0:SREND_MAX_LOAD) ); block(nV)%MYreq2 = MPI_REQUEST_NULL
         allocate( block(nV)%isendDONE(0:SREND_MAX_LOAD) );block(nV)%isendDONE = .TRUE.
         allocate( block(nV)%isendDONE2(0:SREND_MAX_LOAD) ); block(nV)%isendDONE2 = .TRUE.
         block(nV)%nreq = 0
         block(nV)%nreq2 = 0
#endif
         allocate( block(nV)%Tload(1:tiles_right_in,1:tiles_down_in) ) ! used to share load among ranks for each tile
         block(nV)%Tload = 0 ! zero it
         allocate( block(nV)%TARGid(1:tiles_right_in,1:tiles_down_in) ) ! used to share rank per tile
         block(nV)%TARGid = TARGid
         block(nV)%used  = .TRUE.
      end if

      if( block(nV)%n == 0) block(nV)%Tload = 0 ! clear, not within loading a sequences

! set behavior
      block(nV)%load  = .TRUE.
      block(nV)%flush = .FALSE.
#ifndef SREND_NOMPI
      block(nV)%mpi   = .TRUE. ! this could be set at compile time in block structure to default values
#else
      block(nV)%mpi   = .FALSE.
#endif      

!$OMP END SINGLE

      call srend_render(                                               &
         nV, nV_out,                                                   & ! only nV different, neg
         Vdim_in,                                                      &
         E_in,                                                         &
         Ev_in,                                                        &
         Up_in,                                                        &
         Alpha_in,Beta_in,                                             &
         EyeRight_in,                                                  &
         clipx0_in,clipx1_in,clipy0_in,clipy1_in,clipz0_in,clipz1_in,  &
         pclip1_in,                                                    &
         pt_in, pt_r_in,                                               &
         clip0_in,clip1_in, nsh_in,                                    &
         perspective_in,                                               &
         srendtype_in,                                                 &
         dt_in,                                                        &
         bytedata,                                                     &
         XN_in,YN_in,ZN_in,                                            &
         Bd,                                                           &
         iNX_in,iNY_in,iNZ_in,                                         &
         W_in,H_in,                                                    &
         nR_in,                                                        &
         offset_cotab,                                                 &
#ifdef SREND_COTAB_PASSNAMELIST
         n_alpha_knot, alpha_knot, n_rgb_knot, rgb_knot,               &
#endif
#ifdef SREND_SOLID_RENDERING
         Sdim, Sbounds, S, Srgb,                                       &
#endif
         filenames_in,                                                 &
         tiles_right_in, tiles_down_in,                                &
         TARGid)
      
   end subroutine srend_render_load
! ********************************************************************
      
      

      
      
! ********************************************************************
! ---- srend_render_flush() ----
! call this after a sequence of srend_render_load(), called by all ranks, 1 thread (master)
! ********************************************************************
   subroutine srend_render_flush(nV)
      implicit NONE
#ifndef SREND_NOMPI 
      include 'mpif.h'
#endif
      integer,intent(IN) :: nV

      integer :: mhe_tiles_right, mhe_tiles_down, mhe_nV_out, mhe_TARGid
      character*1,dimension(1:400) :: mpi_header ! 1:400   ! for tiling & MPI flush
      equivalence (mpi_header(317),mhe_tiles_right )  ! 317:320
      equivalence (mpi_header(321),mhe_tiles_down )   ! 321:324 
      equivalence (MPI_header(357),mhe_nV_out)    ! 357:360 ! put in header for render_flush and render_flush_write
      equivalence (MPI_header(361),mhe_TARGid)    ! 361:364 ! put in header for render_flush and render_flush_write

#ifndef SREND_NOMPI
      integer :: MYer, MYid=0
      integer :: srendMPIcount ! for MPI_INTEGER portability ! notused: srendMPItag, srendMPIrank,
#endif
! --------- end var declarations -------------------------------------


#ifndef SREND_NOMPI
!      call srend_comm_init() ! already called by srend_render_load() if MPI, never here first
      call MPI_comm_rank(srend_COMM, MYid, MYer)
#endif

! testing thread load sharing in rendering loop
#ifdef SREND_THREAD_LOAD
      print *,'FFFFlush thread load: th_hit(0)=',block(nV)%th_hit(0), &
              'th_hit(1)=',block(nV)%th_hit(1)
      block(nV)%th_hit = 0 ! reset
#endif


! check if loaded for & tiling
#ifndef SREND_NOMPI
      if( block(nV)%used) then! .AND. block(nV)%n > 0) then
        mpi_header(:) = block(nV)%mpi_header(:,0)
        if(mhe_tiles_right*mhe_tiles_down > 1) then
          srendMPIcount = mhe_tiles_right*mhe_tiles_down ! portability with MPI_INTEGER = INTEGER
          call MPI_allreduce(MPI_IN_PLACE, block(nV)%Tload, srendMPIcount, MPI_INTEGER, &
                             MPI_SUM, srend_COMM, MYer)
          return
        end if
      end if
#endif

! set behavior
      block(nV)%used  = .TRUE. ! not needed?
      block(nV)%load  = .FALSE.
      block(nV)%flush = .TRUE.
#ifndef SREND_NOMPI
      block(nV)%mpi   = .TRUE. ! could remove
#else
      block(nV)%mpi   = .FALSE.
#endif

      call srend_base(nV,mhe_nV_out,block(nV)%n,mhe_TARGid,2) ! no tiling
! reset/clear because completed flush
      block(nV)%n = 0
      block(nV)%Tload = 0 ! clear
      block(nV)%load_pass = block(nV)%load_pass + 1 ! for the next pass

   end subroutine srend_render_flush
! ********************************************************************




! ********************************************************************
! ---- srend_render_write() ----
! call this after a sequences of srend_render_load() : called in SMP, by 1 thread (master)
! ********************************************************************
   subroutine srend_render_flush_write(nV)
      implicit NONE      
      integer,intent(IN) :: nV 
      
      integer :: mhe_tiles_right, mhe_tiles_down, mhe_nV_out, mhe_TARGid
      character*1,dimension(1:400) :: mpi_header ! 1:400  ! for tiling & MPI flush,
      equivalence (mpi_header(317),mhe_tiles_right )  ! 317:320
      equivalence (mpi_header(321),mhe_tiles_down )   ! 321:324 
      equivalence (MPI_header(357),mhe_nV_out)    ! 357:360
      equivalence (MPI_header(361),mhe_TARGid)    ! 361:364
      
! --------- end var declarations -------------------------------------

! set behavior
      block(nV)%used  = .TRUE. ! not needed?
      block(nV)%load  = .FALSE.
      block(nV)%flush = .TRUE.
#ifndef SREND_NOMPI
      block(nV)%mpi   = .TRUE. ! could remove
#else
      block(nV)%mpi   = .FALSE.
#endif
       
      mpi_header(:) = block(nV)%mpi_header(:,0) ! get needed info from mpi_header(:,0)

      call srend_base(nV,mhe_nV_out,block(nV)%n,mhe_TARGid,3)
! reset/clear because completed flush & write
      block(nV)%n = 0     ! reset
      block(nV)%Tload = 0 ! reset
      block(nV)%load_pass = block(nV)%load_pass + 1 ! for next pass
   end subroutine srend_render_flush_write
! ********************************************************************
      
      
      
      
      
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! Spherical Rendering     (1)                   Wetherbee 2014  FDLTCC
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS

   subroutine srend_render(                                         &
      nV, nV_out,                                                   &
      Vdim_in,                                                      &
      E_in,                                                         &
      Ev_in,                                                        &
      Up_in,                                                        &
      Alpha_in,Beta_in,                                             &
      EyeRight_in,                                                  &
      clipx0_in,clipx1_in,                                          &
      clipy0_in,clipy1_in,                                          &
      clipz0_in,clipz1_in,                                          &
      pclip1_in,                                                    &
      pt_in, pt_r_in,                                               &
      clip0_in,clip1_in, nsh_in,                                    &
      perspective_in,                                               &
      srendtype_in,                                                 &
      dt_in,                                                        &
      bytedata,                                                     &
      XN_in,YN_in,ZN_in,                                            &
      Bd,                                                           &
      iNX_in,iNY_in,iNZ_in,                                         &
      W_in,H_in,                                                    &
      nR_in,                                                        &
      offset_cotab,                                                 &
#ifdef SREND_COTAB_PASSNAMELIST
      n_alpha_knot, alpha_knot, n_rgb_knot, rgb_knot,               &
#endif
#ifdef SREND_SOLID_RENDERING
      Sdim, Sbounds, S, Srgb,                                       &
#endif
      filenames_in,                                                 &
      tiles_right_in, tiles_down_in,                                &
      TARGid)


#ifdef _OPENMP
      USE OMP_LIB
#endif

      implicit NONE

#ifndef SREND_NOMPI 
      include 'mpif.h'
#endif

! ----------- incoming parameters ------------------------------------
! The "_in0" parameters are altered by the AMR parameter.
! The "_in" parameters are perhaps altered by stereo or pole
!   adjustments, but always copied over.
! ----- view parameters - - - - ---- - - - - --- - - - - - - - - - - -
      integer,intent(IN) :: nV
      integer,intent(IN) :: nV_out ! view target
      INTEGER*4,save :: mhe_nV_out
      integer,intent(IN) :: Vdim_in
      INTEGER*4,save :: mhe_Vdim

      real,dimension(3),intent(IN) :: E_in
      REAL*4, dimension(3) :: E_in2
      REAL*4, dimension(3),save :: mhe_E
      real, dimension(3),intent(IN) :: Ev_in
      REAL*4,dimension(3),save :: mhe_Ev
      real, dimension(3),intent(IN) :: Up_in
      REAL*4,dimension(3),save :: mhe_Up
      real,intent(IN) :: Alpha_in, Beta_in
      REAL*4,save :: mhe_Alpha, mhe_Beta
      real,intent(IN) :: EyeRight_in

      real,intent(IN) :: clipx0_in, clipx1_in
      REAL*4,save :: clipx0, clipx1
      real,intent(IN) :: clipy0_in, clipy1_in
      REAL*4,save :: clipy0, clipy1
      real,intent(IN) :: clipz0_in, clipz1_in
      REAL*4,save :: clipz0, clipz1
      real,intent(IN) :: pclip1_in
      REAL*4,save :: pclip1
      real,dimension(3),intent(IN) ::pt_in
      REAL*4,dimension(3),save ::pt
      real,intent(IN) :: pt_r_in
      REAL*4,save :: pt_r

      real,intent(IN) :: clip0_in
      REAL*4,save :: clip0
      real,intent(IN) :: clip1_in
      REAL*4,save :: clip1
      integer,intent(IN) :: nsh_in
      INTEGER*4,save :: mhe_nsh

      integer,intent(IN) :: perspective_in
      INTEGER*4,save :: mhe_perspective ! 0 = sph, 1 = prsp
      integer,intent(IN) :: srendtype_in
      INTEGER*4,save :: mhe_srendtype!0=norm,1=npole,2=equ,3=spole
      real,intent(IN) :: dt_in
      REAL*4,save :: dt
! ----- data volume parameters - - - - - - - - - - - - - - - - - - - -
      integer,intent(IN) :: XN_in,YN_in,ZN_in
      INTEGER*4,save :: mhe_XN,mhe_YN,mhe_ZN ! cells in each dim
      integer,intent(IN) :: Bd ! boundary around incoming data
!      INTEGER*4,intent(IN) :: Nvar ! number concatenated in bytedata
      character*1,dimension(1-Bd:XN_in+Bd,1-Bd:YN_in+Bd,1-Bd:ZN_in+Bd,1:nR_in),intent(IN) :: bytedata
      integer,intent(IN) :: iNX_in,iNY_in,iNZ_in ! offsets for cube
      INTEGER*4,save :: mhe_iNX, mhe_iNY, mhe_iNZ

! ----- image parameters - - - - - - - - - - - - - - - - - - - - - - -
      integer,intent(IN) :: W_in, H_in
      INTEGER*4,save :: mhe_W, mhe_H ! resulting out put image dimensions
      INTEGER*4,save :: mhe_Wim, mhe_Him ! rendering dimensions in finisher (different for polar
      integer,intent(IN) :: nR_in
      INTEGER*4,save :: mhe_nR ! number cotab_offsets, and variable arrays
      integer,dimension(1:nR_in),intent(IN) :: offset_cotab
#ifdef SREND_COTAB_PASSNAMELIST
      integer,intent(IN) :: n_alpha_knot
      character*256,intent(IN)  :: alpha_knot(1:n_alpha_knot)
      integer,intent(IN) :: n_rgb_knot
      character*256,intent(IN)  :: rgb_knot(1:n_rgb_knot)
#endif
#ifdef SREND_SOLID_RENDERING
      integer,intent(IN) :: Sdim ! 0 = off, 2 = 2D, (later: 3 = 3D)
      integer,intent(IN) :: Sbounds(0:1,1:3) ! 0=min:1=max,1=x 2=y 3=z
      character*1,intent(IN) ::        S(1-Bd:XN_in+Bd,1-Bd:YN_in+Bd,1-Bd:ZN_in+Bd) ! 1.0 = air, 0.0 = solid
      character*1,intent(IN) :: Srgb(1:3,1-Bd:XN_in+Bd,1-Bd:ZN_in+Bd)
#endif
! ----- image file parameters - - - -- --- ---- - - - - - - - - - - --
      character*200,intent(IN) :: filenames_in
! arrays for assignment: can't be string here to pack in mpi_header, so copy over
      character*1,dimension(1:200),save :: mhe_filenames
      
      integer,intent(IN) :: tiles_right_in, tiles_down_in
      INTEGER*4,save :: mhe_tiles_right, mhe_tiles_down
! ----- MPI target rank - - - -- - - - - - -- - -- - - -- - - - - - --
      integer,dimension(1:tiles_right_in,1:tiles_down_in),intent(IN) :: TARGid
      INTEGER*4,save :: mhe_TARGid
      
      
! ----- OMP threads - - -- --- - -- - - - - -- - --- - - - -  - -- ---
      INTEGER*4 :: th ! thread number, 0:SREND_OMP_THREADS-1
      INTEGER*4 :: nth ! number of OMP threads calling, 0 or 8 


! --------------------------------------------------------------------
! nV : The view index is an integer in 1:SREND_MAX_V .
! nV_out : target nV for issend
!
! Vdim : The number of cells spanning the volume, usually the largest
!        dimension if not cubic.  The real dimensions like E, eyeRight,
!        and clipping have "volume units" so 1.0 = Vdim cells across.
!
! E  : The eye position in x,y,z coordinates.  The unit is the cell
!      length for all values in the volume space.
! Ev : The vector direction of the eye view.  This vector points to
!      the center of the image result.  This need not be unit.
! Up : The up direction.  This cannot be parallel to Ev, but it need
!      not be orthogonal.  An orthogonal basis V,U,R will be formed
!      where V X U = R and V=Ev/|Ev| (after some adjustment from Ev
!      if a stereo view).  This need not be unit.
!
! Alpha : The angle span across in the V-R plane or around U. 
!      (degrees)
! Beta  : The angle up and down in the V-U plane or around R. 
!      (degrees)
!
! EyeRight    : The shift in the R direction from original Eye E. 
!
! clipx0, clipx1,clipy0,clipy1,clipz0,clipz1 :
!  Near and far clipping planes perpendicular to each axis.  To
!  "turn off", set these beyond, e.g.: clipx0 = -1000000, clipx1 = 1000000
!
! pclip1 : Far polar clip cone.  Rays whose angle made from V
!      exceed pclip1 are excluded.  This results in a circular
!      rendered image about the center, and it is used for polar
!      views which can be remapped to lon/lat. (degrees)
!
! pt,pt_r : center and radius of sphere to clip outside of
!
! clip0  : Near clipping sphere.  No samples are taken less than this
!      distance from the eye E. (degrees)
! clip1  : Far clipping sphere.  No samples are taken more than this
!      distance from the eye E. (degrees)
! nsh    : number of shells partitioned/rendered in [clip0, clip1]
!
!
! perspective: If 0, then spherical; if 1, then perspective.  All
!      rendering is done with a spherical lon/lat view, then
!      this rendering is resampled and converted to
!      perspective if 1.  Alpha < 180.0 and Beta < 180.0 .
!
! srendtype : 0 normal
!             1 north polar
!             2 equatorial
!             3 south polar
!      The polar and equatorial views are used together for
!      efficient full spherical views.  Otherwise, srendtype
!      should be 0.
!
! dt : The sampling increment in cell lengths, typically .5, .25,
!      and similar fractions.  If dt < 0, this flags an 
!      outside to in rendering: from infinity to the eye.
!
! bytedata : the 3D volume data in scaled 0-255 bytes
!      (1-Bd:mhe_XN+Bd,1-Bd:mhe_YN+Bd,1-Bd:mhe_ZN+Bd) <char*1>
! XN,YN,ZN : the bytedata array dimensions without boundaries
! Bd : the boundary of the bytedata array.
! iNX,iNY,iNZ : The offset of the data volume in cell widths.
!      A volume with a corner at the origin has offset
!      (0,0,0).
!
! W,H : Width and height of rendering plane in pixels.
!
! nR : Number of Rendered images.  One can concatenate data volumes
!      having exactly the same array shape and render them at
!      once in the same pass.
! offset_cotab : An array of integers with dimension (1:mhe_nR)
!      These values are indices to the colortable used for
!      each data volume, and they point to the row
!      of hardcoded key values or to the numbered alpha
!      and rgb key values in a colortable file.
!
! n_alpha_knot, alpha_knot, n_rgb_knot, rgb_knot :
! INTEGER*4     char*256    INTEGER*4   char*256
!      These are meant to be obtained using a namelist as
!      in WRF and CM1.
!
! Sdim : flag : 0 = off, 2 = 2D, (later: 3 = 3D)
! Sbounds(0:1,1:3) ! 0=min:1=max,1=x 2=y 3=z) min and max values in array where one tries to solid render
! S(1-Bd:XN_in+Bd,1-Bd:YN_in+Bd,1-Bd:ZN_in+Bd) ! 1.0 = air, 0.0 = solid, this is used in absorption calc
! Srgb(1:3,1-Bd:XN_in+Bd,1-Bd:ZN_in+Bd) : RGB 24-bit plane values to extrude (something else for 3D in mind)
!
! filenames : A string of comma separated file names.  The commas
!      must be between each filename and none follow the last
!      filename.  There must be mhe_nR filesnames.  For example:
!      'd######.ppm,p######.ppm,v######.ppm' with mhe_nR = 3.
!      There are masks.  The string '######' will be replaced
!      by the pass number srend tracks for each view:
!      ######,000001,000002, etc.
!      The extension '.ppm' selects Portable PixMap format
!      with a fixed header size, and anything else results in 
!      raw 24 bit
!      RGB format.  <character(len=*)>, but len <= 200
!      for fixed mpi_header.
!
! tiles_right, tiles_down: split output image.  This works
!      using a call to Imagemagick convert, after the file
!      has been written to disk.
!
! TARGid : Target MPI rank.
! --------------------------------------------------------------------


! -------------- variables needed to pass up and keep state here -----
! These are mostly the same for all threads, but only one thread
! sets mhe_wh_out(:).  mpi_header is copied to the view cache then sent up
! to compose/finish.  There are ways to do the same thing with
! derived types and MPI derived types, but equivalence is simple
! and effective here.
      logical,save :: mhe_outin  ! if dt_in < 0
      
! bounds actually used: 1=w0 2=w1 3=h0 4=h1
      INTEGER*4,dimension(1:4),save :: mhe_wh_out
! this pass index, passed to compose/finish 
      INTEGER*4, save :: mhe_Vpass
! --------- shells
! shells by index for this rendered subvolume
      INTEGER*4,save :: mhe_sh0, mhe_sh1
      
! for AMR tiling
      INTEGER*4,save :: mhe_nBlock, mhe_TRi, mhe_TDi
      
! magic numbers, not used as yet
! These might (We hope not!) be used to scan corrupted dump files.
      character*4 :: mhe_magic0 = 'LCSE'
      character*4 :: mhe_magic1 = 'ESCL'

! NOTE: all "mhe_" variables are in mpi_header
      
! mpi_header array : equivalenced to and passed using MPI 
      character*1,dimension(1:400),save :: mpi_header ! 1:400
!DIR$ ATTRIBUTES ALIGN:64 :: mpi_header

      equivalence (mpi_header(1), mhe_magic0)  ! 1:4
      equivalence (mpi_header(5), mhe_outin)   ! 5:8
      equivalence (mpi_header(9),mhe_wh_out(1))! 9:24 w0 w1 h0 h1
      equivalence (mpi_header(25), mhe_iNX )   ! 25:28 iNX
      equivalence (mpi_header(29), mhe_iNY )   ! 29:32 iNY
      equivalence (mpi_header(33), mhe_iNZ )   ! 33:36 iNZ
      equivalence (mpi_header(37), mhe_XN )    ! 37:40 XN
      equivalence (mpi_header(41), mhe_YN )    ! 41:44 YN
      equivalence (mpi_header(45), mhe_ZN )    ! 45:48 ZN
      equivalence (mpi_header(49), mhe_nR )    ! 49:52
      equivalence (mpi_header(53), mhe_Ev(1) ) ! 53:64
      equivalence (mpi_header(65), mhe_E(1) )  ! 65:76 ! aligned 64 byte
      equivalence (mpi_header(77), mhe_Up(1) ) ! 77:88
      equivalence (mpi_header(89), mhe_Alpha)  ! 89:92
      equivalence (mpi_header(93), mhe_Beta)   ! 93:96
      equivalence (mpi_header(97), mhe_W)      ! 97:100
      equivalence (mpi_header(101), mhe_H)     ! 101:104
!     unused                                   ! 105:108
      equivalence (mpi_header(109),mhe_filenames(1) ) ! 109:308
      equivalence (mpi_header(309),mhe_perspective )  ! 309:312
      equivalence (mpi_header(313),mhe_srendtype )    ! 313:316
      equivalence (mpi_header(317),mhe_tiles_right )  ! 317:320
      equivalence (mpi_header(321),mhe_tiles_down )   ! 321:324 
      equivalence (mpi_header(325),mhe_Wim)           ! 325:328
      equivalence (mpi_header(329),mhe_Him)           ! 329:332
      equivalence (MPI_header(333),mhe_nsh)           ! 333:336
      equivalence (MPI_header(337),mhe_sh0)           ! 337:340
      equivalence (MPI_header(341),mhe_sh1)           ! 341:344
!      unused                                         ! 345:348
      equivalence (MPI_header(349),mhe_Vdim)          ! 349:352
!     unused                                      ! 353:356
      equivalence (MPI_header(357),mhe_nV_out)    ! 357:360 ! put in header for render_flush and render_flush_write
      equivalence (MPI_header(361),mhe_TARGid)    ! 361:364 ! put in header for render_flush and render_flush_write
      equivalence (MPI_header(365),mhe_Vpass)     ! 365:368
!     unused                                      ! 369:372
!     unused                                      ! 373:384
      equivalence (MPI_header(385),mhe_nBlock)    ! 385:388
      equivalence (MPI_header(389),mhe_TRi)       ! 389:392
      equivalence (MPI_header(393),mhe_TDi)       ! 393:396
      equivalence (MPI_header(397),mhe_magic1)    ! 397:400

! --------------------------------------------------------------------
! counters
      INTEGER*4 i,j,k
      INTEGER*4 ii
! setup first pass : a counter for each view
      INTEGER*4,dimension(1:SREND_MAX_V),save :: Vpass ! each view

! ------- color table ------------------------------------------------
! this is formed by renderer below on first pass and saved in
! VC(nV)%cotab.  On following passes, the first threads copies
! VC(nV)%cotab to cotab for all threads.
! color table (Alpha=0, R=1, G=2, B=3,index in 0:255)
      REAL*4,dimension(:,:,:),allocatable,save :: cotab
      INTEGER*4,save :: Valpha
!DIR$ ATTRIBUTES ALIGN:64 :: cotab
! color table data : to form color table : these are done once for all
! these are for hard-coded color table values, adjust as needed

#ifdef SREND_COTAB_HARDCODED
      character*64,save :: alpha_knot(1:SREND_COTAB_KEYS)
      character*64,save :: rgb_knot(1:SREND_COTAB_KEYS)
#endif

! IO : vars to read in color table knots and fill out color table
      INTEGER*4 cotab_pix,cotab_pix2
      REAL*4 cotab_a,cotab_a2,cotab_r,cotab_r2
      REAL*4 cotab_g,cotab_g2,cotab_b,cotab_b2
 
! a shortcut calculation in sampling
#ifdef SREND_AMR
      REAL*4 dt_Valpha_Vdim
#endif
      
! These are for parsing color table file to form color table
#ifdef SREND_COTAB_FILENAME
      character (len=300) :: cotab_line ! max length, way much here
      INTEGER*4 :: cotab_pos
      INTEGER*4 :: cotab_ferror ! test on read of file
#endif

! ------------ variables for finding bounds ----------
      REAL*4,save :: PI ! set this once for all views the first call
      REAL*4 :: a2,b2 ! alpha/2 and beta/2 in radians
      REAL*4,save :: dW, dH  ! increments for W and H
! pre-processing variables to form the view basis vectors
      REAL*4, dimension(1:3) :: E0, V0, U0, R0
      REAL*4,dimension(1:3),save :: V, U, R
!DIR$ ATTRIBUTES ALIGN:64 :: V, U, R
! computing arrays, checking bounding cubes
      REAL*4 C(1:3)  ! point in space, center of cube/volume
      REAL*4 EC(1:3)  ! ray EP
      REAL*4 dEC ! distance from E to C
      REAL*4 uEC(1:3) ! unitized EC
      REAL*4 radius ! radius of bounding sphere
      REAL*4 uEC2VR(1:3) ! unitized EC - Proj_U (EC)
      REAL*4 acos_arg  ! intermediate result, to check
      REAL*4 aa,bb ! lon/lat coordinate
      REAL*4 gam ! angle span from EC to touch bounding sphere
      REAL*4 aax ! expansion of gam with lat inc/dec
      INTEGER*4 ww0, ww1 ! intermediate bounds for w0, w1
! coordinates for render plane: (1:nx,1:ny,1:nz)
      INTEGER*4,save :: w0,w1,h0,h1
! vars to adjust dims to multiple of 4, also reused for dim of 16packs
      INTEGER*4,save :: w4, h4  
! this is needed for allocation
      INTEGER*4,save :: w4h4 ! number of 4x4=16 packs
! --------------------------------------------------------------------
! View cache : things to save for each view.  cotab is calculated
! once on the first pass for a view, then loaded afterwards by
! first thread.  mpi_header_out and im are buffers sent by mpi issend.
! and must be saved for MPI_test completion during the next call.
      type VC_type
        REAL*4,dimension(:,:,:),allocatable :: cotab
        INTEGER*4 :: Valpha
      end type VC_type
      type(VC_type),dimension(:),allocatable,save :: VC

      
      
! working bounds actually used: 1=w0 2=w1 3=h0 4=h1
      INTEGER*4,dimension(1:4),save :: wh
! needed to be shared : E is altered by AMR for sending info up,
! so E is restored with this after its altered value is sent up
      REAL*4, dimension(3),save :: E_AMR_SAVE, E_AMR_PASS

! -------------- shells ----------------------------------------------
! the actual distance t for each shell, shell(0) = clip0
! and shell(nsh) = clip1
      REAL*4,dimension(0:SREND_MAX_SHELLS),save :: shell
! the increment between shells: (clip1-clip0)/nsh
      REAL*4 :: shdt

!! ! for calculating shells hit, corners of subvolume here
!!      REAL*4 :: dcorner_max, dcorner_min
!!      REAL*4,dimension(0:1,0:1,0:1) :: dcorner

! counter for looping over sh0 to sh1
      INTEGER*4 :: ish
! determine bounding shells for data, temp to set sh0 and sh1
      INTEGER*4 :: sh0_bot, sh1_top

! ------- for finding ray hits ---------------------------------------
! the saved variables here are set by 1 thread in setup, then the 
! variabled with SAVE are used by all threads for ray casting
      INTEGER*4, dimension(:,:,:), allocatable,save :: rr
      INTEGER*4, dimension(:,:,:), allocatable,save :: cc
      REAL*4, dimension(:,:,:,:), allocatable,save :: Q
      REAL*4, dimension(1:16) :: a, b
!DIR$ ATTRIBUTES ALIGN:64 :: rr, cc, Q, a, b
      REAL*4, dimension(1:16) :: cosb, cosbcosa
      REAL*4, dimension(1:16) :: cosbsina, sinb
      REAL*4, dimension(1:16) :: acos_arg16 ! used for pclip1 and srendtype 1,3 (N and S poles)

!DIR$ ATTRIBUTES ALIGN:64 :: cosb, cosbcosa, cosbsina, sinb
! bounds of ray and pack in cube
      REAL*4, dimension(:,:,:), allocatable,save :: tmin, tmax
      REAL*4, dimension(:,:), allocatable,save :: tpackmax, tpackmin
!DIR$ ATTRIBUTES ALIGN:64 :: tmin, tmax, tpackmax, tpackmin
      INTEGER*4,dimension(:,:),allocatable,save :: itpackmax,itpackmin
!DIR$ ATTRIBUTES ALIGN:64 :: itpackmax,itpackmin
! testing ray intersection with cube and setting ray lengths
      REAL*4,dimension(1:16) ::  t0 ! bounds
      REAL*4, dimension(1:16) :: x,y,z  ! cell coord
!DIR$ ATTRIBUTES ALIGN:64 :: t0, x, y, z
! flag to shift and unshift rendering across dateline
      INTEGER*4,dimension(:,:),allocatable,save :: shiftW
      INTEGER*4,dimension(:,:),allocatable,save :: hit
!DIR$ ATTRIBUTES ALIGN:64 :: shiftW, hit
! to remember packs that actually hit cube : save for OMP threads
      INTEGER*4,dimension(:),allocatable,save :: numhits
      INTEGER*4,save :: numhits_all
!DIR$ ATTRIBUTES ALIGN:64 :: numhits, numhits_all
! optimizations to reduce arithmetic : save for OMP threads
      INTEGER*4,save :: iNXN,iNYN,iNZN,iNXNp1,iNYNp1,iNZNp1
!DIR$ ATTRIBUTES ALIGN:64 :: iNXN,iNYN,iNZN,iNXNp1,iNYNp1,iNZNp1
      INTEGER*4,dimension(:,:),allocatable,save :: im_cc, im_rr ! mapping sampling results to target im() array
!DIR$ ATTRIBUTES ALIGN:64 :: im_cc,im_rr

#ifdef SREND_STEREO
      REAL*4, dimension(1:16) :: dstQ,dstQ2
      REAL*4, dimension(1:16) :: clip00, clip11
!DIR$ ATTRIBUTES ALIGN:64 :: dstQ,dstQ2, clip00, clip11
! fix clipping sphere for stereo
      REAL*4,dimension(1:3),save :: dst ! vector shift right
      REAL*4,save :: dst2               ! = dot(dst,dst)
      REAL*4,dimension(1:3) :: EC_in
      REAL*4 :: dEC_in
#endif
#ifdef SREND_TOFF
! fix offset from clip0 and clip00
      REAL*4,dimension(:,:,:),allocatable,save :: toff
!DIR$ ATTRIBUTES ALIGN:64 :: toff
#endif

! experiment
#ifdef SREND_POINT_RADIUS_CLIP
      REAL*4,dimension(1:16) :: pta,ptb,ptc,ptdiscr,pt0,pt1
#endif

! threads in setting tmin and tmax, testing each ray in volume
      INTEGER*4,dimension(:,:),allocatable,save :: th_numhits
      INTEGER*4,dimension(:,:,:),allocatable,save :: th_hit
      INTEGER*4 :: ith
      INTEGER*4,dimension(:,:),allocatable,save :: th_wh

! -------------------------------------------------------------------
      INTEGER*4  ic, ir ! loop vars to set rr,cc for 16-pack
! volume dimensions used to set ray lengths before casting
      REAL*4,save :: x0,x1,y0,y1,z0,z1

! -------- computing variables: sampling -----------------------------
      INTEGER*4 tt ! steps through ray pack
#ifdef SREND_TOFF
      REAL*4,dimension(1:16) :: t ! ray increment, with stereo adjust
!DIR$ ATTRIBUTES ALIGN:64 :: t
#else
      REAL*4 :: t ! ray increment
#endif

      INTEGER*4, dimension(1:16) :: ix, iy, iz
!DIR$ ATTRIBUTES ALIGN:64 :: ix, iy, iz

! hold values during casting
      INTEGER*4, dimension(1:16) :: cotabi
!DIR$ ATTRIBUTES ALIGN:64 :: cotabi
! RGBA values at a sample
      REAL*4,dimension(1:16) :: dR,dG,dB  ! tracer val
      REAL*4,dimension(1:16) :: dalpha  ! alpha val
!DIR$ ATTRIBUTES ALIGN:64 :: dR,dG,dB, dalpha

! solid rendering
#ifdef SREND_SOLID_RENDERING
      REAL*4,dimension(1:16) :: s00,s10,s01,s11
!DIR$ ATTRIBUTES ALIGN:64 :: S00,S10,S01,S11
      INTEGER*4 :: Sminmax_xyz(0:1,1:3) ! 0=min:1=max,1=x 2=y 3=z
#endif

! optimize
      REAL*4,dimension(1:16) :: q1Mtval
      REAL*4,dimension(1:16) :: transparency
!DIR$ ATTRIBUTES ALIGN:64 :: q1Mtval, transparency
! accumulated in i-pass from ray
      REAL*4,dimension(1:16) :: tval ! scaling RGB
      REAL*4,dimension(1:16) :: ddR  ! tracer value
      REAL*4,dimension(1:16) :: ddG  ! tracer value
      REAL*4,dimension(1:16) :: ddB  ! tracer value
      REAL*4,dimension(1:16) :: daa  ! alpha value
!DIR$ ATTRIBUTES ALIGN:64 :: tval,ddR,ddG,ddB,daa
! mask out for t values outside of ray hit
      REAL*4, dimension(1:16) :: tmask ! change from INTEGER*4
!DIR$ ATTRIBUTES ALIGN:64 :: tmask
! for trilinear inerpolation
      REAL*4,dimension(1:16) :: dx0,dy0,dz0,dx1,dy1,dz1
!DIR$ ATTRIBUTES ALIGN:64 :: dx0,dy0,dz0,dx1,dy1,dz1
! pre-align data
      REAL*4, dimension(1:16,1:8) :: dat8
!DIR$ ATTRIBUTES ALIGN:64 :: dat8

! ---------- thread work partitions ----------------------------------
! partition work among threads, simply so using 16 packs
      INTEGER*4 :: part, Wpart, Rpart ! to form partitions by rows
      INTEGER*4 :: th0, th1 ! bounds for each thread's compositing

! ----- MPI ----------------------------------------------------------
! these must be saved for state between calls, but thread safe
! as calls are made by different views and only by master thread
#ifndef SREND_NOMPI
      integer,save :: MYid
      integer,save :: MYer
      integer :: srendMPItag, srendMPIrank ! for MPI_INTEGER portability ! notused: srendMPIcount
#endif

      
#ifdef SREND_RENDER_TIMING
! for testing .. collect statistics
      integer*8,save :: it1, it2, itc
      REAL*4,save :: rc
#endif

! for tiling AMR
      INTEGER*4 :: rendi ! selector for block(nV)%cube(rendi)%im in sampling loop, set before entry
#ifndef SREND_NOMPI
      INTEGER*4 :: tileW, tileH ! width and height of each tile in pixels
      INTEGER*4 :: jj, kk,iii,itile ! counters
#endif
      
! for legend_SVG
#ifdef SREND_SVG_LEGEND
      logical,save,dimension(1:SREND_MAX_V) :: create_legend = .TRUE.
      character*256 :: flegend
      character*4 :: legend_i, legend_y
      character*8 :: legend_r
      character*2 :: legend_argb(0:3)
      character*256 :: f_str, f_dir
      integer*8 :: legend_time_i
      character*30 :: legend_time_str
#endif

! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!  declarations done 
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc

! the first thread of this view does this, the wait after OMP SINGLE (default) is needed
!$OMP SINGLE

#ifdef SREND_FLOP
   flop = 0 ! initialize for rendering
#endif

      if(.NOT. block(nV)%used ) then ! will always use this many
#ifndef SREND_NOMPI
         allocate( block(nV)%MYreq(0:SREND_MAX_LOAD)); block(nV)%MYreq = MPI_REQUEST_NULL
         allocate( block(nV)%MYreq2(0:SREND_MAX_LOAD)); block(nV)%MYreq2 = MPI_REQUEST_NULL
         allocate( block(nV)%isendDONE(0:SREND_MAX_LOAD));block(nV)%isendDONE = .TRUE.
         allocate( block(nV)%isendDONE2(0:SREND_MAX_LOAD)); block(nV)%isendDONE2 = .TRUE.
         block(nV)%nreq = 0
         block(nV)%nreq2 = 0
#endif
         block(nV)%used  = .TRUE.
      end if

! set behavior
#ifndef SREND_NOMPI
      block(nV)%mpi   = .TRUE.
#else
      block(nV)%mpi   = .FALSE.
#endif


#ifdef SREND_RENDER_TIMING
      call system_clock(it1,itc); rc = 1.0_4 / itc
#endif


#ifndef SREND_NOMPI
      call srend_comm_init() ! sets srend_COMM communicator, only does it once on 1st call
      call MPI_comm_rank(srend_COMM, MYid, MYer)
#endif


#ifdef SREND_DEBUG
      print *,'RRRRRRRRRRRRRRR enter: srend_render() RRRRRRRRRRRRRRRRRRR'
#ifndef SREND_NOMPI
      print *,'MPI MYid=',MYid
#endif
      print *,'nV=',nV, 'nV_out='
      print *,'E_in=',E_in
      print *,'Ev_in=',Ev_in
      print *,'Up_in=',Up_in
      print *,'Alpha_in=',Alpha_in, 'Beta_in=',Beta_in, 'EyeRight_in=',EyeRight_in
      print *,'plane clip in:x0=',clipx0_in, 'x1=',clipx1_in
      print *,'plane clip in:y0=',clipy0_in, 'y1=',clipy1_in
      print *,'plane clip in:z0=',clipz0_in, 'z1=',clipz1_in
      print *,'spherical clip in:clip0=',clip0_in, 'clip1=',clip1_in
      print *,'nsh_in=',nsh_in
      print *,'polar clip: pclip1_in=', pclip1_in
      print *,'perspective_in=',perspective_in
      print *,'srendtype_in=',srendtype_in
      print *,'dt_in=',dt_in
      print *,'data XYZ dim in: XN_in=',XN_in,'YN_in=',YN_in,'ZN_in=',ZN_in,'Bd=',Bd
      print *,'XYZ offsets in: iNX_in=',iNX_in,'iNY_in=',iNY_in,'iNZ_in=',iNZ_in
      print *,'Vdim_in=',Vdim_in
      print *,'W_in=',W_in,'H_in=',H_in,'nR_in=',nR_in
      print *,'offset_cotab=',offset_cotab
      print *,'filenames_in=',filenames_in
      print *,'tiles_right_in=',tiles_right_in,'tiles_down_in=',tiles_down_in
      print *,'TARGid=',TARGid
      print *,'RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR'
#endif
      

! --------------------------------------------------------------------
! first pass; some things set when an MPI process starts
! these are all saved, common for all threads if OMP called
      if( .NOT. allocated(VC) ) then
        allocate( VC(1:SREND_MAX_V) )     
        Vpass = 0 ! set all to 0
        PI = 4.0_4 * atan(1.0_4)
      end if

!$OMP END SINGLE


! -------- test for setup message sent previous pass
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#ifndef SREND_NOMPI
!$OMP MASTER

! wait for previous data isend tag
! if already completed using MPI_test at beginning of sub, then 
! outMPIreq = MPI_REQUEST_NULL, and MPI_wait returns immediately

#ifdef nothing
      do while( ANY( block(nV)%isendDONE2(0:block(nV)%nreq2) .eqv. .FALSE. ) )
         do i=0,block(nV)%nreq2
         if( .NOT. block(nV)%isendDONE2(i) ) then
            call MPI_TEST(block(nV)%MYreq2,block(nV)%isendDONE2(i),MPI_STATUS_IGNORE,MYer)
            if(.NOT. block(nV)%isendDONE2(i) ) SREND_YIELDING
         end if
         end do
      end do
#endif       
      if( block(nV)%n == 0 .AND. block(nV)%load ) then
         do while( .NOT. block(nV)%isendDONE2(block(nV)%nreq2) )
            call MPI_TESTALL(1+block(nV)%nreq2,                     &
                              block(nV)%MYreq2(1:block(nV)%nreq2),   &
                              block(nV)%isendDONE2(block(nV)%nreq2), &
                              MPI_STATUSES_IGNORE,MYer)
            if(.NOT. block(nV)%isendDONE2(block(nV)%nreq2) ) SREND_YIELDING
         end do
         block(nV)%isendDONE2(0:block(nV)%nreq2) = .TRUE.
         block(nV)%nreq2 = 0
      else ! regular render using (0)
         do while( .NOT. block(nV)%isendDONE2(0) )
            call MPI_TEST(   block(nV)%MYreq2(0),   &
                              block(nV)%isendDONE2(0), &
                              MPI_STATUS_IGNORE,MYer)
            if(.NOT. block(nV)%isendDONE2(0) ) SREND_YIELDING
         end do
         block(nV)%nreq2 = 0
      end if

!$OMP END MASTER
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


! ----------- test of data message sent previous pass
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#ifndef SREND_NOMPI
!$OMP MASTER
      if( block(nV)%n == 0 .AND. block(nV)%load ) then
         do while( .NOT. block(nV)%isendDONE(block(nV)%nreq) )
            call MPI_TESTALL(1+block(nV)%nreq,                     &
                              block(nV)%MYreq(1:block(nV)%nreq),   &
                              block(nV)%isendDONE(block(nV)%nreq), &
                              MPI_STATUSES_IGNORE,MYer)
            if(.NOT. block(nV)%isendDONE(block(nV)%nreq) ) SREND_YIELDING
         end do
         block(nV)%isendDONE(0:block(nV)%nreq) = .TRUE.
         block(nV)%nreq = 0
      else ! regular render using (0)
         do while( .NOT. block(nV)%isendDONE(0) )
            call MPI_TEST(   block(nV)%MYreq(0),   &
                              block(nV)%isendDONE(0), &
                              MPI_STATUS_IGNORE,MYer)
            if(.NOT. block(nV)%isendDONE(0) ) SREND_YIELDING
         end do
         block(nV)%nreq = 0
      end if

!$OMP END MASTER
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$      
   

#ifdef SREND_RENDER_TIMING
!$OMP MASTER
      call system_clock(it2); print *,'RENDER: After setup,before MPI isend test =:',(it2-it1)*rc; it1 = it2
!$OMP END MASTER
#endif



!$OMP SINGLE
! ------------ logic for render, render_flush, render_flush_write ----
      if(.NOT. allocated(block(nV)%mpi_header) ) allocate( block(nV)%mpi_header(1:400,0:SREND_MAX_LOAD) )!once
      if(.NOT. allocated(block(nV)%cube) ) allocate( block(nV)%cube(0:SREND_MAX_LOAD) )!each time
      
! an ordinary render if !flush && !load
      if( block(nV)%load ) then! srend_render_load:
         if(block(nV)%n .eq. 0) then ! setup if 1st time
            if(Vpass(nV) < block(nV)%load_pass) then
               Vpass(nV) = block(nV)%load_pass ! increment for start of loading
            end if
            block(nV)%Tload = 0 ! set all entries to zero
            if(.NOT. allocated(block(nV)%wh) ) allocate( block(nV)%wh(1:20,0:SREND_MAX_LOAD) )!once
         end if
         block(nV)%n = block(nV)%n + 1! the block index starts at 1 for _render_load
      else
        Vpass(nV) = Vpass(nV) + 1 ! 1 for each view on its 1st pass
      end if

      mhe_nR = nR_in ! set number variables
      if(tiles_right_in*tiles_down_in == 1) mhe_TARGid = TARGid(1,1) ! for load & flush in block()%mpi_header
      mhe_nV_out = nV_out
      mhe_nsh = nsh_in ! set shells
      mhe_Vdim = Vdim_in ! set Vdim
      
! this switches the back2front to front2back ordering, but actually
! done back to front in both cases.  For "front2back" we are viewing
! from infinity toward the eye.
      if(dt_in .lt. 0.0) then
         dt = -dt_in
         mhe_outin = .TRUE.
      else
         dt = dt_in
         mhe_outin = .FALSE.
      end if
      
!$OMP END SINGLE


      
#ifdef SREND_RENDER_TIMING
!$OMP MASTER
      call system_clock(it2); print *,'Render: before color table setup =:',(it2-it1)*rc; it1 = it2
!$OMP END MASTER
#endif 
      
      

!$OMP SINGLE
! --------------------------------------------------------------------
! only first thread does color table setup on first pass then
! stores in VC(nV)%cotab, then only first thread moves this stored
! color table back into cotab for following threads on later
! passes 
!
! It is known that an improper color table WILL break this code, but it
! will break on first pass in that case.  Indices must start at 0 and 
! end at 255 to build all 256 indices.  The indices cannot repeat, and 
! they must increase.

! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! color table setup 
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!      if(Vpass(nV) .eq. 1) then ! do this just once
 
#ifdef SREND_COTAB_EACH_PASS
      if( .NOT. allocated( VC(nV)%cotab) ) allocate( Vc(nV)%cotab(0:3,0:255,1:mhe_nR) )
      if(.TRUE.) then ! do colortable every pass
#else
      if( .NOT. allocated( VC(nV)%cotab) ) then ! just once
         allocate( Vc(nV)%cotab(0:3,0:255,1:mhe_nR) )
#endif

         if(.NOT. allocated(cotab) ) then
            allocate( cotab(0:3,0:255,1:mhe_nR) )
         else if( size(cotab) .ne. size(Vc(nV)%cotab) ) then
            deallocate( cotab )
            allocate( cotab(0:3,0:255,1:mhe_nR) )
         end if

#ifdef SREND_COTAB_HARDCODED
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! hard-coded color table for renderer; read in once
! offset_cotab : where to start reading tables
! the table reading/forming ends with a '255 ...' entry
! -------- write alpha knots here -------     
         alpha_knot(1) = '0 0.0'
         alpha_knot(2) = '5 0.0'
         alpha_knot(3) = '36 0.0627'
         alpha_knot(4) = '72 0.148'
         alpha_knot(5) = '105 0.536'
         alpha_knot(6) = '133 1.0'
         alpha_knot(7) = '158 1.0'
         alpha_knot(8) = '196 0.328'
         alpha_knot(9) = '236 0.302'
         alpha_knot(10) = '244 0.105'
         alpha_knot(11) = '250 0.0'
         alpha_knot(12) = '255 0.0'
! skull MRI
         alpha_knot(13) = '0 0.0'
         alpha_knot(14) = '128 1.0'
         alpha_knot(15) = '255 1.0'
! linear
         alpha_knot(16) = '0 0.0'
         alpha_knot(17) = '255 1.0'
! zero clip
         alpha_knot(18) = '0 0.0'
         alpha_knot(19) = '1 0.01'
         alpha_knot(20) = '50 0.1'
         alpha_knot(21) = '100 0.3'
         alpha_knot(22) = '150 0.8'
         alpha_knot(23) = '255 1.0'
! linear blue-red
         alpha_knot(24) = '0 1.0'
         alpha_knot(25) = '255 1.0'
! lower alpha
         alpha_knot(27) = '0 0.0'
         alpha_knot(28) = '5 0.0'
         alpha_knot(29) = '36 0.03135'! 0.0627'
         alpha_knot(30) = '72 0.074'! 0.148'
         alpha_knot(31) = '105 0.268'! 0.536'
         alpha_knot(32) = '133 0.5'! 1.0'
         alpha_knot(33) = '158 0.5'! 1.0'
         alpha_knot(34) = '196 0.164'! 0.328'
         alpha_knot(35) = '236 0.151'! 0.302'
         alpha_knot(36) = '244 0.05025'! 0.105'
         alpha_knot(37) = '250 0.0'
         alpha_knot(38) = '255 0.0'
! fire
         alpha_knot(40) = '0 0.0'
         alpha_knot(41) = '16 0.05'
         alpha_knot(42) = '220 0.15'
         alpha_knot(43) = '255 1.0'
         
         alpha_knot(44) = '0 0.0'
         alpha_knot(45) = '16 0.03'
         alpha_knot(46) = '250 0.06'
         alpha_knot(47) = '255 0.2'
! disvovery ! ^ spike at middle to pick out something
         alpha_knot(48) = '0 0.0'
         alpha_knot(49) = '1 0.0'
         alpha_knot(50) = '4 0.0'
         alpha_knot(51) = '138 0.0002'
         alpha_knot(52) = '140 0.08'
         alpha_knot(53) = '142 0.002'
         alpha_knot(54) = '255 0.0'
! highlighting AMR
         alpha_knot(55) = '0 0.0 '
         alpha_knot(56) = '40 0.003'
         alpha_knot(57) = '41 0.003'
         alpha_knot(58) = '79 0.003'
         alpha_knot(59) = '80 0.003'
         alpha_knot(60) = '81 0.003'
         alpha_knot(61) = '119 0.003'
         alpha_knot(62) = '120 0.006'
         alpha_knot(63) = '121 0.006'
         alpha_knot(64) = '159 0.006'
         alpha_knot(65) = '160 0.02'
         alpha_knot(66) = '161 0.02'
         alpha_knot(67) = '199 0.009'
         alpha_knot(68) = '200 0.05'
         alpha_knot(69) = '201 0.05'
         alpha_knot(70) = '239 0.02'
         alpha_knot(71) = '240 0.1'
         alpha_knot(72) = '241 0.1'
         alpha_knot(73) = '249 0.01'
         alpha_knot(74) = '250 0.8'
         alpha_knot(75) = '251 1.0'
         alpha_knot(76) = '255 0.0'
! red up, blue down
         alpha_knot(80) = '0 1.0'
         alpha_knot(81) = '1 1.0'
         alpha_knot(82) = '32 0.04'
         alpha_knot(83) = '64 0.01'
         alpha_knot(84) = '128 0.0'
         alpha_knot(85) = '192 0.01'
         alpha_knot(86) = '224 0.04'
         alpha_knot(87) = '255 1.0'
! orf4 manual sample
         alpha_knot(90) = '0    .427'
         alpha_knot(91) = '53   .0034'
         alpha_knot(92) = '123 0.0'
         alpha_knot(93) = '172  .0026'
         alpha_knot(94) = '255  .444'
! xray
         alpha_knot(100) = '0 0.0'
         alpha_knot(101) = '50 0.0'
         alpha_knot(102) = '100 0.0001'
         alpha_knot(103) = '150 0.001'
         alpha_knot(104) = '200 0.01'
         alpha_knot(105) = '255 0.1'
      
! ---------- write rgb knots here
         rgb_knot(1) = '0 0.0 0.0 0.0'
         rgb_knot(2) = '5 0.0 0.0 0.0'
         rgb_knot(3) = '36 0.0 0.0 0.2509804'
         rgb_knot(4) = '71 0.0 0.1176471 0.3137255'
         rgb_knot(5) = '105 0.0 0.7843137 1.0'
         rgb_knot(6) = '131 1.0 1.0 1.0'
         rgb_knot(7) = '158 1.0 1.0 0.0'
         rgb_knot(8) = '196 1.0 0.0 0.0'
         rgb_knot(9) = '244 0.5019608 0.0 0.0'
         rgb_knot(10) = '255 0.5019608 0.0 0.0'
! skull MRI
         rgb_knot(13) = '0 0.0 0.0 0.0'
         rgb_knot(14) = '128 1.0 1.0 1.0'
         rgb_knot(15) = '255 1.0 1.0 1.0'
! linear
         rgb_knot(16) = '0 0.0 0.0 0.0'
         rgb_knot(17) = '255 1.0 1.0 1.0'
! zero clip
         rgb_knot(18) = '0 0.0 0.0 0.0'
         rgb_knot(19) = '1 0.0 0.0 0.0'
         rgb_knot(20) = '50 0.0 0.0 0.15'
         rgb_knot(21) = '100 0.0 0.1 0.4'
         rgb_knot(22) = '150 0.9 0.1 0.0'
         rgb_knot(23) = '255 1.0 0.0 0.0'
!linear blue-red
         rgb_knot(24) = '0 0.0 0.0 1.0'
         rgb_knot(25) = '255 1.0 0.0 0.0'
! lower alpha
         rgb_knot(27) = '0 0.0 0.0 0.0'
         rgb_knot(28) = '5 0.0 0.0 0.0'
         rgb_knot(29) = '36 0.0 0.0 0.2509804'
         rgb_knot(30) = '71 0.0 0.1176471 0.3137255'
         rgb_knot(31) = '105 0.0 0.7843137 1.0'
         rgb_knot(32) = '131 1.0 1.0 1.0'
         rgb_knot(33) = '158 1.0 1.0 0.0'
         rgb_knot(34) = '196 1.0 0.0 0.0'
         rgb_knot(35) = '244 0.5019608 0.0 0.0'
         rgb_knot(36) = '255 0.5019608 0.0 0.0'
! fire
         rgb_knot(40) = '0 1.0 0.0 0.0'
         rgb_knot(41) = '128 1.0 0.5 0.0'
         rgb_knot(42) = '250 1.0 1.0 0.0'
         rgb_knot(43) = '255 1.0 1.0 0.2'
      
         rgb_knot(44) = '0 1.0 0.0 0.0'
         rgb_knot(45) = '16 1.0 1.0 0.0'
         rgb_knot(46) = '250 1.0 1.0 0.2'
         rgb_knot(47) = '255 1.0 1.0 0.9'
! discovery : solid colors 0-255 to pick out something
         rgb_knot(48) = '0 0.0 0.0 0.0'
         rgb_knot(49) = '1 0.5 0.0 0.0'
         rgb_knot(50) = '4 0.5 0.0 0.0'
         rgb_knot(51) = '138 1.0 0.0 0.0'
         rgb_knot(52) = '140 0.0 1.0 0.0'
         rgb_knot(53) = '142 0.0 0.0 1.0'
         rgb_knot(54) = '255 0.0 0.0 0.0'
! highlighting AMR levels
! highlighting AMR
         rgb_knot(55) = '0 0.0 0.0 0.0'
         rgb_knot(56) = '40 1.0 0.2 0.2'
         rgb_knot(57) = '41 0.9 0.0 0.0'
         rgb_knot(58) = '79 0.0 0.0 1.0'
         rgb_knot(59) = '80 0.2 0.2 1.0'
         rgb_knot(60) = '81 0.0 0.0 0.9'
         rgb_knot(61) = '119 0.0 1.0 0.0'
         rgb_knot(62) = '120 0.2 1.0 0.2'
         rgb_knot(63) = '121 0.0 0.9 0.0'
         rgb_knot(64) = '159 1.0 0.0 1.0'
         rgb_knot(65) = '160 1.0 0.2 1.0'
         rgb_knot(66) = '161 0.9 0.0 0.9'
         rgb_knot(67) = '199 1.0 1.0 0.0'
         rgb_knot(68) = '200 1.0 1.0 0.2'
         rgb_knot(69) = '201 0.9 0.9 0.0'
         rgb_knot(70) = '239 1.0 1.0 1.0'
         rgb_knot(71) = '240 0.2 1.0 1.0'
         rgb_knot(72) = '241 0.0 0.9 0.9'
         rgb_knot(73) = '249 0.01 0.8 0.8'
         rgb_knot(74) = '250 1.0 1.0 1.0'
         rgb_knot(75) = '251 1.0 0.5 0.1'
         rgb_knot(76) = '255 0.0 0.0 0.0'
! blue down, red up from 0
         rgb_knot(80) = '0 0.5 0.5 1.0'
         rgb_knot(81) = '1 0.5 0.5 1.0'
         rgb_knot(82) = '64 0.2 0.2 1.0'
         rgb_knot(83) = '128 0.0 0.0 0.0'
         rgb_knot(84) = '192 1.0 0.2 0.2'
         rgb_knot(85) = '255 1.0 0.5 0.5'
! orf4 manual sample
         rgb_knot(90) = '0    .925 0.0 0.0'
         rgb_knot(91) = '123  .965 1.0  .969'
         rgb_knot(92) = '255 1.0   0.0 1.0'
!xray
         rgb_knot(100) = '0 1.0 1.0 1.0'
         rgb_knot(101) = '255 1.0 1.0 1.0'
      
#endif


#if defined(SREND_COTAB_PASSNAMELIST) || defined(SREND_COTAB_HARDCODED)
! -------- create color tables -------------------
         do ii=1,mhe_nR
! read alpha knots and form table
            j = offset_cotab(ii)
            read (alpha_knot(1),*) cotab_pix, cotab_a ! this must be 0
            cotab(0,cotab_pix,ii) = cotab_a
11          j = j+1
            read (alpha_knot(j),*) cotab_pix2, cotab_a2
            do i=cotab_pix+1,cotab_pix2
               cotab(0,i,ii) =                                            &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_a        &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_a2
            end do
            cotab_pix = cotab_pix2
            cotab_a = cotab_a2
            if(cotab_pix .ne. 255) goto 11 ! must have last line 255 # # #
      
! check for Valpha line: "256  512" meaning Valpha = 512, the basis for alpha entries
            Valpha = mhe_Vdim ! default
#ifdef SREND_AMR
            if( len(alpha_knot) >= j+1) then
               if( index(alpha_knot(j+1),'256 ') == 1) then
                  read(alpha_knot(j+1),*) cotab_pix, Valpha ! cotab_pix not used, but Valpha is
               end if
            end if
#ifdef SREND_VALPHA
            Valpha = SREND_VALPHA ! overide/provide Valpha
#endif
#endif
            VC(nV)%Valpha = Valpha ! save

! read RGB knots and form table
            j = offset_cotab(ii)
            read (rgb_knot(j),*) cotab_pix, cotab_r, cotab_g, cotab_b ! 0
            cotab(1,cotab_pix,ii) = cotab_r
            cotab(2,cotab_pix,ii) = cotab_g
            cotab(3,cotab_pix,ii) = cotab_b
13          j = j + 1
            read (rgb_knot(j),*) cotab_pix2, cotab_r2, cotab_g2, cotab_b2
            do i=cotab_pix+1,cotab_pix2
               cotab(1,i,ii) =                                             &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_r         &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_r2
               cotab(2,i,ii) =                                             &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_g         &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_g2
               cotab(3,i,ii) =                                             &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_b         &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_b2
            end do
            cotab_pix = cotab_pix2
            cotab_r = cotab_r2
            cotab_g = cotab_g2
            cotab_b = cotab_b2
            if(cotab_pix .ne. 255) goto 13 ! must have last line 255 # # #

         end do ! ii  1:mhe_nR
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
#endif


#ifdef SREND_COTAB_FILENAME
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! read color table from: cotab.txt : example below for 
! mhe_nR=1, cotab_offset = (/8/) as passed to srend_render():
! ------------------
!  alpha 8   
!  0 0.0
!  128 1.0 1.0
!  255 1.0 1.0
!
!  rgb 8
!  0 0.0 0.0 0.0
!  128 1.0 1.0 1.0
!  255 1.0 1.0 1.0
! ------------------
         open (1000,file=SREND_COTAB_FILENAME,status='old', action='read')
      
         do ii=1,mhe_nR
! read alpha knots and form table
            rewind 1000 ! here for next mhe_nR if more than 1
1001        read(1000,'(A)') cotab_line
            if( cotab_line(1:5) .eq. 'alpha') then
               read(cotab_line(6:300),*) cotab_pos
               if(cotab_pos .ne. offset_cotab(ii)) goto 1001
            else
               goto 1001
            end if

            read (1000,*) cotab_pix, cotab_a ! cotab_pix must be 0
            cotab(0,cotab_pix,ii) = cotab_a
11          read (1000,*) cotab_pix2, cotab_a2
               do i=cotab_pix+1,cotab_pix2
            cotab(0,i,ii) =                                        &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_a  &
               + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_a2
            end do
            cotab_pix = cotab_pix2
            cotab_a = cotab_a2
            if(cotab_pix .ne. 255) goto 11 ! must have last line 255 # # #
      
! check for Valpha line: "256  512" meaning Valpha = 512, the basis for alpha entries
            Valpha = mhe_Vdim ! default
#ifdef SREND_AMR
            read(1000,*,IOSTAT=cotab_ferror,END=1020) cotab_pix, Valpha
1020        continue
            if( cotab_ferror /= 0 .OR. cotab_pix /= 256) Valpha = mhe_vdim
#ifdef SREND_VALPHA
            Valpha = SREND_VALPHA ! overide/provide Valpha
#endif
#endif

            VC(nV)%Valpha = Valpha

! read RGB knots and form table
            rewind 1000
1002        read(1000,'(A)') cotab_line
            if( cotab_line(1:3) .eq. 'rgb') then
               read(cotab_line(4:300),*) cotab_pos
               if(cotab_pos .ne. offset_cotab(ii)) goto 1002
            else
               goto 1002
            end if

            read (1000,*) cotab_pix, cotab_r, cotab_g, cotab_b ! 0
            cotab(1,cotab_pix,ii) = cotab_r
            cotab(2,cotab_pix,ii) = cotab_g
            cotab(3,cotab_pix,ii) = cotab_b
13          read (1000,*) cotab_pix2, cotab_r2, cotab_g2, cotab_b2
            do i=cotab_pix+1,cotab_pix2
               cotab(1,i,ii) =                                             &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_r         &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_r2
               cotab(2,i,ii) =                                             &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_g         &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_g2
               cotab(3,i,ii) =                                             &
               1.0_4*(cotab_pix2-i)/(cotab_pix2-cotab_pix)*cotab_b         &
                     + 1.0_4*(i-cotab_pix)/(cotab_pix2-cotab_pix)*cotab_b2
            end do
            cotab_pix = cotab_pix2
            cotab_r = cotab_r2
            cotab_g = cotab_g2
            cotab_b = cotab_b2
            if(cotab_pix .ne. 255) goto 13 ! must have last line 255 # # #
         end do ! ii  1:mhe_nR

         close(1000)
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
#endif

         Vc(nV)%cotab = cotab ! keep in cache
      
! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --
         else ! not Vpass = 1, load in color table for view
            if( size(cotab) .ne. size(Vc(nV)%cotab) ) then
               deallocate( cotab )
               allocate( cotab(0:3,0:255,1:mhe_nR) )
            end if
            cotab = VC(nV)%cotab ! copy from cache
            Valpha = VC(nV)%Valpha
      end if ! Vpass = 1 ------ end setting up cotab from keys
! ------------- done forming color table -----------------------------




! -------------------- SVG_LEGEND ------------------------------------
! create a legend for the data, preliminary
#ifdef SREND_SVG_LEGEND


#ifndef SREND_NOMPI
      if(MYid == 0) then
#endif
      if( create_legend(nV) ) then ! do it only on first pass for each view
#ifndef SREND_COTAB_EACH_PASS
      create_legend(nV) = .FALSE. ! turn off, will turn on for next 1st pass
#endif     
     
      f_str = filenames_in ! copy original over
      
      do ii=1,mhe_nR
! get the fielnames directory to write legend: f_dir
         j = index(f_str,',')
         f_dir = f_str(1:j-1) ! pick out filename string for this var
         f_str = f_str(j+1:) ! copy rest over for next ii in 1:nR
         j = index(f_dir,'/',BACK=.TRUE.)
         if(j > 0) then ! make the directory as needed, use flegend as string for this         
           f_dir = f_dir(1:j)
           flegend = 'mkdir -p ' // trim(f_dir)
           call system( flegend )
         end if
         flegend = '' ! clear, just in case
      
! just write legend.svg if defined
#ifdef SREND_SVG_EACH_PASS_LEGEND
         flegend = 'legend.svg'
#else
         write(flegend,'("legend_V",I2.2,"_R",I2.2,".svg")') nV, ii
#endif
         flegend = trim(f_dir) // trim(flegend)
         
         open(unit=123,status='unknown', file=flegend,access='stream')
         write(123) '<svg width="200" height="640">',char(10)
         
         legend_time_i = time8()
         call ctime(legend_time_i,legend_time_str)
         flegend = '<!--Rendering Date: ' // trim(legend_time_str) // ' -->'
         write(123) trim(flegend), char(10)
         
         write(flegend,'("<!--(0 means spherical) Perspective=",I1.1," alpha=",F6.2," beta=",F6.2," -->")') &
               perspective_in, Alpha_in, Beta_in
         write(123) trim(flegend), char(10)
         
         write(123) '<!--The RGB value in the following line is applied to all text.-->', char(10)
         write(123) '   <defs><style type="text/css">text{fill:#FEDCBA;}</style></defs>', char(10) ! text color
! annotate bars
         write(123) '<!--These annotate the RGB and RGB-alpha color bars-->',char(10)
         write(123) '   <text x="6" y="570" transform="rotate(90 6,570)">RGB</text>', char(10)
         write(123) '   <text x="27" y="570" transform="rotate(90 27,570)">Opacity</text>', char(10)
         
! print titles on top, placeholders
         write(123) '<!--This is title 1--><text x="0" y="15">_title1_</text>', char(10)
         write(123) '<!--This is title 2--><text x="0" y="30">_title2_</text>', char(10)
         write(123) '<!--This is title 3--><text x="0" y="45">_title3_</text>', char(10)

         do i=0,255 ! write keys in some subset of 0-256
           write (legend_y,'(I0)') 54 + 2*i ! y position to write text
           flegend='' ! clear
           write(legend_i,'(I0)') i
!           if(mod(i,32) == 0 .OR. i == 255) then ! only write every 32 numbers
!             flegend = '<text x="45" y="' // trim(legend_y) // '">' // trim(legend_i) // '</text>'
!           else
             flegend = '<!--_' // trim(legend_i) // '_text x="45" y="' // trim(legend_y) // '">_' // &
                       trim(legend_i) // '_</text_' // trim(legend_i) // '_-->'
!           end if
           write(123) trim(flegend),char(10)
         end do
         
         do i=0,255 ! write color bar: RGB and RGB+alpha
           write (legend_y,'(I0)') 50 + 2*i ! y position to write bar
           flegend='' ! clear
           write(legend_argb(1),'(Z2.2)') floor( 255.0_4 * cotab(1,i,ii) )
           write(legend_argb(2),'(Z2.2)') floor( 255.0_4 * cotab(2,i,ii) )
           write(legend_argb(3),'(Z2.2)') floor( 255.0_4 * cotab(3,i,ii) )
           flegend = '   <rect x="0" y="' // trim(legend_y) // '" width="20" height="2" fill="#' // &
                     legend_argb(1) // legend_argb(2) // legend_argb(3) // '"/>'
           write(123) trim(flegend),char(10) ! opaque
           flegend='' ! clear
           write (legend_r,'(F7.5)') cotab(0,i,ii)
           flegend = '   <rect x="20" y="' // trim(legend_y) // '" width="20" height="2" fill="#' // &
                     legend_argb(1) // legend_argb(2) // legend_argb(3) // '" fill-opacity="' //  &
                     trim(legend_r) // '"/>'
           write(123) trim(flegend),char(10) ! with opacity
         end do
         
! finish off         
         write(123) '</svg>',char(10)
         close(123)        
      end do ! nR
           
      end if ! create_legend(nV) == T
#ifndef SREND_NOMPI
      end if ! MYid == 0
#endif


#endif
! ------------------------------- end SVG_LEGEND ---------------------


! ccccccccccccc end color table setup and setting cccccccccccccccccccc
! all wait on OMP END SINGLE
!$OMP END SINGLE
      
      
      
#ifdef SREND_RENDER_TIMING
!$OMP MASTER
      call system_clock(it2); print *,'Render: after color table setup =:',(it2-it1)*rc; it1 = it2
!$OMP END MASTER
#endif
      


! the first thread does this, all wait at end
!$OMP SINGLE
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! initial setup
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! --------- copy arguments into mpi_header for passing----------------

      do i=1,200
        mhe_filenames(i) = filenames_in(i:i)
      end do

      mhe_XN = XN_in; mhe_YN=YN_in; mhe_ZN = ZN_in

      mhe_perspective = perspective_in
      mhe_srendtype = srendtype_in
      
! ordinary rendering, default  
      mhe_W = W_in
      mhe_H = H_in

      mhe_Wim = W_in ! output target
      mhe_Him = H_in
      
#ifdef SREND_DESKTOP
      mhe_Wim = mhe_W
      mhe_Him = mhe_H
#endif
      
      mhe_Alpha = Alpha_in
      mhe_Beta = Beta_in
      
! for spherical rendering, we always have a W:H = 2:1 ratio; W better be divisible by 16, and H by 8
! Npole and Spole are the top and bottom quarters; equatorial is the middle half--of resulting image
      if(mhe_srendtype .eq. 1 .OR. mhe_srendtype .eq. 3) then ! polar, Npole or Spole, 
! assume W:H = 2:1 ratio, say 800x400, then W=200, H=200, Wim=800, Him=100
         mhe_W = W_in / 4 ! renderig itself
         mhe_H = W_in / 4 
  
         mhe_Wim = W_in ! map to this image dimensions
         mhe_Him = H_in / 4
         
         mhe_Alpha = 95.0_4
         mhe_Beta = 95.0_4         
      else if(mhe_srendtype .eq. 2) then ! equatorial
! assume W:H = 2:1 ratio, say 800x400, then W=800, H=200, Wim=800, Him=200
         mhe_W = W_in ! rendering target
         mhe_H = H_in / 2 

         mhe_Wim = W_in ! map to this target
         mhe_Him = H_in / 2
         
         mhe_Alpha = 360.0_4
         mhe_Beta = 90.0_4 
      end if
      
      
      mhe_tiles_right = tiles_right_in
      mhe_tiles_down = tiles_down_in
      mhe_TDi = 1 ! default
      mhe_TRi = 1 ! default

      mhe_iNX = iNX_in
      mhe_iNY = iNY_in
      mhe_iNZ = iNZ_in

! --------------- prep for stereo shift ------------------------------ 
      E_in2 = E_in !useless now, removed voff

      if(EyeRight_in .ne. 0.0_4) then
! > make orthogonal basis
        R0(1) = Ev_in(2)*Up_in(3) - Ev_in(3)*Up_in(2) ! R = Ev x Up
        R0(2) = Ev_in(3)*Up_in(1) - Ev_in(1)*Up_in(3)
        R0(3) = Ev_in(1)*Up_in(2) - Ev_in(2)*Up_in(1)
        U0(1) = R0(2)*Ev_in(3) - R0(3)*Ev_in(2) ! U = R x Ev
        U0(2) = R0(3)*Ev_in(1) - R0(1)*Ev_in(3)
        U0(3) = R0(1)*Ev_in(2) - R0(2)*Ev_in(1)
        V0 = Ev_in ! copy of Ev
! unitize vectors ! none should be zero
        V0 = V0 / sqrt( V0(1)**2 + V0(2)**2 + V0(3)**2 ) ! unitize
        R0 = R0 / sqrt( R0(1)**2 + R0(2)**2 + R0(3)**2 )
        U0 = U0 / sqrt( U0(1)**2 + U0(2)**2 + U0(3)**2 )
! shift eye
        E0 = E_in2 + EyeRight_in * R0

      else ! not stereo
        E0 = E_in2
        V0 = Ev_in
        U0 = Up_in
      end if
      
      E_AMR_PASS = E0
      mhe_E = E0 * mhe_Vdim

#ifdef SREND_STEREO
      dst = (E_in2 - E0) * mhe_Vdim
      dst2 = dot_product(dst,dst)
#endif


! E, clipx0,clipx1, clipy0,clipy1, clipz0,clipz1, clip0,clip1, EyeRight
! example: xform shift = 1/2, vol_dim = 448
! [-1/2,1/2]^3 -->[0,448]^3
! example: xform_shift = 0, vol_dim = 448
! [0,1]^3 --> [0,448]^3
! ---- coordinates, all tranformed the same: [x + xform_shift]*xform_vol_dim
#ifdef SREND_XYZ_AXIS_CLIP
      clipx0 = clipx0_in*mhe_Vdim
      clipx1 = clipx1_in*mhe_Vdim
      clipy0 = clipy0_in*mhe_Vdim
      clipy1 = clipy1_in*mhe_Vdim
      clipz0 = clipz0_in*mhe_Vdim
      clipz1 = clipz1_in*mhe_Vdim
#else
      clipx0 = clipx0_in ! use it here only, but avoid calculation
      clipx1 = clipx1_in
      clipy0 = clipy0_in
      clipy1 = clipy1_in
      clipz0 = clipz0_in
      clipz1 = clipz1_in
#endif
! ---- distances, all transformed the same: x * xform_vol_dim
      clip0 = clip0_in * mhe_Vdim
      clip1 = clip1_in * mhe_Vdim

! point radius clip
#ifdef SREND_POINT_RADIUS_CLIP
      pt = pt_in*mhe_Vdim
      pt_r = pt_r_in*mhe_Vdim
#else
      pt = pt_in      ! unused, but supress compiler warnings
      pt_r = pt_r_in
#endif



! --------------- prep for pole rotation -----------------------------
      if( mhe_srendtype .eq. 1 .OR. mhe_srendtype .eq. 3) then ! n/s pole
! > make orthogonal basis: V,U,R used for scratch here
         R(1) = V0(2)*U0(3) - V0(3)*U0(2) ! R = Ev x Up
         R(2) = V0(3)*U0(1) - V0(1)*U0(3)
         R(3) = V0(1)*U0(2) - V0(2)*U0(1)
         U(1) = R(2)*V0(3) - R(3)*V0(2) ! U = R x Ev
         U(2) = R(3)*V0(1) - R(1)*V0(3)
         U(3) = R(1)*V0(2) - R(2)*V0(1)
         V = V0
! unitize vectors
         V = V / sqrt( V(1)**2 + V(2)**2 + V(3)**2 ) ! unitize
         R = R / sqrt( R(1)**2 + R(2)**2 + R(3)**2 )
         U = U / sqrt( U(1)**2 + U(2)**2 + U(3)**2 )
! rotate view 90 degrees to the pole, North or South      
         if( mhe_srendtype .eq. 1) then ! npole
            V0 = U
            U0 = -V
         else if( mhe_srendtype .eq. 3) then ! spole
            V0 = -U
            U0 = V
         end if
      end if ! 'pole'

      
! load modified view variables
      mhe_Ev = V0
      mhe_Up = U0

! some care on pclip1_in
      if(mhe_srendtype .eq. 1 .OR. mhe_srendtype .eq. 3) then ! Npole or Spole
         ! in this case pclip1_in is ignored for 3-part full spherical
         pclip1 = PI / 180.0_4 * 95.0_4 ! a reasonable range for this, some padding for bilinear interp later
      else if(mhe_srendtype == 2) then ! equatorial
         pclip1 = PI / 180.0_4         ! override whatever pclip1_in is entered
      else
#ifdef SREND_POLAR_CLIP
         pclip1 = PI / 180.0_4 * pclip1_in
#else
         pclip1 = pclip1_in ! use it to supress unused warning, but not needed
#endif
      end if
      
      
#ifdef SREND_DEBUG
      print *,'2222222222222222222 enter: srend_render() 222222222222222222'
      print *,'nV=',nV, 'nV_out=',nV_out
      print *,'E_in=',E_in
      print *,'Ev_in=',Ev_in
      print *,'Up_in=',Up_in
      print *,'Alpha_in=',Alpha_in, 'Beta_in=',Beta_in, 'EyeRight_in=',EyeRight_in
      print *,'plane clip in:x0=',clipx0, 'x1=',clipx1
      print *,'plane clip in:y0=',clipy0, 'y1=',clipy1
      print *,'plane clip in:z0=',clipz0, 'z1=',clipz1
      print *,'spherical clip in:clip0=',clip0, 'clip1=',clip1
      print *,'nsh_in=',nsh_in
      print *,'polar clip: pclip1_in=', pclip1
      print *,'perspective_in=',perspective_in
      print *,'srendtype_in=',srendtype_in
      print *,'dt_in=',dt_in
      print *,'data XYZ dim in: mhe_XN=',mhe_XN,'YN=',mhe_YN,'mhe_ZN=',mhe_ZN,'Bd=',Bd
      print *,'XYZ offsets in: mhe_iNX=',mhe_iNX,'mhe_iNY=',mhe_iNY,'mhe_iNZ=',mhe_iNZ
      print *,'mhe_Vdim=',mhe_Vdim
      print *,'W_in=',W_in,'H_in=',H_in,'nR_in=',nR_in
      print *,'offset_cotab=',offset_cotab
      print *,'filenames_in=',filenames_in
      print *,'tiles_right_in=',tiles_right_in,'tiles_down_in=',tiles_down_in
      print *,'TARGid=',TARGid
      print *,'22222222222222222222222222222222222222222222222222222222222'
#endif      
      
      
! for tile+MPI flush routine to be able to reference mpi_header(0), in case that
! nothing will be put in mpi_header(1+), for reference in mpi_allreduce() call in render_flush() for tiling
      if( block(nV)%load .AND. block(nV)%n .eq. 1) then ! setup if 1st time
         E_AMR_SAVE = mhe_E
         mhe_E = E_AMR_PASS
         block(nV)%mpi_header(:,0) = mpi_header(:) ! set something
         mhe_E = E_AMR_SAVE
      end if
      
      
      
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! determine bounding array
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc

! viewing increments, assume W and H are divisible by 4
      a2 = mhe_Alpha * PI/360.0_4  ! half span
      b2 = mhe_Beta  * PI/360.0_4  ! half span
      dW = a2 * 2.0_4 /mhe_W
      dH = b2 * 2.0_4 /mhe_H

! don't alter incoming parameters Ev,Up: make orthogonal basis
      R(1) = mhe_Ev(2)*mhe_Up(3) - mhe_Ev(3)*mhe_Up(2) ! R = Ev x Up
      R(2) = mhe_Ev(3)*mhe_Up(1) - mhe_Ev(1)*mhe_Up(3)
      R(3) = mhe_Ev(1)*mhe_Up(2) - mhe_Ev(2)*mhe_Up(1)
      U(1) = R(2)*mhe_Ev(3) - R(3)*mhe_Ev(2) ! U = R x Ev
      U(2) = R(3)*mhe_Ev(1) - R(1)*mhe_Ev(3)
      U(3) = R(1)*mhe_Ev(2) - R(2)*mhe_Ev(1)
      V = mhe_Ev ! copy of Ev
      V = V / sqrt( V(1)**2 + V(2)**2 + V(3)**2 ) ! unitize
      R = R / sqrt( R(1)**2 + R(2)**2 + R(3)**2 )
      U = U / sqrt( U(1)**2 + U(2)**2 + U(3)**2 )

! --------------------------------------------------------------------
! Set an initial sufficient spanning array before testing rays.
! This array is w0:w1,h0:h1
! set this array to something if missed: 1:4,1:4

      radius = sqrt( 1.0_4*(mhe_XN**2 + mhe_YN**2 + mhe_ZN**2) ) * .5_4
      C(1) = mhe_iNX + .5_4*mhe_XN ! center of cubes
      C(2) = mhe_iNY + .5_4*mhe_YN
      C(3) = mhe_iNZ + .5_4*mhe_ZN
        
      EC = C - mhe_E
      dEC = sqrt( EC(1)**2 + EC(2)**2 + EC(3)**2 )

#ifdef SREND_STEREO
      EC_in = C - mhe_E
      dEC_in = sqrt( EC_in(1)**2 + EC_in(2)**2 + EC_in(3)**2 )
#endif

! check if embedded, really just too close to know for sure not        
      if( dEC .le. radius + 1.0_4) then
         w0=1; w1=mhe_W; h0=1; h1=mhe_H
         goto 107 ! done with this one
      end if
! near clip check
#ifdef SREND_STEREO
      if( dEC_in + radius .lt. clip0) then
#else
      if( dEC + radius .lt. clip0) then
#endif
         w0=1; w1=4; h0=1; h1=4 ! values to reject
         goto 107
      end if
! far clip check
#ifdef SREND_STEREO
      if( dEC_in - radius .gt. clip1) then
#else
      if( dEC - radius .gt. clip1) then
#endif
         w0=1; w1=4; h0=1; h1=4 ! values to reject
         goto 107
      end if
        
      uEC = EC / dEC ! make a unit vector, dEC > 0 assured
      
      acos_arg = dot_product(uEC,U)
      if( acos_arg .ge. 1.0_4) acos_arg = 1.0_4
      if( acos_arg .le. -1.0_4) acos_arg = -1.0_4
      bb = PI*.5_4 - acos( acos_arg )
!      bb = PI*.5 - acos( dot_product(uEC,U) ) ! angle up to center! replaced by above
!      possible trouble with acos() due to numerical roundoff

      gam = asin( radius / dEC) ! angle off center to sphere surface
!     we know radius + 1.0 < dEC and radius > 0
        
! now, setup span across for w0,W1
      if(bb + gam .ge. PI*.5_4 - .05_4) then ! North pole
         w0 = 1
         w1 = mhe_W
      else if(bb - gam .le. -PI*.5_4 + .05_4) then ! South pole
         w0 = 1
         w1 = mhe_W
      else ! calculations for w0,w1
         uEC2VR = uEC - dot_product(uEC,U)*U ! uEC - projection to U
         uEC2VR = uEC2VR / sqrt( uEC2VR(1)**2 + uEC2VR(2)**2 + uEC2VR(3)**2 )
!       fine, as uEC cannot be even close to Up=U
         
         acos_arg = dot_product(uEC2VR,V)
         if( acos_arg .ge. 1.0_4) acos_arg = 1.0_4
         if( acos_arg .le. -1.0_4) acos_arg = -1.0_4
         aa = acos( acos_arg )
!        aa = acos( dot_product(uEC2VR,V) ) ! this is replaced by above
!        there was trouble - i.e. acos(1.00000) = NaN

         if( dot_product(uEC2VR,R) .gt. 0.0_4) aa = -aa  

! calculation expansion due to elev/decl from equator
! we already know that the angle will not cause cos = 0
         aax = 1.0_4/cos( min( abs(bb)+gam, b2) )

         ww0 = floor( (a2 -(aa+gam*aax)-.501_4*dW)/dW )
         ww1 = ceiling( (a2-(aa-gam*aax)+.501_4*dW)/dW )     
     
         if( mhe_Alpha .lt. 360.0_4) then
            w0 = max(1,ww0 )
            w1 = min(mhe_W,ww1 )
         else ! Alpha = 360.0
            if(ww1 - ww0 .ge. mhe_W) then
               w0 = 1
               w1 = mhe_W
            else if(ww1 .gt. mhe_W) then
               w0 = ww0 - mhe_W
               w1 = ww1 - mhe_W   
               if( w0 .le. -mhe_W/2) then
                  w0 = 1
                  w1 = mhe_W
               end if
            else
               w0 = ww0
               w1 = ww1
            end if
         end if

         if(w1-w0 .gt. mhe_W .OR. w0 .lt. -mhe_W/2 .OR. w1 .gt. mhe_W) then
            w0=1
            w1=mhe_W
         end if

      end if ! set w0, w1

      if(w1-w0 .lt. 0) then
         w0=1; w1=4; h0=1; h1=4 ! values to reject
         goto 107
      end if
      
! set h0, h1
      h0 = max(1,  floor( (b2-(bb+gam)-.501_4*dH)/dH) )
      h1 = min(mhe_H,ceiling( (b2-(bb-gam)+.501_4*dH)/dH) )
        
      if(h1 .lt. h0 ) then
         w0=1; w1=4; h0=1; h1=4 ! values to reject
         goto 107
      end if
            
! expand to multiple of 4, extend right and down first
! assume: W and H are divisible by 4, really 8 for W
! but, ensure that in spherical case we also have multiples of 4
! on either side of the 0:1 splitting dateline
      h4 = mod(h1,4) 
      if(h4 .ne. 0) h1 = h1 + 4-h4
      h4 = mod(h0-1,4)
      if(h4 .ne. 0) h0 = h0 - h4

      w4 = mod(w1,4)
      if(w4 .ne. 0) w1 = w1 + 4-w4
      w4 = mod(mhe_W + w0-1,4)
      if(w4 .ne. 0) w0 = w0 - w4 

      if( 1+w1-w0 .ge. mhe_W) then ! spans anyway
         w0 = 1
         w1 = mhe_W
      end if

107   continue

! this is for the smp mode where renderer calls the finisher just for processing and writing
! out the image from rendering one block.
      if(.NOT. block(nV)%mpi .AND. .NOT. block(nV)%load) then !render NOMPI, must be full dimensions
         w0 = 1; w1 = mhe_W; h0 = 1; h1 = mhe_H
      end if

! sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss


#ifdef SREND_RENDER_TIMING
      call system_clock(it2); print *,'Render: after bounding span det =:',(it2-it1)*rc; it1 = it2
#endif


! determine number of shells needed and set clipping distances
! for each shell for setting ray casting limits
      shdt = (clip1 - clip0) / mhe_nsh
      shell(0) = clip0
      do i=1,mhe_nsh
         shell(i) = clip0 + shdt * i
      end do
      
      mhe_sh0 = 1
      mhe_sh1 = mhe_nsh

! alternate
      if( allocated (numhits)) then
         if( lbound(numhits,DIM=1) /= mhe_sh0 .OR. ubound(numhits,DIM=1) /= mhe_sh1) then
            deallocate( numhits )
         end if
      end if
      if( .NOT. allocated (numhits)) then
         allocate( numhits(mhe_sh0:mhe_sh1) ) ! all of them
      end if
      
!      if( allocated (numhits) ) 
!      allocate( numhits(mhe_sh0:mhe_sh1) ) ! all of them

! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! adjust w4h4 for allocation if needed
      if(w1-w0 > 0) then
         w4h4 = (w1-w0+1)*(h1-h0+1)/16 ! number of 4x4=16 packs
      else
         numhits_all = 0 ! running total
         numhits(:) = 0 ! for each shell
         wh(1) = 1000000!1
         wh(2) = -1000000!0
         wh(3) = 1000000!1
         wh(4) = -1000000!0
      end if

      
!$OMP END SINGLE
      
      
! determine thread id for load sharing scheme: id = th, num threads = nth
! must do here before allocations below by thread for th_ vars
#ifdef _OPENMP
      th = OMP_get_thread_num()
      nth = OMP_get_num_threads()
#else
      th = 0
      nth = 1
#endif

      
      
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! find hits
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
      if(w1-w0 <= 0 ) goto 6666 ! nothing to test
      
      
!$OMP SINGLE
! reduce arithmetic
      iNXN = mhe_iNX
      iNYN = mhe_iNY
      iNZN = mhe_iNZ
      iNXNp1 = iNXN!+1 !used for mask in sampling loop
      iNYNp1 = iNYN!+1
      iNZNp1 = iNZN!+1

! set volume boundaries : NX=NY=NZ=1
      x0 = iNXN 
      x1 = iNXN + mhe_XN
      y0 = iNYN
      y1 = iNYN + mhe_YN
      z0 = iNZN
      z1 = iNZN + mhe_ZN

#ifdef SREND_XYZ_AXIS_CLIP
! adjust if XYZ plane clipping is enabled
      if(clipx0 .gt. x0) x0 = clipx0
      if(clipx1 .lt. x1) x1 = clipx1
      if(clipy0 .gt. y0) y0 = clipy0
      if(clipy1 .lt. y1) y1 = clipy1
      if(clipz0 .gt. z0) z0 = clipz0
      if(clipz1 .lt. z1) z1 = clipz1
#endif

! will set during ray casting bounds checks to minimum span (wo:w1,h0:h1)
      wh(1) =  1000000000 ! w0
      wh(2) = -1000000000 ! w1
      wh(3) =  1000000000 ! h0
      wh(4) = -1000000000 ! h1

! reuse w4 and h4 to be number of 16-packs in each direction
      w4 = (1+w1-w0)/4
      h4 = (1+h1-h0)/4
      
      numhits = 0     ! zero all shells

!$OMP END SINGLE
      
! check if clipping planes remove volume to render
      if( x1-x0<=0.0_4 .OR. y1-y0<=0.0_4 .OR. z1-z0<=0.0_4) goto 6666 ! nothing to test

      
!$OMP SINGLE

! --------- allocate arrays based on rendering plane -----------------
! check and see if arrays are already large enough.  The unused ends
! of arrays will do no harm.

      if(allocated(rr)) then ! just test one of the following, all saved
         if( w4h4 > size(rr,DIM=2) .OR. mhe_sh0 < lbound(rr,DIM=3) .OR. mhe_sh1 > ubound(rr,DIM=3)) then
            deallocate(rr)
            deallocate(cc)
            deallocate(Q)
            deallocate(tmin)
            deallocate(tmax)
            deallocate(tpackmin)
            deallocate(tpackmax)
            deallocate(itpackmin)
            deallocate(itpackmax)
            deallocate(shiftW)
            deallocate(hit)
            deallocate(im_cc)
            deallocate(im_rr)
            
            deallocate(th_numhits)
            deallocate(th_hit)
            deallocate(th_wh)
#ifdef SREND_TOFF
            deallocate(toff)
#endif
         end if
      end if

      if( .NOT. allocated(rr)) then
         allocate(    rr(1:16,w4h4,mhe_sh0:mhe_sh1) )
         allocate(    cc(1:16,w4h4,mhe_sh0:mhe_sh1) )
         allocate( Q(1:16,1:3,w4h4,mhe_sh0:mhe_sh1) )
         allocate(  tmin(1:16,w4h4,mhe_sh0:mhe_sh1) )
         allocate(  tmax(1:16,w4h4,mhe_sh0:mhe_sh1) )
         allocate(   tpackmax(w4h4,mhe_sh0:mhe_sh1) )
         allocate(   tpackmin(w4h4,mhe_sh0:mhe_sh1) )
         allocate(  itpackmax(w4h4,mhe_sh0:mhe_sh1) )
         allocate(  itpackmin(w4h4,mhe_sh0:mhe_sh1) )
         allocate(     shiftW(w4h4,mhe_sh0:mhe_sh1) )
         allocate(        hit(w4h4,mhe_sh0:mhe_sh1) )
         allocate(      im_cc(w4h4,mhe_sh0:mhe_sh1) )
         allocate(      im_rr(w4h4,mhe_sh0:mhe_sh1) )
         
         allocate(         th_numhits(mhe_sh0:mhe_sh1,0:nth-1) )
         allocate(        th_hit(w4h4,mhe_sh0:mhe_sh1,0:nth-1) )
         allocate(                          th_wh(1:4,0:nth-1) )
#ifdef SREND_TOFF
         allocate(  toff(1:16,w4h4,mhe_sh0:mhe_sh1) )
#endif
      end if
      
      
      th_numhits = 0  ! zero all threads and shells
      
! set bounds for all threads, will consolodate later with 1 thread
      do ith=0,nth-1
         th_wh(1,ith) = wh(1)
         th_wh(2,ith) = wh(2)
         th_wh(3,ith) = wh(3)
         th_wh(4,ith) = wh(4)
      end do
         
!$OMP END SINGLE




! --------- test rays, all threads --------------------
! entry test, same for all

      th0 = 1 + th * (w4h4 / nth)
      th1 = (th+1) * (w4h4 / nth) +  w4h4 - nth*(w4h4/nth)

      do ish = mhe_sh0,mhe_sh1

         do k=th0, th1 !>>>>>>>>>>> each thread takes a piece

! ccccccccccccccccccccccccccccccccccccccccccccccccccccc
! We form 16-packs of pixels/rays

! load in indices for 4x4=16 pack
! rr(i,k) and cc(i,k) values are the im grid coordinates
            do ir = 0,3
               do ic = 0,3
                  rr(1+ic+4*ir,k,ish) = ir + h0 + 4*((k-1)/w4)
                  cc(1+ic+4*ir,k,ish) = ic + w0 + 4*mod((k-1),w4)
               end do
            end do

! here, we translate the cc pixel coordinates for
! spherical when cc < 1 for purposes of casting
! we translate these back before placing back in im() array,
! in this case.  Note that cc(i) will always be < 1 for
! all i = 1 to 16 for the entire pack.  
! But >>>> if w0=1 and w1 = W, don't unshift afterwards
! We just add W to shift here.
            shiftW(k,ish) = 0
            if( cc(1,k,ish) .lt. 1) then ! all cc(:,k), and really <= -3
            shiftW(k,ish) = mhe_W
            do i=1,16
               cc(i,k,ish) = cc(i,k,ish) + shiftW(k,ish)
            end do
            end if

! for mapping within rendering loop
            im_cc(k,ish) =  (mhe_W + cc(1,k,ish) - shiftW(k,ish) + 3)/4 - mhe_W/4
            im_rr(k,ish) =  (rr(1,k,ish) + 3)/4

            
            
do i=1,16 ! VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV

! make viewing vector Q
a(i) = (mhe_W/2 - cc(i,k,ish))*dW + dW/2.0_4
b(i) = (mhe_H/2 - rr(i,k,ish))*dH + dH/2.0_4
FLOP(10)

cosb(i) = cos(b(i))
cosbcosa(i) = cosb(i)*cos(a(i))
cosbsina(i) = cosb(i)*sin(a(i))
sinb(i) = sin(b(i))

! Q is unit here
Q(i,1,k,ish) = V(1)*cosbcosa(i) - R(1)*cosbsina(i) + U(1)*sinb(i)
Q(i,2,k,ish) = V(2)*cosbcosa(i) - R(2)*cosbsina(i) + U(2)*sinb(i)
Q(i,3,k,ish) = V(3)*cosbcosa(i) - R(3)*cosbsina(i) + U(3)*sinb(i)
FLOP(15)

! find where vector Q intersects volume 
tmin(i,k,ish) =  1.0e30_4 ! some outrageous value
tmax(i,k,ish) = -1.0e30_4

if (Q(i,1,k,ish) .ne. 0.0_4) then
t0(i) = (x0 - mhe_E(1))/Q(i,1,k,ish)
y(i) = t0(i)*Q(i,2,k,ish) + mhe_E(2)
z(i) = t0(i)*Q(i,3,k,ish) + mhe_E(3)
if ( t0(i) .ge. 0.0_4 .AND.        &
y(i) .ge. y0 .AND. y(i) .lt. y1  &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   if(t0(i) < tmin(i,k,ish)) tmin(i,k,ish) = t0(i)
   if(t0(i) > tmax(i,k,ish)) tmax(i,k,ish) = t0(i)
end if
end if

if( Q(i,1,k,ish) .ne. 0.0_4) then
t0(i) = (x1 - mhe_E(1) )/Q(i,1,k,ish)
y(i) = t0(i)*Q(i,2,k,ish) + mhe_E(2)
z(i) = t0(i)*Q(i,3,k,ish) + mhe_E(3)
if ( t0(i) .ge. 0.0_4 .AND.         &
y(i) .ge. y0 .AND. y(i) .lt. y1   &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   if(t0(i) < tmin(i,k,ish)) tmin(i,k,ish) = t0(i)
   if(t0(i) > tmax(i,k,ish)) tmax(i,k,ish) = t0(i)
end if
end if


if (Q(i,2,k,ish) .ne. 0.0_4) then
t0(i) = (y0-mhe_E(2))/Q(i,2,k,ish)
x(i) = t0(i)*Q(i,1,k,ish) + mhe_E(1)
z(i) = t0(i)*Q(i,3,k,ish) + mhe_E(3)
if ( t0(i) .ge. 0.0_4 .AND.          &
x(i) .ge. x0 .AND. x(i) .lt. x1   &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   if(t0(i) < tmin(i,k,ish)) tmin(i,k,ish) = t0(i)
   if(t0(i) > tmax(i,k,ish)) tmax(i,k,ish) = t0(i)
end if
end if

if (Q(i,2,k,ish) .ne. 0.0_4) then 
t0(i) = (y1 - mhe_E(2) )/Q(i,2,k,ish)
x(i) = t0(i)*Q(i,1,k,ish) + mhe_E(1)
z(i) = t0(i)*Q(i,3,k,ish) + mhe_E(3)
if ( t0(i) .ge. 0.0_4 .AND.         &
x(i) .ge. x0 .AND. x(i) .lt. x1  &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   if(t0(i) < tmin(i,k,ish)) tmin(i,k,ish) = t0(i)
   if(t0(i) > tmax(i,k,ish)) tmax(i,k,ish) = t0(i)
end if
end if


if (Q(i,3,k,ish) .ne. 0.0_4) then
t0(i) = (z0 - mhe_E(3))/Q(i,3,k,ish)
x(i) = t0(i)*Q(i,1,k,ish) + mhe_E(1)
y(i) = t0(i)*Q(i,2,k,ish) + mhe_E(2)
if ( t0(i) .ge. 0.0_4 .AND.         &
x(i) .ge. x0 .AND. x(i) .lt. x1  &
.AND. y(i) .ge. y0 .AND. y(i) .lt. y1) then
   if(t0(i) < tmin(i,k,ish)) tmin(i,k,ish) = t0(i)
   if(t0(i) > tmax(i,k,ish)) tmax(i,k,ish) = t0(i)
   end if
end if

if (Q(i,3,k,ish) .ne. 0.0_4) then
t0(i) = (z1 - mhe_E(3) )/Q(i,3,k,ish)
x(i) = t0(i)*Q(i,1,k,ish) + mhe_E(1)
y(i) = t0(i)*Q(i,2,k,ish) + mhe_E(2)
if ( t0(i) .ge. 0.0_4 .AND.          &
x(i) .ge. x0 .AND. x(i) .lt. x1   &
.AND. y(i) .ge. y0 .AND. y(i) .lt. y1) then
   if(t0(i) < tmin(i,k,ish)) tmin(i,k,ish) = t0(i)
   if(t0(i) > tmax(i,k,ish)) tmax(i,k,ish) = t0(i)
end if
end if



! ccccccccccccccccccccccccccccccc
! stereo fix for clipping spheres
#ifdef SREND_STEREO

! assume that clip0 and clip1 > |dstereo|
dstQ(i) = dst(1)*Q(i,1,k,ish)+dst(2)*Q(i,2,k,ish)+ dst(3)*Q(i,3,k,ish)
dstQ2(i) = dstQ(i) * dstQ(i)
clip00(i) = sqrt( shell(ish-1)**2 - dst2 + dstQ2(i) )  + dstQ(i)
FLOP(6)

#ifdef SREND_TOFF
! difference between clip0 and clip00 in dt increments
! makes sense only for clip0 = shell(ish-1)
toff(i,k,ish) = ( clip00(i) - shell(ish-1) ) / dt
toff(i,k,ish) = dt*( toff(i,k,ish) - floor(toff(i,k,ish)) )
FLOP(5)
#endif

! near clipping
x(i) = clip00(i)*Q(i,1,k,ish) + mhe_E(1)
y(i) = clip00(i)*Q(i,2,k,ish) + mhe_E(2)
z(i) = clip00(i)*Q(i,3,k,ish) + mhe_E(3)
FLOP(6)
if ( tmax(i,k,ish) > clip00(i)       &
.AND. x(i) .ge. x0 .AND. x(i) .lt. x1  &
.AND. y(i) .ge. y0 .AND. y(i) .lt. y1  &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   tmin(i,k,ish) = clip00(i)
end if

clip11(i) = sqrt( shell(ish)**2 - dst2 + dstQ2(i) ) + dstQ(i)

! far clipping
x(i) = clip11(i)*Q(i,1,k,ish) + mhe_E(1)
y(i) = clip11(i)*Q(i,2,k,ish) + mhe_E(2)
z(i) = clip11(i)*Q(i,3,k,ish) + mhe_E(3)
FLOP(6)
if ( tmax(i,k,ish) > clip00(i)       &
.AND. tmax(i,k,ish) > tmin(i,k,ish) &
.AND. x(i) .ge. x0 .AND. x(i) .lt. x1  &
.AND. y(i) .ge. y0 .AND. y(i) .lt. y1  &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   tmax(i,k,ish) = clip11(i)
end if

! reject
if( tmax(i,k,ish) .lt. clip00(i)    &
.OR. tmin(i,k,ish) .gt. clip11(i)  &
.OR. tmax(i,k,ish) .eq. tmin(i,k,ish) ) then
tmin(i,k,ish) = 1.0e30_4
tmax(i,k,ish) = -1.0e30_4
end if ! end clip check

#else

#ifdef SREND_TOFF
! safety: this does nothing useful, but in case stereo off so that
! results are correct
!!      toff(i,k,ish) = 0.0
! difference between clip0 and clip00 in dt increments
! makes sense only for clip0 = shell(ish-1)
toff(i,k,ish) = ( clip0 - shell(ish-1) ) / dt
toff(i,k,ish) = dt*( toff(i,k,ish) - floor(toff(i,k,ish)) )
FLOP(5)
#endif


! near clipping
x(i) = shell(ish-1)*Q(i,1,k,ish) + mhe_E(1)
y(i) = shell(ish-1)*Q(i,2,k,ish) + mhe_E(2)
z(i) = shell(ish-1)*Q(i,3,k,ish) + mhe_E(3)
FLOP(6)
if ( tmax(i,k,ish) > shell(ish-1)    & ! fix
.AND. x(i) .ge. x0 .AND. x(i) .lt. x1  &
.AND. y(i) .ge. y0 .AND. y(i) .lt. y1  &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   tmin(i,k,ish) = shell(ish-1)
end if

! far clipping
x(i) = shell(ish)*Q(i,1,k,ish) + mhe_E(1)
y(i) = shell(ish)*Q(i,2,k,ish) + mhe_E(2)
z(i) = shell(ish)*Q(i,3,k,ish) + mhe_E(3)
FLOP(6)
if ( tmax(i,k,ish) > shell(ish)           &
.AND. tmax(i,k,ish) .gt. tmin(i,k,ish) &
.AND. x(i) .ge. x0 .AND. x(i) .lt. x1  &
.AND. y(i) .ge. y0 .AND. y(i) .lt. y1  &
.AND. z(i) .ge. z0 .AND. z(i) .lt. z1) then
   tmax(i,k,ish) = shell(ish)
end if

! reject
if( tmax(i,k,ish) .lt. shell(ish-1)  &
.OR. tmin(i,k,ish) .gt. shell(ish)  &
.OR. tmax(i,k,ish) .eq. tmin(i,k,ish) ) then
   tmin(i,k,ish) = 1.0e30_4
   tmax(i,k,ish) = -1.0e30_4
end if ! end clip check

#endif
! cccccccccccccc end clipping spheres' fix for stereo



#ifdef SREND_POLAR_CLIP
if(.TRUE.) then
#else
if(mhe_srendtype == 1 .OR. mhe_srendtype == 3) then
#endif
! polar clipping: will not be used for stereo nor equatorial (srendtype=2)
acos_arg16(i) = Q(i,1,k,ish)*V(1)+Q(i,2,k,ish)*V(2)+Q(i,3,k,ish)*V(3)
if(acos_arg16(i) .gt. 1.0_4) acos_arg16(i) = 1.0_4
if(acos_arg16(i) .lt. -1.0_4) acos_arg16(i) = -1.0_4

if( acos( acos_arg16(i) ) .gt. pclip1) then
tmin(i,k,ish) = 1.0e30_4
tmax(i,k,ish) = -1.0e30_4
end if ! end clip check
end if


#ifdef SREND_POINT_RADIUS_CLIP  
! first, skip useless calculation if [tmin,tmax] cannot hit sphere, just by distance
pta(i) = sqrt( (mhe_E(1)-pt(1))**2 + (mhe_E(2)-pt(2))**2 + (mhe_E(3)-pt(3))**2 )
if( tmax(i,k,ish) < pta(i) - pt_r .OR. tmin(i,k,ish) > pta(i) + pt_r) then ! can't be in sqphere
   tmin(i,k,ish) = 1.0e30_4
   tmax(i,k,ish) = -1.0e30_4
else
! solve quadratic: t = -b/2a += sqrt(b^2 - 4ac)/2a
   pta(i) = Q(i,1,k,ish)**2 + Q(i,2,k,ish)**2 + Q(i,3,k,ish)**2 ! this vector Q should never be zero
FLOP(5)
   ptb(i) = 2.0*(Q(i,1,k,ish)*(mhe_E(1)-pt(1)) + &
                 Q(i,2,k,ish)*(mhe_E(2)-pt(2)) + &
                 Q(i,3,k,ish)*(mhe_E(3)-pt(3)) )
FLOP(6)
   ptc(i) = pt(1)**2+mhe_E(1)**2-2.0*pt(1)*mhe_E(1) + &
            pt(2)**2+mhe_E(2)**2-2.0*pt(2)*mhe_E(2) + &
            pt(3)**2+mhe_E(3)**2-2.0*pt(3)*mhe_E(3) - pt_r**2
FLOP(22)
   ptdiscr(i) = ptb(i)**2 - 4.0*pta(i)*ptc(i) ! discriminant b^2 - 4ac
FLOP(4)
   if( ptdiscr(i) > 0.0) then 
      ptdiscr(i) = sqrt(ptdiscr(i))
FLOP(1)
      pt0(i) = (-ptb(i) - ptdiscr(i))*0.5 / pta(i)
      pt1(i) = (-ptb(i) + ptdiscr(i))*0.5 / pta(i)
FLOP(6)
!   if( pt0(i) < tmax(i,k,ish) .AND. pt0(i) > tmin(i,k,ish)) tmin(i,k,ish) = pt0(i)
!   if( pt1(i) < tmax(i,k,ish) .AND. pt1(i) > tmin(i,k,ish)) tmax(i,k,ish) = pt1(i)
      if( pt1(i) < tmin(i,k,ish) .OR. tmax(i,k,ish) < pt0(i)) then ! current interval not in sphere, missed
         tmin(i,k,ish) = 1.0e30_4
         tmax(i,k,ish) = -1.0e30_4
      else 
         if( pt1(i) < tmax(i,k,ish) ) tmax(i,k,ish) = pt1(i)
         if( tmin(i,k,ish) < pt0(i) ) tmin(i,k,ish) = pt0(i)
      end if
   else ! ray missed, so mark as missed
      tmin(i,k,ish) = 1.0e30_4
      tmax(i,k,ish) = -1.0e30_4
   end if
end if
#endif


end do ! i over 4x4=16 pack : iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
! vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv



! we now find the span for t to iterate over the pack 
            tpackmax(k,ish) = maxval(tmax(:,k,ish))
            tpackmin(k,ish) = minval(tmin(:,k,ish))

            if(tpackmax(k,ish) .le. shell(ish-1) ) goto 2014
            if(tpackmin(k,ish) .gt. shell(ish)) goto 2014

            itpackmin(k,ish) = floor(tpackmin(k,ish)/dt)
            itpackmax(k,ish) = ceiling(tpackmax(k,ish)/dt)
            
! OK here, so find minimal bounds for wh casting array
! hit() contains indices to 16-packs which have at least 1
! ray hitting cube
! NOTE: we cast in 1:W, but we shift to wh(1):wh(2) to store

            th_numhits(ish,th) = th_numhits(ish,th) + 1   ! threaded
            th_hit(th_numhits(ish,th),ish,th) = k         ! threaded

            if(cc(1 ,k,ish)-shiftW(k,ish) < th_wh(1,th)) th_wh(1,th) = cc(1,k,ish)-shiftW(k,ish)
            if(cc(16,k,ish)-shiftW(k,ish) > th_wh(2,th)) th_wh(2,th) = cc(16,k,ish)-shiftW(k,ish)
            if(rr(1, k,ish) < th_wh(3,th)) th_wh(3,th) = rr(1, k,ish)
            if(rr(16,k,ish) > th_wh(4,th)) th_wh(4,th) = rr(16,k,ish)
2014        continue
         end do ! k k k k k k k k k k k k k k k k k k k k k k k k k k k k
      end do ! ish: sh0 to sh1

! --------- end: tested all rays -------------------------------------



!$OMP BARRIER
!$OMP SINGLE

! first extract the hits for each shell from all threads, may not be well ordered for sampling
      do ish = mhe_sh0, mhe_sh1
         do ith = 0,nth-1
            do k = 1,th_numhits(ish,ith)
                  numhits(ish) = numhits(ish)+1
                  hit( numhits(ish), ish) = th_hit(k,ish,ith)
            end do
         end do
      end do

! now, hit( numhits(ish), ish) has the indices to indices for sampling, but these are not
! ordered.  Order these.
      do ish = mhe_sh0, mhe_sh1
         if(numhits(ish)>1) call heapsort( numhits(ish), hit(1,ish) ) ! heapsort for each shell
      end do

! setup bounds (w0:w1,h0:h1) from threads
      do ith=0,nth-1
         if( th_wh(1,ith) < wh(1)) wh(1) = th_wh(1,ith)
         if( th_wh(2,ith) > wh(2)) wh(2) = th_wh(2,ith)
         if( th_wh(3,ith) < wh(3)) wh(3) = th_wh(3,ith)
         if( th_wh(4,ith) > wh(4)) wh(4) = th_wh(4,ith)
      end do



! now, pack results from arrays 1:w4h4 into arrays 1:numhits
! This fills in unused gaps of these arrays to 1:numhits length
! by moving all entries forward.
      numhits_all = 0
      sh0_bot = mhe_nsh+1
      sh1_top = 0

      do ish = mhe_sh0, mhe_sh1
         if( numhits(ish) .gt. 0) then
            if(ish .lt. sh0_bot) sh0_bot = ish
            if(ish .gt. sh1_top) sh1_top = ish 
            do k=1,numhits(ish)
               rr(:,k,ish) = rr(:,hit(k,ish),ish)
               cc(:,k,ish) = cc(:,hit(k,ish),ish)
               Q(:,:,k,ish) = Q(:,:,hit(k,ish),ish)   
               tmin(:,k,ish) = tmin(:,hit(k,ish),ish)
               tmax(:,k,ish) = tmax(:,hit(k,ish),ish)
               itpackmin(k,ish) = itpackmin(hit(k,ish),ish)
               itpackmax(k,ish) = itpackmax(hit(k,ish),ish)
               shiftW(k,ish) = shiftW(hit(k,ish),ish)
! mapper in sampling loop          
               im_cc(k,ish) =  im_cc(hit(k,ish),ish)
               im_rr(k,ish) =  im_rr(hit(k,ish),ish)
#ifdef SREND_TOFF
               toff(:,k,ish) = toff(:,hit(k,ish),ish)
#endif
            end do
            numhits_all = numhits_all + numhits(ish)
         end if ! numhits(ish) > 0
      end do

   

      mhe_sh0 = sh0_bot ! these will be correct only if there are hits, but only used then
      mhe_sh1 = sh1_top
         
!$OMP END SINGLE

! --------------------------------------------------
6666  continue ! place to jump if w1-w0<0, no ray testing was done

!$OMP SINGLE

! index im by 16 pack: (-3,-2,-1,0,1,2,3,4) -> (0,1)
      wh(1) = ( mhe_W + wh(1) + 3 )/4 - mhe_W/4
      wh(2) = wh(2)/4
      wh(3) = ( wh(3) + 3 )/4
      wh(4) = wh(4)/4

! allocate target rendering array, exit if nothing to render      
      if(.NOT. block(nV)%mpi .AND. .NOT. block(nV)%load) then !render NOMPI, must be full dimensions
! this is for the smp mode where renderer calls the finisher just for processing and writing
! out the image from rendering one block.
         mhe_sh0 = 1
         mhe_sh1 = mhe_nsh
         wh(1) = 1; wh(2) = mhe_W/4; wh(3) = 1; wh(4) = mhe_H/4
         if( allocated( block(nV)%cube(block(nV)%n)%im ) ) deallocate(block(nV)%cube(block(nV)%n)%im)
         allocate(block(nV)%cube(block(nV)%n)%im(1:16,0:3,wh(1):wh(2),wh(3):wh(4),mhe_sh0:mhe_sh1,1:mhe_nR) )
         block(nV)%cube(block(nV)%n)%im = 0.0_4 ! zero array
      else if(mhe_tiles_right*mhe_tiles_down > 1) then
         if(numhits_all > 0) then
            if( allocated(block(nV)%cube(0)%im)) deallocate(block(nV)%cube(0)%im)
            allocate(block(nV)%cube(0)%im(1:16,0:3,wh(1):wh(2),wh(3):wh(4),mhe_sh0:mhe_sh1,1:mhe_nR) )
            block(nV)%cube(0)%im = 0.0_4 ! zero array
         else
            block(nV)%n = block(nV)%n - 1
!           return ! will return outside openmp loop below
         end if
      else if(numhits_all > 0) then
         if( allocated(block(nV)%cube(block(nV)%n)%im)) deallocate(block(nV)%cube(block(nV)%n)%im)
         allocate(block(nV)%cube(block(nV)%n)%im(1:16,0:3,wh(1):wh(2),wh(3):wh(4),mhe_sh0:mhe_sh1,1:mhe_nR) )
         block(nV)%cube(block(nV)%n)%im = 0.0_4 ! zero array
      else if( block(nV)%load ) then ! no hits
         block(nV)%n = block(nV)%n - 1
         if( block(nV)%n == 0) then
            mhe_E = E_AMR_PASS ! for mpi_header
!            mhe_wh_out = wh ! copy these to mpi_header
            mhe_wh_out(1)=1; mhe_wh_out(2)=0
            mhe_nBlock = block(nV)%n ! use for all in tag for composer to recv data: TAG = nV*1000000 + block(nV)%n
            block(nV)%mpi_header(:,0) = mpi_header(:)
         end if
!          return ! will return outside openmp loop below
!      else
!          in this case, numhits_all = 0, regular MPI render and send to compose or finish             
      end if ! no dud message data sends


      if(.NOT. block(nV)%load .OR. numhits_all > 0) then ! --------- numhits_all > 0, else don't bother doing
      
! to adjust back from AMR changes, as needed for compose/finish
         E_AMR_SAVE = mhe_E ! save this exactly for sampling
         mhe_E = E_AMR_PASS ! for mpi_header
         mhe_wh_out = wh ! copy these to mpi_header
         mhe_nBlock = block(nV)%n ! use for all in tag for composer to recv data: TAG = nV*1000000 + block(nV)%n
         block(nV)%mpi_header(:,block(nV)%n) = mpi_header(:) ! copy for sending up
      
         if(block(nV)%n .ge. 1) then ! load
            if(tiles_right_in*tiles_down_in == 1)  mhe_TARGid = TARGid(1,1)
            mhe_nV_out = nV_out
            block(nV)%mpi_header(:,block(nV)%n) = mpi_header(:)
            block(nV)%wh(1:4,block(nV)%n) = wh(1:4)
            block(nV)%wh(5,block(nV)%n) = mhe_iNX
            block(nV)%wh(6,block(nV)%n) = mhe_iNY
            block(nV)%wh(7,block(nV)%n) = mhe_iNZ
            block(nV)%wh(8,block(nV)%n)  = mhe_XN
            block(nV)%wh(9,block(nV)%n)  = mhe_YN
            block(nV)%wh(10,block(nV)%n) = mhe_ZN
            block(nV)%wh(14,block(nV)%n) = mhe_sh0
            block(nV)%wh(15,block(nV)%n) = mhe_sh1
            block(nV)%wh(16,block(nV)%n) = mhe_Vdim
         else if( block(nV)%load ) then ! block(nV)%n == 0
            if(tiles_right_in*tiles_down_in == 1)  mhe_TARGid = TARGid(1,1)
            mhe_nV_out = nV_out
            block(nV)%mpi_header(:,block(nV)%n) = mpi_header(:)
            block(nV)%wh(1:4,block(nV)%n) = wh(1:4)
            block(nV)%wh(5,block(nV)%n) = mhe_iNX
            block(nV)%wh(6,block(nV)%n) = mhe_iNY
            block(nV)%wh(7,block(nV)%n) = mhe_iNZ
            block(nV)%wh(8,block(nV)%n)  = mhe_XN
            block(nV)%wh(9,block(nV)%n)  = mhe_YN
            block(nV)%wh(10,block(nV)%n) = mhe_ZN
            block(nV)%wh(14,block(nV)%n) = mhe_sh0
            block(nV)%wh(15,block(nV)%n) = mhe_sh1
            block(nV)%wh(16,block(nV)%n) = mhe_Vdim
         end if

      end if ! ------------------- numhits_all > 0

! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss
! sssssssss end setup ssssssssssssssssssssssssssssssssssssssssssssssss

! done here instead of within rendering loop by all threads
      mhe_E = E_AMR_SAVE ! all restore this

! barrier here for all threads, necessary
!$OMP END SINGLE




#ifdef SREND_RENDER_TIMING
!$OMP MASTER
      call system_clock(it2); print *,'Render: after rays setup =:',(it2-it1)*rc; it1 = it2
!$OMP END MASTER
#endif


! This is outside an openmp parallel section, a shared "save" var set by the single thread above
! This test then exit on numhits_all <= 0 must be done outside.


! ++++++++++++++++++++++++++++++++++++++ check this!
      if(block(nV)%load .AND. numhits_all <= 0) return
! +++++++++++++++++++ check on this above, fails in usual mode if no hits with this set


      
! only master does this, the single above must have updated sh mem
! for this setup message to be loaded properly
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#ifndef SREND_NOMPI
!$OMP MASTER

      if(.NOT. block(nV)%load .AND. tiles_right_in*tiles_down_in == 1) then
! send bounds and info SREND_SETUP_TAG
         block(nV)%isendDONE2(block(nV)%n) = .FALSE.
         block(nV)%nreq2 = block(nV)%n

         srendMPIrank = mhe_TARGid ! for portability INTEGER
         srendMPItag = SREND_TAG2+nV_out ! for portability INTEGER
#ifdef SREND_ISSEND
         call MPI_issend(block(nV)%mpi_header(1,block(nV)%n),400,MPI_CHARACTER,  &
#else
         call MPI_isend(block(nV)%mpi_header(1,block(nV)%n),400,MPI_CHARACTER,  &
#endif
         srendMPIrank, srendMPItag, srend_COMM, block(nV)%MYreq2, MYer)
      end if
!$OMP END MASTER
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

#ifdef SREND_RENDER_TIMING
      call system_clock(it2); print *,'Render: after header sent, MPI only =:',(it2-it1)*rc; it1 = it2
#endif




! RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR
! Ray Casting loop : all threads do this
! RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR
if( numhits_all > 0) then

#ifdef SREND_AMR
   dt_Valpha_Vdim = dt * Valpha / mhe_Vdim
#endif

! determine thread id for load sharing scheme: id = th, num threads = nth
!#ifdef _OPENMP
!   th = OMP_get_thread_num()
!   nth = OMP_get_num_threads()
!#else
!   th = 0
!   nth = 1
!#endif
      
! AMR tiling
   rendi = block(nV)%n
   if(mhe_tiles_right*mhe_tiles_down > 1) rendi = 0 ! target array for rendering 

! nRnRnR ----- loop over the numver of variables ---------------------
   do ii=1,mhe_nR

      do ish = mhe_sh0, mhe_sh1 ! loop over shells

! Here determine compositing work for thread th by 16pack over k
         part = numhits(ish) ! number of 16packs to do
         Wpart = part / nth ! number of 16packs per thread, coarse
         Rpart = part - Wpart * nth ! remainder, leftover 16packs
! we determine each th thread's span of 16packs to do: Wpart + (1 or 0)
         th0 = 1 + th*Wpart         !all get this many 16packs
         th1 = 1 + (th+1)*Wpart - 1
         if( th .ge.  Rpart) then ! add offset was Rpart was done
            th0 = th0 + Rpart
            th1 = th1 + Rpart      
         else ! th < Rpart
            th0 = th0 + th     ! offset done before
            th1 = th1 + th + 1 ! extra offset within
         end if
         

! ccccccccccc for each k cast a 16pack of rays over i and march them through by tt
         do k = th0, th1

! init results for pack
SREND_VECTORIZE
            do i=1,16
               ddR(i) = 0.0_4
               ddG(i) = 0.0_4
               ddB(i) = 0.0_4
               daa(i) = 0.0_4
            end do


#ifdef SREND_TOFF
            do tt=itpackmax(k,ish), itpackmin(k,ish)-1,-1 ! TTTTTTTT
#else
            do tt=itpackmax(k,ish), itpackmin(k,ish),-1 ! TTTTTTTT
               t = tt*dt
#endif


#ifdef SREND_THREAD_LOAD
               block(nV)%th_hit(th) = block(nV)%th_hit(th) + 1
#endif

SREND_VECTORIZE
               do i=1,16 ! VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV

! these are coordinates in grid plane: [offX,offX+N]
#ifdef SREND_TOFF
                  t(i) = tt*dt + toff(i,k,ish)
                  x(i) = t(i)*Q(i,1,k,ish) + mhe_E(1) + .5_4
                  y(i) = t(i)*Q(i,2,k,ish) + mhe_E(2) + .5_4
                  z(i) = t(i)*Q(i,3,k,ish) + mhe_E(3) + .5_4
FLOP(11)
#else
                  x(i) = t*Q(i,1,k,ish) + mhe_E(1) + .5_4
                  y(i) = t*Q(i,2,k,ish) + mhe_E(2) + .5_4
                  z(i) = t*Q(i,3,k,ish) + mhe_E(3) + .5_4
FLOP(9)
#endif

                  ix(i) = x(i) ! cast, if floor wont vectorize, compiler might whine
                  iy(i) = y(i) ! cast, if floor wont vectorize, compiler might whine
                  iz(i) = z(i) ! cast, if floor wont vectorize, compiler might whine
!                  ix(i) = floor(x(i))
!                  iy(i) = floor(y(i))
!                  iz(i) = floor(z(i))
                  

! mask out bad values
                  tmask(i) = 1.0_4
#ifdef SREND_TOFF
                  if( t(i) .le. tmin(i,k,ish)) tmask(i) = 0.0_4
                  if( t(i) .gt. tmax(i,k,ish)) tmask(i) = 0.0_4
#else
                  if( t .le. tmin(i,k,ish)) tmask(i) = 0.0_4
                  if( t .gt. tmax(i,k,ish)) tmask(i) = 0.0_4
#endif
                  if( tmask(i) .eq. 0.0_4) then
                     ix(i) = iNXNp1 ! exists in data array, does not matter what
                     iy(i) = iNYNp1 ! these are: iNYNp1 = iNY*N + 1
                     iz(i) = iNZNp1
                  end if

! >>> trilinear on color table entry 
                  dx0(i) = x(i) - ix(i)
                  dy0(i) = y(i) - iy(i)
                  dz0(i) = z(i) - iz(i)
                  dx1(i) = 1.0_4 - dx0(i)
                  dy1(i) = 1.0_4 - dy0(i)
                  dz1(i) = 1.0_4 - dz0(i)
FLOP(6)
! correct offset
                  ix(i) = ix(i) - iNXN!*tmask(i,k) : these are: iNX*N
                  iy(i) = iy(i) - iNYN!*tmask(i,k)
                  iz(i) = iz(i) - iNZN!*tmask(i,k)
               end do ! vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
 

! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss
#ifdef SREND_SOLID_RENDERING
! always back to front
if(.NOT. mhe_outin .AND. Sdim == 2) then ! do solid here

   Sminmax_xyz(0,1) = minval(ix)
   Sminmax_xyz(1,1) = maxval(ix)
   Sminmax_xyz(0,2) = minval(iy)
   Sminmax_xyz(1,2) = maxval(iy)
   Sminmax_xyz(0,3) = minval(iz)
   Sminmax_xyz(1,3) = maxval(iz)

   if( Sminmax_xyz(0,1)<= Sbounds(1,1)+1 .AND. Sminmax_xyz(1,1) >= Sbounds(0,1) .AND. &
      Sminmax_xyz(0,2)<= Sbounds(1,2)+1 .AND. Sminmax_xyz(1,2) >= Sbounds(0,2) .AND. &
      Sminmax_xyz(0,3)<= Sbounds(1,3)+1 .AND. Sminmax_xyz(1,3) >= Sbounds(0,3) ) then

      
SREND_VECTORIZE
      do i=1,16
!#ifdef NOTHING
         S00(i) = dy1(i)*iachar(S(ix(i)  ,iy(i),iz(i)))     + dy0(i)*iachar(S(ix(i)  ,iy(i)+1,iz(i)))
         S10(i) = dy1(i)*iachar(S(ix(i)+1,iy(i),iz(i)))     + dy0(i)*iachar(S(ix(i)+1,iy(i)+1,iz(i)))
         S01(i) = dy1(i)*iachar(S(ix(i)  ,iy(i),iz(i)+1))   + dy0(i)*iachar(S(ix(i)  ,iy(i)+1,iz(i)+1))
         S11(i) = dy1(i)*iachar(S(ix(i)+1,iy(i),iz(i)+1))   + dy0(i)*iachar(S(ix(i)+1,iy(i)+1,iz(i)+1))
         dalpha(i) = ( dx1(i)*dz1(i)*S00(i)+dx0(i)*dz1(i)*S10(i) + &
                        dx1(i)*dz0(i)*S01(i)+dx0(i)*dz0(i)*S11(i) )/255.0_4
         dalpha(i) = max(0.0_4, dalpha(i))
         dalpha(i) = min(1.0_4,dalpha(i))
         dalpha(i) = dalpha(i)*tmask(i) ! zero if outside

         dR(i) = (dx1(i)*dz1(i)*iachar( Srgb(1,ix(i)  ,iz(i)) ) + &
                  dx0(i)*dz1(i)*iachar( Srgb(1,ix(i)+1,iz(i)) ) + &
                  dx1(i)*dz0(i)*iachar( Srgb(1,ix(i)  ,iz(i)+1) ) + &
                  dx0(i)*dz0(i)*iachar( Srgb(1,ix(i)+1,iz(i)+1) )  )/255.0_4
         dR(i) = max(0.0_4,dR(i)); dR(i) = min(255.0_4,dR(i))
         dG(i) = (dx1(i)*dz1(i)*iachar( Srgb(2,ix(i)  ,iz(i)) ) + &
                  dx0(i)*dz1(i)*iachar( Srgb(2,ix(i)+1,iz(i)) ) + &
                  dx1(i)*dz0(i)*iachar( Srgb(2,ix(i)  ,iz(i)+1) ) + &
                  dx0(i)*dz0(i)*iachar( Srgb(2,ix(i)+1,iz(i)+1) )  )/255.0_4
         dG(i) = max(0.0_4,dG(i)); dG(i) = min(255.0_4,dG(i))
         dB(i) = (dx1(i)*dz1(i)*iachar( Srgb(3,ix(i)  ,iz(i)) ) + &
                  dx0(i)*dz1(i)*iachar( Srgb(3,ix(i)+1,iz(i)) ) + &
                  dx1(i)*dz0(i)*iachar( Srgb(3,ix(i)  ,iz(i)+1) ) + &
                  dx0(i)*dz0(i)*iachar( Srgb(3,ix(i)+1,iz(i)+1) )  )/255.0_4
         dB(i) = max(0.0_4,dB(i)); dB(i) = min(255.0_4,dB(i))
!#endif
#ifdef NOTHING
         dalpha(i) = iachar(S(ix(i),iy(i),iz(i)))/255.0_4
         dR(i) = iachar( Srgb(1,ix(i)  ,iz(i)) ) / 255.0_4
         dG(i) = iachar( Srgb(2,ix(i)  ,iz(i)) ) / 255.0_4
         dB(i) = iachar( Srgb(3,ix(i)  ,iz(i)) ) / 255.0_4
#endif
         transparency(i) = (1.0_4 - dalpha(i))
         ddR(i) = transparency(i)*ddR(i) + dalpha(i)*dR(i) ! replaced as these are premultiplied by alpha above
         ddG(i) = transparency(i)*ddG(i) + dalpha(i)*dG(i)
         ddB(i) = transparency(i)*ddB(i) + dalpha(i)*dB(i)
         daa(i) = transparency(i)*daa(i) + dalpha(i)
      end do
      
   end if ! there is solid to render
      
end if ! outine = T && Sdim == 1
#endif
! ssss solid rendering above -----------------------------------------
! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss
 
 
! obtain and lineup data for SIMD section, probably not useful to vectorize
SREND_VECTORIZE
               do i=1,16
                  dat8(i,1) = ichar(bytedata(ix(i)  ,iy(i)  ,iz(i)  ,ii) )
                  dat8(i,2) = ichar(bytedata(ix(i)+1,iy(i)  ,iz(i)  ,ii) )
                  dat8(i,3) = ichar(bytedata(ix(i)  ,iy(i)+1,iz(i)  ,ii) )
                  dat8(i,4) = ichar(bytedata(ix(i)  ,iy(i)  ,iz(i)+1,ii) )
                  dat8(i,5) = ichar(bytedata(ix(i)+1,iy(i)+1,iz(i)  ,ii) )
                  dat8(i,6) = ichar(bytedata(ix(i)+1,iy(i)  ,iz(i)+1,ii) )
                  dat8(i,7) = ichar(bytedata(ix(i)  ,iy(i)+1,iz(i)+1,ii) )
                  dat8(i,8) = ichar(bytedata(ix(i)+1,iy(i)+1,iz(i)+1,ii) )
               end do

      
      
      
SREND_VECTORIZE
               do i=1,16 ! VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
! interpolate tval(i) from color table byte values on 8 corner points of cell 
                  tval(i)=                           &
                     dat8(i,1)*dx1(i)*dy1(i)*dz1(i)  &
                  + dat8(i,2)*dx0(i)*dy1(i)*dz1(i)  &
                  + dat8(i,3)*dx1(i)*dy0(i)*dz1(i)  &
                  + dat8(i,4)*dx1(i)*dy1(i)*dz0(i)  &
                  + dat8(i,5)*dx0(i)*dy0(i)*dz1(i)  &
                  + dat8(i,6)*dx0(i)*dy1(i)*dz0(i)  &
                  + dat8(i,7)*dx1(i)*dy0(i)*dz0(i)  &
                  + dat8(i,8)*dx0(i)*dy0(i)*dz0(i)
FLOP(31)

! optimize
                  tval(i) = tval(i)*tmask(i) ! will be 0 if the sample was outside of data array
                  if(tval(i) >= 255.0_4) tval(i) = 249.999_4 ! correction, ugly, but OK
!                  cotabi(i) = tval(i) ! cast, lower cotab entry index, compiler may complain: if floor won't vectorize
                  cotabi(i) = floor(tval(i))
                  tval(i) = tval(i) - cotabi(i) ! reuse tval() var, the higher cotab value portion
                  q1Mtval(i) = 1.0_4 - tval(i)    ! the lower cotab value portion
FLOP(3)
               end do ! vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

! not SIMD here: interpolate RGBA values from color table cotab()
               do i=1,16
                  dR(i) = (cotab(1,cotabi(i),ii)*(q1Mtval(i)) + &
                           cotab(1,cotabi(i)+1,ii)*tval(i) )
                  dG(i) = (cotab(2,cotabi(i),ii)*(q1Mtval(i)) + &
                           cotab(2,cotabi(i)+1,ii)*tval(i) )
                  dB(i) = (cotab(3,cotabi(i),ii)*(q1Mtval(i)) + &
                           cotab(3,cotabi(i)+1,ii)*tval(i) )
                  dalpha(i) = ( (cotab(0,cotabi(i),ii)*(q1Mtval(i)) + &
                           cotab(0,cotabi(i)+1,ii)*tval(i) ) )*tmask(i)
FLOP(14)
               end do

! ------ composite -----------------
               if(.NOT. mhe_outin) then
!     usual: back2front for view from eye
! back2front : inside->out
SREND_VECTORIZE
                  do i=1,16 ! VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV

#ifdef SREND_AMR
                     transparency(i) = (1.0_4 - dalpha(i))**dt_Valpha_Vdim
                     dalpha(i) = 1.0_4 - transparency(i)
FLOP(3)
#else
                     transparency(i) = 1.0_4 - dalpha(i)
FLOP(1)
#endif
                     ddR(i) = transparency(i)*ddR(i) + dalpha(i)*dR(i) ! replaced as these are premultiplied by alpha above
                     ddG(i) = transparency(i)*ddG(i) + dalpha(i)*dG(i)
                     ddB(i) = transparency(i)*ddB(i) + dalpha(i)*dB(i)
                     daa(i) = transparency(i)*daa(i) + dalpha(i)
FLOP(11)
                  end do ! i : vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

      
      
               else
!     unusual: back2front but viewing from outside to eye
! front2back : outside->in
SREND_VECTORIZE
                  do i=1,16 ! VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV

#ifdef SREND_AMR
                     dalpha(i) = 1.0_4 - (1.0_4 - dalpha(i))**dt_Valpha_Vdim
FLOP(3)
#endif
                     transparency(i) = 1.0_4 - daa(i)
                     ddR(i) = transparency(i)*dR(i)*dalpha(i) + ddR(i) ! replaced as these are premultiplied by alpha above
                     ddG(i) = transparency(i)*dG(i)*dalpha(i) + ddG(i)
                     ddB(i) = transparency(i)*dB(i)*dalpha(i) + ddB(i)
                     daa(i) = transparency(i)*dalpha(i)       + daa(i)
FLOP(12)
                  end do ! i : vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

               end if ! not mhe_outin
! ---- end composite ---------------

            end do ! tt : tttttttttttttttttttttttttttttttttttttttttttttttttt

! all map to same 4x4 16-pack, so use the first one      
!      im_cc(k,ish) =  (mhe_W + cc(1,k,ish) - shiftW(k,ish) + 3)/4 - mhe_W/4
!      im_rr(k,ish) =  (rr(1,k,ish) + 3)/4

SREND_VECTORIZE
            do i=1,16 ! rendi = 0 if tiles_right*tiles_down > 1, else is block(nV)%n
               block(nV)%cube(rendi)%im(i,0,im_cc(k,ish),im_rr(k,ish),ish,ii) = daa(i) ! a
               block(nV)%cube(rendi)%im(i,1,im_cc(k,ish),im_rr(k,ish),ish,ii) = ddR(i) ! a
               block(nV)%cube(rendi)%im(i,2,im_cc(k,ish),im_rr(k,ish),ish,ii) = ddG(i) ! a
               block(nV)%cube(rendi)%im(i,3,im_cc(k,ish),im_rr(k,ish),ish,ii) = ddB(i) ! a
            end do


         end do ! k : over 16 packs of rays

      end do ! ish : sh0 to sh1 shells

   end do ! ii 1:mhe_nR byte arrays of variable data


! rrrrrrrrrrrrrr end ray casting lop rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
! rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
end if ! numhits_all > 0
! all threads must reach this so all data is in VC(nV)%im
!$OMP BARRIER

#ifdef SREND_FLOP
!$OMP MASTER
      print *,'flops =',flop,'M=',1.0e-6*flop,'G=',1.0e-9*flop,'T=',1.0e-12*flop
!$OMP END MASTER
#endif

#ifdef SREND_RENDER_TIMING
!$OMP MASTER
      call system_clock(it2); print *,'Render: after core sampling loop =:',(it2-it1)*rc; it1 = it2
!$OMP END MASTER
#endif


! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ 
#ifndef SREND_NOMPI
!$OMP MASTER
if( numhits_all > 0 .AND. .NOT. block(nV)%load .AND. tiles_right_in*tiles_down_in == 1) then
      block(nV)%isendDONE(block(nV)%n) = .FALSE.
      block(nV)%nreq = block(nV)%n

      srendMPIrank = mhe_TARGid
      srendMPItag = nV_out*10000 + block(nV)%n
#ifdef SREND_ISSEND
      call MPI_issend(block(nV)%cube(block(nV)%n)%im,size(block(nV)%cube(block(nV)%n)%im),MPI_REAL,    &
#else      
      call MPI_isend(block(nV)%cube(block(nV)%n)%im,size(block(nV)%cube(block(nV)%n)%im),MPI_REAL,    &
#endif
      srendMPIrank, srendMPItag, srend_COMM, block(nV)%MYreq, MYer)
end if
!$OMP END MASTER
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$




#ifndef SREND_NOMPI
!$OMP MASTER
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
if(tiles_right_in*tiles_down_in > 1) then
if(numhits_all > 0) then

! AMR tiling
   tileW = mhe_W / mhe_tiles_right / 4
   tileH = mhe_H / mhe_tiles_down / 4

   do i = (mhe_W/4 + wh(1)-1)/tileW + 1 - mhe_tiles_right, (mhe_W/4 + wh(2)-1)/tileW + 1 - mhe_tiles_right
      do j = (wh(3)-1)/tileH + 1, (wh(4)-1)/tileH + 1
         w0 = max(wh(1),1+(i-1)*tileW)
         w1 = min(wh(2),i*tileW)
         h0 = max(wh(3),1+(j-1)*tileH)
         h1 = min(wh(4),j*tileH)

! swing over if neg; and wo and w1 must both be negative here
         if(w0 <= 0) then
            iii = mhe_W/4
         else
            iii = 0
         end if
         
         if( i < 1) then
            itile = mhe_tiles_right
         else
            itile = 0
         end if
            
         if( allocated( block(nV)%cube(block(nV)%n)%im )) deallocate(block(nV)%cube(block(nV)%n)%im)
         allocate( block(nV)%cube(block(nV)%n)%im(1:16,0:3,w0+iii:w1+iii,h0:h1,mhe_sh0:mhe_sh1,1:mhe_nR) )
         do ir = 1,mhe_nR
         do ish = mhe_sh0, mhe_sh1
         do jj=h0,h1
         do ii=w0,w1
         do k=0,3
         do kk=1,16
            block(nV)%cube(block(nV)%n)%im(kk,k,ii+iii,jj,ish,ir) = block(nV)%cube(0)%im(kk,k,ii,jj,ish,ir)
         end do
         end do
         end do
         end do
         end do
         end do
         
         block(nV)%Tload(i+itile,j) = block(nV)%Tload(i+itile,j) + 1 ! increment tile count for this rank

         mhe_wh_out(1) = w0+iii; mhe_wh_out(2) = w1+iii; mhe_wh_out(3) = h0; mhe_wh_out(4) = h1
         mhe_E = E_AMR_PASS ! must restore this for mpi_header, changed to E_SAVE for ray cast loop
         mhe_nBlock = block(nV)%n
         mhe_TRi = i+itile
         mhe_TDi = j
         mhe_TARGid = TARGid(i+itile,j)
         mhe_Vpass = Vpass(nV)

         block(nV)%mpi_header(:,block(nV)%n) = mpi_header(:) ! copy over
! write to mpi_header (as equivalenced), then copy to this header to send up
         block(nV)%isendDONE2(block(nV)%n) = .FALSE.
         block(nV)%nreq2 = block(nV)%n
            
         srendMPIrank = mhe_TARGid ! MPI portability
         srendMPItag = SREND_TAG2+nV_out
#ifdef SREND_ISSEND
         call MPI_issend(block(nV)%mpi_header(1,block(nV)%n), 400,MPI_CHARACTER, &
#else 
         call MPI_isend(block(nV)%mpi_header(1,block(nV)%n), 400,MPI_CHARACTER,  &
#endif
                            srendMPIrank, srendMPItag,                           &
                            srend_COMM, block(nV)%MYreq2, MYer)
! send data                            
         block(nV)%isendDONE(block(nV)%n) = .FALSE.
         block(nV)%nreq = nV_out*10000 + block(nV)%n
         
         srendMPIrank = mhe_TARGid ! MPI portability
         srendMPItag = nV_out*10000 + block(nV)%n
#ifdef SREND_ISSEND
         call MPI_issend(block(nV)%cube(block(nV)%n)%im,size(block(nV)%cube(block(nV)%n)%im),MPI_REAL, &
#else 
         call MPI_isend(block(nV)%cube(block(nV)%n)%im,size(block(nV)%cube(block(nV)%n)%im),MPI_REAL,  &
#endif
                            srendMPIrank, srendMPItag,                                                 & 
                            srend_COMM, block(nV)%MYreq, MYer)
         block(nV)%n = block(nV)%n + 1 ! for next block
      end do
   end do
      
end if ! numhits_all > 0
block(nV)%n = block(nV)%n - 1 ! adjust back from one extra creation, always
end if ! tilesR*tilesD > 0
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
!$OMP END MASTER
#endif

      
      

#ifdef SREND_RENDER_TIMING
!$OMP MASTER
      call system_clock(it2); print *,'Render: after MPI send data =:',(it2-it1)*rc; it1 = it2
!$OMP END MASTER
#endif
      

! --------------------- SREND_NOMPI = desktop ------------------------
#ifdef SREND_NOMPI
!$OMP MASTER
      if( .NOT. block(nV)%load ) then
         call Srend_base(nV,0,0,0,3)
      end if
!$OMP END MASTER
!$OMP BARRIER
#endif
! -------------------- end : desktop NOMPI ---------------------------


      return ! nothing left to do
! rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
! rrrrrrrrrrrrrrrr end rendering :ray casting rrrrrrrrrrrrrrrrrrrrrrrr
! rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr

      CONTAINS

! numerical recipes heapsort, just for integer*4; must have n>1
      SUBROUTINE heapsort(N,RA)
      implicit none
      integer*4 :: N, RA(N)
      integer*4 :: L,RRA,IR,I,J
      L=N/2+1
      IR=N
      !The index L will be decremented from its initial value during the
      !"hiring" (heap creation) phase. Once it reaches 1, the index IR 
      !will be decremented from its initial value down to 1 during the
      !"retirement-and-promotion" (heap selection) phase.
1099  continue
      if(L > 1)then
         L=L-1
         RRA=RA(L)
      else
         RRA=RA(IR)
         RA(IR)=RA(1)
         IR=IR-1
         if(IR.eq.1)then
            RA(1)=RRA
            return
         end if
      end if
      I=L
      J=L+L
2099  if(J.le.IR)then
         if(J < IR)then
            if(RA(J) < RA(J+1))  J=J+1
         end if
         if(RRA < RA(J))then
            RA(I)=RA(J)
            I=J
            J=J+J
         else
            J=IR+1
         end if
         goto 2099
      end if
      RA(I)=RRA
      goto 1099
      END subroutine heapsort
      


   end subroutine srend_render ! Srend_render subroutine
! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss






! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! Spherical Rendering     (2,3)                 Wetherbee 2014  FDLTCC
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
#ifndef SREND_NOMPI
! ====================================================================
! wrapper for composer           
   subroutine srend_compose(   &
      nV,                      &
      nV_out,                  &
      numSources,              &
      TARGid)

      integer,intent(IN)    :: nV
      integer,intent(IN)    :: nV_out
      integer,intent(IN)    :: numSources
      integer,intent(IN)    :: TARGid

      call Srend_base(   &
         nV,             &
         nV_out,         &
         numSources,     &
         TARGid,         &
         2)                ! code for composer
   end subroutine srend_compose
	  
! ====================================================================
! wrapper for finisher    
   subroutine srend_finish(   &
      nV,                     &
      numSources)

      integer,intent(IN)    :: nV
      integer,intent(IN)    :: numSources

      call Srend_base(   &
         nV,             &
         nV,             & ! nv_out, not used
         numSources,     &
         0,              & ! targid, not used
         3)                ! code for finisher
       
   end subroutine srend_finish
	  
! ====================================================================
! wrapper for tile finisher, called by all ranks    

   subroutine srend_tile_finish( nV, nv_out)
      implicit NONE 
      include 'mpif.h'
! same process: nV is where view information is held common to all processes, so available
! nV_out is where the finish routine will store data
      integer,intent(IN)    :: nV, nV_out
      integer    :: numSources

      integer :: MYid, MYer

! for exracting data from source view nV block header
      INTEGER*4 :: mhe_tiles_down, mhe_tiles_right, mhe_TRi, mhe_TDi
      character*1,dimension(1:400) :: mpi_header ! 1:608
      equivalence (mpi_header(317),mhe_tiles_right )  ! 317:320
      equivalence (mpi_header(321),mhe_tiles_down )   ! 321:324 
      equivalence (MPI_header(389),mhe_TRi)         ! 389:392
      equivalence (MPI_header(393),mhe_TDi)         ! 393:396
      
      INTEGER*4 :: col, row, irc
      logical :: MEtile ! = .FALSE. ! if exit because this rank does not composite/write a tile
! ------------ end var declarations ----------------------------------            

      call srend_comm_init() ! needed in here, sometimes
      call MPI_comm_rank(srend_COMM, MYid, MYer)
      
! logic here: if MYid is in TARGid, then call, else exit but always reset Tload=0      
      mpi_header(:) = block(nV)%mpi_header(:,0) ! get tiles_right and tiles_down
   
      MEtile = .FALSE.
      do irc = 1, mhe_tiles_right*mhe_tiles_down
         row = 1 + (irc-1)/mhe_tiles_right ! row index in 1:tiles_down
         col = 1 + mod(irc-1,mhe_tiles_right) ! col index in 1:tiles_right
         if(MYid == block(nV)%TARGid(col,row) ) then
            MEtile = .TRUE.
            exit ! found, no more searching in do loop
         end if
      end do
      
      if( MEtile ) then
         numSources = block(nV)%Tload(col,row)
         if(numSources == 0) then ! in this case, there is no block(nV_out)%mpi_header() sent in for recv, so supply data needed
            mhe_TRi = col
            mhe_TDi = row
            if(.NOT. allocated(block(nV_out)%mpi_header) ) then
               block(nV_out)%used = .TRUE.
               allocate( block(nV_out)%mpi_header(1:400,0:SREND_MAX_LOAD) )
            end if
            block(nV_out)%mpi_header(:,0) = mpi_header(:) ! set this as it is not known otherwise
            call Srend_base(    &
               nV_out,          &
               nV_out,          & ! nv_out, not used
               numSources,      &
               0,               & ! targid, not used
               3) ! code for finisher 
         else ! there is a block(nV_out)%mpi_header as needed
            call Srend_base(    &
               nV_out,          &
               nV_out,          & ! nv_out, not used
               numSources,      &
               0,               & ! targid, not used
               3) ! code for finisher
         end if
#ifdef SREND_TILE_MONTAGE 
      else
         call MPI_barrier(srend_COMM, MYer) ! for montage, called by all finishers
#endif
      end if 
! do this by all ranks though  
      block(nV)%Tload = 0 ! experiment here, should look it up
      block(nV)%n = 0 ! zero here
      block(nV)%load_pass = block(nV)%load_pass + 1

   end subroutine srend_tile_finish
#endif


! =====================================================================
! the subroutine call by compose and finish wrappers above
! =====================================================================
   subroutine Srend_base( &
      nV,                 &
      nV_out,             &
      numSources,         &
      TARGid,             &
      rflag)

#ifdef _OPENMP
      USE OMP_LIB
#endif
      implicit NONE  
#ifndef SREND_NOMPI 
      include 'mpif.h'
#endif

      integer :: nV
      integer,intent(IN) :: nV_out

      integer,intent(IN) :: numSources
      integer,intent(IN) :: TARGid
      integer,intent(IN) :: rflag ! flag to determine action

! --------------------------------------------------------------------
! some things are done only on first pass, others on a following pass
! and per view
      INTEGER*4,dimension(1:SREND_MAX_V),save :: Vpass
! --------------------------------------------------------------------
! counters
      INTEGER*4 i,j,k, kc
      INTEGER*4 :: iNX1,iNY1,iNZ1 ! to calc offset and NX,NY,NZ
! --------------------------------------------------------------------
! global rendering parameters
      REAL*4,save :: PI
      REAL*4 :: dW, dH  ! increments for W and H across viewing 

! --------------------------------------------------------------------
! output imagery
      REAL*4 RGB(1:3)
      character*64 header
      character*200,dimension(:),allocatable :: rfname_root, rfname_ext
      character*200 :: rfname_name
      character*200 :: rfname_command
      character*200 :: rfname_tile
! polar remap
      REAL*4, dimension(1:3) :: R, UU, VV, Q
      REAL*4 :: a, b
      REAL*4 :: sinb, cosb, cosbcosa, cosbsina, dotQU
      REAL*4 :: acos_arg ! intermediate variable
      INTEGER*4 :: Hoffset
! for rendering to perspective, also just writing out in this routine
! converting to perspective from spherical
      REAL*4 aa, bb, aaa, bbb
      INTEGER*4 ii,jj,kk
      REAL*4 dWW, dHH ! for arbitrary angle spans
      REAL*4 a_span, b_span ! span, centered at spherical origin
! bilinear interpolation: sampling spherical to perspective
      REAL*4 ddx0,ddy0,ddx1,ddy1

! --------------------------------------------------------------------
! FILES
      INTEGER*4 :: fun ! file unit number; must be dfferent for threads
      character*100 :: fnum ! file number, for making output file names
#if !defined(SREND_NOMPI) && defined(SREND_TILE_MONTAGE)
      character*100 :: fangles ! fangles argument to ./montage
#endif
! --------------------------------------------------------------------

! ---------- for extracting data from incoming mpi_header
      logical :: mhe_outin     ! if dt_in < 0
      INTEGER*4,dimension(1:4) :: mhe_wh_out
      INTEGER*4 :: mhe_iNX,mhe_iNY,mhe_iNZ
      INTEGER*4 mhe_XN,mhe_YN,mhe_ZN ! cells across in each dim
      REAL*4, dimension(3) :: mhe_E
      REAL*4, dimension(3):: mhe_Ev
      REAL*4, dimension(3) :: mhe_Up
      INTEGER*4 mhe_nR ! number cotab_offsets, and variable arrays
      REAL*4 :: mhe_Alpha
      REAL*4 :: mhe_Beta
      INTEGER*4 :: mhe_W
      INTEGER*4 :: mhe_H
      character*1,dimension(1:200) :: mhe_filenames
      INTEGER*4 :: mhe_tiles_right, mhe_tiles_down
      INTEGER*4 :: mhe_perspective ! 0 = sph, 1 = prsp
      INTEGER*4 :: mhe_srendtype ! 0=norm,1=npole,2=equ,3=sp
      INTEGER*4 :: mhe_Wim, mhe_Him ! target image WxH, for poles
      INTEGER*4 :: mhe_nsh, mhe_sh0, mhe_sh1 ! shells rendered
      
      INTEGER*4 :: mhe_Vdim, Vdim_max, Vdim_temp
! --------- shells
! counter for looping over sh0 to sh1
      INTEGER*4 :: ish
      
! for AMR tiling
      INTEGER*4 :: mhe_nBlock, mhe_TRi, mhe_TDi

! Blue Waters special dumping
      character*4 :: mhe_magic0 = 'LCSE'
      character*4 :: mhe_magic1 = 'ESCL'
      
      INTEGER*4 :: mhe_TARGid, mhe_nV_out, mhe_Vpass ! for load/flush
      
      
! mpi_header array
      character*1,dimension(1:400) :: mpi_header ! 1:608
      
      equivalence (mpi_header(1), mhe_magic0)  ! 1:4
      equivalence (mpi_header(5), mhe_outin)   ! 5:8
      equivalence (mpi_header(9),mhe_wh_out(1))! 9:24 w0 w1 h0 h1
      equivalence (mpi_header(25), mhe_iNX )   ! 25:28 iNX
      equivalence (mpi_header(29), mhe_iNY )   ! 29:32 iNY
      equivalence (mpi_header(33), mhe_iNZ )   ! 33:36 iNZ
      equivalence (mpi_header(37), mhe_XN )    ! 37:40 XN
      equivalence (mpi_header(41), mhe_YN )    ! 41:44 YN
      equivalence (mpi_header(45), mhe_ZN )    ! 45:48 ZN
      equivalence (mpi_header(49), mhe_nR )    ! 49:52
      equivalence (mpi_header(53), mhe_Ev(1) ) ! 53:64
      equivalence (mpi_header(65), mhe_E(1) )  ! 65:76 
      equivalence (mpi_header(77), mhe_Up(1) ) ! 77:88
      equivalence (mpi_header(89), mhe_Alpha)  ! 89:92
      equivalence (mpi_header(93), mhe_Beta)   ! 93:96
      equivalence (mpi_header(97), mhe_W)      ! 97:100
      equivalence (mpi_header(101), mhe_H)     ! 101:104
!     unused                                   ! 105:108
      equivalence (mpi_header(109),mhe_filenames(1) ) ! 109:308
      equivalence (mpi_header(309),mhe_perspective )  ! 309:312
      equivalence (mpi_header(313),mhe_srendtype )    ! 313:316
      equivalence (mpi_header(317),mhe_tiles_right )  ! 317:320
      equivalence (mpi_header(321),mhe_tiles_down )   ! 321:324 
      equivalence (mpi_header(325),mhe_Wim)           ! 325:328
      equivalence (mpi_header(329),mhe_Him)           ! 329:332
      equivalence (MPI_header(333),mhe_nsh)           ! 333:336
      equivalence (MPI_header(337),mhe_sh0)           ! 337:340
      equivalence (MPI_header(341),mhe_sh1)           ! 341:344
!      unused                                         ! 345:348
      equivalence (MPI_header(349),mhe_Vdim)          ! 349:352
!     unused                                      ! 353:356
      equivalence (MPI_header(357),mhe_nV_out)    ! 357:360 ! in mpi_header only for render_flush and render_flush_write
      equivalence (MPI_header(361),mhe_TARGid)    ! 361:364 ! in mpi_header only for render_flush and render_flush_write
      equivalence (MPI_header(365),mhe_Vpass)     ! 365:368
!     unused                                      ! 369:372
!     unused                                      ! 373:380
!     unused                                      ! 381:384
      equivalence (MPI_header(385),mhe_nBlock)    ! 385:388
      equivalence (MPI_header(389),mhe_TRi)       ! 389:392
      equivalence (MPI_header(393),mhe_TDi)       ! 393:396
      equivalence (MPI_header(397),mhe_magic1)    ! 397:400

      character*200 :: filenames_str ! for parsing out names
     
! --------------------------------------------------------------------
      type VC_type
         REAL*4,dimension(:,:,:,:,:),allocatable :: im ! outgoing
!DIR$ ATTRIBUTES ALIGN:64 :: im
! ------- sorting data
         INTEGER*4 :: NS ! number of cubes with data
         INTEGER*4,dimension(:,:),allocatable :: wh
         REAL*4, dimension(:,:,:,:,:),allocatable :: tim

         INTEGER*4,dimension(:),allocatable :: iw0,iw1,ih0,ih1
         INTEGER*4,dimension(:),allocatable :: rank
! --------- finishing images
         character(len=200),dimension(:),allocatable :: rfname
         character(len=200) :: tfname
         character*1, dimension(:,:,:),allocatable :: ppm_buffer
! for perspective, extra boundary padding for interpolation
         character*1, dimension(:,:,:),allocatable :: ppm_buffer2
         REAL*4, dimension(:,:,:),allocatable :: perspective
! --------- MPI
         integer :: MYer
! incoming data
         integer,dimension(:),allocatable :: MYreq
         logical,dimension(:),allocatable :: irecvDONE
! incoming setups
         integer,dimension(:),allocatable :: MYreq2
         logical :: irecvDONE2
! incoming setup info:(MPI_STATUS_SIZE,:)
         integer,dimension(:,:), allocatable :: MYstat2
! for duds
         INTEGER*4 :: dummies
         REAL*4,dimension(:,:),allocatable :: imDUM
         INTEGER*4,dimension(:),allocatable :: dumRANK
         INTEGER*4,dimension(:),allocatable :: dumTAGoff
         INTEGER*4,dimension(:),allocatable :: dumID
! shells to span: sh0 to sh1
         INTEGER*4 :: mhe_sh0, mhe_sh1
       
      end type VC_type

! the table of arrays for incoming render planes, various dimensions
      type(VC_type),dimension(1:SREND_MAX_V),save :: VC

! ------ sorting -----------------------------------------------------
      INTEGER*4,dimension(:,:),allocatable :: wh
      logical,dimension(:,:,:,:),allocatable :: oct

! create table for array dimensions by i
      INTEGER*4,dimension(:),allocatable :: iw0,iw1,ih0,ih1
! table for shell indices
      INTEGER*4,dimension(:),allocatable :: ish0, ish1

! list for sorting
      INTEGER*4,allocatable,dimension(:) :: L
      
      
#ifdef SREND_TIMING
! for testing .. collect statistics
      integer*8 it1, it2, itc
      REAL*4 rc
#endif

#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE  
! threading in compositing
      INTEGER*4 :: th, nth, thRows, thWid, thRem, th0, th1
#endif

! tiling write offset (not WxH indices for im)
      INTEGER*4 :: offi, offj
      
#ifndef SREND_NOMPI
      integer :: MYid=0, MYer
      integer :: srendMPItag, srendMPIrank, srendMPIcount ! for MPI_INTEGER portability
#endif

      
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!  declarations done 
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc


#ifndef SREND_NOMPI
      call srend_comm_init() ! called everytime, but only inits srend_COMM the 1st call
      call MPI_comm_rank(srend_COMM, MYid, MYer)
#endif

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'Enter srend_base'
#endif


      if(.NOT. block(nV)%used ) block(nV)%used  = .TRUE. ! make sure set


#ifdef SREND_TIMING
      call system_clock(it1,itc)
      rc = 1.0_4 / itc
#endif      
      
      if( .NOT. allocated( VC(nV)%wh) ) then
         Vpass(nV) = 1           ! pass for this view
         allocate( VC(nV)%wh(1:20,0:SREND_MAX_LOAD) )
         PI = 4.0_4 * atan(1.0_4) ! done for every view, but OK once
      else
         Vpass(nV) = Vpass(nV) + 1
      end if

      
      
! SMP special: go to file write/finish **********************************************************************
#ifdef SREND_NOMPI
      if(numSources == 0) then ! NOMPI and called from render to write result in block(nV)%cube(0)
         mpi_header(:) = block(nV)%mpi_header(:,0)
#ifdef SREND_DEBUG
         print *,'CCCCCCCCCCCCCC enter: srend_compose() CCCCCCCCCCC nompi render-in compose'
         print *,'Call arguments::::::::::::::::::::::::::::::::::'
         print *,'nV=',nV, 'nV_out=',nV_out
         print *,'numSources=',numSources,'TARGid=',TARGid,'rflag=',rflag
         print *,'Header arguments::::::::::::::::::::::::::::::::'
         print *,'mhe_outin=',mhe_outin
         print *,'mhe_wh_out(1:4)=',mhe_wh_out(1:4)
         print *,'offset(mhe_iNX,mhe_iNY,mhe_iNZ)=',mhe_iNX,mhe_iNY,mhe_iNZ
         print *,'data dim(mhe_XN,mhe_YN,mhe_ZN)=',mhe_XN,mhe_YN,mhe_ZN
         print *,'mhe_nR=',mhe_nR
         print *,'mhe_Ev=',mhe_Ev
         print *,'mhe_E=',mhe_E
         print *,'mhe_Up=',mhe_Up
         print *,'mhe_Alpha=',mhe_Alpha, 'mhe_Beta=',mhe_Beta, 'mhe_W=',mhe_W,'mhe_H=',mhe_H
         print *,'mhe_filenames=',mhe_filenames
         print *,'mhe_perspective=',mhe_perspective
         print *,'mhe_srendtype=',mhe_srendtype
         print *,'mhe_tiles_right=',mhe_tiles_right,'mhe_tiles_down=',mhe_tiles_down
         print *,'mhe_W=',mhe_W,'mhe_H=',mhe_H
         print *,'mhe_Wim=',mhe_Wim,'mhe_Him=',mhe_Him,'mhe_nsh=',mhe_nsh,'mhe_sh0=',mhe_sh0,'mhe_sh1=',mhe_sh1
         print *,'mhe_Vdim=',mhe_Vdim
         print *,'mhe_nV_out=',mhe_nV_out,'mhe_TARGid=',mhe_TARGid
         print *,'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC nompi render-in compose'
#endif
         goto 9876 ! write file routine
      end if  
#endif
! ***********************************************************************************************************



!special for tiling, no incoming data
#ifndef SREND_NOMPI
      if(numSources ==0 .AND. rflag == 3) then
         mpi_header(:) = block(nV)%mpi_header(:,0) ! should be zero
         if( .NOT. allocated( block(nV)%cube) ) allocate(block(nV)%cube(0:SREND_MAX_LOAD) )
         if( allocated( block(nV)%cube(0)%im ) ) deallocate( block(nV)%cube(0)%im )
         allocate( block(nV)%cube(0)%im(1:16,0:3,1+(mhe_TRi-1)*(mhe_W/mhe_tiles_right/4):mhe_TRi*(mhe_W/mhe_tiles_right/4),&
                                                 1+(mhe_TDi-1)*(mhe_H/mhe_tiles_down/4):mhe_TDi*(mhe_H/mhe_tiles_down/4),&
                                                 1:mhe_nsh, &
                                                 1:mhe_nR))
         block(nV)%cube(0)%im = 0.0_4 ! allocate here
         goto 9876 ! write file routine
      end if
#endif


#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before allocations'
#endif
        

      if( .NOT. allocated( VC(nV)%MYreq) ) then

         allocate( VC(nV)%MYreq(1:SREND_MAX_LOAD) )
         allocate( VC(nV)%MYreq2(1:SREND_MAX_LOAD) )
#ifndef SREND_NOMPI
         allocate( VC(nV)%MYstat2(MPI_STATUS_SIZE,1:SREND_MAX_LOAD) )
#endif
         allocate( VC(nV)%irecvDONE(1:SREND_MAX_LOAD) )

         allocate( VC(nV)%rank(0:SREND_MAX_LOAD))
         allocate( VC(nV)%iw0(0:SREND_MAX_LOAD))
         allocate( VC(nV)%iw1(0:SREND_MAX_LOAD))
         allocate( VC(nV)%ih0(0:SREND_MAX_LOAD))
         allocate( VC(nV)%ih1(0:SREND_MAX_LOAD))

         allocate( VC(nV)%imDUM(1:4,0:SREND_MAX_LOAD) )
         allocate( VC(nV)%dumRANK(1:SREND_MAX_LOAD) )
         allocate( VC(nV)%dumTAGoff(1:SREND_MAX_LOAD) )
         allocate( VC(nV)%dumID(1:SREND_MAX_LOAD) )
      end if

      
      if(.NOT. allocated(block(nV)%mpi_header) ) then
         block(nV)%used = .TRUE.
         allocate( block(nV)%mpi_header(1:400,0:SREND_MAX_LOAD) )
      end if
      
      
      allocate( iw0(0:numSources)) ! local working arrays
      allocate( iw1(0:numSources))
      allocate( ih0(0:numSources))
      allocate( ih1(0:numSources))
      allocate( wh(1:20,0:numSources) )
      allocate( oct(0:2,0:2,0:2,0:numsources) )

      allocate( ish0(0:numsources))
      allocate( ish1(0:numsources))
      
#ifdef SREND_TIMING
      call system_clock(it2); print *,'Compose: after allocations span =:',(it2-it1)*rc,'rflag=',rflag,' nV=',nVl; it1 = it2
#endif



! CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
! CCCCCCCCCCC  composer  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
! CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before test isend setup'
#endif

! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#ifndef SREND_NOMPI
! complete isend of setup data
      if(rflag .eq. 2 .AND. .NOT. block(nV)%isendDONE4 .AND. Vpass(nV) .gt. 1) then
         do while(.NOT. block(nV)%isendDONE4)
            call MPI_TEST(block(nV)%MYreq4,block(nV)%isendDONE4, MPI_STATUS_IGNORE,VC(nV)%MYer)
            if(.NOT. block(nV)%isendDONE4 ) SREND_YIELDING
         end do
      end if
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


#ifdef SREND_TIMING
      call system_clock(it2); it1 = it2
#endif




#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before test isend data'
#endif

! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#ifndef SREND_NOMPI
      if(rflag .eq. 2 .AND. .NOT. block(nV)%isendDONE3 .AND. Vpass(nV) .gt. 1) then
! wait here if previous isend is not done for data.
! This is only for a composer sending results up after Pass = 1
! if already completed using MPI_test at beginning of sub, then 
! outMPIreq = MPI_REQUEST_NULL, and MPI_wait returns immediately
         do while(.NOT. block(nV)%isendDONE3)
            call MPI_TEST(block(nV)%MYreq3,block(nV)%isendDONE3,MPI_STATUS_IGNORE,VC(nV)%MYer)
            if(.NOT. block(nV)%isendDONE3 ) SREND_YIELDING
         end do
      end if
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: after test on previous isend data call =:',(it2-it1)*rc,'rflag=',rflag,'Vpass=',Vpass(nV)
      it1 = it2
#endif



! SSSSS Setup SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before set mpi_header recvs'
#endif

! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
! set recvs for info on cubes & arrays
#ifndef SREND_NOMPI

#ifdef SREND_TIMING
      call system_clock(it2); it1 = it2
#endif

!if(nV_in .gt. 0) then ! if nV < 0, the blocks will already be in memory along with wh
      if( .NOT. block(nV)%flush) then
         do i = 1, numSources
            srendMPItag = SREND_TAG2+nV ! portability
            call MPI_irecv( block(nV)%mpi_header(1,i),400,MPI_CHARACTER,     &
                              MPI_ANY_SOURCE, srendMPItag, srend_COMM,         &
                              VC(nV)%MYreq2(i),VC(nV)%MYer)
         end do
         
         VC(nV)%irecvDONE2 = .FALSE.
         i = 0

         do while(.NOT. VC(nV)%irecvDONE2)
            srendMPIcount = numSources ! portability
            call MPI_testall( srendMPIcount, VC(nV)%MYreq2(1:numSources),                       &
                              VC(nV)%irecvDONE2, VC(nV)%MYstat2(:,1:numSources),VC(nV)%MYer)

            if(.NOT. VC(nV)%irecvDONE2 ) SREND_YIELDING
            i = i+1
         end do
      end if ! .NOT. block(nV)%flush
      
#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose:received headers:',(it2-it1)*rc,'ntests=', i,'Vpass=',Vpass(nV),'rflag=',rflag; it1 = it2
#endif      


#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$



#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before organize mpi_header info'
#endif

!if( nV_in .gt. 0) then ! regular MPI
      if( .NOT. block(nV)%flush) then
         VC(nV)%NS = 0
         i = 0
         do ii=1,numSources ! get data from headers

            mpi_header(:) = block(nV)%mpi_header(:,ii) ! load header to view data        
            if(mhe_wh_out(2)-mhe_wh_out(1) < 0) cycle ! a dud
            
            i = i+1
            VC(nV)%NS = i ! data
#ifndef SREND_NOMPI
            VC(nV)%rank(i) = VC(nV)%MYstat2(MPI_SOURCE,ii)
#endif
            wh(1,i) = mhe_wh_out(1) ! w0
            wh(2,i) = mhe_wh_out(2) ! w1
            wh(3,i) = mhe_wh_out(3) ! h0
            wh(4,i) = mhe_wh_out(4) ! h1
            wh(5,i) = mhe_iNX
            wh(6,i) = mhe_iNY
            wh(7,i) = mhe_iNZ
            wh(8,i) = mhe_XN
            wh(9,i) = mhe_YN
            wh(10,i) = mhe_ZN
            wh(14,i) = mhe_sh0
            wh(15,i) = mhe_sh1
            wh(16,i) = mhe_Vdim
            wh(17,i) = mhe_Vpass ! for pass debug MPI
            wh(18,i) = mhe_nBlock
            wh(19,i) = mhe_TRi
            wh(20,i) = mhe_TDi
         end do

         mhe_TARGid = TARGid
         mhe_nV_out = nV_out

      else if( block(nV)%flush ) then !a render_flush or render_flush_write
         if(block(nV)%n >=1) then
            wh(:,1:numSources) = block(nV)%wh(:,1:numSources)
         else
            wh(:,0) = block(nV)%wh(:,0)
         end if
         mpi_header(:) = block(nV)%mpi_header(:,block(nV)%n)
! do this here, used to be done later      
         VC(nV)%NS = block(nV)%n
      end if

      
! at this point the last mpi_header is mostly OK for passing on (compose), and rest will
! get updated for sending up except mhe_wh_out(1:4)
#ifdef SREND_DEBUG
      print *,'CCCCCCCCCCCCCC enter: srend_compose() CCCCCCCCCCC'
      print *,'Call arguments::::::::::::::::::::::::::::::::::'
      print *,'nV=',nV, 'nV_out=',nV_out
      print *,'numSources=',numSources,'TARGid=',TARGid,'rflag=',rflag
      print *,'Header arguments::::::::::::::::::::::::::::::::'
      print *,'mhe_outin=',mhe_outin
      print *,'mhe_wh_out(1:4)=',mhe_wh_out(1:4)
      print *,'offset(mhe_iNX,mhe_iNY,mhe_iNZ)=',mhe_iNX,mhe_iNY,mhe_iNZ
      print *,'data dim(mhe_XN,mhe_YN,mhe_ZN)=',mhe_XN,mhe_YN,mhe_ZN
      print *,'mhe_nR=',mhe_nR
      print *,'mhe_Ev=',mhe_Ev
      print *,'mhe_E=',mhe_E
      print *,'mhe_Up=',mhe_Up
      print *,'mhe_Alpha=',mhe_Alpha, 'mhe_Beta=',mhe_Beta, 'mhe_W=',mhe_W,'mhe_H=',mhe_H
      print *,'mhe_filenames=',mhe_filenames
      print *,'mhe_perspective=',mhe_perspective
      print *,'mhe_srendtype=',mhe_srendtype
      print *,'mhe_tiles_right=',mhe_tiles_right,'mhe_tiles_down=',mhe_tiles_down
      print *,'mhe_W=',mhe_W,'mhe_H=',mhe_H
      print *,'mhe_Wim=',mhe_Wim,'mhe_Him=',mhe_Him,'mhe_nsh=',mhe_nsh,'mhe_sh0=',mhe_sh0,'mhe_sh1=',mhe_sh1
      print *,'mhe_Vdim=',mhe_Vdim
      print *,'mhe_nV_out=',mhe_nV_out,'mhe_TARGid=',mhe_TARGid
      print *,'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'
#endif


#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before set irecv data'
#endif

#ifndef SREND_NOMPI
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ MPI $$$$$$$$$$$$$$$$$$$$$$$$$
! receive here
      if( .NOT. allocated( block(nV)%cube) ) allocate(block(nV)%cube(0:SREND_MAX_LOAD) )
      
      if(.NOT. block(nV)%flush) then !MPI, not flush or flush_write
         do i=1, VC(nV)%NS
            if( allocated(block(nV)%cube(i)%im) ) deallocate(block(nV)%cube(i)%im)
               allocate(block(nV)%cube(i)%im(1:16,0:3,wh(1,i):wh(2,i),wh(3,i):wh(4,i),wh(14,i):wh(15,i),1:mhe_nR) )
            srendMPIrank = VC(nV)%rank(i)     ! MPI portability
            srendMPItag = nV*10000 + wh(18,i) ! MPI portability
            call MPI_irecv( block(nV)%cube(i)%im,              &
                  size(block(nV)%cube(i)%im), MPI_REAL,        &
                  srendMPIrank,                                &
                  srendMPItag,                                 & ! wh(18,i) = block(nV)%n from setup mpi_header 
                  srend_COMM, VC(nV)%MYreq(i), VC(nV)%MYer)
         end do
      
#ifdef SREND_TIMING
         call system_clock(it2)
         print *,'Compose: after alloc cubes and set irecv for MPI:',(it2-it1)*rc,'rflag=',rflag,'NS=',VC(nV)%NS,&
               'Vpass=',Vpass(nV),'numsources=',numsources
         it1 = it2
#endif

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before test irecv data NS=',VC(nV)%NS
#endif

         do i=1, VC(nV)%NS
            VC(nV)%irecvDONE(i) = .FALSE.
            do while(.NOT. VC(nV)%irecvDONE(i))
               call MPI_test(                      &
                  VC(nV)%MYreq(i),                &
                  VC(nV)%irecvDONE(i),            &
                  MPI_STATUS_IGNORE,VC(nV)%MYer)
               if(.NOT. VC(nV)%irecvDONE(i) ) SREND_YIELDING
            end do
         end do
      
      end if ! MPI incoming

#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: after MPI recv all cubes for MPI:',(it2-it1)*rc,'rflag=',rflag
      it1 = it2
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#endif



#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before sorting and prep'
#endif

! find AMR level for compositing: AMR_comp = min(max(AMR not dud),AMR_view)
      Vdim_max = 0 ! start
      do i=1, VC(nV)%NS
          if( wh(16,i) .gt. Vdim_max) Vdim_max = wh(16,i) ! max(AMR not dud)
      end do
      
      
! set AMR for passing up, in MPI_header
      mhe_Vdim = Vdim_max ! load this for passing up, needed only for rflag == 2
      
      
! adjust to AMR_max for sorting
      do i=1, VC(nV)%NS
          Vdim_temp = Vdim_max / wh(16,i) ! multiplier
          wh(5,i) = wh(5,i)  * Vdim_temp ! iNX , optimize these a bit ? worth it ?
          wh(6,i) = wh(6,i)  * Vdim_temp ! iNY
          wh(7,i) = wh(7,i)  * Vdim_temp ! iNZ
          wh(8,i) = wh(8,i)  * Vdim_temp ! XN
          wh(9,i) = wh(9,i)  * Vdim_temp ! YN
          wh(10,i) = wh(10,i)* Vdim_temp ! ZN
      end do
      
      
! adjust E, just needed for sorting    
      mhe_E = mhe_E * Vdim_max
      
      
#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: after prep time span, adjusted to max(Vdim), (1:initial):',(it2-it1)*rc
      it1 = it2
#endif
      


!DEC$ NO PARALLEL

!bounds to start
      mhe_iNX =   1000000000
      mhe_iNY =   1000000000
      mhe_iNZ =   1000000000
      iNX1 = -1000000000
      iNY1 = -1000000000
      iNZ1 = -1000000000
      
      ish0(0) =  1000000000
      ish1(0) = -1000000000
      
      iw0(0) =  1000000000
      iw1(0) = -1000000000
      ih0(0) =  1000000000
      ih1(0) = -1000000000

      VC(nV)%dummies = 0
      
      do i=1, VC(nV)%NS

! update shell index
         if( wh(14,i) .lt. ish0(0)) ish0(0) = wh(14,i)
         if( wh(15,i) .gt. ish1(0)) ish1(0) = wh(15,i)

! find bounds for all blocks, in AMR_max coordinates
! This is used for the resulting block bounds and for sorting
         if(wh(5,i) .lt. mhe_iNX) mhe_iNX = wh(5,i)
         if(wh(6,i) .lt. mhe_iNY) mhe_iNY = wh(6,i)
         if(wh(7,i) .lt. mhe_iNZ) mhe_iNZ = wh(7,i)
         if(wh(5,i)+wh(8,i) .gt. iNX1) iNX1 = wh(5,i)+wh(8,i)
         if(wh(6,i)+wh(9,i) .gt. iNY1) iNY1 = wh(6,i)+wh(9,i)
         if(wh(7,i)+wh(10,i) .gt. iNZ1) iNZ1 = wh(7,i)+wh(10,i)

! To determine output array to collect all incoming arrays in compositing
! find bounds for all arrays
         if(wh(1,i) .lt. iw0(0)) iw0(0) = wh(1,i)
         if(wh(2,i) .gt. iw1(0)) iw1(0) = wh(2,i)
         if(wh(3,i) .lt. ih0(0)) ih0(0) = wh(3,i)
         if(wh(4,i) .gt. ih1(0)) ih1(0) = wh(4,i)

! Determine far corner offsets
         wh(11,i) = wh(5,i) + wh(8,i) !iNX+XN , far faces
         wh(12,i) = wh(6,i) + wh(9,i) !iNY+YN
         wh(13,i) = wh(7,i) + wh(10,i) !iNZ+ZN
          
! needed for n^2 sort; boolean TRUE if block is in an octant with respect to eye E
! this eliminates some testing in the n2sort
         oct(0,0,0,i) = .FALSE.
         oct(0,0,1,i) = .FALSE.
         oct(0,1,0,i) = .FALSE.
         oct(0,1,1,i) = .FALSE.
         oct(1,0,0,i) = .FALSE.
         oct(1,0,1,i) = .FALSE.
         oct(1,1,0,i) = .FALSE.
         oct(1,1,1,i) = .FALSE.
         
         if( wh(5,i)<mhe_E(1) .AND. wh(6,i)<mhe_E(2) .AND. wh(7,i)<mhe_E(3) ) oct(0,0,0,i) = .TRUE.
         if( wh(5,i)<mhe_E(1) .AND. wh(6,i)<mhe_E(2) .AND. wh(13,i)>mhe_E(3) ) oct(0,0,1,i) = .TRUE.
         if( wh(5,i)<mhe_E(1) .AND. wh(12,i)>mhe_E(2) .AND. wh(7,i)<mhe_E(3) ) oct(0,1,0,i) = .TRUE.
         if( wh(5,i)<mhe_E(1) .AND. wh(12,i)>mhe_E(2) .AND. wh(13,i)>mhe_E(3) ) oct(0,1,1,i) = .TRUE.
         if( wh(11,i)>mhe_E(1) .AND. wh(6,i)<mhe_E(2) .AND. wh(7,i)<mhe_E(3) ) oct(1,0,0,i) = .TRUE.
         if( wh(11,i)>mhe_E(1) .AND. wh(6,i)<mhe_E(2) .AND. wh(13,i)>mhe_E(3) ) oct(1,0,1,i) = .TRUE.
         if( wh(11,i)>mhe_E(1) .AND. wh(12,i)>mhe_E(2) .AND. wh(7,i)<mhe_E(3) ) oct(1,1,0,i) = .TRUE.
         if( wh(11,i)>mhe_E(1) .AND. wh(12,i)>mhe_E(2) .AND. wh(13,i)>mhe_E(3) ) oct(1,1,1,i) = .TRUE.

      end do
      
      
! these are the dimensions of the containing volume
      mhe_XN = iNX1-mhe_iNX
      mhe_YN = iNY1-mhe_iNY
      mhe_ZN = iNZ1-mhe_iNZ
      
#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: after split blocks and indexed span (2:initial):',(it2-it1)*rc
      print *,'     ish0(0) =',ish0(0),'ish1(0)=',ish1(0)
      it1 = it2
#endif



! prep for sorting: L() is the array of indices of blocks
      allocate( L(1:VC(nV)%NS) )
      do i=1,VC(nV)%NS
         L(i) = i
      end do

      
! ---------------- n2 sort --------------
      call n2sort(L, VC(nV)%NS)

#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: after n2 sorting, time span (not KDtree):',(it2-it1)*rc
      print *,'number sorted =', VC(nV)%NS
      it1 = it2
#endif
! ------------ n2 sort ------ end ----


! align data block array indices 1:NS neatly !!!!!!! needless now
      do i=1, VC(nV)%NS
         iw0(i) = wh(1,i)
         iw1(i) = wh(2,i)
         ih0(i) = wh(3,i)
         ih1(i) = wh(4,i)
         ish0(i) = wh(14,i)
         ish1(i) = wh(15,i)  
      end do

! check bounds if too large, also set for finisher
      if(rflag .eq. 3) then ! finish out to image, use fill output dimensions
         if(mhe_tiles_right*mhe_tiles_down > 1) then
            iw0(0) = 1 + (mhe_TRi - 1) * mhe_W/ mhe_tiles_right / 4
            iw1(0) = mhe_TRi * mhe_W/ mhe_tiles_right / 4
            ih0(0) = 1 + (mhe_TDi - 1) * mhe_H/ mhe_tiles_down / 4
            ih1(0) = mhe_TDi * mhe_H/ mhe_tiles_down / 4
         else
            iw0(0) = 1
            iw1(0) = mhe_W/4 ! for 16pack
            ih0(0) = 1
            ih1(0) = mhe_H/4 ! for 16pack
         end if
            
         ish0(0) = 1 ! always to this for finisher
         ish1(0) = mhe_nsh
      else if ( 1+iw1(0)-iw0(0) .gt. mhe_W/4 ) then ! use output dimensions
         iw0(0) = 1
         iw1(0) = mhe_W/4 ! for 16pack
      end if
      
! allocate memory for target array
! this must be reallocated if fly-through
      if( VC(nV)%NS .gt. 0 .OR. rflag .eq. 3) then
         if( allocated( block(nV)%cube(0)%im ) ) deallocate( block(nV)%cube(0)%im )
         allocate( block(nV)%cube(0)%im(1:16,0:3,iw0(0):iw1(0),ih0(0):ih1(0),ish0(0):ish1(0),1:mhe_nR))
         block(nV)%cube(0)%im = 0.0_4 ! allocate here
      end if

      
! fill mhe_W and mhe_H are put in header, so these don't match 16-pack dimensions
! load mpi_header
      if( VC(nV)%NS > 0) then
         mhe_wh_out(1) = iw0(0)
         mhe_wh_out(2) = iw1(0)
         mhe_wh_out(3) = ih0(0)
         mhe_wh_out(4) = ih1(0)
      else ! a dud
         mhe_wh_out(1) = 0
         mhe_wh_out(2) = -1
         mhe_wh_out(3) = 0
         mhe_wh_out(4) = -1
      end if
         mhe_sh0 = ish0(0)
         mhe_sh1 = ish1(0)
! save mpi_header to VC(nV)%mpi_header buffer for non-blocking send
         mhe_E = mhe_E / Vdim_max ! adjust back to AMR=0 coordinates
        
! new tag scheme
         mhe_nBlock = 0 ! for compositing
         block(nV)%mpi_header(:,0) = mpi_header(:)
        
! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss



#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: after align blocks before compositing, alloc im:',(it2-it1)*rc
      it1 = it2
#endif

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before isend setup'
#endif



      block(nV)%isendDONE4 = .TRUE. ! make sure this is set

! SSSSSS setup SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
! called only by compose(), header send to next compose/finish rank
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#ifndef SREND_NOMPI
      if(rflag .eq. 2) then ! send up
         block(nV)%isendDONE4 = .FALSE.
         srendMPIrank = mhe_TARGid ! MPI portability
         srendMPItag = SREND_TAG2+mhe_nV_out ! MPI portability
#ifdef SREND_ISSEND
         call MPI_issend(block(nV)%mpi_header(1,0), &
#else 
         call MPI_isend(block(nV)%mpi_header(1,0),  &
#endif
                         400,MPI_CHARACTER,         &
                        srendMPIrank,               & 
                        srendMPItag,                &
                        srend_COMM,                 &
                        block(nV)%MYreq4,           &
                        VC(nV)%MYer)
      end if ! rflag = 2
#endif
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss



#ifdef SREND_TIMING
      call system_clock(it2)
      print *,'Compose: MPI sent setup info:',(it2-it1)*rc,'rflag=',rflag
      print *,'   nV=',nV,'nV_out=',nV_out
      print *,'   mhe_TARGid=',mhe_TARGid,'mhe_nV_out=',mhe_nV_out
      it1 = it2
#endif




! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! Now, we have all arrays in, so compose them
! ccccccccccccccccccc compositing loop ccccccccccccccccccccccccccccccc
#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE
!$OMP PARALLEL default(shared) private(kc,k,kk,ish,j,i,ii,th,nth,thRows,thWid,thRem,th0,th1)

#ifdef _OPENMP
      th = OMP_get_thread_num()
      nth = OMP_get_num_threads()
#else
      th = 0
      nth = 1
#endif
#endif



#ifdef SREND_PARALLEL_COMPOSE
      thRows = ih1(0) - ih0(0) +1
      thWid = thRows / nth
      thRem = thRows - thWid * nth
      th0 = ih0(0) + th * thWid
      th1 = ih0(0) + (th + 1) * thWid -1
      if(th >= thRem) then
         th0 = th0 + thRem
         th1 = th1 + thRem
      else
         th0 = th0 + th
         th1 = th1 + th + 1
      end if
#endif



#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'nV=',nV,'srend_base: before compositing'
#endif

do kc = 1, VC(nV)%NS
   k = L(kc) ! index to block(nV)%cube(kc)%im

#ifdef SREND_PARALLEL_COMPOSE      
   if(th0 <= ih1(k) .AND. th1 >= ih0(k)) then
#endif

#ifdef SREND_PARALLEL2_COMPOSE
   thRows = ih1(k) - ih0(k) +1
   thWid = thRows / nth
   thRem = thRows - thWid * nth
   th0 = ih0(k) + th * thWid
   th1 = ih0(k) + (th + 1) * thWid -1
   if(th >= thRem) then
      th0 = th0 + thRem
      th1 = th1 + thRem
   else
      th0 = th0 + th
      th1 = th1 + th + 1
   end if
#endif

! --------------------------------------------------------------------
! usual: view from eye
   if(.NOT. mhe_outin) then
! --------------------------------------------------------------------

      do kk=1, mhe_nR ! ----------------------------------

         do ish = ish0(k),ish1(k)

            if( iw0(k) .lt. 1 .AND. iw0(0) .ge. 1) then ! write each half

#ifdef SREND_COMPOSITE_OPENMP
!$OMP PARALLEL DO default(shared) private(i,ii)
#endif
#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE
               do j=max(ih0(k),th0), min(ih1(k),th1)
#else

               do j = ih0(k), ih1(k), 1
#endif
      
                  do i=iw0(k),0,1 ! shifted: i -> W+i

! back2front
SREND_VECTORIZE
do ii=1,16
   block(nV)%cube(0)%im(ii,1,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,1,mhe_W/4+i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,1,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,2,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,2,mhe_W/4+i,j,ish,kk) +      block(nV)%cube(k)%im(ii,2,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,3,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,3,mhe_W/4+i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,3,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,0,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,0,mhe_W/4+i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,0,i,j,ish,kk)
end do ! ii
      
      
                  end do ! i

                  do i=1,iw1(k),1 ! not shifted

! back2front
SREND_VECTORIZE
do ii=1,16
   block(nV)%cube(0)%im(ii,1,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,1,i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,1,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,2,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,2,i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,2,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,3,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,3,i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,3,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,0,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,0,i,j,ish,kk)  +     block(nV)%cube(k)%im(ii,0,i,j,ish,kk)
end do ! ii
      
                  end do ! i
               end do ! j
#ifdef SREND_COMPOSITE_OPENMP
!$OMP END PARALLEL DO
#endif

            else ! in range: 1:W,1:H or iw0(0):iw1(0),:
#ifdef SREND_COMPOSITE_OPENMP
!$OMP PARALLEL DO default(shared) private(i,ii)
#endif
#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE
               do j=max(ih0(k),th0), min(ih1(k),th1)
#else
               do j = ih0(k), ih1(k), 1
#endif

                  do i=iw0(k),iw1(k),1

! back2front
SREND_VECTORIZE
do ii=1,16
   block(nV)%cube(0)%im(ii,1,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,1,i,j,ish,kk) +      block(nV)%cube(k)%im(ii,1,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,2,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,2,i,j,ish,kk) +      block(nV)%cube(k)%im(ii,2,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,3,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,3,i,j,ish,kk) +      block(nV)%cube(k)%im(ii,3,i,j,ish,kk)

   block(nV)%cube(0)%im(ii,0,i,j,ish,kk) = (1.0_4-block(nV)%cube(k)%im(ii,0,i,j,ish,kk) ) &
 * block(nV)%cube(0)%im(ii,0,i,j,ish,kk) +      block(nV)%cube(k)%im(ii,0,i,j,ish,kk)
end do ! ii
      
                  end do ! i
               end do ! j
#ifdef SREND_COMPOSITE_OPENMP
!$OMP END PARALLEL DO
#endif

            end if ! in range

         end do ! ish

      end do ! kk 1:mhe_nR

! --------------------------------------------------------------------
! mhe_outin: view coming from outside to eye
   else
! --------------------------------------------------------------------

! Now, we have all arrays in, so compose them

      do kk=1,mhe_nR ! -------------------------------------

         do ish = ish0(k), ish1(k)

            if( iw0(k) .lt. 1 .AND. iw0(0) .ge. 1) then ! write each half

#ifdef SREND_COMPOSITE_OPENMP      
!$OMP PARALLEL DO default(shared) private(i,ii)
#endif
#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE
               do j=max(ih0(k),th0), min(ih1(k),th1)
#else
               do j = ih0(k), ih1(k), 1
#endif

                  do i=iw0(k),0,1 ! shifted: i -> W+i

! inverted (front2back)
SREND_VECTORIZE
do ii=1,16
   block(nV)%cube(0)%im(ii,1,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,mhe_W+i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,1,i,j,ish,kk) +              block(nV)%cube(0)%im(ii,1,mhe_W+i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,2,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,mhe_W+i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,2,i,j,ish,kk) +              block(nV)%cube(0)%im(ii,2,mhe_W+i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,3,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,mhe_W+i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,3,i,j,ish,kk) +              block(nV)%cube(0)%im(ii,3,mhe_W+i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,0,mhe_W/4+i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,mhe_W+i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,0,i,j,ish,kk) +              block(nV)%cube(0)%im(ii,0,mhe_W+i,j,ish,kk) 
end do ! ii
      
                  end do ! i

                  do i=1,iw1(k),1 ! not shifted

! inverted (front2back)
SREND_VECTORIZE
do ii=1,16
   block(nV)%cube(0)%im(ii,1,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,1,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,1,i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,2,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,2,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,2,i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,3,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,3,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,3,i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,0,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,0,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,0,i,j,ish,kk) 
end do ! ii
      
                  end do ! i
               end do ! j
#ifdef SREND_COMPOSITE_OPENMP
!$OMP END PARALLEL DO
#endif

            else ! in range: 1:W,1:H or iw0(0):iw1(0),:

#ifdef SREND_COMPOSITE_OPENMP      
!$OMP PARALLEL DO default(shared) private(i,ii)
#endif
#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE
               do j=max(ih0(k),th0), min(ih1(k),th1)
#else
               do j = ih0(k), ih1(k), 1
#endif

                  do i=iw0(k),iw1(k),1

! inverted (front2back)
SREND_VECTORIZE
do ii=1,16
   block(nV)%cube(0)%im(ii,1,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,1,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,1,i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,2,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,2,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,2,i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,3,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,3,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,3,i,j,ish,kk) 

   block(nV)%cube(0)%im(ii,0,i,j,ish,kk) = (1.0_4-block(nV)%cube(0)%im(ii,0,i,j,ish,kk)) &
 * block(nV)%cube(k)%im(ii,0,i,j,ish,kk) +      block(nV)%cube(0)%im(ii,0,i,j,ish,kk) 
end do
      
                  end do ! i
               end do ! j
#ifdef SREND_COMPOSITE_OPENMP
!$OMP END PARALLEL DO
#endif

            end if ! in range

         end do ! ish 

      end do ! kk 1:mhe_nR
! --------------------------------------------------------------------
   end if ! NOT mhe_outin
! --------------------------------------------------------------------
! --------------------------------------------------------------------
      
#ifdef SREND_PARALLEL_COMPOSE      
end if ! thread band includes block k
#endif

#ifdef SREND_PARALLEL2_COMPOSE      
!$OMP BARRIER
#endif

end do ! k 

#if defined SREND_PARALLEL_COMPOSE || defined SREND_PARALLEL2_COMPOSE  
!$OMP END PARALLEL
#endif
! ccccccccccccc end compositing loop ccccccccccccccccccccccccccccccccc
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! ccccccccccccc end compositing loop ccccccccccccccccccccccccccccccccc





#ifdef SREND_TIMING
      call system_clock(it2); print *,'Compose: compositing time span:',(it2-it1)*rc; it1 = it2
#endif

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'srend_base: before data send up'
#endif



! -------------------------------------------------------------------------------------------
! routine just for composer, sends data to another compoer or finisher ----------------------
! -------------------------------------------------------------------------------------------
#ifndef SREND_NOMPI
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
      if(rflag .eq. 2) then !--if this is a composer, then data has to be sent on to rank(0)
         if( VC(nV)%NS > 0 ) then
            block(nV)%isendDONE3 = .FALSE. ! set for both cases
            srendMPIrank = mhe_TARGid ! MPI portability
            srendMPItag = mhe_nV_out*10000 + 0 ! MPI portability
#ifdef SREND_ISSEND
            call MPI_issend(block(nV)%cube(0)%im,size(block(nV)%cube(0)%im),MPI_REAL, &
#else
            call MPI_isend(block(nV)%cube(0)%im,size(block(nV)%cube(0)%im),MPI_REAL, &
#endif
               srendMPIrank,  &
               srendMPItag,   & ! mhe_nBlock = 0, should be
               srend_COMM,block(nV)%MYreq3,VC(nV)%MYer)
         end if
         
#ifdef SREND_TIMING
         call system_clock(it2)
         print *,'Compose: after compositing, the tail till end rflag2:',(it2-it1)*rc
         it1 = it2
#endif
         
         return ! we are done if this is a composer
      end if ! rflag = 2 -------------------- end if composer -------
! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#endif
! -------------------------------------------------------------------------------------------
! end: special composer routine -------------------------------------- ----------------------
! -------------------------------------------------------------------------------------------



! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! ccccccccccccccccccc end composer ccccccccccccccccccccccccccccccccccc
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc




9876  continue ! target for NOMPI and finish from render call

! WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
! WWWWWWWWWWWWWWWWWWWWW write to disk WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
! WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
! 
! The features for writing out files are for practical uses which came
! to mind, not to address every possible use.  Three image format
! can be written directly.
!
! PPM (Portable PixMap for imagery, not to be confused with Piecewise 
! Parabolic Method) is 24 bit RGB raster with a plain text header. 
! A provided file name with the extension ".ppm. selects this
! format.  An added fixed-length PPM header (here 20 bytes) is the 
! only difference between PPM and raw.
!
!
! Here are the practical restrictions:
! 
! 1) Perspective cannot be used for npole, equatorial, or spole in
! full spherical views.  Angle spans for perspective must be
! less than 180 degrees.
!
! 2) npole, equatorial, and spole full spherical views cannot be 
! tiled.  This is because these 3 views are rendered and written
! separately but to the same file.
! 
! Hopefully, these restrictions above are not high, and it is 
! probably much faster to handle image format conversion and tiling
! separately when these restrictions above apply.
!
! If output images need to be converted to a different format,
! ImageMagick convert & mogrify are very fast.  Install ImageMagick;
! convert and mogrify commands will be installed.
! Example
!   convert all PPM images to PMG: mogrify -format png *.ppm
!
! Tiling and combining images from PPM (if dimensions are
! known) is straightforward array programming.  One can read the 
! PPM header of file "out.ppm" with "head -n 1 out.ppm".  The PPM
! header is exactly 20 bytes--19 characters and one '\r' = achar(10)--
! and the string is "P6  1600   800  255" followed by '\r' for an
! image 1600 pixels wide and 800 pixels tall.  "P6" is required for
! this raw/binary PPM format, and 255 means 8 bits for each RGB
! color.  Without much programming, one could use ImageMagick's 
! convert like this:
!   convert -crop 25%x25% d000008.ppm d00000.png
! This results in 16 images d000008-0.png to d000008-15.png, ordered
! by row with 0-based offset.  There are surely other ways to do this
! with various utilities, but the Fortran or C program to do it
! in a particular way need not be long.  Such a program
! "srend_tile_montage.F90" is provided with Srend.
! --------------------------------------------------------------------

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'srend_base: before write setup'
#endif


! set W and H for finishing
      mhe_W = mhe_W / mhe_tiles_right ! width of each tile in pixels
      mhe_H = mhe_H / mhe_tiles_down  ! height of each tile in pixels
      mhe_wim = mhe_W
      mhe_Him = mhe_H

! -------- process filenames ------------------

! the array of mhe_nR filenames for multiple variables
      if( allocated( VC(nV)%rfname) ) then
        if( ubound(VC(nV)%rfname,DIM=1) < mhe_nr) deallocate(VC(nV)%rfname)
      end if
      if( .NOT. allocated( VC(nV)%rfname) ) allocate( VC(nV)%rfname(1:mhe_nR) )

! copy array of char*1 to string char*200
! This character*1 array is more portable than attempting to deal with passed Fortran strings.
! Fortran strings are great, but not so as arguments when calling from C/C++ or Python
! or passing with MPI.
      do i=1,200
         filenames_str(i:i) = mhe_filenames(i)
      end do 

      write (fnum,'(i6.6,A)') Vpass(nV) ! file number, leading 0s, maybe used to replace '######' below

! This expects and trailing comma "," after each filename as well
! as between them.  If '######' is in the filename, it is replaced
! by the zero-padded Vpass(nV) value.
      kk = 1 ! beginning of filenames
      do i=1,mhe_nR
         k = index(filenames_str(kk:200), ',')             ! find the comma--which is after every entry
         VC(nV)%rfname(i) = filenames_str(kk:kk+k-2)       ! before the comma, and offset back 1
         kk = kk + k                                       ! next start position after the comma
         k = index( VC(nV)%rfname(i), '######')            ! find special tag string '#######'
         if( k .gt. 0) VC(nV)%rfname(i)(k:k+5) = fnum(1:6) ! replace ###### with 000001,000002, ...
      end do

      allocate( rfname_root(1:mhe_nR))                     ! for 'path_filename.ext', holds 'path_filename' 
      allocate( rfname_ext(1:mhe_nR))                      ! for 'path_filename.ext', 'holds ext'
      
      
! Imagemagick "convert" must be available in the PATH for this to work.
! check known extensions: .ppm, .png, .jpg, .tif, .bmp, .gif
      do i=1,mhe_nR
         k = index(VC(nV)%rfname(i), '.ppm')
         if(k .gt. 0) then
            rfname_ext(i) = 'ppm'
            rfname_root(i) = VC(nV)%rfname(i)(1:k-1)
         end if
         k = index(VC(nV)%rfname(i), '.jpg')
         if(k .gt. 0) then
            rfname_ext(i) = 'jpg'
            rfname_root(i) = VC(nV)%rfname(i)(1:k-1)
         end if
         k = index(VC(nV)%rfname(i), '.png')
         if(k .gt. 0) then
            rfname_ext(i) = 'png'
            rfname_root(i) = VC(nV)%rfname(i)(1:k-1)
         end if
         k = index(VC(nV)%rfname(i), '.bmp')
         if(k .gt. 0) then
            rfname_ext(i) = 'bmp'
            rfname_root(i) = VC(nV)%rfname(i)(1:k-1)
         end if
         k = index(VC(nV)%rfname(i), '.tif')
         if(k .gt. 0) then
            rfname_ext(i) = 'tif'
            rfname_root(i) = VC(nV)%rfname(i)(1:k-1)
         end if
         k = index(VC(nV)%rfname(i), '.gif')
         if(k .gt. 0) then
            rfname_ext(i) = 'gif'
            rfname_root(i) = VC(nV)%rfname(i)(1:k-1)
         end if
      end do



! create any needed directories in VC(nV)%rfname(1:mhe_nR)
! The filenames string is parsed attempting to create directories -- whether existing or not --
! depending on slash "/" in the string.  If existing, it fails silently with the "-p" option.
! This is more portable than detecting existence of a directory.
      do i=1,mhe_nR
         kk = 1
         k = 2 ! to initialize for entry into do while(found a /)
         do while(k .gt. 1)
            k = index(rfname_root(i)(kk:),'/')!,back=.TRUE.)
            if(k .gt. 1) then ! .gt.1 because will not deal with directories off root /, nor //
               rfname_command = 'mkdir -p ' // trim(rfname_root(i)(1:k+kk-1))
               call system( rfname_command )
               kk = kk+k ! after the / 
            end if
         end do
      end do


! The sampling data was collected with vectors of 16 REAL*4 RGBA values, but we
! will process images by pixel rather than 16-packs
! remap cube%im(1:16,0:3,...) to VC(nV)%im(0:3,...)
      if( allocated(VC(nV)%im) ) then
         if( ubound(VC(nV)%im,DIM=2) /= mhe_W .OR. &
               ubound(VC(nV)%im,DIM=3) /= mhe_H .OR. &
               ubound(VC(nV)%im,DIM=4) /= mhe_nsh .OR. &
               ubound(VC(nV)%im,DIM=5) /= mhe_nR ) then
            deallocate( VC(nV)%im)
         end if
      end if
      if(.NOT. allocated(VC(nV)%im )) allocate( VC(nV)%im(0:3,mhe_W,mhe_H,1:mhe_nsh,1:mhe_nR) )
      do ii=1,mhe_nR
         do jj = 1,mhe_nsh
            do j = 1,mhe_H/4
            offj = j + (mhe_TDi - 1) * mhe_H/4 ! =j if tiles_down == 1
            do i = 1,mhe_W/4
            offi = i + (mhe_TRi - 1) * mhe_W/4 ! =i if tiles_right == 1 
               do k=0,3
               do kk=1,16
                  VC(nV)%im(k,(i-1)*4+ mod(kk-1,4)+1,(j-1)*4+ (kk-1)/4+1,jj,ii) =  &
                     block(nV)%cube(0)%im(kk,k,offi,offj,jj,ii)
               end do
               end do
            end do
            end do
         end do
      end do


      
! --------------------------------------------------------------------
! ----- remap/copy im to tim : north or south pole, nsh = 1 ----------
! This should not be done for tiling.
! --------------------------------------------------------------------
      if ( mhe_srendtype .eq. 1 .OR. mhe_srendtype .eq. 3) then ! n/s pole

! don't alter incoming parameters V,U: make orthogonal basis
! this is for npole or s pole:               V,  U, R
! for npole, the spherical target basis is: -U,  V, R
! for spole, the spherical target basis is:  U, -V, R 
         R(1) = mhe_Ev(2)*mhe_Up(3) - mhe_Ev(3)*mhe_Up(2) ! R = V x U
         R(2) = mhe_Ev(3)*mhe_Up(1) - mhe_Ev(1)*mhe_Up(3)
         R(3) = mhe_Ev(1)*mhe_Up(2) - mhe_Ev(2)*mhe_Up(1)
         mhe_Up(1) = R(2)*mhe_Ev(3) - R(3)*mhe_Ev(2) ! U = R x V
         mhe_Up(2) = R(3)*mhe_Ev(1) - R(1)*mhe_Ev(3)
         mhe_Up(3) = R(1)*mhe_Ev(2) - R(2)*mhe_Ev(1)
         mhe_Ev = mhe_Ev / sqrt( mhe_Ev(1)**2 + mhe_Ev(2)**2 + mhe_Ev(3)**2 ) ! unitize
         R = R / sqrt( R(1)**2 + R(2)**2 + R(3)**2 )
         mhe_Up = mhe_Up / sqrt( mhe_Up(1)**2 + mhe_Up(2)**2 + mhe_Up(3)**2 )

! if polar (srendtype = 1 npole or srendtype = 3 spole), than map
!  im(0:3,1:W  ,1:H  ,1,nsh,1:mhe_nR)     to 
! tim(0:3,1:Wim,1:Him,1:nsh,1:mhe_nR)
         if( allocated(VC(nV)%tim) ) then
            if( ubound(VC(nV)%tim,DIM=2) /= mhe_Wim .OR. &
                ubound(VC(nV)%tim,DIM=3) /= mhe_Him .OR. &
                ubound(VC(nV)%tim,DIM=4) /= mhe_nsh .OR. &
                ubound(VC(nV)%tim,DIM=5) /= mhe_nR ) then
               deallocate( VC(nV)%tim)
            end if
         end if
         if(.NOT. allocated(VC(nV)%tim) ) allocate(VC(nV)%tim(0:3,1:mhe_Wim,1:mhe_Him,1:mhe_nsh,1:mhe_nR))
         VC(nV)%tim = 0.0_4 ! zero all shells, no need to copy 0 shells

         dW = 360._4 * PI/180.0_4 / mhe_Wim
         dH = 180.0_4 * PI/180._4 / (mhe_Him * 4.0_4)

         dWW = mhe_Alpha * PI/180.0_4 / mhe_W
         dHH = mhe_Beta  * PI/180.0_4 / mhe_H

! this is for npole or s pole:               V,  U, R
! for npole, the spherical target basis is: -U,  V, R
! for spole, the spherical target basis is:  U, -V, R 
         if ( mhe_srendtype .eq. 3) then ! spole
            Hoffset = 3*mhe_Him ! spole
            VV = mhe_Up
            UU = -mhe_Ev
         else                   ! npole
            Hoffset = 0 
            VV = -mhe_Up
            UU = mhe_Ev
         end if

         do j=1,mhe_Him
            b = (mhe_Him*2 - j-Hoffset)*dH + dH * .5_4
            cosb = cos(b)
            sinb = sin(b)

            do i=1,mhe_Wim
               a = (mhe_Wim/2 - i)*dW + dW * .5_4
               cosbcosa = cosb*cos(a)
               cosbsina = cosb*sin(a)
      ! form unit vector Q off spherical target pixel
               Q = VV*cosbcosa - R*cosbsina + UU*sinb
      ! find angles with respect to pole basis
               dotQU = dot_product(Q,mhe_Up)
               if( dotQU .gt. 1.0_4) dotQU = 1.0_4
               if( dotQU .lt. -1.0_4) dotQU = 1.0_4
               bb = acos( dotQU ) ! declination off U
               bb = PI/2.0_4 - bb ! elevation off V, latitude
               Q = Q - mhe_Up*dotQU ! remove projection of Q onto U
               Q = Q / sqrt( Q(1)**2 + Q(2)**2 + Q(3)**2 ) ! unitize Q
               acos_arg = dot_product(mhe_Ev,Q)
               if( acos_arg .gt. 1.0_4) acos_arg = 1.0_4
               if( acos_arg .lt. -1.0_4) acos_arg = -1.0_4
               aa = acos( acos_arg ) ! angle in VR plane
               if( dot_product(Q,R) .gt. 0.0_4) aa = -aa ! pos left
      ! we have lon,lat aa,bb; now find pixel in pole rendering
      ! interpolation: bilinear

               ddx0 = mhe_W/2.0_4 - aa/dWW + .5_4
               ii = floor(ddx0)
               ddx0 = ddx0 - ii
               ddx1 = 1.0_4 - ddx0

               ddy0 = mhe_H/2.0_4 - bb/dHH + .5_4
               jj = floor(ddy0)
               ddy0 = ddy0 - jj
               ddy1 = 1.0_4 - ddy0

      ! interpolation is same for each shell and variable in 1:mhe_nR
               do kk = 1, mhe_nR
                  do ish = 1,mhe_nsh
                     do k=0,3
                        VC(nV)%tim(k,i,j,ish,kk) =                    & 
                           VC(nV)%im(k,ii,jj,ish,kk)     *ddx1*ddy1 + &
                           VC(nV)%im(k,ii+1,jj,ish,kk)   *ddx0*ddy1 + &
                           VC(nV)%im(k,ii,jj+1,ish,kk)   *ddx1*ddy0 + &
                           VC(nV)%im(k,ii+1,jj+1,ish,kk) *ddx0*ddy0
                     end do
                  end do ! ish
               end do ! kk
            end do ! i
         end do ! j    
      end if ! remap for npole and spole 
! ----------------- done with map im to tim --------------------------




! --------------------------------------------------------------------
! multiple shells (nsh > 1) and a .ppm image
! --------------------------------------------------------------------
! this is mostly a test to see what is going into shells
      if(mhe_nsh > 1) then ! .ppm

         if( allocated(VC(nV)%ppm_buffer) ) then
            if( ubound(VC(nV)%ppm_buffer,DIM=2) /= mhe_Wim .OR. &
                ubound(VC(nV)%ppm_buffer,DIM=3) /= mhe_Him) then
                deallocate(VC(nV)%ppm_buffer)
            end if
         end if
         if(.NOT. allocated(VC(nV)%ppm_buffer) ) then
            allocate( VC(nV)%ppm_buffer(1:3,1:mhe_Wim,1:mhe_Him) )
         end if

         do kk=1,mhe_nR
            do ish = 1,mhe_nsh

               if ( mhe_srendtype .eq. 1 .OR. mhe_srendtype .eq. 3) then
                  do j=1,mhe_Him
                     do i=1,mhe_Wim
                        RGB(1) = 255.0_4 * VC(nV)%tim(1,i,j,ish,kk)
                        RGB(2) = 255.0_4 * VC(nV)%tim(2,i,j,ish,kk)
                        RGB(3) = 255.0_4 * VC(nV)%tim(3,i,j,ish,kk)  

                        VC(nV)%ppm_buffer(1,i,j) = char( floor(RGB(1)) ) ! R
                        VC(nV)%ppm_buffer(2,i,j) = char( floor(RGB(2)) ) ! G
                        VC(nV)%ppm_buffer(3,i,j) = char( floor(RGB(3)) ) ! B
                     end do
                  end do
               else ! not a pole
                  do j=1,mhe_Him
                     do i=1,mhe_Wim
                        RGB(1) = 255.0_4 * VC(nV)%im(1,i,j,ish,kk)
                        RGB(2) = 255.0_4 * VC(nV)%im(2,i,j,ish,kk)
                        RGB(3) = 255.0_4 * VC(nV)%im(3,i,j,ish,kk)  

                        VC(nV)%ppm_buffer(1,i,j) = char( floor(RGB(1)) ) ! R
                        VC(nV)%ppm_buffer(2,i,j) = char( floor(RGB(2)) ) ! G
                        VC(nV)%ppm_buffer(3,i,j) = char( floor(RGB(3)) ) ! B
                     end do
                  end do
               end if ! polar remapped

               write (fnum,'(i3.3,A)') ish ! shell number 001-999
               rfname_tile = '' ! multiple tiles only make sense without 3-part full spherical, must be srendtype=0
               if( mhe_tiles_right*mhe_tiles_down > 1) write(rfname_tile,"('_',I3.3,'_',I3.3)") mhe_TDi, mhe_TRi
               VC(nV)%tfname = trim(rfname_root(kk)) // '.' // trim(fnum) // trim(rfname_tile) // '.ppm'
               fun = SREND_FILE_UNIT + nV

               if( mhe_srendtype .eq. 0) then ! single .ppm in one write, must be this if tiling
                  open(unit=fun,status='unknown', file=VC(nV)%tfname,access='stream')
                  write(header,"('P6',I5,1X,I5,'   255')") mhe_Wim, mhe_Him
                  write (fun) trim(header(1:19)),achar(10) ! 20
               else if (mhe_srendtype .eq. 1) then ! npole
                  open(unit=fun,status='unknown', file=VC(nV)%tfname,access='stream')
                  write(header,"('P6',I5,1X,I5,'   255')") mhe_Wim, mhe_Him*4
                  write (fun) trim(header(1:19)),achar(10) ! 20
               else ! equatorial or spole
                  open(unit=fun,status='unknown', file=VC(nV)%tfname,access='stream',position='append')
               end if
               
               write (fun) VC(nV)%ppm_buffer
               close (fun)

            end do ! ish 
         end do ! kk 1:mhe_nR
         

! ------ NOTE: srend_tile_montage.F90 must be compiled to "montage" and be in working directory
! ------ NOTE: Imagemagick "convert" must be available for concersion to non- .ppm imagery
! --------------------- called by all ranks to work, but only rank SREND_TILE_MONTAGE does it
!!#define SREND_TILE_MONTAGE 0
! Imagemagick method:  montage tile_*.ppm -tile 4x4 -geometry +0+0 -depth 8 out.ppm
! ./motage method: ./montage tile.sss tile.sss.ppm [alpha beta]  !! angles if perspective
#ifndef SREND_NOMPI
#ifdef SREND_TILE_MONTAGE
         if( (mhe_srendtype == 0 .OR. mhe_srendtype == 3) .AND. mhe_tiles_right*mhe_tiles_down > 1) then
            call MPI_barrier(srend_COMM, MYer) ! all must be complete and written to disk
            if(MYid == SREND_TILE_MONTAGE) then ! one rank to do this, say defined as: SREND_TLE_MONTAGE 0
               do kk=1,mhe_nR
                  do ish = 1,mhe_nsh
                     write (fnum,'(i3.3,A)') ish ! shell number 001-999
                     rfname_command = './montage ' // trim(rfname_root(kk)) // '.' // trim(fnum) // ' ' // &
                                                      trim(rfname_root(kk)) // '.' // trim(fnum) // '.ppm'
                     rfname_command = trim(rfname_command) // '; rm ' // trim(rfname_root(kk)) // '.' // trim(fnum) // '_*'
                     call system( trim(rfname_command) ) ! if below fails
   !                  call execute_command_line(rfname_command,wait=.FALSE.) ! asynchronous

                     if( rfname_ext(kk) /= 'ppm') then
                        rfname_command = 'convert ' // trim(rfname_root(kk)) // '.' // trim(fnum) // '.ppm ' //           &
                                         trim(rfname_root(kk)) // '.' // trim(fnum) // '.'     // trim(rfname_ext(kk)) // &
                                         ';rm ' // trim(rfname_root(kk)) //  '.' // trim(fnum) // '.ppm'
                        call system( trim(rfname_command) )
                     end if
                  end do
               end do
            end if
#ifdef SREND_CLEAR_FBUFS
            call clear_finish_buffs()
#endif
            return ! from rflag = 3
         end if
#endif
#endif

! -------- convert with Imagemagick if not .ppm
         if( (mhe_srendtype .eq. 0 .OR. mhe_srendtype .eq. 3) .AND. mhe_tiles_right*mhe_tiles_down == 1) then ! don't do .ppm
            do kk=1,mhe_nR
               do ish = 1,mhe_nsh
                  if(rfname_ext(kk)(1:3) /= 'ppm') then
                     write (fnum,'(i3.3,A)') ish ! shell number 001-999
                     rfname_command = 'convert ' // trim(rfname_root(kk)) // '.' // trim(fnum) // '.ppm ' //                  &
                                             trim(rfname_root(kk)) // '.' // trim(fnum) // '.'     // trim(rfname_ext(kk)) // &
                                           ';rm ' // trim(rfname_root(kk)) //  '.' // trim(fnum) // '.ppm'
                     call system( trim(rfname_command) )
!                     call execute_command_line(rfname_command,wait=.FALSE.) ! asynchronous
                  end if
               end do
            end do
         end if ! an image .ppm, .jpg, .png, regular=0 or spole=3, don't do equatorial or npole

         
#ifdef SREND_CLEAR_FBUFS
         call clear_finish_buffs()
#endif
         return ! from rflag = 3 dddddddddddddddd done dddddddddddddddddd
      end if ! ppm and nsh > 1
! --------------------------------------------------------------------
! end multiple shells routine
! --------------------------------------------------------------------



#ifdef SREND_TIMING
      call system_clock(it2); print *,'Compose: in write, before if-persp:',(it2-it1)*rc; it1 = it2
#endif

#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'srend_base: before write for nsh=1'
#endif



! below here nsh = 1 -------------------------------------------------

! the buffer for all, .ppm
      if( allocated(VC(nV)%ppm_buffer) ) then
         if( ubound(VC(nV)%ppm_buffer,DIM=2) /= mhe_Wim .OR. &
               ubound(VC(nV)%ppm_buffer,DIM=3) /= mhe_Him) then
               deallocate(VC(nV)%ppm_buffer)
         end if
      end if
      if(.NOT. allocated(VC(nV)%ppm_buffer) ) allocate( VC(nV)%ppm_buffer(1:3,1:mhe_Wim,1:mhe_Him) )

! WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
! write out a file for each variable/color map
      do kk=1,mhe_nR ! --------------------------------------------

! --------------------------------------------------------------------
         if( mhe_perspective .eq. 1 .AND. mhe_tiles_right*mhe_tiles_down == 1) then
! this will never be a pole, nor equatorial, always use VC(nV)%im
! further, perspective should not be done when tiling; only after montage

! 1) convert REAL*4 in VC(nV)%im to byte in VC(nV)%ppm_buffer2
! 1b) pad VC(nV)%ppm_buffer2 for interpolating close to edges
! 2) interpolate VC(nV)%ppm_buffer2 to REAL*4 VC(nV)%perspective 
! 3) convert REAL*4 VC(nV)%perspective to byte VC(nV)%ppm_buffer

! extra boundary padding for source of interpolation
            if( allocated(VC(nV)%ppm_buffer2) ) then
               if( ubound(VC(nV)%ppm_buffer2,DIM=2) /= mhe_Wim+1 .OR. &
                  ubound(VC(nV)%ppm_buffer2,DIM=3) /= mhe_Him+1) then
                  deallocate(VC(nV)%ppm_buffer2)
               end if
            end if
            if( .NOT. allocated(VC(nV)%ppm_buffer2) ) then
               allocate( VC(nV)%ppm_buffer2(1:3,0:mhe_Wim+1,0:mhe_Him+1) )
            end if
! ------------- copy im(:,:,:,mhe_nR) section to ppm_buffer2
            do j=1,mhe_Him
               do i=1,mhe_Wim
                  RGB(1) = 255.0_4 * VC(nV)%im(1,i,j,1,kk)
                  RGB(2) = 255.0_4 * VC(nV)%im(2,i,j,1,kk)
                  RGB(3) = 255.0_4 * VC(nV)%im(3,i,j,1,kk)  

                  VC(nV)%ppm_buffer2(1,i,j) = char( floor(RGB(1)) ) ! R
                  VC(nV)%ppm_buffer2(2,i,j) = char( floor(RGB(2)) ) ! G
                  VC(nV)%ppm_buffer2(3,i,j) = char( floor(RGB(3)) ) ! B
               end do
            end do
! pad top and bottom row of pixels in ppm_buffer2
            VC(nV)%ppm_buffer2(1:3,1:mhe_Wim,0) = VC(nV)%ppm_buffer2(1:3,1:mhe_Wim,1)
            VC(nV)%ppm_buffer2(1:3,1:mhe_Wim,mhe_Him+1) = VC(nV)%ppm_buffer2(1:3,1:mhe_Wim,mhe_Him)
! pad sides, padded array corners are data corners
            VC(nV)%ppm_buffer2(1:3,0,0:mhe_Him+1) = VC(nV)%ppm_buffer2(1:3,1,0:mhe_Him+1)
            VC(nV)%ppm_buffer2(1:3,mhe_Wim+1,0:mhe_Him+1) = VC(nV)%ppm_buffer2(1:3,mhe_Wim,0:mhe_Him+1)

! target array for resamplng interpolation
            if( allocated(VC(nV)%perspective) ) then
               if( ubound(VC(nV)%perspective,DIM=2) /= mhe_Wim .OR. &
                  ubound(VC(nV)%perspective,DIM=3) /= mhe_Him) then
                  deallocate(VC(nV)%perspective)
               end if
            end if
            if( .NOT. allocated(VC(nV)%perspective) ) then
               allocate( VC(nV)%perspective(1:3,1:mhe_Wim,1:mhe_Him) )
            end if

! viewing increments, assume W and H are divisible by 4
            dW = mhe_Alpha * PI/180.0_4 / mhe_Wim
            dH = mhe_Beta  * PI/180.0_4 / mhe_Him

! convert to radians, no shrink .. have padding
            a_span = mhe_Alpha * PI / 180.0_4
            b_span = mhe_Beta  * PI / 180.0_4

            dWW = 2.0_4*tan(.5_4*a_span)/mhe_Wim
            dHH = 2.0_4*tan(.5_4*b_span)/mhe_Him
            do j=1,mhe_Him
               bb = (mhe_Him/2.0_4 - j)*dHH + dHH/2.0_4 ! angle coords, persp
               do i=1,mhe_Wim
                  aa = (mhe_Wim/2.0_4 - i)*dWW + dWW/2.0_4 ! angle coords, persp
                  aaa = atan(aa)               ! project
                  bbb = atan( bb / sqrt(aa*aa + 1.0_4) )
      ! interpolation: bilinear
                  ddx0 = mhe_Wim/2.0_4 - (aaa - dW/2.0_4)/dW ! hit spherical pix
                  ii = floor(ddx0)
                  ddx0 = ddx0 - ii
                  ddx1 = 1.0_4 - ddx0
                  
                  ddy0 = mhe_Him/2.0_4 - (bbb - dH/2.0_4)/dH
                  jj = floor(ddy0)
                  ddy0 = ddy0 - jj
                  ddy1 = 1.0_4 - ddy0
                  do k=1,3
                     VC(nV)%perspective(k,i,j) =                         &
                     ichar(VC(nV)%ppm_buffer2(k,ii,jj))    *ddx1*ddy1 +  &
                     ichar(VC(nV)%ppm_buffer2(k,ii+1,jj))  *ddx0*ddy1 +  &
                     ichar(VC(nV)%ppm_buffer2(k,ii,jj+1))  *ddx1*ddy0 +  &
                     ichar(VC(nV)%ppm_buffer2(k,ii+1,jj+1))*ddx0*ddy0
                  end do
               end do
            end do
            
            do j=1,mhe_Him ! copy back to ppm_buffer for writing .ppm
               do i=1,mhe_Wim
                  VC(nV)%ppm_buffer(1,i,j) = char(floor(VC(nV)%perspective(1,i,j)) )
                  VC(nV)%ppm_buffer(2,i,j) = char(floor(VC(nV)%perspective(2,i,j)) )
                  VC(nV)%ppm_buffer(3,i,j) = char(floor(VC(nV)%perspective(3,i,j)) )
               end do
            end do
! cccccccccccccc end spherical -> perspective projection ccccccccccccc
         else ! spherical

! char & byte are not vectorizable data types, but could be done
! in lower level fashion
            if( mhe_srendtype .eq. 1 .OR. mhe_srendtype .eq. 3) then ! polar remap
            do j=1,mhe_Him
            do i=1,mhe_Wim
               RGB(1) = 255.0_4 * VC(nV)%tim(1,i,j,1,kk)
               RGB(2) = 255.0_4 * VC(nV)%tim(2,i,j,1,kk)
               RGB(3) = 255.0_4 * VC(nV)%tim(3,i,j,1,kk)  

               VC(nV)%ppm_buffer(1,i,j) = char( floor(RGB(1)) ) ! R
               VC(nV)%ppm_buffer(2,i,j) = char( floor(RGB(2)) ) ! G
               VC(nV)%ppm_buffer(3,i,j) = char( floor(RGB(3)) ) ! B
            end do
            end do
            else ! not polar remapped, use im
            do j=1,mhe_Him
            do i=1,mhe_Wim
               RGB(1) = 255.0_4 * VC(nV)%im(1,i,j,1,kk)
               RGB(2) = 255.0_4 * VC(nV)%im(2,i,j,1,kk)
               RGB(3) = 255.0_4 * VC(nV)%im(3,i,j,1,kk)  

               VC(nV)%ppm_buffer(1,i,j) = char( floor(RGB(1)) ) ! R
               VC(nV)%ppm_buffer(2,i,j) = char( floor(RGB(2)) ) ! G
               VC(nV)%ppm_buffer(3,i,j) = char( floor(RGB(3)) ) ! B
            end do
            end do
            end if

! ------  done copy im to ppm_buffer

         end if ! perspective or spherical
! --------------------------------------------------------------------

#ifdef SREND_TIMING
         call system_clock(it2); print *,'Compose: in, before write file:',(it2-it1)*rc; it1 = it2
#endif



! write file now -----------------------------------------------------
! The following just writes the file, or pieces of it sequentially.
         fun = SREND_FILE_UNIT + nV
   
         if( mhe_tiles_right*mhe_tiles_down == 1) then
            rfname_name = trim(rfname_root(kk)) // '.ppm' ! always write a .ppm
            if( mhe_srendtype .eq. 0) then ! single .ppm in one write
               open(unit=fun,status='unknown', file=rfname_name,access='stream')
               write(header,"('P6',I5,1X,I5,'   255')") mhe_Wim, mhe_Him
               write (fun) trim(header(1:19)),achar(10) ! 20
            else if (mhe_srendtype .eq. 1) then ! npole
               open(unit=fun,status='unknown', file=rfname_name,access='stream')
               write(header,"('P6',I5,1X,I5,'   255')") mhe_Wim, mhe_Him*4
               write (fun) trim(header(1:19)),achar(10) ! 20
            else ! equatorial or spole
               open(unit=fun,status='unknown', file=VC(nV)%rfname(kk),access='stream',position='append')
            end if        
            write (fun) VC(nV)%ppm_buffer
            close (fun)
         else ! AMR tiling
            write(rfname_tile,"('_',I3.3,'_',I3.3,'.ppm')") mhe_TDi, mhe_TRi
            rfname_name = trim(rfname_root(kk)) // trim(rfname_tile)
            open(unit=fun,status='unknown', file=rfname_name,access='stream')
            write(header,"('P6',I5,1X,I5,'   255')") mhe_Wim, mhe_Him
            write (fun) trim(header(1:19)),achar(10) ! 20
            write (fun) VC(nV)%ppm_buffer
            close (fun)
         end if
         
      end do ! kk 1:mhe_nR ----------------------------------------------
      
      
! ------ NOTE: srend_tile_montage.F90 must be compiled to "montage" and be in working directory
! --------------------- called by all ranks to work, but only rank SREND_TILE_MONTAGE does it
!!#define SREND_TILE_MONTAGE 0
! ./motage method: ./montage tile.sss tile.sss.ppm [alpha beta]  !! angles if perspective
#ifndef SREND_NOMPI
#ifdef SREND_TILE_MONTAGE
      if( (mhe_srendtype == 0 .OR. mhe_srendtype == 3) .AND. mhe_tiles_right*mhe_tiles_down > 1) then
         call MPI_barrier(srend_COMM, MYer)
         if(MYid == SREND_TILE_MONTAGE) then ! one rank to do this, say defined as: SREND_TLE_MONTAGE 0
            do kk=1,mhe_nR
               rfname_command = './montage ' // trim(rfname_root(kk)) // ' ' // &
                                                trim(rfname_root(kk)) // '.ppm'
               if(mhe_perspective .eq. 1) then
                  write(fangles,"(f7.3,' ',f7.3)") mhe_alpha, mhe_beta
                  rfname_command = trim(rfname_command) // ' ' // trim(fangles) 
               end if
               rfname_command = trim(rfname_command) // '; rm ' // trim(rfname_root(kk)) // '_*'
               if( rfname_ext(kk) /= 'ppm') then
                  rfname_command = trim(rfname_command) // '; convert ' // trim(rfname_root(kk)) // '.ppm ' //&
                                                 trim(rfname_root(kk)) // '.'     // trim(rfname_ext(kk)) // &
                                                 ';rm ' // trim(rfname_root(kk)) //  '.ppm '
               end if
               call system( trim(rfname_command) ) ! if below fails
!               call execute_command_line(rfname_command,wait=.FALSE.) ! asynchronous
            end do
         end if
#ifdef SREND_CLEAR_FBUFS
         call clear_finish_buffs()
#endif
         return
      end if
#endif
#endif

! -------- convert with Imagemagick if not .ppm
      if( (mhe_srendtype .eq. 0 .OR. mhe_srendtype .eq. 3) .AND. mhe_tiles_right*mhe_tiles_down == 1) then ! don't do .ppm
         do kk=1,mhe_nR
            if(rfname_ext(kk)(1:3) /= 'ppm') then
               rfname_command = 'convert ' // trim(rfname_root(kk)) //  '.ppm ' //                         &
                                              trim(rfname_root(kk)) //  '.'     // trim(rfname_ext(kk)) // &
                                              ';rm ' // trim(rfname_root(kk)) //  '.ppm '
               call system( trim(rfname_command) )
!               call execute_command_line(rfname_command,wait=.FALSE.) ! asynchronous
            end if
         end do
      end if ! an image .ppm, .jpg, .png, regular=0 or spole=3, don't do equatorial or npole

! WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW


#if !defined(SREND_NOMPI) && defined(SREND_FTRACE)
      print *,'BASE MYid=',MYid,'srend_base: after write nsh=1'
#endif


#ifdef SREND_TIMING
      call system_clock(it2); print *,'Compose: in write after file write:',(it2-it1)*rc; it1 = it2
#endif


#ifdef SREND_CLEAR_FBUFS
      call clear_finish_buffs()
#endif

      return ! from rflag = 3

! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
! NOTE: The Portable Pix Map (PPM) format is directly read and 
! displayed by many browsers and desktops.  It is also a base format
! which ImageMagick and ffmpeg easily use; it amounts to a raw RGB 
! raster image with a header.
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!          ./ffmpeg -i %06d.ppm -b 1200k full.ogv
! This generates a smooth movie.
!   ffmpeg      : http://www.ffmpeg.org
!   -i %06d.ppm : input files ######.ppm, 000001.ppm, ...
!   -b 1200k    : bitrate 
!   target      : full.ogv
! wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww
! wwwwwwwwwwwwwww end : write to disk  wwwwwwwwwwwwwwwwwwwwwwwwwwwwwww
! wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww


   CONTAINS
   
! -----------------------------------------------------------------
! clears memory allocated during image finishing, if SREND_CLEAR_FBUFS defined
   subroutine clear_finish_buffs()
      if( allocated(VC(nV)%rfname) ) deallocate(VC(nV)%rfname)
      if( allocated(VC(nV)%ppm_buffer) ) deallocate(VC(nV)%ppm_buffer)
      if( allocated(VC(nV)%ppm_buffer2) ) deallocate(VC(nV)%ppm_buffer2)
      if( allocated(VC(nV)%perspective) ) deallocate(VC(nV)%perspective)
      if( allocated(VC(nV)%tim) ) deallocate(VC(nV)%tim)
      if( allocated(VC(nV)%im) ) deallocate(VC(nV)%im)
   end subroutine clear_finish_buffs
   

! --------------------------------------------------------------------
! sorts using oct90 and wh() structure in srend_base, L the list of indices
! This is called by srend_cope. srend_finish, and srend_tile_finish
   subroutine n2sort(L,n)
      INTEGER*4,intent(IN) :: n
      INTEGER*4 :: i,j,ii,jj
      INTEGER*4, intent(INOUT) :: L(1:n)
      
      do i=1,n-1
         do j=i+1, n
     
            ii=L(i)
            jj=L(j)

! The far corner of jj is "behind" the near corner of ii => swap
! This is not transitive.  A block which occludes nothing below
! rises to the top.  This is for back2front order, but can be
! reversed for front2back or eye2out.

if( oct(0,0,0,ii) .AND. wh(5,jj) < wh(11,ii) .AND. wh(6,jj) < wh(12,ii) .AND. wh(7,jj) < wh(13,ii)  ) goto 9994 ! 000
if( oct(0,0,1,ii) .AND. wh(5,jj) < wh(11,ii) .AND. wh(6,jj) < wh(12,ii) .AND. wh(13,jj) > wh(7,ii)  ) goto 9994 ! 001
if( oct(0,1,0,ii) .AND. wh(5,jj) < wh(11,ii) .AND. wh(12,jj) > wh(6,ii) .AND. wh(7,jj) < wh(13,ii)  ) goto 9994 ! 010
if( oct(0,1,1,ii) .AND. wh(5,jj) < wh(11,ii) .AND. wh(12,jj) > wh(6,ii) .AND. wh(13,jj) > wh(7,ii)  ) goto 9994 ! 011
if( oct(1,0,0,ii) .AND. wh(11,jj) > wh(5,ii) .AND. wh(6,jj) < wh(12,ii) .AND. wh(7,jj) < wh(13,ii)  ) goto 9994 ! 100
if( oct(1,0,1,ii) .AND. wh(11,jj) > wh(5,ii) .AND. wh(6,jj) < wh(12,ii) .AND. wh(13,jj) > wh(7,ii)  ) goto 9994 ! 101
if( oct(1,1,0,ii) .AND. wh(11,jj) > wh(5,ii) .AND. wh(12,jj) > wh(6,ii) .AND. wh(7,jj) < wh(13,ii)  ) goto 9994 ! 110
if( oct(1,1,1,ii) .AND. wh(11,jj) > wh(5,ii) .AND. wh(12,jj) > wh(6,ii) .AND. wh(13,jj) > wh(7,ii)  ) goto 9994 ! 111
            cycle
      
9994        continue
            L(i) = jj
            L(j) = ii

         end do
      end do
      
   end subroutine n2sort

   end subroutine Srend_base! Srend subroutine
! ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss


! +++++++++++ extra routine for Python scaling GUI ++++++++++++++
! This is for the ctl.py color table/scaling editor. Assume REAL*4 and INTEGER*4 as per ctl.py
   subroutine srend_scale(n,rdat,bdat,hist_rdat,hist_bdat,a,b,h,k,rmin,rmax)
      IMPLICIT none
      INTEGER*4,intent(IN) :: n
      REAL*4,intent(IN) :: rdat(1:n)
      character*1,intent(INOUT) :: bdat(1:n)
      INTEGER*4,intent(INOUT) :: hist_rdat(0:255), hist_bdat(0:255)! log scaled [0,100]
      REAL*4,intent(IN) :: a,b,h,k ! y = k + a*(x-h)/sqrt((x-h)^2 + b^2)
      REAL*4,intent(INOUT) :: rmax, rmin
      
      REAL*4 :: rspan,x,y, log_nrmax, log_nbmax
      INTEGER*4 :: i, iy, nrmax, nbmax 

      
      rmax = rdat(1); rmin = rdat(1); hist_bdat = 0 
      do i=1,n
         x = rdat(i)
         if( x>rmax ) rmax = x
         if( x<rmin ) rmin = x
         y = k + a*(x-h) / sqrt(x*x + b*b)
         if( isnan(y)) y = 0.0_4
         iy = max(0,min(255,floor(y)))
         bdat(i) = achar( iy )
         hist_bdat(iy) = hist_bdat(iy) + 1
      end do
      
      rspan = rmax - rmin
      hist_rdat = 0
      if( rspan == 0.0_4) then
         hist_rdat(128) = 100 ! all values x are the same
         hist_bdat = floor( 100.0_4 * hist_bdat / n) ! only 1 is non-zero and becomes 100
         return
      end if
      do i=1,n
         y = 255.0_4 * (rdat(i) - rmin) / rspan
         iy = floor(y)
         hist_rdat(iy) = hist_rdat(iy) + 1
      end do
      nrmax = 0; nbmax = 0
      do i=0,255
         if(hist_rdat(i) > nrmax) nrmax = hist_rdat(i)
         if(hist_bdat(i) > nbmax) nbmax = hist_bdat(i)
      end do
      log_nrmax = log(1.0_4+nrmax);log_nbmax = log(1.0_4+nbmax)
      hist_rdat = floor( 100.0_4 * log(1.0_4+hist_rdat)/log_nrmax )
      hist_bdat = floor( 100.0_4 * log(1.0_4+hist_bdat)/log_nbmax )
      
      end subroutine srend_scale

end module srend
! ********************************************************************
! ********************************************************************

! ---- ifdef SRENDERING ---- encloses everything
#endif
