! t_mpi_4R_2C_1F.F90  2016jul09  wetherbee
! -----------------------------------------------------------------------
! This is free and unencumbered software released into the public domain.
!
! Anyone is free to copy, modify, publish, use, compile, sell, or 
! distribute this software, either in source code form or as a compiled 
! binary, for any purpose, commercial or non-commercial, and by any 
! means.
!
! In jurisdictions that recognize copyright laws, the author or authors 
! of this software dedicate any and all copyright interest in the 
! software to the public domain. We make this dedication for the benefit 
! of the public at large and to the detriment of our heirs and 
! successors. We intend this dedication to be an overt act of 
! relinquishment in perpetuity of all present and future rights to this 
! software under copyright law.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
! EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
! MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
! IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
! OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
! ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
! OTHER DEALINGS IN THE SOFTWARE.
!
! For more information, please refer to http://unlicense.org/
! ----------------------------------------------------------------------

!    - rendering to extra ranks (apart from rendering ranks) for compose and finish
!    - an intermediate compose collecting between render and finish
! --------------------------------------------------------------------
!
! render(0)   render(1)   render(2)  render(3)       compose(4 & 5) >>>> finish(6)
!        |           |           |          |           ^
!         \           \           \          \         /
!           ------------>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
!
! MPI: 4 rendering ranks + 1 compositor + 1 finish
! SET: XR=2,YR=2,ZR=1
! mpif90  Srend.F90 -O3 -c
! gcc sleep.c -c
! mpif90 t_mpi_4R_2C_1F.F90 Srend.o sleep.o -O3 -o t_mpi_4R_2C_1F
! mpirun -np 7 ./t_mpi_4R_2C_1F
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc

! test Srend
! set defines in Srend.F90
#define SREND_COMM_DUP ! optional actually, but useful to prevent MPI clashing
#define SREND_YIELDING call sleep_n1() !one nanosecond, effectively a sched_yield
! comment out
!#define SREND_NOMPI ! must be commented out for MPI, defined for SMP/serial
! cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc

! ranks in each dimension, will be ordered xyz
! volume will be split in each xyz dimension by XR, YR, ZR respectively
#define XR 2
#define YR 2
#define ZR 1
#define RR XR*YR*ZR
! dimensions of N^3 volume to create by formula
#define N 256

program t_mpi_4R_2C_1F ! extra ranks
#ifdef _OPENMP
      USE OMP_LIB
#endif
      USE srend

      IMPLICIT NONE
      include 'mpif.h'

      character*1,dimension(:,:,:),allocatable :: fv ! data
      character,save :: filenames*200
! -------------------------------------------------------------------- 
! loop variables in main program 
      integer tt
! for timing
      integer*8,save :: it1, it2, itc, total0, total1
      real*4,save :: rc
! MPI
      integer*4 :: MYer, MYid=0, MYn
! flythrough parameter
      real*4, dimension(3),save :: E, E0, V
! counters
      integer*4 :: i,j,k
      real*4 :: x,y,z, cr
! target MPI ranks for compositing
      integer*4 :: xoff, yoff, zoff       ! xyz offsets for data
      integer*4 :: nx, ny, nz             ! xyz dimensions of data
      integer*4 :: C_rank(1:1)  ! compositor rank, 4 or 5
      
! ccccccccc end variable declarations cccccccccccccccccccccccccccccccc

      call system_clock(it1,itc) ! start timer
      rc = 1.0 / itc             ! rc = 1/ticks_per_second

! init MPI
      call MPI_init(MYer)
      call MPI_comm_rank(MPI_COMM_WORLD, MYid, MYer)
      call MPI_comm_size(MPI_COMM_WORLD, MYn, MYer)
      call MPI_barrier(MPI_COMM_WORLD,MYer) ! all meet here
      print *,'MPI init: MYid=',MYid,'Ranks=',MYn, 'volume N^3, N=', N
      if(MYn /= RR+2+1) then
        print *,'ranks needed=', RR+2+1
        print *,'XR,XR,ZR=', XR,YR,ZR
        STOP
      end if
      
if(MYid < RR) then ! rendering ranks: 0,1,2,3

! compositor rank
      C_rank(1) = RR + mod(MYid,2) ! MYid=0,1 C_rank=4; MYid=2,3 C_rank=5

!find width of each block
      nx = N/XR
      ny = N/YR
      nz = N/ZR
      
! find offsets for each rank's block
      xoff = mod(MYid,XR) * nx
      yoff = mod(MYid,XR*YR)/XR * ny
      zoff = MYid/(XR*YR) * nz

! debug
      print *,'Renderer: MYid=',MYid,' nxyz=',nx,ny,nz,'xyzoff=',xoff,yoff,zoff
      
! allocate array for rendering
      allocate( fv(1+xoff-1:xoff+nx+1,1+yoff-1:yoff+ny+1,1+zoff-1:zoff+nz+1) )
      print *,'Allocated: MYid=',MYid,'fv size(fv)=',size(fv) ! debug

! create data: fv array of char*1, attempt to create interesting data for tests
      do k=zoff,zoff+nz+1
        z = 1. - (2.*k)/N
      do j=yoff,yoff+ny+1
        y = 1. - (2.*j)/N
      do i=xoff,xoff+nx+1
        x = 1. - (2.*i)/N
        cr = 4.0 + min(1.,1./(x*x+y*y+z*z + .1)**4)*(124.+124./3.*(sin(1./(x*y+.1))+ &
             cos(1./(x+y*z+.1))+sin(1.0/(z*x+y+.1))**2) )
        fv(i,j,k) = char(floor(cr))
      end do
      end do
      end do
      
! wireframe white on edges of volume in array fv, tests so this is around entire volume  
      if(yoff==0    .AND. zoff==0   ) fv(:    ,0:1  ,0:1  ) = char(0)
      if(yoff==0    .AND. zoff+nz==N) fv(:    ,0:1  ,N:N+1) = char(0)
      if(yoff+ny==N .AND. zoff==0   ) fv(:    ,N:N+1,0:1  ) = char(0)
      if(yoff+ny==N .AND. zoff+nz==N) fv(:    ,N:N+1,N:N+1) = char(0)
      
      if(xoff==0    .AND. zoff==0   ) fv(0:1  ,:    ,0:1  ) = char(0)
      if(xoff+nx==N .AND. zoff==0   ) fv(N:N+1,:    ,0:1  ) = char(0)
      if(xoff==0    .AND. zoff+nz==N) fv(0:1  ,:    ,N:N+1) = char(0)
      if(xoff+nx==N .AND. zoff+nz==N) fv(N:N+1,:    ,N:N+1) = char(0)
      
      if(xoff==0    .AND. yoff==0   ) fv(0:1  ,0:1  ,:    ) = char(0)
      if(xoff+nx==N .AND. yoff==0   ) fv(N:N+1,0:1  ,:    ) = char(0)
      if(xoff==0    .AND. yoff+ny==N) fv(0:1  ,N:N+1,:    ) = char(0)
      if(xoff+nx==N .AND. yoff+ny==N) fv(N:N+1,N:N+1,:    ) = char(0)
      
! parameters for run
      E0 = (/0.5,1.3,1.7/)  ! initial eye position
      V = -E0               ! view vector

      call system_clock(it2)
      print *,'Prepped: MYid=',MYid,' span=',(it2-it1)*rc
      total0 = it2 ! starting time before compute loops, don't count prep time for this


! PPPPPPPPPPP parallel region PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
! export OMP_NUM_THREADS=4 , or however many threads should be used here  
! NOTE: All threads render.
!$OMP PARALLEL default(shared) private(tt)

      do tt= 1,4
      
!$OMP MASTER
          print *,'Render pass start: MYid=',MYid,' pass = ',tt
          call system_clock(it1)
          write(filenames,'("r/",I6.6,".ppm,")') tt  !pass number within: r/000001.ppm, trailing "," needed
          E = E0 * (5.0-tt)/4.0      !move eye through volume 
!$OMP END MASTER
!$OMP BARRIER

          call srend_render(  &! rendering multiple blocks
          1,1,                    &! nV, nV_out
          E,                      &! Eye position
          V,                      &! View vector
          (/0.,1.,0./),           &! Up vector
          120.0, 120.0,           &! Alpha-horz, Beta-vert
          0.0,                    &! stereo: EyeRight
          -10.0,10.0,             &! x0,x1 -- clipping planes perpendicular to axes
          -10.0,10.0,             &! y0,y1
          -10.0,10.0,             &! z0,z1
          0.1, 10.0, 1,           &! near clip0,far clip1,nsh
          180.0,                  &! farpolar clip
          1,                      &! 0 spherical, 1 perspective
          0,                      &! 0=norm, 1=npole, 2=equator,3=spole
          0.25,                   &! sampling in cell units,(-)for out>in
          fv,                     &! data array
          nx,ny,nz,               &! XN,YN,ZN,Bd,AMR of data array passed in
          1,                      &! Bd
          xoff,yoff,zoff,         &! offset
          N,                      &! Vdim
          1024, 1024,             &! rendering Width, Height in pixels
          1,                      &! nR
          (/48/),                 &! cotab_offset(1:nR), use 1 or 48 for tests
          filenames,              &! output file name masks
          1,1,                    &! tiles right, tiles down: usually 1,1
          C_rank)                  ! target MPI rank

! sync threads between calls
!$OMP BARRIER

!$OMP MASTER
      call system_clock(it2)
      print *,'Render pass done: MYid=',MYid,' pass=',tt,' span=',(it2-it1)*rc
      it1 = it2

      call MPI_barrier(MPI_COMM_WORLD,MYer) ! all meet here, renderers
!$OMP END MASTER
          
      end do ! tt pass loop
!$OMP END PARALLEL
! pppppppppp end parallel region ppppppppppppppppppppppppppppppppppppp

else if(MYid == RR .OR. MYid == RR+1) then ! MYid = RR,RR+1 = 4,5 here
      do tt=1,4
        call system_clock(it2)
        it1 = it2

        call srend_compose(1,1,2,6)  ! nV, nV_out, numsources, target MPI rank
        
        call MPI_barrier(MPI_COMM_WORLD,MYer) ! all meet here, finisher
     
        call system_clock(it2)
        print *,'MYid=',MYid,'finished: tt=',tt,'span=',(it2-it1)*rc
      end do
else ! finish, MYid = RR+2 = 6 here
      do tt=1,4
        call system_clock(it2)
        it1 = it2

        call srend_finish(1,2)  ! nV, numsources
        
        call MPI_barrier(MPI_COMM_WORLD,MYer) ! all meet here, finisher
     
        call system_clock(it2)
        print *,'MYid=',MYid,'finished: tt=',tt,'span=',(it2-it1)*rc
      end do
end if ! render or composite/finish
      
        print *,'Before barrier MYid=',MYid
      call MPI_barrier(MPI_COMM_WORLD,MYer) ! all meet here
        print *,'After barrier MYid=',MYid
      
      if(MYid == 0) then
         call system_clock(it2)
         total1 = it2
         print *,'total runtime in calc loop (node 0) =', (total1-total0)*rc
      end if
      
      call MPI_finalize(MYer)

end program t_mpi_4R_2C_1F ! program main



