!-----------------------------------------------------------------------------!
!   CP2K: A general program to perform molecular dynamics simulations         !
!   Copyright (C) 2000 - 2010  CP2K developers group                          !
!-----------------------------------------------------------------------------!

! *****************************************************************************
!> \brief Calculate the plane wave density for each molecule independently
!>      by collocating the primitive Gaussian functions on the grids
!>      constructed in the molecolar box. The molecule is centered in the
!>      small box and no PBC are used.
!>      Each molecule is handled by one processor entirely, therefore
!>      no further distribution is allowed, unless OMP is used (if it works)
!> \author MI (05.01.2005)
! *****************************************************************************
MODULE kg_gpw_collocate_den

  USE atomic_kind_types,               ONLY: atomic_kind_type,&
                                             get_atomic_kind,&
                                             get_atomic_kind_set
  USE basis_set_types,                 ONLY: get_gto_basis_set,&
                                             gto_basis_set_type
  USE cell_types,                      ONLY: cell_type,&
                                             pbc
  USE cp_control_types,                ONLY: dft_control_type
  USE cp_dbcsr_interface,              ONLY: cp_dbcsr_distribution,&
                                             cp_dbcsr_get_block_p
  USE cp_dbcsr_operations,             ONLY: cp_dbcsr_deallocate_matrix,&
                                             cp_dbcsr_from_sm,&
                                             sm_from_dbcsr
  USE cp_dbcsr_types,                  ONLY: cp_dbcsr_type
  USE cube_utils,                      ONLY: cube_info_type
  USE dbcsr_methods,                   ONLY: dbcsr_distribution_mp
  USE distribution_2d_types,           ONLY: distribution_2d_type
  USE gaussian_gridlevels,             ONLY: gaussian_gridlevel,&
                                             gridlevel_info_type
  USE input_section_types,             ONLY: section_vals_get_subs_vals,&
                                             section_vals_type
  USE kinds,                           ONLY: dp
  USE lgrid_types,                     ONLY: lgrid_type
  USE memory_utilities,                ONLY: reallocate
  USE orbital_pointers,                ONLY: ncoset
  USE pw_env_types,                    ONLY: pw_env_get,&
                                             pw_env_type
  USE pw_methods,                      ONLY: pw_integrate_function
  USE pw_types,                        ONLY: pw_p_type
  USE qs_collocate_density,            ONLY: collocate_pgf_product_rspace,&
                                             density_rs2pw
  USE qs_environment_types,            ONLY: get_qs_env,&
                                             qs_environment_type
  USE qs_integrate_potential,          ONLY: integrate_pgf_product_rspace,&
                                             potential_pw2rs
  USE qs_modify_pab_block,             ONLY: FUNC_AB,&
                                             FUNC_DADB
  USE realspace_grid_types,            ONLY: realspace_grid_desc_p_type,&
                                             realspace_grid_p_type,&
                                             rs_grid_create,&
                                             rs_grid_release,&
                                             rs_grid_zero
  USE sparse_matrix_types,             ONLY: real_matrix_p_type
  USE termination,                     ONLY: stop_memory,&
                                             stop_program
  USE timings,                         ONLY: timeset,&
                                             timestop
#include "cp_common_uses.h"

  IMPLICIT NONE
  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'kg_gpw_collocate_den'

  INTEGER, PARAMETER                       :: add_tasks = 1000, &
                                              max_tasks = 2000
  REAL(kind=dp), PARAMETER                 :: mult_tasks = 2.0_dp

! *** Public functions ***

  PUBLIC :: calculate_rho_mol, integrate_mol_potential

CONTAINS

!****f* kg_gpw_collocate_den/calculate_rho_mol

! *****************************************************************************
!> \brief This function is called within a loop over all the molecules
!>      to calculate the molecular density on the molecular grid only.
!>      The molecular densities are used by KG_GPW to calculate the Kinetic energy
!>      correction by subtracting the sum of the kinetic energies calculated
!>      for each molecule independently
!> \param qs_env the qs environment
!> \param matrix_p global density matrix
!> \param rho_r molecular density on the molecular real space mesh
!> \param rho_g molecular density on the molecular reciprocal space mesh
!> \param total_rho integral of the molecular density (output)
!> \param atom global index of the atoms in the molecule (input)
!> \param kind index of the kind of the atoms in the molecule (input)
!> \param ratom internal coordinates of the atoms in the box of the molecule (input)
!> \param compute_tau logical to control if tau is required
!> \param error variable to control error logging, stopping,...
!>        see module cp_error_handling
!> \note
!>      In KG_GPW a molecular pw environment is created for each molecule kind
!>      which is based on a molecular cell box that does not use PBC
!>      The molecule is located in the center of the bos and the coordinates passed
!>      to this routine are internal coordinates.
!>      At this level, it should not be necessary to have the atomic positions
!>      in the global box
!> \author MI
! *****************************************************************************
  SUBROUTINE calculate_rho_mol(qs_env,matrix_p,rho_r,rho_g,total_rho,pw_env,&
                               atom,kind,ratom,compute_tau,error)

    TYPE(qs_environment_type), POINTER       :: qs_env
    TYPE(cp_dbcsr_type), POINTER             :: matrix_p
    TYPE(pw_p_type), INTENT(INOUT)           :: rho_r, rho_g
    REAL(dp), INTENT(OUT)                    :: total_rho
    TYPE(pw_env_type), POINTER               :: pw_env
    INTEGER, DIMENSION(:), INTENT(IN)        :: atom, kind
    REAL(dp), DIMENSION(:, :), INTENT(IN)    :: ratom
    LOGICAL, INTENT(IN), OPTIONAL            :: compute_tau
    TYPE(cp_error_type), INTENT(inout)       :: error

    CHARACTER(LEN=*), PARAMETER :: routineN = 'calculate_rho_mol', &
      routineP = moduleN//':'//routineN

    INTEGER :: bcol, brow, curr_tasks, dir, first_pgfb, first_setb, &
      ga_gb_function, handle, i, iat, iatom, igrid_level, ikind, ikind_old, &
      ipgf, iset, istat, itask, ithread, j, jat, jatom, jkind, jkind_old, &
      jpgf, jset, k, maxco, maxsgf, maxsgf_set, n, na1, na2, nat_mol, nb1, &
      nb2, ncoa, ncob, npme, nseta, nsetb, nthread, num_tasks, sgfa, sgfb, tp
    INTEGER, DIMENSION(:), POINTER           :: la_max, la_min, lb, lb_max, &
                                                lb_min, npgfa, npgfb, nsgfa, &
                                                nsgfb, ntasks, ub
    INTEGER, DIMENSION(:, :), POINTER        :: first_sgfa, first_sgfb, ival, &
                                                latom, tasks_local
    INTEGER, DIMENSION(:, :, :), POINTER     :: tasks
    LOGICAL                                  :: distributed_rs_grids, &
                                                failure, found, &
                                                map_consistent, my_compute_tau
    REAL(KIND=dp)                            :: dab, eps_rho_rspace, &
                                                kind_radius_b, rab2, scale, &
                                                zetp
    REAL(KIND=dp), DIMENSION(3)              :: ra, rab, rb, rp
    REAL(KIND=dp), DIMENSION(:), POINTER     :: set_radius_a, set_radius_b
    REAL(KIND=dp), DIMENSION(:, :), POINTER  :: p_block, pab, rpgfa, rpgfb, &
                                                sphi_a, sphi_b, work, zeta, &
                                                zetb
    REAL(KIND=dp), DIMENSION(:, :, :), &
      POINTER                                :: pabt, workt
    TYPE(atomic_kind_type), DIMENSION(:), &
      POINTER                                :: atomic_kind_set
    TYPE(atomic_kind_type), POINTER          :: atomic_kind
    TYPE(cell_type), POINTER                 :: cell
    TYPE(cube_info_type), DIMENSION(:), &
      POINTER                                :: cube_info
    TYPE(dft_control_type), POINTER          :: dft_control
    TYPE(gridlevel_info_type), POINTER       :: gridlevel_info
    TYPE(gto_basis_set_type), POINTER        :: orb_basis_a, orb_basis_b
    TYPE(lgrid_type)                         :: lgrid
    TYPE(realspace_grid_desc_p_type), &
      DIMENSION(:), POINTER                  :: rs_descs
    TYPE(realspace_grid_p_type), &
      DIMENSION(:), POINTER                  :: rs_rho
    TYPE(section_vals_type), POINTER         :: input, interp_section

!$  INTEGER                                  :: omp_get_max_threads, omp_get_thread_num

    failure=.FALSE.

    ! by default, do not compute the kinetic energy density (tau)
    ! if compute_tau, all grids referencing to rho are actually tau
    IF (PRESENT(compute_tau)) THEN
       my_compute_tau = compute_tau
    ELSE
       my_compute_tau = .FALSE.
    ENDIF

    IF (my_compute_tau) THEN
       CALL timeset(routineN,handle)
       ga_gb_function = FUNC_DADB
    ELSE
       CALL timeset(routineN,handle)
       ga_gb_function = FUNC_AB
    ENDIF

    NULLIFY(atomic_kind_set,dft_control,input)
    CALL get_qs_env(qs_env=qs_env,&
                    atomic_kind_set=atomic_kind_set,&
                    dft_control=dft_control,&
                    input=input,&
                    cell=cell,&
                    error=error)

    ! *** assign from pw_env
    NULLIFY(gridlevel_info,cube_info,interp_section)
    gridlevel_info=>pw_env%gridlevel_info
    cube_info=>pw_env%cube_info

    interp_section => section_vals_get_subs_vals(input,"DFT%MGRID%INTERPOLATOR",&
         error=error)

    ! *** set up the pw multi-grids
    NULLIFY(rs_descs)
    CPPrecondition(ASSOCIATED(pw_env),cp_failure_level,routineP,error,failure)
    CALL pw_env_get(pw_env, rs_descs=rs_descs, error=error)

    ! *** set up the rs multi-grids
    distributed_rs_grids=.FALSE.
    NULLIFY(rs_rho)
    ALLOCATE (rs_rho(SIZE(rs_descs)),STAT=istat)
    IF (istat /= 0) CALL stop_memory(routineP,"rs_rho")
    DO i=1,SIZE(rs_rho)
       CALL rs_grid_create(rs_rho(i)%rs_grid, rs_descs(i)%rs_desc, error=error)
       CALL rs_grid_zero(rs_rho(i)%rs_grid)
    END DO

    eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
    map_consistent = dft_control%qs_control%map_consistent
    nthread = 1
!$  nthread = omp_get_max_threads()

!   *** Allocate work storage ***

    CALL get_atomic_kind_set(atomic_kind_set=atomic_kind_set,&
                             maxco=maxco,&
                             maxsgf=maxsgf,&
                             maxsgf_set=maxsgf_set)

    IF ( nthread > 1 ) THEN
      NULLIFY(lgrid%r)
      n=0
      DO igrid_level = 1,gridlevel_info%ngrid_levels
        n = MAX(n,rs_rho(igrid_level)%rs_grid%ngpts_local)
      END DO
      n = n*nthread
      CALL reallocate(lgrid%r,1,n)
    END IF

    NULLIFY(pabt,workt,ntasks,tasks,tasks_local,ival,latom)
    CALL reallocate(pabt,1,maxco,1,maxco,0,nthread-1)
    CALL reallocate(workt,1,maxco,1,maxsgf_set,0,nthread-1)
    CALL reallocate(ntasks,1,gridlevel_info%ngrid_levels)
    CALL reallocate(tasks,1,11,1,max_tasks,1,gridlevel_info%ngrid_levels)
    CALL reallocate(tasks_local,1,4,1,max_tasks)
    CALL reallocate(ival,1,6,1,max_tasks)
    CALL reallocate(latom,1,2,1,max_tasks)
    curr_tasks = max_tasks

    nat_mol = SIZE(atom,1)

    ikind_old = 0
    jkind_old = 0
    DO iat = 1,nat_mol
       ikind = KIND(iat)
       iatom = atom(iat)
       ra(1:3) = ratom(1:3,iat)

       IF(ikind /= ikind_old) THEN
         NULLIFY(atomic_kind,orb_basis_a,la_max,la_min,npgfa,nsgfa,&
                 rpgfa,set_radius_a,sphi_a,zeta)
         atomic_kind => atomic_kind_set(ikind)

         CALL get_atomic_kind(atomic_kind=atomic_kind,&
                              orb_basis_set=orb_basis_a)

         IF (.NOT.ASSOCIATED(orb_basis_a)) CYCLE
         CALL get_gto_basis_set(gto_basis_set=orb_basis_a,&
                                first_sgf=first_sgfa,&
                                lmax=la_max,&
                                lmin=la_min,&
                                npgf=npgfa,&
                                nset=nseta,&
                                nsgf_set=nsgfa,&
                                pgf_radius=rpgfa,&
                                set_radius=set_radius_a,&
                                sphi=sphi_a,&
                                zet=zeta)
         ikind_old = ikind
       END IF
       IF (.NOT.ASSOCIATED(orb_basis_a)) CYCLE

       DO jat = iat, nat_mol
          jkind = KIND(jat)
          jatom = atom(jat)
          rb(1:3) = ratom(1:3,jat)

          IF(jkind /= jkind_old) THEN
            NULLIFY(atomic_kind,orb_basis_b,lb_max,lb_min,npgfb,nsgfb,&
                 rpgfb,set_radius_b,sphi_b,zetb)

            atomic_kind => atomic_kind_set(jkind)

            CALL get_atomic_kind(atomic_kind=atomic_kind,&
                              orb_basis_set=orb_basis_b)

            IF (.NOT.ASSOCIATED(orb_basis_b)) CYCLE

            CALL get_gto_basis_set(gto_basis_set=orb_basis_b,&
                                   first_sgf=first_sgfb,&
                                   kind_radius=kind_radius_b,&
                                   lmax=lb_max,&
                                   lmin=lb_min,&
                                   npgf=npgfb,&
                                   nset=nsetb,&
                                   nsgf_set=nsgfb,&
                                   pgf_radius=rpgfb,&
                                   set_radius=set_radius_b,&
                                   sphi=sphi_b,&
                                   zet=zetb)
            jkind_old = jkind
          END IF
          IF (.NOT.ASSOCIATED(orb_basis_b)) CYCLE

          ntasks = 0
          tasks = 0

          IF (iatom <= jatom) THEN
             brow = iatom
             bcol = jatom
          ELSE
             brow = jatom
             bcol = iatom
          END IF

          ! bad, should do better loop ordering XXXXXXXXXX
          NULLIFY(p_block)
          CALL cp_dbcsr_get_block_p(matrix=matrix_p,&
               row=brow,col=bcol,BLOCK=p_block,found=found)
          IF (.NOT.ASSOCIATED(p_block)) CYCLE

          IF (.NOT. map_consistent) THEN
             IF ( ALL ( 100.0_dp*ABS(p_block) < eps_rho_rspace) ) CYCLE
          END IF

          rab2 = 0.0_dp
          DO i = 1,3
            rab(i) = rb(i)-ra(i)
            rab2 = rab2 + rab(i)*rab(i)
          END DO
          dab = SQRT(rab2)

          DO iset=1,nseta

            IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE

            IF (iatom == jatom) THEN
               first_setb = iset
            ELSE
               first_setb = 1
            END IF

            DO jset=first_setb,nsetb

              IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE

              DO ipgf=1,npgfa(iset)

                IF (rpgfa(ipgf,iset) + set_radius_b(jset) < dab) CYCLE

                IF ((iatom == jatom).AND.(iset == jset)) THEN
                  first_pgfb = ipgf
                ELSE
                  first_pgfb = 1
                END IF

                DO jpgf=first_pgfb,npgfb(jset)

                  IF (rpgfa(ipgf,iset) + rpgfb(jpgf,jset) < dab) CYCLE

                  zetp = zeta(ipgf,iset) + zetb(jpgf,jset)

                  igrid_level = gaussian_gridlevel(gridlevel_info,zetp)

                  ntasks(igrid_level) = ntasks(igrid_level) + 1
                  n = ntasks(igrid_level)
                  IF ( n > curr_tasks ) THEN
                    curr_tasks = curr_tasks*mult_tasks
                    CALL reallocate(tasks,1,11,1,curr_tasks,&
                                    1,gridlevel_info%ngrid_levels)
                  END IF

                  tasks (1,n,igrid_level) = n

                  DO dir = 1,3
                     rp(:) = ra(:) + zetb(jpgf,jset)/zetp*rab(:)
                     rp(:) = pbc(rp,cell)
                     tp = FLOOR(DOT_PRODUCT(cell%h_inv(dir,:),rp)*rs_rho(igrid_level)%rs_grid%desc%npts(dir))
                     tp = MODULO ( tp, rs_rho(igrid_level)%rs_grid%desc%npts(dir) )
                     tasks (8+dir,n,igrid_level) = tp + rs_rho(igrid_level)%rs_grid%desc%lb(dir)
                  END DO

                  tasks (3,n,igrid_level) = iatom
                  tasks (4,n,igrid_level) = jatom
                  tasks (5,n,igrid_level) = iset
                  tasks (6,n,igrid_level) = jset
                  tasks (7,n,igrid_level) = ipgf
                  tasks (8,n,igrid_level) = jpgf

                END DO  ! jpgf
              END DO  ! ipgf
            END DO  ! jset
          END DO  ! iset

          DO igrid_level = 1, gridlevel_info%ngrid_levels
            num_tasks = ntasks ( igrid_level )
            IF ( num_tasks  > SIZE ( tasks_local, 2 ) ) &
              CALL reallocate(tasks_local,1,4,1,num_tasks )
            IF ( num_tasks  > SIZE ( ival, 2 ) ) &
              CALL reallocate(ival,1,6,1,num_tasks )
            IF ( num_tasks  > SIZE ( latom, 2 ) ) &
              CALL reallocate(latom,1,2,1,num_tasks )

!$OMP parallel do private(i)
            DO i=1,num_tasks
              tasks_local(1,i) = tasks(1,i,igrid_level)
              tasks_local(2,i) = tasks(9,i,igrid_level)
              tasks_local(3,i) = tasks(10,i,igrid_level)
              tasks_local(4,i) = tasks(11,i,igrid_level)
              latom(1,i) = tasks(3,i,igrid_level)
              latom(2,i) = tasks(4,i,igrid_level)
              ival(1,i) = tasks(3,i,igrid_level)
              ival(2,i) = tasks(4,i,igrid_level)
              ival(3,i) = tasks(5,i,igrid_level)
              ival(4,i) = tasks(6,i,igrid_level)
              ival(5,i) = tasks(7,i,igrid_level)
              ival(6,i) = tasks(8,i,igrid_level)
            END DO

!$OMP parallel do private(i)
            DO i=num_tasks+1,SIZE(tasks_local,2)
              tasks_local(1,i) = 0
              tasks_local(2,i) = 0
              tasks_local(3,i) = 0
              tasks_local(4,i) = 0
            END DO
            npme = 0

            IF ( nthread > 1 ) THEN
              lb => rs_rho(igrid_level)%rs_grid%lb_local
              ub => rs_rho(igrid_level)%rs_grid%ub_local
              lgrid%ldim = rs_rho(igrid_level)%rs_grid%ngpts_local
!$OMP parallel private(ithread,n)
!$            ithread = omp_get_thread_num()
              n = ithread*lgrid%ldim + 1
              CALL dcopy(lgrid%ldim,0._dp,0,lgrid%r(n),1)
!$OMP end parallel
            END IF

!$OMP parallel &
!$OMP default(none) &
!$OMP private(ithread,itask,iset,jset,ncoa,ncob,sgfa,sgfb) &
!$OMP private(work,pab,istat,ipgf,jpgf,na1,na2,nb1,nb2,scale) &
!$OMP shared(iatom,jatom,ra,rb,rab,rab2,brow,bcol,p_block) &
!$OMP shared(maxco,maxsgf_set,ival,num_tasks) &
!$OMP shared(npgfa,npgfb,ncoset,la_max,lb_max,first_sgfa,first_sgfb) &
!$OMP shared(nsgfa,nsgfb,sphi_a,sphi_b,la_min,lb_min,zeta,zetb) &
!$OMP shared(rs_rho,igrid_level,cube_info,eps_rho_rspace,lgrid,nthread) &
!$OMP shared(workt,pabt,ga_gb_function,map_consistent,error) &
!$OMP shared(cell)
            ithread = 0
!$          ithread = omp_get_thread_num()
            pab => pabt(:,:,ithread)
            work => workt(:,:,ithread)
!$OMP do
            DO itask = 1,num_tasks

              IF (.NOT.ASSOCIATED(p_block)) &
                  CALL stop_program(routineP,"p_block not associated in matrixp")
              iset = ival (3,itask)
              jset = ival (4,itask)
              ncoa = npgfa(iset)*ncoset(la_max(iset))
              sgfa = first_sgfa(1,iset)
              ncob = npgfb(jset)*ncoset(lb_max(jset))
              sgfb = first_sgfb(1,jset)
              IF (iatom <= jatom) THEN
                CALL dgemm("N","N",ncoa,nsgfb(jset),nsgfa(iset),&
                          1.0_dp,sphi_a(1,sgfa),SIZE(sphi_a,1),&
                          p_block(sgfa,sgfb),SIZE(p_block,1),&
                          0.0_dp,work(1,1),maxco)
                CALL dgemm("N","T",ncoa,ncob,nsgfb(jset),&
                          1.0_dp,work(1,1),maxco,&
                          sphi_b(1,sgfb),SIZE(sphi_b,1),&
                          0.0_dp,pab(1,1),maxco)
              ELSE
                CALL dgemm("N","N",ncob,nsgfa(iset),nsgfb(jset),&
                          1.0_dp,sphi_b(1,sgfb),SIZE(sphi_b,1),&
                          p_block(sgfb,sgfa),SIZE(p_block,1),&
                          0.0_dp,work(1,1),maxco)
                CALL dgemm("N","T",ncob,ncoa,nsgfa(iset),&
                          1.0_dp,work(1,1),maxco,&
                          sphi_a(1,sgfa),SIZE(sphi_a,1),&
                          0.0_dp,pab(1,1),maxco)
              END IF
              ipgf   = ival (5,itask)
              jpgf   = ival (6,itask)
              na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
              na2 = ipgf*ncoset(la_max(iset))
              nb1 = (jpgf - 1)*ncoset(lb_max(jset)) + 1
              nb2 = jpgf*ncoset(lb_max(jset))

              IF ((iatom == jatom).AND.&
                (iset == jset).AND.&
                (ipgf == jpgf)) THEN
                scale = 1.0_dp
              ELSE
                scale = 2.0_dp
              END IF

              IF ( nthread > 1 ) THEN
                IF (iatom <= jatom) THEN
                  CALL collocate_pgf_product_rspace(&
                       la_max(iset),zeta(ipgf,iset),la_min(iset),&
                       lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                       ra,rab,rab2,scale,pab,na1-1,nb1-1,&
                       rs_rho(igrid_level)%rs_grid,cell,cube_info(igrid_level),&
                       eps_rho_rspace,&
                       ga_gb_function=ga_gb_function,&
                       lgrid=lgrid,ithread=ithread, &
                       map_consistent=map_consistent,error=error)
                ELSE
                  CALL collocate_pgf_product_rspace(&
                       lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                       la_max(iset),zeta(ipgf,iset),la_min(iset),&
                       rb,-rab,rab2,scale,pab,nb1-1,na1-1,&
                       rs_rho(igrid_level)%rs_grid,cell,cube_info(igrid_level),&
                       eps_rho_rspace,&
                       ga_gb_function=ga_gb_function,&
                       lgrid=lgrid,ithread=ithread, &
                       map_consistent=map_consistent,error=error)
                END IF
              ELSE
                IF (iatom <= jatom) THEN
                  CALL collocate_pgf_product_rspace(&
                       la_max(iset),zeta(ipgf,iset),la_min(iset),&
                       lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                       ra,rab,rab2,scale,pab,na1-1,nb1-1,&
                       rs_rho(igrid_level)%rs_grid,cell,cube_info(igrid_level),&
                       eps_rho_rspace,&
                       ga_gb_function=ga_gb_function,&
                       map_consistent=map_consistent,error=error)
                ELSE
                  CALL collocate_pgf_product_rspace(&
                       lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                       la_max(iset),zeta(ipgf,iset),la_min(iset),&
                       rb,-rab,rab2,scale,pab,nb1-1,na1-1,&
                       rs_rho(igrid_level)%rs_grid,cell,cube_info(igrid_level),&
                       eps_rho_rspace,&
                       ga_gb_function=ga_gb_function,&
                       map_consistent=map_consistent,error=error)
                END IF
              END IF
            END DO  ! itask

!$OMP end parallel
            IF ( nthread > 1 ) THEN
              n = (ub(1)-lb(1)+1)*(ub(2)-lb(2)+1)
              DO i=1,nthread
!$OMP parallel do &
!$OMP default(none) &
!$OMP private(j,k) &
!$OMP shared(i,lb,ub,lgrid,rs_rho,n,igrid_level)
                DO j=lb(3),ub(3)
                  k = lgrid%ldim*(i-1) + n*(j-lb(3)) + 1
                  CALL daxpy (n,1._dp,lgrid%r(k),1,&
                       rs_rho(igrid_level)%rs_grid%r(lb(1),lb(2),j),1)
                END DO
              END DO
            END IF
          END DO  ! igrid_level

       END DO  ! jat
    END DO  ! iat

    IF ( nthread > 1 ) THEN
      DEALLOCATE (lgrid%r,STAT=istat)
      CPPostconditionNoFail(istat==0,cp_warning_level,routineP,error)
    END IF

    DEALLOCATE (pabt,workt,ntasks,tasks,tasks_local,ival,latom,STAT=istat)
    CPPostconditionNoFail(istat==0,cp_warning_level,routineP,error)

    CALL density_rs2pw(pw_env,rs_rho,rho_r,rho_g,interp_section=interp_section,error=error)

    total_rho = pw_integrate_function(rho_r%pw,isign=-1,error=error)

    CALL timestop(handle)

  END SUBROUTINE calculate_rho_mol

!****f* kg_gpw_collocate_den/integrate_mol_potential

! *****************************************************************************
!> \brief This function is called within a loop over all the molecules
!>      to calculate  integrate a given potential defined on the molecular grid only.
!>      This is used by KG_GPW to calculate the Kinetic energy
!>      correction as sum over the molecular contributions to the kinetic energy calculated
!>      through the kinetic energy functional of the single molecular density,
!>      for each molecule independently.
!> \param qs_env the qs environment
!> \param vxc_mol potential on the radial grid as calculated from the molecular density
!> \param matrix_p global density matrix
!> \param matrix_h global ks matrix
!> \param pw_env pw environment for this molecule kind
!> \param atom global index of the atoms in the molecule (input)
!> \param kind index of the kind of the atoms in the molecule (input)
!> \param ratom internal coordinates of the atoms in the box of the molecule (input)
!> \param forces_mol forces on the atoms of the molecule, coming from this additional potential
!> \param compute_tau logical to control if tau is required
!> \param error variable to control error logging, stopping, see module cp_error_handling
!> \note
!>      In KG_GPW a molecular pw environment is created for each molecule kind
!>      The corresponding molecular cell box  does not use PBC and is smaller than the global box.
!>      The molecule is located in the center of the box and the coordinates passed
!>      to this routine are internal coordinates.
!>      At this level, it should not be necessary to have the atomic positions in the global box
!> \author MI
! *****************************************************************************
  SUBROUTINE integrate_mol_potential(qs_env,vxc_mol,matrix_p_sm,&
                                    matrix_h_sm,pw_env,atom, kind, ratom,&
                                    forces_mol,compute_tau,distribution_2d,error)

    TYPE(qs_environment_type), POINTER       :: qs_env
    TYPE(pw_p_type), INTENT(IN)              :: vxc_mol
    TYPE(real_matrix_p_type), INTENT(IN)     :: matrix_p_sm
    TYPE(real_matrix_p_type), INTENT(INOUT)  :: matrix_h_sm
    TYPE(pw_env_type), POINTER               :: pw_env
    INTEGER, DIMENSION(:), INTENT(IN)        :: atom, kind
    REAL(dp), DIMENSION(:, :), INTENT(IN)    :: ratom
    REAL(dp), DIMENSION(:, :), &
      INTENT(INOUT), OPTIONAL                :: forces_mol
    LOGICAL, INTENT(IN), OPTIONAL            :: compute_tau
    TYPE(distribution_2d_type), POINTER      :: distribution_2d
    TYPE(cp_error_type), INTENT(inout)       :: error

    CHARACTER(LEN=*), PARAMETER :: routineN = 'integrate_mol_potential', &
      routineP = moduleN//':'//routineN

    INTEGER :: bcol, brow, curr_tasks, handle, i, iat, iatom, igrid_level, &
      ijsets, ikind, ikind_old, ipgf, iset, istat, itask, ithread, jat, &
      jatom, jkind, jkind_old, jpgf, jset, maxco, maxsgf, maxsgf_set, n, na1, &
      na2, nat_mol, nb1, nb2, ncoa, ncob, npme, nset_pairs, nseta, nsetb, &
      nthread, num_tasks, sgfa, sgfb
    INTEGER, DIMENSION(:), POINTER           :: la_max, la_min, lb_max, &
                                                lb_min, npgfa, npgfb, nsgfa, &
                                                nsgfb, ntasks
    INTEGER, DIMENSION(:, :), POINTER        :: atasks, first_sgfa, &
                                                first_sgfb, ival, tasks_local
    INTEGER, DIMENSION(:, :, :), POINTER     :: tasks
    LOGICAL                                  :: calculate_forces, failure, &
                                                found, map_consistent, &
                                                my_compute_tau
    REAL(dp)                                 :: dab, eps_gvg_rspace, &
                                                kind_radius_b, rab2, zetp
    REAL(KIND=dp), DIMENSION(3)              :: force_a, force_b, ra, rab, rb
    REAL(KIND=dp), DIMENSION(:), POINTER     :: set_radius_a, set_radius_b
    REAL(KIND=dp), DIMENSION(:, :), POINTER  :: h_block, hab, p_block, pab, &
                                                rpgfa, rpgfb, sphi_a, sphi_b, &
                                                work, zeta, zetb
    REAL(KIND=dp), DIMENSION(:, :, :), &
      POINTER                                :: habt, pabt, workt
    TYPE(atomic_kind_type), DIMENSION(:), &
      POINTER                                :: atomic_kind_set
    TYPE(atomic_kind_type), POINTER          :: atomic_kind
    TYPE(cell_type), POINTER                 :: cell
    TYPE(cp_dbcsr_type), POINTER             :: matrix_h, matrix_p
    TYPE(cube_info_type), DIMENSION(:), &
      POINTER                                :: cube_info
    TYPE(dft_control_type), POINTER          :: dft_control
    TYPE(gridlevel_info_type), POINTER       :: gridlevel_info
    TYPE(gto_basis_set_type), POINTER        :: orb_basis_a, orb_basis_b
    TYPE(realspace_grid_desc_p_type), &
      DIMENSION(:), POINTER                  :: rs_descs
    TYPE(realspace_grid_p_type), &
      DIMENSION(:), POINTER                  :: rs_v
    TYPE(section_vals_type), POINTER         :: input, interp_section

!$  INTEGER                                  :: omp_get_max_threads, omp_get_thread_num

    failure=.FALSE.

    ALLOCATE(matrix_p,matrix_h)!sm->dbcsr
    !CALL cp_dbcsr_init(matrix_p, error)!sm->dbcsr
    !CALL cp_dbcsr_init(matrix_h, error)!sm->dbcsr
    CALL cp_dbcsr_from_sm(matrix_p, matrix_p_sm%matrix, error, distribution_2d)!sm->dbcsr
    CALL cp_dbcsr_from_sm(matrix_h, matrix_h_sm%matrix, error, distribution_2d,&
         mp_obj=dbcsr_distribution_mp (cp_dbcsr_distribution (matrix_p)))!sm->dbcsr
    ! by default, do not compute the kinetic energy density (tau)
    ! if compute_tau, all grids referening to rho are actually tau
    IF (PRESENT(compute_tau)) THEN
       my_compute_tau = compute_tau
    ELSE
       my_compute_tau = .FALSE.
    ENDIF

    IF (my_compute_tau) THEN
       CALL timeset(routineN,handle)
    ELSE
       CALL timeset(routineN,handle)
    ENDIF

    calculate_forces = .FALSE.
    IF(PRESENT(forces_mol)) calculate_forces = .TRUE.

    NULLIFY(atomic_kind_set,dft_control,input)
    CALL get_qs_env(qs_env=qs_env,&
                    atomic_kind_set=atomic_kind_set,&
                    dft_control=dft_control,&
                    input=input,&
                    cell=cell,&
                    error=error)

    NULLIFY(rs_descs)
    CPPrecondition(ASSOCIATED(pw_env),cp_failure_level,routineP,error,failure)
    CALL pw_env_get(pw_env, rs_descs=rs_descs, error=error)
    ALLOCATE (rs_v(SIZE(rs_descs)),STAT=istat)
    IF (istat /= 0) CALL stop_memory(routineP,"rs_v")
    DO i=1, SIZE(rs_v)
      CALL rs_grid_create(rs_v(i)%rs_grid, rs_descs(i)%rs_desc, error=error)
    END DO

    ! *** assign from pw_env for this molecule kind
    NULLIFY(gridlevel_info,cube_info)
    gridlevel_info=>pw_env%gridlevel_info
    cube_info=>pw_env%cube_info

    interp_section => section_vals_get_subs_vals(input,"DFT%MGRID%INTERPOLATOR",&
         error=error)
    CALL potential_pw2rs(rs_v,vxc_mol,pw_env,interp_section,error)

    map_consistent=dft_control%qs_control%map_consistent
    IF (map_consistent) THEN
      eps_gvg_rspace = dft_control%qs_control%eps_rho_rspace ! needs to be consistent with rho_rspace
    ELSE
      eps_gvg_rspace = dft_control%qs_control%eps_gvg_rspace
    ENDIF

    nthread = 1
!$  nthread = omp_get_max_threads()

!   *** Allocate work storage ***

    CALL get_atomic_kind_set(atomic_kind_set=atomic_kind_set,&
                             maxco=maxco,&
                             maxsgf=maxsgf,&
                             maxsgf_set=maxsgf_set)

    NULLIFY ( pabt, habt, workt )
    CALL reallocate(habt,1,maxco,1,maxco,0,nthread)
    CALL reallocate(workt,1,maxco,1,maxsgf_set,0,nthread)
    CALL reallocate(pabt,1,maxco,1,maxco,0,nthread)

    NULLIFY(atasks,ntasks,tasks,tasks_local,ival)

    CALL reallocate(ntasks,1,gridlevel_info%ngrid_levels)
    CALL reallocate(tasks,1,8,1,max_tasks,1,gridlevel_info%ngrid_levels)
    CALL reallocate(tasks_local,1,2,1,max_tasks)
    CALL reallocate(ival,1,6,1,max_tasks)
    CALL reallocate(atasks,1,2,1,max_tasks)
    curr_tasks = max_tasks

    nat_mol = SIZE(atom,1)

    ikind_old = 0
    jkind_old = 0
    DO iat = 1,nat_mol
       ikind = KIND(iat)
       iatom = atom(iat)
       ra(1:3) = ratom(1:3,iat)

       IF(ikind /= ikind_old) THEN
         NULLIFY(atomic_kind,orb_basis_a,la_max,la_min,npgfa,nsgfa,&
                 rpgfa,set_radius_a,sphi_a,zeta)
         atomic_kind => atomic_kind_set(ikind)

         CALL get_atomic_kind(atomic_kind=atomic_kind,&
                              orb_basis_set=orb_basis_a)

         IF (.NOT.ASSOCIATED(orb_basis_a)) CYCLE
         CALL get_gto_basis_set(gto_basis_set=orb_basis_a,&
                                first_sgf=first_sgfa,&
                                lmax=la_max,&
                                lmin=la_min,&
                                npgf=npgfa,&
                                nset=nseta,&
                                nsgf_set=nsgfa,&
                                pgf_radius=rpgfa,&
                                set_radius=set_radius_a,&
                                sphi=sphi_a,&
                                zet=zeta)
         ikind_old = ikind
       END IF
       IF (.NOT.ASSOCIATED(orb_basis_a)) CYCLE

       DO jat = iat, nat_mol
          jkind = KIND(jat)
          jatom = atom(jat)
          rb(1:3) = ratom(1:3,jat)

          IF(jkind /= jkind_old) THEN
            NULLIFY(atomic_kind,orb_basis_b,lb_max,lb_min,npgfb,nsgfb,&
                 rpgfb,set_radius_b,sphi_b,zetb)

            atomic_kind => atomic_kind_set(jkind)

            CALL get_atomic_kind(atomic_kind=atomic_kind,&
                              orb_basis_set=orb_basis_b)

            IF (.NOT.ASSOCIATED(orb_basis_b)) CYCLE

            CALL get_gto_basis_set(gto_basis_set=orb_basis_b,&
                                   first_sgf=first_sgfb,&
                                   kind_radius=kind_radius_b,&
                                   lmax=lb_max,&
                                   lmin=lb_min,&
                                   npgf=npgfb,&
                                   nset=nsetb,&
                                   nsgf_set=nsgfb,&
                                   pgf_radius=rpgfb,&
                                   set_radius=set_radius_b,&
                                   sphi=sphi_b,&
                                   zet=zetb)
            jkind_old = jkind
          END IF
          IF (.NOT.ASSOCIATED(orb_basis_b)) CYCLE

          ntasks = 0
          tasks = 0

          IF (iatom <= jatom) THEN
             brow = iatom
             bcol = jatom
          ELSE
             brow = jatom
             bcol = iatom
          END IF

          NULLIFY(h_block)
          CALL cp_dbcsr_get_block_p(matrix=matrix_h,&
               row=brow,col=bcol,BLOCK=h_block,found=found)
          CPPrecondition(ASSOCIATED(h_block),cp_failure_level,routineP,error,failure)

          NULLIFY(p_block)
          CALL cp_dbcsr_get_block_p(matrix=matrix_p,&
               row=brow,col=bcol,BLOCK=p_block,found=found)
          CPPrecondition(ASSOCIATED(p_block),cp_failure_level,routineP,error,failure)

          rab2 = 0.0_dp
          DO i = 1,3
            rab(i) = rb(i)-ra(i)
            rab2 = rab2 + rab(i)*rab(i)
          END DO
          dab = SQRT(rab2)

          DO iset=1,nseta

            IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE

            DO jset=1,nsetb

              IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE

              DO ipgf=1,npgfa(iset)

                IF (rpgfa(ipgf,iset) + set_radius_b(jset) < dab) CYCLE

                DO jpgf=1,npgfb(jset)

                  IF (rpgfa(ipgf,iset) + rpgfb(jpgf,jset) < dab) CYCLE

                  zetp = zeta(ipgf,iset) + zetb(jpgf,jset)

                  igrid_level = gaussian_gridlevel(gridlevel_info,zetp)

                  ntasks(igrid_level) = ntasks(igrid_level) + 1
                  n = ntasks(igrid_level)
                  IF ( n > curr_tasks ) THEN
                    curr_tasks = curr_tasks*mult_tasks
                    CALL reallocate(tasks,1,8,1,curr_tasks,&
                                    1,gridlevel_info%ngrid_levels)
                  END IF

                  tasks (1,n,igrid_level) = n
                  tasks (3,n,igrid_level) = iatom
                  tasks (4,n,igrid_level) = jatom
                  tasks (5,n,igrid_level) = iset
                  tasks (6,n,igrid_level) = jset
                  tasks (7,n,igrid_level) = ipgf
                  tasks (8,n,igrid_level) = jpgf

                END DO  ! jpgf
              END DO  ! ipgf
            END DO  ! jset
          END DO  ! iset

          DO igrid_level = 1, gridlevel_info%ngrid_levels
            num_tasks = ntasks ( igrid_level )
            IF ( num_tasks  > SIZE ( tasks_local, 2 ) ) &
              CALL reallocate(tasks_local,1,2,1,num_tasks)
            IF ( num_tasks > SIZE ( ival, 2 ) ) &
              CALL reallocate(ival,1,6,1,num_tasks)

!$OMP parallel do private(i)
            DO i=1,num_tasks
              tasks_local(1,i) = tasks(1,i,igrid_level)
              tasks_local(2,i) = tasks(2,i,igrid_level)
              ival(1,i) = tasks(3,i,igrid_level)
              ival(2,i) = tasks(4,i,igrid_level)
              ival(3,i) = tasks(5,i,igrid_level)
              ival(4,i) = tasks(6,i,igrid_level)
              ival(5,i) = tasks(7,i,igrid_level)
              ival(6,i) = tasks(8,i,igrid_level)
            END DO

!$OMP parallel do private(i)
            DO i=num_tasks+1,SIZE(tasks_local,2)
              tasks_local(1,i) = 0
              tasks_local(2,i) = 0
            END DO

! fully replicated grids, each processor can process all its tasks
           ! get number of tasks available locally
            npme = SIZE ( tasks_local, 2 )
            DO i = 1, SIZE ( tasks_local, 2)
              IF ( tasks_local ( 1, i ) <= 0 ) THEN
                 npme = i - 1
                 EXIT
              END IF
            END DO

            CALL  pair_get_loop_vars( npme, ival, nset_pairs, atasks )

!$OMP parallel &
!$OMP default(none) &
!$OMP private(ithread,itask,iset,jset,ncoa,ncob,sgfa,sgfb) &
!$OMP private(work,hab,pab,istat,ipgf,jpgf,na1,na2,nb1,nb2) &
!$OMP private(force_a,force_b,ijsets) &
!$OMP shared(iatom,jatom,ra,rb,rab,rab2,brow,bcol,p_block,h_block) &
!$OMP shared(maxco,maxsgf_set,ival,num_tasks,atasks,calculate_forces) &
!$OMP shared(npgfa,npgfb,ncoset,la_max,lb_max,first_sgfa,first_sgfb) &
!$OMP shared(nsgfa,nsgfb,sphi_a,sphi_b,la_min,lb_min,eps_gvg_rspace,zeta,zetb) &
!$OMP shared(rs_v,igrid_level,cube_info,nthread,nset_pairs) &
!$OMP shared(workt,habt,pabt,my_compute_tau,map_consistent,forces_mol,iat,jat,error) &
!$OMP shared(cell)
            ithread = 0
!$          ithread = omp_get_thread_num()
            hab => habt(:,:,ithread)
            pab => pabt(:,:,ithread)
            work => workt(:,:,ithread)

!$OMP do
            DO ijsets = 1,nset_pairs
              IF (calculate_forces) THEN
                force_a(:) = 0.0_dp
                force_b(:) = 0.0_dp
              END IF

              itask = atasks(1,ijsets)
              iset = ival (3,itask)
              jset = ival (4,itask)
              ncoa = npgfa(iset)*ncoset(la_max(iset))
              sgfa = first_sgfa(1,iset)
              ncob = npgfb(jset)*ncoset(lb_max(jset))
              sgfb = first_sgfb(1,jset)

              IF(calculate_forces) THEN
                IF (iatom <= jatom) THEN
                   CALL dgemm("N","N",ncoa,nsgfb(jset),nsgfa(iset),&
                              1.0_dp,sphi_a(1,sgfa),SIZE(sphi_a,1),&
                              p_block(sgfa,sgfb),SIZE(p_block,1),&
                              0.0_dp,work(1,1),SIZE(work,1))
                   CALL dgemm("N","T",ncoa,ncob,nsgfb(jset),&
                              1.0_dp,work(1,1),SIZE(work,1),&
                              sphi_b(1,sgfb),SIZE(sphi_b,1),&
                              0.0_dp,pab(1,1),SIZE(pab,1))
                ELSE
                   CALL dgemm("N","N",ncob,nsgfa(iset),nsgfb(jset),&
                              1.0_dp,sphi_b(1,sgfb),SIZE(sphi_b,1),&
                              p_block(sgfb,sgfa),SIZE(p_block,1),&
                              0.0_dp,work(1,1),SIZE(work,1))
                   CALL dgemm("N","T",ncob,ncoa,nsgfa(iset),&
                              1.0_dp,work(1,1),SIZE(work,1),&
                              sphi_a(1,sgfa),SIZE(sphi_a,1),&
                              0.0_dp,pab(1,1),SIZE(pab,1))
               END IF
              ELSE
                pab = 0._dp
              END IF
              hab = 0._dp
              DO itask = atasks(1,ijsets),atasks(2,ijsets)
                ipgf   = ival (5,itask)
                jpgf   = ival (6,itask)
                na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
                na2 = ipgf*ncoset(la_max(iset))
                nb1 = (jpgf - 1)*ncoset(lb_max(jset)) + 1
                nb2 = jpgf*ncoset(lb_max(jset))

                IF(calculate_forces) THEN
                  IF (iatom <= jatom) THEN
                     CALL integrate_pgf_product_rspace(&
                          la_max(iset),zeta(ipgf,iset),la_min(iset),&
                          lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                          ra,rab,rab2,rs_v(igrid_level)%rs_grid,cell,&
                          cube_info(igrid_level),&
                          hab,pab=pab,o1=na1-1,o2=nb1-1, &
                          eps_gvg_rspace=eps_gvg_rspace,&
                          calculate_forces=.TRUE.,&
                          force_a=force_a,force_b=force_b,ithread=ithread,&
                          compute_tau=my_compute_tau,map_consistent=map_consistent,error=error)
                   ELSE
                     CALL integrate_pgf_product_rspace(&
                          lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                          la_max(iset),zeta(ipgf,iset),la_min(iset),&
                          rb,-rab,rab2,rs_v(igrid_level)%rs_grid,cell,&
                          cube_info(igrid_level),&
                          hab,pab=pab,o1=nb1-1,o2=na1-1, &
                          eps_gvg_rspace=eps_gvg_rspace,&
                          calculate_forces=.TRUE.,&
                          force_a=force_b,force_b=force_a,ithread=ithread,&
                          compute_tau=my_compute_tau,map_consistent=map_consistent,error=error)
                   END IF
                ELSE
                   IF (iatom <= jatom) THEN
                     CALL integrate_pgf_product_rspace(&
                          la_max(iset),zeta(ipgf,iset),la_min(iset),&
                          lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                          ra,rab,rab2,rs_v(igrid_level)%rs_grid,cell,&
                          cube_info(igrid_level),&
                          hab,o1=na1-1,o2=nb1-1,&
                          eps_gvg_rspace=eps_gvg_rspace,&
                          calculate_forces=.FALSE.,&
                          ithread=ithread,&
                          compute_tau=my_compute_tau,map_consistent=map_consistent,error=error)
                   ELSE
                     CALL integrate_pgf_product_rspace(&
                          lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
                          la_max(iset),zeta(ipgf,iset),la_min(iset),&
                          rb,-rab,rab2,rs_v(igrid_level)%rs_grid,cell,&
                          cube_info(igrid_level),&
                          hab,o1=nb1-1,o2=na1-1,&
                          eps_gvg_rspace=eps_gvg_rspace,&
                          calculate_forces=.FALSE.,&
                          ithread=ithread, &
                          compute_tau=my_compute_tau,map_consistent=map_consistent,error=error)
                   END IF
                END IF
              END DO ! itask

              IF (iatom <= jatom) THEN
                CALL dgemm("N","N",ncoa,nsgfb(jset),ncob,&
                           1.0_dp,hab(1,1),SIZE(hab,1),&
                           sphi_b(1,sgfb),SIZE(sphi_b,1),&
                           0.0_dp,work(1,1),SIZE(work,1))
                CALL dgemm("T","N",nsgfa(iset),nsgfb(jset),ncoa,&
                           -1.0_dp,sphi_a(1,sgfa),SIZE(sphi_a,1),&
                           work(1,1),SIZE(work,1),&
                           1.0_dp,h_block(sgfa,sgfb),SIZE(h_block,1))
              ELSE
                CALL dgemm("N","N",ncob,nsgfa(iset),ncoa,&
                           1.0_dp,hab(1,1),SIZE(hab,1),&
                           sphi_a(1,sgfa),SIZE(sphi_a,1),&
                           0.0_dp,work(1,1),SIZE(work,1))
                CALL dgemm("T","N",nsgfb(jset),nsgfa(iset),ncob,&
                           -1.0_dp,sphi_b(1,sgfb),SIZE(sphi_b,1),&
                           work(1,1),SIZE(work,1),&
                           1.0_dp,h_block(sgfb,sgfa),SIZE(h_block,1))
              END IF

              IF (calculate_forces) THEN
!$OMP critical (qs_integrate_force)
                 forces_mol(:,iat)= forces_mol(:,iat)+ 2.0_dp*force_a(:)
                 IF (iatom /= jatom) THEN
                    forces_mol(:,jat) = forces_mol(:,jat) + 2.0_dp*force_b(:)
                 END IF
!$OMP end critical (qs_integrate_force)
              END IF

            END DO  ! ijsets

!$OMP end parallel

          END DO  ! igrid_level
       END DO  ! jat
    END DO  ! iat

    DEALLOCATE (habt,workt,ntasks,tasks,tasks_local,ival,&
                pabt,atasks,STAT=istat)
    CPPostconditionNoFail(istat==0,cp_warning_level,routineP,error)

    IF (ASSOCIATED(rs_v)) THEN
      DO igrid_level = 1, gridlevel_info%ngrid_levels
        CALL rs_grid_release(rs_v(igrid_level)%rs_grid, error=error)
      END DO
      DEALLOCATE(rs_v, STAT=istat)
      CPPostconditionNoFail(istat==0,cp_warning_level,routineP,error)
    END IF

    CALL sm_from_dbcsr ( matrix_h_sm%matrix, matrix_h, distribution_2d, error=error )!sm->dbcsr
    CALL cp_dbcsr_deallocate_matrix ( matrix_h, error=error )!sm->dbcsr
    CALL cp_dbcsr_deallocate_matrix ( matrix_p, error=error )!sm->dbcsr

    CALL timestop(handle)

  END SUBROUTINE integrate_mol_potential

! *****************************************************************************
  SUBROUTINE pair_get_loop_vars ( npme, ival, nset_pairs, atasks )

    INTEGER, INTENT(IN)                      :: npme
    INTEGER, DIMENSION(6, npme), INTENT(IN)  :: ival
    INTEGER, INTENT(OUT)                     :: nset_pairs
    INTEGER, DIMENSION(:, :), POINTER        :: atasks

    INTEGER                                  :: iset, iset_old, itask, jset, &
                                                jset_old

     IF(SIZE(atasks,2) < npme) CALL reallocate(atasks,1,2,1,npme)

     nset_pairs = 0
     iset_old = 0
     jset_old = 0
     DO itask = 1,npme
       iset = ival(3,itask)
       jset = ival(4,itask)
       IF ( iset /= iset_old .OR. jset /= jset_old ) THEN
         IF(nset_pairs>0) atasks(2,nset_pairs) = itask - 1
         nset_pairs = nset_pairs + 1
         atasks(1,nset_pairs) = itask
         iset_old = iset
         jset_old = jset
        END IF
     END DO  ! itask
     IF(nset_pairs>0) atasks(2,nset_pairs) = npme

  END SUBROUTINE pair_get_loop_vars

END MODULE kg_gpw_collocate_den
