Commit f263758b authored by Claus Kleinwort's avatar Claus Kleinwort
Browse files

Implementation of parameter groups, sparse similarity operations

git-svn-id: http://svnsrv.desy.de/public/MillepedeII/trunk@187 3547b9b0-65b8-46d3-b95d-921b3f43af62
parent 17e0d7fc
......@@ -5,7 +5,7 @@
!! \author Claus Kleinwort, DESY (maintenance and developement)
!!
!! \copyright
!! Copyright (c) 2009 - 2019 Deutsches Elektronen-Synchroton,
!! Copyright (c) 2009 - 2020 Deutsches Elektronen-Synchroton,
!! Member of the Helmholtz Association, (DESY), HAMBURG, GERMANY \n\n
!! This library is free software; you can redistribute it and/or modify
!! it under the terms of the GNU Library General Public License as
......@@ -38,17 +38,14 @@ MODULE mpbits
INTEGER(mpl) :: ndimb !< dimension for bit (field) array
INTEGER(mpl) :: ndimb2 !< dimension for bit map
INTEGER(mpi) :: n !< matrix size (counters)
INTEGER(mpi) :: n !< matrix size (counters, sparse)
INTEGER(mpi) :: n2 !< matrix size (map)
INTEGER(mpi) :: ibfw !< bit field width
INTEGER(mpi) :: ireqpe !< min number of pair entries
INTEGER(mpi) :: isngpe !< upper bound for pair entry single precision storage
INTEGER(mpi) :: icmprs !< compression flag for sparsity (column indices)
INTEGER(mpi) :: iextnd !< flag for extended storage (both 'halves' of sym. mat. for improved access patterns)
INTEGER(mpi) :: nspc !< number of precision for sparse global matrix (1=D, 2=D+f)
INTEGER(mpi) :: mxcnt !< max value for bit field counters
INTEGER(mpi) :: nencdm !< max value for column counter
INTEGER(mpi) :: nencdb !< number of bits for encoding column counter
INTEGER(mpi) :: nthrd !< number of threads
INTEGER(mpi), DIMENSION(:), ALLOCATABLE :: bitFieldCounters !< fit field counters for global parameters pairs (tracks)
INTEGER(mpi), DIMENSION(:), ALLOCATABLE :: bitMap !< fit field map for global parameters pairs (measurements)
......@@ -64,30 +61,29 @@ END MODULE mpbits
!!
SUBROUTINE inbits(im,jm,inc) ! include element (I,J)
USE mpbits
IMPLICIT NONE
INTEGER(mpi), INTENT(IN) :: im
INTEGER(mpi), INTENT(IN) :: jm
INTEGER(mpi), INTENT(IN) :: inc
INTEGER(mpl) :: l
INTEGER(mpl) :: ll
INTEGER(mpi) :: i
INTEGER(mpi) :: j
INTEGER(mpi) :: noffj
INTEGER(mpi) :: nout
INTEGER(mpi) :: m
INTEGER(mpi) :: mm
INTEGER(mpi) :: icount
INTEGER(mpi) :: ib
INTEGER(mpi) :: jcount
INTEGER(mpl) :: noffi
LOGICAL :: btest
IF(im == jm) RETURN ! diagonal
! diagonal included now !
!IF(im == jm) RETURN ! diagonal
j=MIN(im,jm)
i=MAX(im,jm)
IF(j <= 0) RETURN ! out low
IF(i > n) RETURN ! out high
noffi=INT(i-1,mpl)*INT(i-2,mpl)*INT(ibfw,mpl)/2 ! for J=1
noffi=INT(i-1,mpl)*INT(i,mpl)*INT(ibfw,mpl)/2 ! for J=1
noffj=(j-1)*ibfw
l=noffi/bs+i+noffj/bs ! row offset + column offset
! add I instead of 1 to keep bit maps of different rows in different words (openMP !)
......@@ -96,36 +92,29 @@ SUBROUTINE inbits(im,jm,inc) ! include element (I,J)
bitFieldCounters(l)=ibset(bitFieldCounters(l),m)
ELSE
! get counter from bit field
ll=l
mm=m
icount=0
DO ib=0,ibfw-1
IF (btest(bitFieldCounters(ll),mm)) icount=ibset(icount,ib)
mm=mm+1
IF (mm >= bs) THEN
ll=ll+1
mm=mm-bs
END IF
END DO
nout=m+ibfw-bs ! number of bits outside word
IF (nout <= 0) THEN
! inside single word
CALL mvbits(bitFieldCounters(l),m,ibfw,icount,0)
ELSE
! spread over two words
CALL mvbits(bitFieldCounters(l),m,ibfw-nout,icount,0)
CALL mvbits(bitFieldCounters(l+1),0,nout,icount,ibfw-nout)
ENDIF
! increment
jcount=icount
icount=MIN(icount+inc,mxcnt)
! store counter into bit field
IF (icount /= jcount) THEN
ll=l
mm=m
DO ib=0,ibfw-1
IF (btest(icount,ib)) THEN
bitFieldCounters(ll)=ibset(bitFieldCounters(ll),mm)
ELSE
bitFieldCounters(ll)=ibclr(bitFieldCounters(ll),mm)
END IF
mm=mm+1
IF (mm >= bs) THEN
ll=ll+1
mm=mm-bs
END IF
END DO
IF (nout <= 0) THEN
! inside single word
CALL mvbits(icount,0,ibfw,bitFieldCounters(l),m)
ELSE
! spread over two words
CALL mvbits(icount,0,ibfw-nout,bitFieldCounters(l),m)
CALL mvbits(icount,ibfw-nout,nout,bitFieldCounters(l+1),0)
ENDIF
END IF
END IF
RETURN
......@@ -138,37 +127,32 @@ END SUBROUTINE inbits
!! \param [in] jreqpe min number of pair entries
!! \param [in] jhispe mupper bound for pair entry histogrammimg
!! \param [in] jsngpe upper bound for pair entry single precision storage
!! \param [in] jcmprs compression flag for sparsity (column indices)
!! \param [in] jextnd flag for extended storage
!! \param [out] idimb dimension for bit (field) array
!! \param [out] iencdb number of bits for encoding column counter
!! \param [out] ispc number of precision for sparse global matrix
!!
SUBROUTINE clbits(in,jreqpe,jhispe,jsngpe,jcmprs,jextnd,idimb,iencdb,ispc)
SUBROUTINE clbits(in,jreqpe,jhispe,jsngpe,jextnd,idimb,ispc)
USE mpbits
USE mpdalc
IMPLICIT NONE
INTEGER(mpi), INTENT(IN) :: in
INTEGER(mpi), INTENT(IN) :: jreqpe
INTEGER(mpi), INTENT(IN) :: jhispe
INTEGER(mpi), INTENT(IN) :: jsngpe
INTEGER(mpi), INTENT(IN) :: jcmprs
INTEGER(mpi), INTENT(IN) :: jextnd
INTEGER(mpl), INTENT(OUT) :: idimb
INTEGER(mpi), INTENT(OUT) :: iencdb
INTEGER(mpi), INTENT(OUT) :: ispc
INTEGER(mpl) :: noffd
INTEGER(mpi) :: i
INTEGER(mpi) :: icount
INTEGER(mpi) :: mb
INTEGER(mpi) :: nbcol
!$ INTEGER(mpi) :: OMP_GET_MAX_THREADS
! save input parameter
n=in
ireqpe=jreqpe
isngpe=jsngpe
icmprs=jcmprs+jextnd ! enforce compression for extended storage
iextnd=jextnd
! number of precision types (D, F)
ispc=1
......@@ -186,7 +170,7 @@ SUBROUTINE clbits(in,jreqpe,jhispe,jsngpe,jcmprs,jextnd,idimb,iencdb,ispc)
END IF
END DO
! bit field array size
noffd=INT(n,mpl)*INT(n-1,mpl)*INT(ibfw,mpl)/2
noffd=INT(n,mpl)*INT(n+1,mpl)*INT(ibfw,mpl)/2
ndimb=noffd/bs+n
idimb=ndimb
mb=INT(4.0E-6*REAL(ndimb,mps),mpi)
......@@ -200,14 +184,6 @@ SUBROUTINE clbits(in,jreqpe,jhispe,jsngpe,jcmprs,jextnd,idimb,iencdb,ispc)
END IF
CALL mpalloc(bitFieldCounters,ndimb,'INBITS: bit storage')
bitFieldCounters=0
! encoding for compression
nbcol=bs/2 ! one half of the bits for column number, other for column counter
DO i=bs/2,bs-2
IF (btest(n,i)) nbcol=i+1 ! more bits for column number
END DO
nencdb=bs-nbcol
iencdb=nencdb
nencdm=ishft(1,nencdb)-1
nthrd=1
!$ NTHRD=OMP_GET_MAX_THREADS()
RETURN
......@@ -215,17 +191,18 @@ END SUBROUTINE clbits
!> Analyze bit fields.
!!
!! \param [out] ndims (1): (reduced) size of bit array; (2): size of column lists;
!! (3/4): number of (double/single precision) off diagonal elements;
!! \param[out] ncmprs compression info (per row)
!! \param [in] npgrp parameter groups
!! \param [out] ndims (1): (reduced) size of bit array; (2): size of column lists;
!! (3/4): number of (double/single precision) off diagonal elements;
!! \param[out] nsparr row offsets
!! \param[in] ihst >0: histogram number
!!
SUBROUTINE ndbits(ndims,ncmprs,nsparr,ihst)
SUBROUTINE ndbits(npgrp,ndims,nsparr,ihst)
USE mpbits
IMPLICIT NONE
INTEGER(mpi), DIMENSION(:), INTENT(IN) :: npgrp
INTEGER(mpl), DIMENSION(4), INTENT(OUT) :: ndims
INTEGER(mpi), DIMENSION(:), INTENT(OUT) :: ncmprs
INTEGER(mpl), DIMENSION(:,:), INTENT(OUT) :: nsparr
INTEGER(mpi), INTENT(IN) :: ihst
......@@ -235,33 +212,35 @@ SUBROUTINE ndbits(ndims,ncmprs,nsparr,ihst)
INTEGER(mpi) :: ichunk
INTEGER(mpi) :: i
INTEGER(mpi) :: j
INTEGER(mpi) :: m
INTEGER(mpi) :: last
INTEGER(mpi) :: lrgn
INTEGER(mpi) :: next
INTEGER(mpi) :: icp
INTEGER(mpi) :: mm
INTEGER(mpi) :: jp
INTEGER(mpi) :: nj
INTEGER(mpi) :: n1
INTEGER(mpi) :: nd
INTEGER(mpi) :: ib
INTEGER(mpi) :: ir
INTEGER(mpi) :: icount
INTEGER(mpi) :: iproc
INTEGER(mpi) :: iword
INTEGER(mpi) :: k
INTEGER(mpi) :: mb
INTEGER(mpi) :: n1
INTEGER(mpl) :: ll
INTEGER(mpl) :: lb
INTEGER(mpl) :: nin
INTEGER(mpl) :: npar
INTEGER(mpl) :: ntot
INTEGER(mpl) :: noffi
INTEGER(mpl) :: noffj
REAL(mps) :: cpr
REAL(mps) :: fracu
REAL(mps) :: fracz
LOGICAL :: btest
!$ INTEGER(mpi) :: OMP_GET_THREAD_NUM
nd=npgrp(n+1)-npgrp(1) ! number of diagonal elements
ndims(1)=ndimb
ndims(2)=0
ndims(3)=0
......@@ -270,222 +249,195 @@ SUBROUTINE ndbits(ndims,ncmprs,nsparr,ihst)
ll=0
lb=0
ichunk=MIN((n+nthrd-1)/nthrd/32+1,256)
IF (ibfw > 1.OR.icmprs > 0) THEN
! reduce bit field counters to (precision type) bits, analyze precision type bit fields ('1st half' (j<i))
! parallelize row loop
! private copy of NTOT for each thread, combined at end, init with 0.
!$OMP PARALLEL DO &
!$OMP PRIVATE(I,NOFFI,LL,MM,LB,MB,IWORD,IPROC,J,ICOUNT,IB,INR,IRGN,LAST,LRGN,NEXT,JP,IR) &
!$OMP REDUCTION(+:NTOT) &
!$OMP SCHEDULE(DYNAMIC,ICHUNK)
DO i=1,n
noffi=INT(i-1,mpl)*INT(i-2,mpl)*INT(ibfw,mpl)/2
ll=noffi/bs+i
mm=0
lb=ll
mb=0
iword=0 ! temporary bit fields
iproc=0
!$ IPROC=OMP_GET_THREAD_NUM() ! thread number
inr(1)=0
inr(2)=0
irgn(1)=0
irgn(2)=0
last=0
lrgn=0
DO j=1,i-1
! get (pair) counter
icount=0
next=0
DO ib=0,ibfw-1
IF (btest(bitFieldCounters(ll),mm)) icount=ibset(icount,ib)
mm=mm+1
IF (mm >= bs) THEN
ll=ll+1
mm=mm-bs
END IF
END DO
! reduce bit field counters to (precision type) bits, analyze precision type bit fields ('1st half' (j<=i))
! parallelize row loop
! private copy of NTOT for each thread, combined at end, init with 0.
!$OMP PARALLEL DO &
!$OMP PRIVATE(I,NOFFI,LL,MM,LB,MB,IWORD,IPROC,J,ICOUNT,IB,INR,IRGN,LAST,LRGN,NEXT,JP,IR,NPAR) &
!$OMP REDUCTION(+:NTOT) &
!$OMP SCHEDULE(DYNAMIC,ICHUNK)
DO i=1,n
noffi=INT(i-1,mpl)*INT(i,mpl)*INT(ibfw,mpl)/2
ll=noffi/bs+i
mm=0
lb=ll
mb=0
iword=0 ! reset temporary bit fields
iproc=0
!$ IPROC=OMP_GET_THREAD_NUM() ! thread number
inr(1)=0
inr(2)=0
irgn(1)=1 ! 'end marker' region
irgn(2)=1
last=0
lrgn=0
npar=0
IF (icount > 0) THEN
ntot=ntot+1
IF (iproc == 0.AND.ihst > 0) CALL hmpent(ihst,REAL(icount,mps))
DO j=1,i ! loop until diagonal element
! get (pair) counter
icount=0
next=0
DO ib=0,ibfw-1
IF (btest(bitFieldCounters(ll),mm)) icount=ibset(icount,ib)
mm=mm+1
IF (mm >= bs) THEN
ll=ll+1
mm=mm-bs
END IF
END DO
! keep pair ?
IF (icount >= ireqpe) THEN
next=1 ! double
IF (icount <= isngpe) next=2 ! single
iword=ibset(iword,mb+next-1)
inr(next)=inr(next)+1
IF (next /= last.OR.lrgn >= nencdm) THEN
irgn(next)=irgn(next)+1
lrgn=0
END IF
lrgn=lrgn+1
END IF
last=next
IF (icount > 0) THEN
npar=npar+npgrp(j+1)-npgrp(j)
IF (iproc == 0.AND.ihst > 0) CALL hmpent(ihst,REAL(icount,mps))
END IF
mb=mb+nspc
IF (mb >= bs) THEN
bitFieldCounters(lb)=iword ! store
iword=0
lb=lb+1
mb=mb-bs
! keep pair ?
IF (icount >= ireqpe) THEN
next=1 ! double
IF (icount <= isngpe) next=2 ! single
iword=ibset(iword,mb+next-1)
inr(next)=inr(next)+npgrp(j+1)-npgrp(j) ! number of parameters
IF (next /= last) THEN
irgn(next)=irgn(next)+1
END IF
END DO
bitFieldCounters(lb)=iword ! store
! save row statistics
ir=i+1
DO jp=1,nspc
nsparr(1,ir)=irgn(jp) ! number of regions per row and precision
nsparr(2,ir)=inr(jp) ! number of columns per row and precision
ir=ir+n+1
END DO
END IF
last=next
! save condensed bitfield
mb=mb+nspc
IF (mb >= bs) THEN
bitFieldCounters(lb)=iword ! store
iword=0
lb=lb+1
mb=mb-bs
END IF
END DO
bitFieldCounters(lb)=iword ! store
ntot=ntot+npar*(npgrp(i+1)-npgrp(i))
! save row statistics
ir=i+1
DO jp=1,nspc
nsparr(1,ir)=irgn(jp) ! number of regions per row and precision
nsparr(2,ir)=inr(jp) ! number of columns per row and precision (groups)
ir=ir+n+1
END DO
END DO
!$OMP END PARALLEL DO
! analyze precision type bit fields for extended storage, check for row compression
! analyze precision type bit fields for extended storage, check for row compression
! parallelize row loop
! private copy of NDIMS for each thread, combined at end, init with 0.
!$OMP PARALLEL DO &
!$OMP PRIVATE(I,NOFFI,NOFFJ,LL,MM,INR,IRGN,LAST,LRGN,J,NEXT,ICP,NWCP,JP,IR,IB) &
!$OMP REDUCTION(+:NDIMS) &
!$OMP SCHEDULE(DYNAMIC,ICHUNK)
DO i=1,n
! restore row statistics
irgn(1)=INT(nsparr(1,i+1),mpi)
irgn(2)=INT(nsparr(1,i+n+2),mpi)
inr(1)=INT(nsparr(2,i+1),mpi)
inr(2)=INT(nsparr(2,i+n+2),mpi)
! parallelize row loop
! private copy of NDIMS for each thread, combined at end, init with 0.
!$OMP PARALLEL DO &
!$OMP PRIVATE(I,NOFFI,NOFFJ,LL,MM,INR,IRGN,LAST,LRGN,J,NEXT,ICP,NWCP,JP,IR,IB) &
!$OMP REDUCTION(+:NDIMS) &
!$OMP SCHEDULE(DYNAMIC,ICHUNK)
DO i=1,n
! restore row statistics
irgn(1)=INT(nsparr(1,i+1),mpi)
irgn(2)=INT(nsparr(1,i+n+2),mpi)
inr(1)=INT(nsparr(2,i+1),mpi)
inr(2)=INT(nsparr(2,i+n+2),mpi)
! analyze precision type bit fields for extended storage ('2nd half' (j>i) too) ?
IF (iextnd > 0) THEN
! analyze precision type bit fields for extended storage ('2nd half' (j>i) too) ?
IF (iextnd > 0) THEN
noffj=(i-1)*nspc
mm=MOD(noffj,bs)
noffj=(i-1)*nspc
mm=INT(MOD(noffj,bs),mpi)
last=0
lrgn=0
last=0
lrgn=0
! remaining columns
DO j=i+1, n
! index for pair (J,I)
noffi=INT(j-1,mpl)*INT(j-2,mpl)*INT(ibfw,mpl)/2 ! for I=1
ll=noffi/bs+j+noffj/bs ! row offset + column offset
! remaining columns
DO j=i+1, n
! index for pair (J,I)
noffi=INT(j-1,mpl)*INT(j,mpl)*INT(ibfw,mpl)/2 ! for I=1
ll=noffi/bs+j+noffj/bs ! row offset + column offset
! get precision type
next=0
DO ib=0,nspc-1
IF (btest(bitFieldCounters(ll),mm+ib)) next=ibset(next,ib)
END DO
! keep pair ?
IF (next > 0) THEN
inr(next)=inr(next)+1
IF (next /= last.OR.lrgn >= nencdm) THEN
irgn(next)=irgn(next)+1
lrgn=0
END IF
lrgn=lrgn+1
END IF
last=next
! get precision type
next=0
DO ib=0,nspc-1
IF (btest(bitFieldCounters(ll),mm+ib)) next=ibset(next,ib)
END DO
END IF
! row statistics, compression
ir=i+1
DO jp=1,nspc
icp=0
nwcp(0)=inr(jp) ! list of column indices (default)
IF (inr(jp) > 0) THEN
nwcp(1)=irgn(jp)+(irgn(jp)+7)/8 ! list of regions of consecutive columns (and group offsets)
! compress row ?
IF ((nwcp(1) < nwcp(0).AND.icmprs > 0).OR.iextnd > 0) THEN
icp=1
ncmprs(i+n*(jp-1))=irgn(jp) ! number of regions per row and precision
! keep pair ?
IF (next > 0) THEN
inr(next)=inr(next)+npgrp(j+1)-npgrp(j) ! number of parameters
IF (next /= last) THEN
irgn(next)=irgn(next)+1
END IF
! total space
ndims(2) =ndims(2) +nwcp(icp)
ndims(jp+2)=ndims(jp+2)+nwcp(0)
END IF
! per row and precision
nsparr(1,ir)=nwcp(icp)
nsparr(2,ir)=nwcp(0)
ir=ir+n+1
last=next
END DO
END DO
!$OMP END PARALLEL DO
END IF
! sum up, fill row offsets
lb=1
n1=0
ll=n+1
! row statistics, compression
ir=i+1
DO jp=1,nspc
DO i=1,n
n1=n1+1
nsparr(1,n1)=lb
nsparr(2,n1)=ll
lb=lb+nsparr(1,n1+1)
ll=ll+nsparr(2,n1+1)
END DO
n1=n1+1
nsparr(1,n1)=lb
nsparr(2,n1)=ll
ll=1
icp=0
nwcp(0)=inr(jp) ! list of column indices (default)
IF (inr(jp) > 0) THEN
nwcp(1)=irgn(jp)*2 ! list of regions (group starts and offsets)
! compress row ?
IF ((nwcp(1) < nwcp(0)).OR.iextnd > 0) THEN
icp=1
END IF
! total space
ndims(2) =ndims(2) +nwcp(icp)
ndims(jp+2)=ndims(jp+2)+nwcp(0)*(npgrp(i+1)-npgrp(i))
END IF
! per row and precision
nsparr(1,ir)=nwcp(icp)
nsparr(2,ir)=nwcp(0)*(npgrp(i+1)-npgrp(i))
ir=ir+n+1
END DO
END DO
!$OMP END PARALLEL DO
ELSE
nin=0
nsparr(1,1)=1
nsparr(2,1)=n+1
n1=1
! sum up, fill row offsets
lb=0
n1=0
ll=nd
DO jp=1,nspc
DO i=1,n
noffi=INT(i-1,mpl)*INT(i-2,mpl)/2
ll=noffi/bs+i
nj=(i-1)/bs
DO k=0,nj
DO m=0,bs-1
IF(btest(bitFieldCounters(ll+k),m)) nin=nin+1
END DO
END DO
n1=n1+1
nsparr(1,n1)=nsparr(1,1)+nin
nsparr(2,n1)=nsparr(2,1)+nin
nsparr(1,n1)=lb
nsparr(2,n1)=ll
lb=lb+nsparr(1,n1+1)
ll=ll+nsparr(2,n1+1)
END DO
ndims(2)=nin
ndims(3)=nin
ntot=nin
END IF
n1=n1+1
nsparr(1,n1)=lb
nsparr(2,n1)=ll
ll=0
END DO
nin=ndims(3)+ndims(4)
fracz=200.0*REAL(ntot,mps)/REAL(n,mps)/REAL(n-1,mps)
fracu=200.0*REAL(nin,mps)/REAL(n,mps)/REAL(n-1,mps)
fracz=200.0*REAL(ntot,mps)/REAL(nd,mps)/REAL(nd-1,mps)
fracu=200.0*REAL(nin,mps)/REAL(nd,mps)/REAL(nd-1,mps)
WRITE(*,*) ' '
WRITE(*,*) 'NDBITS: number of diagonal elements',n
WRITE(*,*) 'NDBITS: number of diagonal elements',nd
WRITE(*,*) 'NDBITS: number of used off-diagonal elements',nin
WRITE(*,1000) 'fraction of non-zero off-diagonal elements', fracz
WRITE(*,1000) 'fraction of used off-diagonal elements', fracu
IF (icmprs /= 0) THEN
cpr=100.0*REAL(mpi*ndims(2)+mpd*ndims(3)+mps*ndims(4),mps)/REAL((mpd+mpi)*nin,mps)
WRITE(*,1000) 'compression ratio for off-diagonal elements', cpr
END IF
cpr=100.0*REAL(mpi*ndims(2)+mpd*ndims(3)+mps*ndims(4),mps)/REAL((mpd+mpi)*nin,mps)
WRITE(*,1000) 'compression ratio for off-diagonal elements', cpr
1000 FORMAT(' NDBITS: ',a,f6.2,' %')