pro dhwaveiter, lucy_image, B, raw_image, gain, con_var, fft_psf, niter, chimin, $
    threshold=threshold, ndamp=ndamp, speedup=speedup, init=init, dotv=dotv
;+
; NAME:
;   DHWAVEITER
; PURPOSE:
;   Execute multiple iterations of Lucy's method with hfilter wavelet
;   filtering of residuals to reduce noise amplification.  This version
;   solves for the model (lucy_image) on an oversampled grid.  It also is
;   modified to include the constant readout noise in the iteration ala
;   Snyder.  This version speeds convergence using a conjugate-gradient
;   approach.
; CALLING SEQUENCE:
;   dhwaveiter, lucy_image, B, raw_image, gain, con_var, fft_psf, niter $
;       [, chimin], [keywords...]
; INPUTS:
;   LUCY_IMAGE  Current version of image (will be changed by iterations)
;               (at least B * raw image size)
;   B           Oversampling factor
;   RAW_IMAGE   Original image
;   GAIN        Gain in electrons per DN (7.5 for WFPC).  This can be an array
;               of the same size as the RAW_IMAGE if the gain is variable (e.g.
;               due to a strongly variable flat-field or to different numbers of
;               exposures.  Bad pixels can be eliminated by setting gain=0 for those
;               pixels.
;   CON_VAR     Readout noise factor
;               = readout_noise^2/gain if readout_noise is in electrons, or
;               = readout_noise^2*gain if readout_noise is in DN
;               This can also be an array.  If a sky (constant or image) has been
;               subtracted, this should be = sky + con_var
;   FFT_PSF     Complex FFT of point spread function (same size as lucy_image)
;   NITER       Number of iterations to do
; OPTIONAL (KEYWORD) INPUT PARAMETERS:
;   CHIMIN      Minimum value of chi-square (0 by default, i.e. just do niter
;               iterations.  I rarely stop at chisq=1.)
;   THRESHOLD   Threshold for noise damping.  Roughly corresponds to
;               difference in sigma where damping turns on.  Default value
;               is 3 sigma.  threshold=0 gives standard Lucy method.
;               Note that this is sqrt(threshold) for the old dampiter.pro.
;   NDAMP       Determines how sharply damping sets in.  Larger numbers
;               produce flatter likelihood function (more damping).
;               Default is ndamp=10.  This parameter probably does not
;               need to be changed.  It only affects the acceleration.
;   SPEEDUP     Speedup method to use:
;                  speedup <= 0 -> no speedup (standard Lucy iteration)
;                  speedup  = 1 -> line-search (Hook & Lucy acceleration)
;                  speedup >= 2 -> conjugate gradient (Turbo Lucy)
;               Default is speedup = 2.
;   INIT        Set to non-zero value if this is first call.  Performs
;               extra initialization when speedup=2.
;   DOTV        If set, displays results of each iteration.  Default is DOTV=1;
;               use DOTV=0 to turn off display.
; OUTPUTS:
;   LUCY_IMAGE is modified on output.
; COMMON BLOCKS:
;   None.
; SIDE EFFECTS:
;   None.
; RESTRICTIONS:
;   LUCY_IMAGE and FFT_PSF must be the same size and must be at least B times
;   bigger than RAW_IMAGE.  To avoid wraparound edge effects due to the use of
;   FFTs for convolution, they ought to be bigger than RAW_IMAGE by about the
;   size of the non-zero extent of the PSF.  For efficiency in the FFTs, their
;   sizes should be products of powers of small primes (powers of 2 and 3 are best.)
; PROCEDURE:
;   The standard Lucy iteration is performed, but the differences between the
;   data and model are denoised using wavelet filtering (HFILTER) as suggested
;   by Starck & Murtagh.  This retains significant differences on all
;   scales but discards noise, so it effectively stops the iterations in
;   regions where the difference between the model and the data looks like
;   the expected noise.  The acceleration is performed using DAMPSPEEDIT
;   from the damped Lucy iteration, which is not exactly correct but which
;   works well and avoids noise amplification.
; MODIFICATION HISTORY:
;   Created by R. White in 1996
;   This version updated with some better documentation, 2005 April 25
;-
    on_error, 2
    if n_elements(chimin) le 0 then chimin=0.0
    if n_elements(speedup) le 0 then speedup=2
    if n_elements(threshold) le 0 then threshold=3.0
    if n_elements(ndamp) le 0 then ndamp=10
    if n_elements(dotv) le 0 then dotv=1

    if (ndamp LE 1) AND (threshold NE 0) then begin
        message,'Warning: damping is ignored for NDAMP='+string(ndamp),/inform
        threshold = 0.0
        ndamp = 1
    endif

    if speedup LE 0 then begin
        message,'Lucy iteration with wavelet noise damping, no acceleration',/inform
    endif else if speedup LE 1 then begin
        message,'Accelerated Lucy iteration with wavelet noise damping',/inform
    endif else begin
        message,'Turbo Lucy iteration with wavelet noise damping',/inform

        ; special initialization for Turbo speedup -- first 5 steps don't use
        ; CG acceleration

        if keyword_set(init) then begin
            cglucy_restart, -4
            message,'Starting with 5 accelerated (non-turbo) steps',/inform
        endif
    endelse

    temp=size(raw_image)
    if temp[0] NE 2 then message,'RAW_IMAGE must be 2-D'
    x_raw=temp[1]
    y_raw=temp[2]
    nz = n_elements(raw_image)

    temp=size(lucy_image)
    if temp[0] NE 2 then message,'LUCY_IMAGE must be 2-D'
    x_size=temp[1]
    y_size=temp[2]
    if x_size LT B*x_raw OR y_size LT B*y_raw then $
        message,'LUCY_IMAGE must be at least B times bigger than RAW_IMAGE'

    temp = size(fft_psf)
    if temp[0] NE 2 then message,'FFT_PSF must be 2-D'
    if temp[1] NE x_size OR temp[2] NE y_size then $
        message,'FFT_PSF must be same size as LUCY_IMAGE'

    temp = size(gain)
    if temp[0] NE 0 then begin
        if temp[0] NE 2 then message,'GAIN must be a scalar or 2-D image'
        if temp[1] NE x_raw OR temp[2] NE y_raw then $
            message,'GAIN must be same size as RAW_IMAGE'
    endif

    ratio = fltarr(x_size,y_size)

    ; get normalization factor
    if n_elements(gain) EQ 1 then begin
        ratio[0:B*x_raw-1,0:B*y_raw-1] = gain
    endif else begin
        ratio[0:B*x_raw-1,0:B*y_raw-1] = $
            rebin(gain,B*x_raw,B*y_raw,/sample)
    endelse
    norm = float( fft( fft(ratio,-1) * conj(fft_psf), +1))
    zero=where(norm lt 1.e-3*max(norm))
    if zero[0] ge 0 then begin
        norm[zero] = 1.0
        ; Zero lucy_image where norm is zero -- those pixels don't affect
        ; the image anyway, and this will help speedup get a fast start.
        lucy_image[zero] = 0.0
    endif

    ; blur initial guess and calculate chi-square
    fft_conv = fft_psf*fft(lucy_image,-1)
    lucy_conv = B^2*rebin( $
        (float(fft(temporary(fft_conv),+1)))[0:B*x_raw-1,0:B*y_raw-1], $
        x_raw,y_raw)
    chisq = total(gain*(lucy_conv - raw_image)^2/(lucy_conv + con_var))/(nz-1)
    print,'initial chi-sq per degree freedom=',chisq

    for i=1,niter do begin

        ; compute residuals and variance (in DN) and
        ; filter them by damping differences

        rrr = raw_image-lucy_conv
        if threshold GT 0 then begin
            rrr = hfilter(rrr, (lucy_conv+con_var)/gain, threshold, ndamp=ndamp)
            if max(abs(rrr)) eq 0 then begin
               print,'iteration ',i,': converged, filtered difference=0'
               return
            endif
        endif

        ; embed the ratio in a bigger array (same size as lucy_image) with zero padding
        ; expand ratio by pixel replication back to oversampled size

        ratio[0:B*x_raw-1,0:B*y_raw-1] = rebin( $
            gain*(1 + temporary(rrr) / (lucy_conv+con_var)) > 0, $
            B*x_raw,B*y_raw,/sample)
        fft_srat = fft(ratio,-1) * conj(fft_psf)
        if speedup le 0 then begin

            ; no speedup

            lucy_image = lucy_image * float(fft(temporary(fft_srat),+1))/norm
            fft_conv = fft_psf*fft(lucy_image,-1)
            ; bin in BxB blocks to get convolved model on same grid as data
            lucy_conv = B^2*rebin( $
                (float(fft(temporary(fft_conv),+1)))[0:B*x_raw-1,0:B*y_raw-1], $
                x_raw,y_raw)

        endif else begin

            ; Call dampspeedit to accelerate convergence
            ; It updates lucy_image and lucy_conv

            new_lucy = lucy_image * float(fft(temporary(fft_srat),+1))/norm
            dampspeedit,lucy_image,temporary(new_lucy),lucy_conv,B,fft_psf,raw_image, $
                gain,con_var,threshold^2,ndamp,speedup=speedup

        endelse
        if keyword_set(dotv) then begin
            dtv,lucy_image[0:B*x_raw-1,0:B*y_raw-1]*B^2
            xyouts,0.05,0.95,'Iter'+string(i,format='(i4)')+' of'+ $
                string(niter,format='(i4)'),/normal
        endif

        ; calculate chi-square

        chisq = total(gain*(lucy_conv - raw_image)^2/(lucy_conv + con_var))/(nz-1)

        print,'iteration ',i,': chi-sq per degree freedom=',chisq
        if chisq LE chimin then begin
            print,'chi-sq <= ',chimin,', converged'
            return
        endif
    endfor
end