// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package gonum

import (
	"math"

	"gonum.org/v1/gonum/blas"
	"gonum.org/v1/gonum/blas/blas64"
	"gonum.org/v1/gonum/lapack"
)

// Dlaqr23 performs the orthogonal similarity transformation of an n×n upper
// Hessenberg matrix to detect and deflate fully converged eigenvalues from a
// trailing principal submatrix using aggressive early deflation [1].
//
// On return, H will be overwritten by a new Hessenberg matrix that is a
// perturbation of an orthogonal similarity transformation of H. It is hoped
// that on output H will have many zero subdiagonal entries.
//
// If wantt is true, the matrix H will be fully updated so that the
// quasi-triangular Schur factor can be computed. If wantt is false, then only
// enough of H will be updated to preserve the eigenvalues.
//
// If wantz is true, the orthogonal similarity transformation will be
// accumulated into Z[iloz:ihiz+1,ktop:kbot+1], otherwise Z is not referenced.
//
// ktop and kbot determine a block [ktop:kbot+1,ktop:kbot+1] along the diagonal
// of H. It must hold that
//  0 <= ilo <= ihi < n,     if n > 0,
//  ilo == 0 and ihi == -1,  if n == 0,
// and the block must be isolated, that is, it must hold that
//  ktop == 0   or H[ktop,ktop-1] == 0,
//  kbot == n-1 or H[kbot+1,kbot] == 0,
// otherwise Dlaqr23 will panic.
//
// nw is the deflation window size. It must hold that
//  0 <= nw <= kbot-ktop+1,
// otherwise Dlaqr23 will panic.
//
// iloz and ihiz specify the rows of the n×n matrix Z to which transformations
// will be applied if wantz is true. It must hold that
//  0 <= iloz <= ktop,  and  kbot <= ihiz < n,
// otherwise Dlaqr23 will panic.
//
// sr and si must have length kbot+1, otherwise Dlaqr23 will panic.
//
// v and ldv represent an nw×nw work matrix.
// t and ldt represent an nw×nh work matrix, and nh must be at least nw.
// wv and ldwv represent an nv×nw work matrix.
//
// work must have length at least lwork and lwork must be at least max(1,2*nw),
// otherwise Dlaqr23 will panic. Larger values of lwork may result in greater
// efficiency. On return, work[0] will contain the optimal value of lwork.
//
// If lwork is -1, instead of performing Dlaqr23, the function only estimates the
// optimal workspace size and stores it into work[0]. Neither h nor z are
// accessed.
//
// recur is the non-negative recursion depth. For recur > 0, Dlaqr23 behaves
// as DLAQR3, for recur == 0 it behaves as DLAQR2.
//
// On return, ns and nd will contain respectively the number of unconverged
// (i.e., approximate) eigenvalues and converged eigenvalues that are stored in
// sr and si.
//
// On return, the real and imaginary parts of approximate eigenvalues that may
// be used for shifts will be stored respectively in sr[kbot-nd-ns+1:kbot-nd+1]
// and si[kbot-nd-ns+1:kbot-nd+1].
//
// On return, the real and imaginary parts of converged eigenvalues will be
// stored respectively in sr[kbot-nd+1:kbot+1] and si[kbot-nd+1:kbot+1].
//
// References:
//  [1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
//      Aggressive Early Deflation. SIAM J. Matrix Anal. Appl 23(4) (2002), pp. 948—973
//      URL: http://dx.doi.org/10.1137/S0895479801384585
//
func (impl Implementation) Dlaqr23(wantt, wantz bool, n, ktop, kbot, nw int, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, sr, si []float64, v []float64, ldv int, nh int, t []float64, ldt int, nv int, wv []float64, ldwv int, work []float64, lwork int, recur int) (ns, nd int) {
	switch {
	case n < 0:
		panic(nLT0)
	case ktop < 0 || max(0, n-1) < ktop:
		panic(badKtop)
	case kbot < min(ktop, n-1) || n <= kbot:
		panic(badKbot)
	case nw < 0 || kbot-ktop+1+1 < nw:
		panic(badNw)
	case ldh < max(1, n):
		panic(badLdH)
	case wantz && (iloz < 0 || ktop < iloz):
		panic(badIloz)
	case wantz && (ihiz < kbot || n <= ihiz):
		panic(badIhiz)
	case ldz < 1, wantz && ldz < n:
		panic(badLdZ)
	case ldv < max(1, nw):
		panic(badLdV)
	case nh < nw:
		panic(badNh)
	case ldt < max(1, nh):
		panic(badLdT)
	case nv < 0:
		panic(nvLT0)
	case ldwv < max(1, nw):
		panic(badLdWV)
	case lwork < max(1, 2*nw) && lwork != -1:
		panic(badLWork)
	case len(work) < max(1, lwork):
		panic(shortWork)
	case recur < 0:
		panic(recurLT0)
	}

	// Quick return for zero window size.
	if nw == 0 {
		work[0] = 1
		return 0, 0
	}

	// LAPACK code does not enforce the documented behavior
	//  nw <= kbot-ktop+1
	// but we do (we panic above).
	jw := nw
	lwkopt := max(1, 2*nw)
	if jw > 2 {
		// Workspace query call to Dgehrd.
		impl.Dgehrd(jw, 0, jw-2, t, ldt, work, work, -1)
		lwk1 := int(work[0])
		// Workspace query call to Dormhr.
		impl.Dormhr(blas.Right, blas.NoTrans, jw, jw, 0, jw-2, t, ldt, work, v, ldv, work, -1)
		lwk2 := int(work[0])
		if recur > 0 {
			// Workspace query call to Dlaqr04.
			impl.Dlaqr04(true, true, jw, 0, jw-1, t, ldt, sr, si, 0, jw-1, v, ldv, work, -1, recur-1)
			lwk3 := int(work[0])
			// Optimal workspace.
			lwkopt = max(jw+max(lwk1, lwk2), lwk3)
		} else {
			// Optimal workspace.
			lwkopt = jw + max(lwk1, lwk2)
		}
	}
	// Quick return in case of workspace query.
	if lwork == -1 {
		work[0] = float64(lwkopt)
		return 0, 0
	}

	// Check input slices only if not doing workspace query.
	switch {
	case len(h) < (n-1)*ldh+n:
		panic(shortH)
	case len(v) < (nw-1)*ldv+nw:
		panic(shortV)
	case len(t) < (nw-1)*ldt+nh:
		panic(shortT)
	case len(wv) < (nv-1)*ldwv+nw:
		panic(shortWV)
	case wantz && len(z) < (n-1)*ldz+n:
		panic(shortZ)
	case len(sr) != kbot+1:
		panic(badLenSr)
	case len(si) != kbot+1:
		panic(badLenSi)
	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
		panic(notIsolated)
	case kbot+1 < n && h[(kbot+1)*ldh+kbot] != 0:
		panic(notIsolated)
	}

	// Machine constants.
	ulp := dlamchP
	smlnum := float64(n) / ulp * dlamchS

	// Setup deflation window.
	var s float64
	kwtop := kbot - jw + 1
	if kwtop != ktop {
		s = h[kwtop*ldh+kwtop-1]
	}
	if kwtop == kbot {
		// 1×1 deflation window.
		sr[kwtop] = h[kwtop*ldh+kwtop]
		si[kwtop] = 0
		ns = 1
		nd = 0
		if math.Abs(s) <= math.Max(smlnum, ulp*math.Abs(h[kwtop*ldh+kwtop])) {
			ns = 0
			nd = 1
			if kwtop > ktop {
				h[kwtop*ldh+kwtop-1] = 0
			}
		}
		work[0] = 1
		return ns, nd
	}

	// Convert to spike-triangular form. In case of a rare QR failure, this
	// routine continues to do aggressive early deflation using that part of
	// the deflation window that converged using infqr here and there to
	// keep track.
	impl.Dlacpy(blas.Upper, jw, jw, h[kwtop*ldh+kwtop:], ldh, t, ldt)
	bi := blas64.Implementation()
	bi.Dcopy(jw-1, h[(kwtop+1)*ldh+kwtop:], ldh+1, t[ldt:], ldt+1)
	impl.Dlaset(blas.All, jw, jw, 0, 1, v, ldv)
	nmin := impl.Ilaenv(12, "DLAQR3", "SV", jw, 0, jw-1, lwork)
	var infqr int
	if recur > 0 && jw > nmin {
		infqr = impl.Dlaqr04(true, true, jw, 0, jw-1, t, ldt, sr[kwtop:], si[kwtop:], 0, jw-1, v, ldv, work, lwork, recur-1)
	} else {
		infqr = impl.Dlahqr(true, true, jw, 0, jw-1, t, ldt, sr[kwtop:], si[kwtop:], 0, jw-1, v, ldv)
	}
	// Note that ilo == 0 which conveniently coincides with the success
	// value of infqr, that is, infqr as an index always points to the first
	// converged eigenvalue.

	// Dtrexc needs a clean margin near the diagonal.
	for j := 0; j < jw-3; j++ {
		t[(j+2)*ldt+j] = 0
		t[(j+3)*ldt+j] = 0
	}
	if jw >= 3 {
		t[(jw-1)*ldt+jw-3] = 0
	}

	ns = jw
	ilst := infqr
	// Deflation detection loop.
	for ilst < ns {
		bulge := false
		if ns >= 2 {
			bulge = t[(ns-1)*ldt+ns-2] != 0
		}
		if !bulge {
			// Real eigenvalue.
			abst := math.Abs(t[(ns-1)*ldt+ns-1])
			if abst == 0 {
				abst = math.Abs(s)
			}
			if math.Abs(s*v[ns-1]) <= math.Max(smlnum, ulp*abst) {
				// Deflatable.
				ns--
			} else {
				// Undeflatable, move it up out of the way.
				// Dtrexc can not fail in this case.
				_, ilst, _ = impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, ns-1, ilst, work)
				ilst++
			}
			continue
		}
		// Complex conjugate pair.
		abst := math.Abs(t[(ns-1)*ldt+ns-1]) + math.Sqrt(math.Abs(t[(ns-1)*ldt+ns-2]))*math.Sqrt(math.Abs(t[(ns-2)*ldt+ns-1]))
		if abst == 0 {
			abst = math.Abs(s)
		}
		if math.Max(math.Abs(s*v[ns-1]), math.Abs(s*v[ns-2])) <= math.Max(smlnum, ulp*abst) {
			// Deflatable.
			ns -= 2
		} else {
			// Undeflatable, move them up out of the way.
			// Dtrexc does the right thing with ilst in case of a
			// rare exchange failure.
			_, ilst, _ = impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, ns-1, ilst, work)
			ilst += 2
		}
	}

	// Return to Hessenberg form.
	if ns == 0 {
		s = 0
	}
	if ns < jw {
		// Sorting diagonal blocks of T improves accuracy for graded
		// matrices. Bubble sort deals well with exchange failures.
		sorted := false
		i := ns
		for !sorted {
			sorted = true
			kend := i - 1
			i = infqr
			var k int
			if i == ns-1 || t[(i+1)*ldt+i] == 0 {
				k = i + 1
			} else {
				k = i + 2
			}
			for k <= kend {
				var evi float64
				if k == i+1 {
					evi = math.Abs(t[i*ldt+i])
				} else {
					evi = math.Abs(t[i*ldt+i]) + math.Sqrt(math.Abs(t[(i+1)*ldt+i]))*math.Sqrt(math.Abs(t[i*ldt+i+1]))
				}

				var evk float64
				if k == kend || t[(k+1)*ldt+k] == 0 {
					evk = math.Abs(t[k*ldt+k])
				} else {
					evk = math.Abs(t[k*ldt+k]) + math.Sqrt(math.Abs(t[(k+1)*ldt+k]))*math.Sqrt(math.Abs(t[k*ldt+k+1]))
				}

				if evi >= evk {
					i = k
				} else {
					sorted = false
					_, ilst, ok := impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, i, k, work)
					if ok {
						i = ilst
					} else {
						i = k
					}
				}
				if i == kend || t[(i+1)*ldt+i] == 0 {
					k = i + 1
				} else {
					k = i + 2
				}
			}
		}
	}

	// Restore shift/eigenvalue array from T.
	for i := jw - 1; i >= infqr; {
		if i == infqr || t[i*ldt+i-1] == 0 {
			sr[kwtop+i] = t[i*ldt+i]
			si[kwtop+i] = 0
			i--
			continue
		}
		aa := t[(i-1)*ldt+i-1]
		bb := t[(i-1)*ldt+i]
		cc := t[i*ldt+i-1]
		dd := t[i*ldt+i]
		_, _, _, _, sr[kwtop+i-1], si[kwtop+i-1], sr[kwtop+i], si[kwtop+i], _, _ = impl.Dlanv2(aa, bb, cc, dd)
		i -= 2
	}

	if ns < jw || s == 0 {
		if ns > 1 && s != 0 {
			// Reflect spike back into lower triangle.
			bi.Dcopy(ns, v[:ns], 1, work[:ns], 1)
			_, tau := impl.Dlarfg(ns, work[0], work[1:ns], 1)
			work[0] = 1
			impl.Dlaset(blas.Lower, jw-2, jw-2, 0, 0, t[2*ldt:], ldt)
			impl.Dlarf(blas.Left, ns, jw, work[:ns], 1, tau, t, ldt, work[jw:])
			impl.Dlarf(blas.Right, ns, ns, work[:ns], 1, tau, t, ldt, work[jw:])
			impl.Dlarf(blas.Right, jw, ns, work[:ns], 1, tau, v, ldv, work[jw:])
			impl.Dgehrd(jw, 0, ns-1, t, ldt, work[:jw-1], work[jw:], lwork-jw)
		}

		// Copy updated reduced window into place.
		if kwtop > 0 {
			h[kwtop*ldh+kwtop-1] = s * v[0]
		}
		impl.Dlacpy(blas.Upper, jw, jw, t, ldt, h[kwtop*ldh+kwtop:], ldh)
		bi.Dcopy(jw-1, t[ldt:], ldt+1, h[(kwtop+1)*ldh+kwtop:], ldh+1)

		// Accumulate orthogonal matrix in order to update H and Z, if
		// requested.
		if ns > 1 && s != 0 {
			// work[:ns-1] contains the elementary reflectors stored
			// by a call to Dgehrd above.
			impl.Dormhr(blas.Right, blas.NoTrans, jw, ns, 0, ns-1,
				t, ldt, work[:ns-1], v, ldv, work[jw:], lwork-jw)
		}

		// Update vertical slab in H.
		var ltop int
		if !wantt {
			ltop = ktop
		}
		for krow := ltop; krow < kwtop; krow += nv {
			kln := min(nv, kwtop-krow)
			bi.Dgemm(blas.NoTrans, blas.NoTrans, kln, jw, jw,
				1, h[krow*ldh+kwtop:], ldh, v, ldv,
				0, wv, ldwv)
			impl.Dlacpy(blas.All, kln, jw, wv, ldwv, h[krow*ldh+kwtop:], ldh)
		}

		// Update horizontal slab in H.
		if wantt {
			for kcol := kbot + 1; kcol < n; kcol += nh {
				kln := min(nh, n-kcol)
				bi.Dgemm(blas.Trans, blas.NoTrans, jw, kln, jw,
					1, v, ldv, h[kwtop*ldh+kcol:], ldh,
					0, t, ldt)
				impl.Dlacpy(blas.All, jw, kln, t, ldt, h[kwtop*ldh+kcol:], ldh)
			}
		}

		// Update vertical slab in Z.
		if wantz {
			for krow := iloz; krow <= ihiz; krow += nv {
				kln := min(nv, ihiz-krow+1)
				bi.Dgemm(blas.NoTrans, blas.NoTrans, kln, jw, jw,
					1, z[krow*ldz+kwtop:], ldz, v, ldv,
					0, wv, ldwv)
				impl.Dlacpy(blas.All, kln, jw, wv, ldwv, z[krow*ldz+kwtop:], ldz)
			}
		}
	}

	// The number of deflations.
	nd = jw - ns
	// Shifts are converged eigenvalues that could not be deflated.
	// Subtracting infqr from the spike length takes care of the case of a
	// rare QR failure while calculating eigenvalues of the deflation
	// window.
	ns -= infqr
	work[0] = float64(lwkopt)
	return ns, nd
}