LinearSolverClass.f90 Source File


Source Code

module LinearSolverClass
   use, intrinsic :: iso_fortran_env
   use omp_lib
   use IOClass
   use TimeClass
   use MathClass
   use ArrayClass
   use SparseClass
   use RandomClass
   !use MPIClass
   implicit none

   interface BiCGSTAB
      module procedure bicgstab_real32, bicgstab_real64, bicgstab_complex64
   end interface

   interface GPBiCG
      module procedure GPBiCG_real32, GPBiCG_real64, GPBiCG_complex64
   end interface

   interface Gauss_Jordan_PV
      module procedure Gauss_Jordan_PV_real32, Gauss_Jordan_PV_real64, Gauss_Jordan_PV_complex64
   end interface

   interface StochasticGradientDescent
      module procedure StochasticGradientDescent_scalar_function
   end interface

   type :: LinearSolver_
      ! non-Element-by-element
      real(real64), allocatable :: a(:, :)
      real(real64), allocatable :: b(:)
      real(real64), allocatable :: x(:)
      ! Element-by-element
      real(real64), allocatable :: a_e(:, :, :)
      real(real64), allocatable :: b_e(:, :)
      real(real64), allocatable :: x_e(:, :)

      ! For Sparse
      ! COO format
      real(real64), allocatable   :: val(:)
      integer(int32), allocatable :: index_I(:)
      integer(int32), allocatable :: index_J(:)
      integer(int32), allocatable :: row_domain_id(:)
      integer(int32), allocatable :: column_domain_id(:)
      ! CRS format
      real(real64), allocatable   :: CRS_val(:)
      integer(int64), allocatable :: CRS_index_I(:)
      integer(int32), allocatable :: CRS_index_J(:)
      integer(int32), allocatable :: CRS_row_domain_id(:)
      integer(int32), allocatable :: CRS_column_domain_id(:)

      integer(int32), allocatable   :: b_Index_J(:)
      integer(int32), allocatable   :: b_Domain_ID(:)
      logical, allocatable   :: Locked(:)

      ! info
      integer(int32), allocatable   :: NumberOfNode(:)
      integer(int32) :: DOF = 1
      logical :: debug = .false.
      !
      integer(int32), allocatable :: connectivity(:, :)
      integer(int32) :: itrmax = 1000000
      integer(int32) :: currentID = 1
      integer(int32) :: b_currentID = 1
      real(real64) :: er0 = dble(1.0e-10)
      logical :: ReadyForFix = .false.

   contains

      procedure, public :: init => initLinearSolver

      procedure, public :: set => setLinearSolver

      procedure, public :: assemble => assembleLinearSolver

      procedure, pass ::  importLinearSolver
      procedure, pass :: importCRS_LinearSolver
      generic, public :: import => importLinearSolver, importCRS_LinearSolver
      procedure, public :: fix => fixLinearSolver
      procedure, public :: solve => solveLinearSolver

      procedure, public :: show => showLinearSolver
      procedure, public :: globalMatrix => globalMatrixLinearSolver
      procedure, public :: globalVector => globalVectorLinearSolver

      procedure, public  :: convertCOOtoCRS => convertCOOtoCRSLinearSolver
      procedure, public  :: matmulCRS => matmulCRSLinearSolver
      procedure, public  :: matmulCOO => matmulCOOLinearSolver

      procedure, public :: prepareFix => prepareFixLinearSolver
      procedure, public :: getCOOFormat => getCOOFormatLinearSolver
      procedure, public :: exportAsCOO => exportAsCOOLinearSolver
      procedure, public :: exportRHS => exportRHSLinearSolver

   end type

   interface crs_matvec
      module procedure :: crs_matvec_generic, crs_matvec_for_CRStype
   end interface crs_matvec

   interface sub_crs_matvec
      module procedure :: sub_crs_matvec_generic, sub_crs_matvec_for_CRStype
   end interface sub_crs_matvec

   interface crs_matmul
      module procedure :: crs_matmul_generic, crs_matmul_for_CRStype
   end interface crs_matmul
contains

!====================================================================================
   subroutine initLinearSolver(obj, NumberOfNode, DOF)
      class(LinearSolver_), intent(inout) :: obj
      integer(int32), optional, intent(in) :: NumberOfNode(:), DOF
      integer(int32) :: i, j, k, n, num_total_unk, node_count

      obj%ReadyForFix = .false.
      ! non-Element-by-element
      if (allocated(obj%a)) deallocate (obj%a)
      if (allocated(obj%b)) deallocate (obj%b)
      if (allocated(obj%x)) deallocate (obj%x)
      ! Element-by-element
      if (allocated(obj%a_e)) deallocate (obj%a_e)
      if (allocated(obj%b_e)) deallocate (obj%b_e)
      if (allocated(obj%x_e)) deallocate (obj%x_e)

      if (allocated(obj%val)) deallocate (obj%val)
      if (allocated(obj%index_I)) deallocate (obj%index_I)
      if (allocated(obj%index_J)) deallocate (obj%index_J)
      if (allocated(obj%row_Domain_ID)) deallocate (obj%row_Domain_ID)
      if (allocated(obj%column_Domain_ID)) deallocate (obj%column_Domain_ID)

      if (allocated(obj%b_Index_J)) deallocate (obj%b_Index_J)
      if (allocated(obj%b_Domain_ID)) deallocate (obj%b_Domain_ID)
      if (allocated(obj%Locked)) deallocate (obj%Locked)

      if (allocated(obj%connectivity)) deallocate (obj%connectivity)

      n = input(default=1, option=DOF)
      ! Number of node of n th domains is NumberOfNode(n)
      if (present(NumberOfNode)) then
         num_total_unk = sum(NumberOfNode)*n
         allocate (obj%b_Index_J(num_total_unk))
         allocate (obj%b_Domain_ID(num_total_unk))
         allocate (obj%Locked(num_total_unk))

         allocate (obj%b(num_total_unk))
         obj%NumberOfNode = NumberOfNode
         obj%DOF = DOF
         obj%b(:) = 0.0d0

         num_total_unk = 0
         do i = 1, size(NumberOfNode)
            node_count = 0
            do j = 1, NumberOfNode(i)
               do k = 1, n
                  num_total_unk = num_total_unk + 1
                  node_count = node_count + 1
                  obj%b_Domain_ID(num_total_unk) = i
                  obj%Locked(num_total_unk) = .false.
                  obj%b_Index_J(num_total_unk) = node_count
               end do
            end do
         end do
      end if

      obj%itrmax = 1000000
      obj%currentID = 1
      obj%b_currentID = 1
      obj%er0 = dble(1.0e-08)
   end subroutine
!====================================================================================

!====================================================================================
   recursive subroutine assembleLinearSolver(obj, connectivity, DOF, eMatrix, eVector, DomainIDs)
      class(LinearSolver_), intent(inout) :: obj
      integer(int32), intent(in) :: connectivity(:) ! connectivity matrix
      !(global_node_id#1, global_node_id#2, global_node_id#3, . )
      integer(int32), intent(in) :: DOF ! degree of freedom
      integer(int32), optional, intent(in) :: DomainIDs(:) ! DomainID
      real(real64), optional, intent(in) :: eMatrix(:, :) ! elemental matrix
      real(real64), optional, intent(in) :: eVector(:) ! elemental Vector
      integer(int32) :: i, j, k, l, m, node_id1, node_id2, domain_ID1, domain_ID2
      integer(int32), allocatable :: domID(:)

      if (present(eMatrix)) then
         if (present(DomainIDs)) then
            do j = 1, size(connectivity, 1)
               do k = 1, size(connectivity, 1)
                  do l = 1, DOF
                     do m = 1, DOF
                        node_id1 = connectivity(j)
                        node_id2 = connectivity(k)
                        if (j < 1 .or. k < 1) then
                           print *, "ERROR :: Assemble solver j<1 .or. k<1"
                           stop
                        end if
                        if (j > size(DomainIDs) .or. k > size(DomainIDs)) then
                           print *, j, size(DomainIDs), k
                           print *, DOmainIDs
                           print *, "ERROR :: Assemble solver j >= size(DomainIDs) .or.k >= size(DomainIDs)"
                           stop
                        end if

                        domain_ID1 = DomainIDs(j)
                        domain_ID2 = DomainIDs(k)
                        call obj%set( &
                           low=DOF*(node_id1 - 1) + l, &
                           column=DOF*(node_id2 - 1) + m, &
                           entryvalue=eMatrix(DOF*(j - 1) + l, DOF*(k - 1) + m), &
                           row_DomainID=Domain_ID1, &
                           column_DomainID=Domain_ID2)
                     end do
                  end do
               end do
            end do
         else
            do j = 1, size(DomainIDs, 1)
               do k = 1, size(DomainIDs, 1)
                  do l = 1, DOF
                     do m = 1, DOF
                        node_id1 = connectivity(j)
                        node_id2 = connectivity(k)
                        call obj%set( &
                           low=DOF*(node_id1 - 1) + l, &
                           column=DOF*(node_id2 - 1) + m, &
                           entryvalue=eMatrix(DOF*(j - 1) + l, DOF*(k - 1) + m), &
                           row_DomainID=1, &
                           column_DomainID=1)
                     end do
                  end do
               end do
            end do
         end if
      end if

      if (present(eVector)) then
         if (present(DomainIDs)) then
            do j = 1, size(connectivity)
               do l = 1, DOF
                  node_id1 = connectivity(j)
                  domain_ID1 = DomainIDs(j)
                  call obj%set( &
                     low=DOF*(node_id1 - 1) + l, &
                     entryvalue=eVector(DOF*(j - 1) + l), &
                     row_DomainID=Domain_ID1)
               end do
            end do
         else
            do j = 1, size(connectivity)
               do l = 1, DOF
                  node_id1 = connectivity(j)
                  call obj%set( &
                     low=DOF*(node_id1 - 1) + l, &
                     entryvalue=eVector(DOF*(j - 1) + l), &
                     row_DomainID=1)
               end do
            end do
         end if
      end if

   end subroutine
!====================================================================================

!====================================================================================
   recursive subroutine fixLinearSolver(obj, nodeid, entryvalue, entryID, DOF, row_DomainID, &
                                        nodeids, entryvalues, debug)
      class(LinearSolver_), intent(inout) :: obj
      logical, optional, intent(in) :: debug
      integer(int32), intent(in) :: nodeid
      integer(int32), optional, intent(in) :: entryID, DOF, row_DomainID, nodeids(:), entryvalues(:)
      real(real64), intent(in) :: entryvalue
      integer(int32), allocatable :: Index_I(:), Index_J(:), NumNodeBeforeDomainID(:)
      integer(int32) :: i, j, n, offset, m
      type(Time_) :: time

      ! [CASE1] :: single-domain & dense matrix
      ! [CASE2] :: single-domain & Sparse matrix
      ! [CASE3] :: multi-domain  & dense matrix
      ! [CASE4] :: multi-domain  & Sparse matrix

      ! [CASE0] :: none of above

      ! [CASE1][CASE0]
      if (.not. allocated(obj%val)) then
         if (allocated(obj%a) .and. allocated(obj%b)) then
            ! [CASE1]
            ! it only has obj%a and obj%b
            !x(nodeid) = entryvalue
            n = size(obj%b)

            obj%b(:) = obj%b(:) - obj%a(:, nodeid)*entryvalue
            obj%a(:, nodeid) = 0.0d0
            obj%a(nodeid, :) = 0.0d0
            obj%a(nodeid, nodeid) = 1.0d0
            obj%b(nodeid) = entryvalue
            return
         else
            ![CASE0]
            print *, "[ERROR] LinearSolver >> [CASE0] :: No Ax=b is set."
         end if
      end if

      ! Below, cases [CASE2] or [CASE4]
      if (.not. present(row_DomainID)) then
         ![CASE2] >> set : row_DomainID = 1 >> [Case4]
         call obj%fix(nodeid=nodeid, entryvalue=entryvalue, entryID=entryID, DOF=DOF, &
                      row_DomainID=1, debug=debug)
      end if

      if (obj%debug) then
         print *, "[fixLinearSolver] >> ReadyForFix"
      end if

      if (.not. obj%ReadyForFix) then
         call obj%prepareFix()
      end if

      if (obj%debug) then
         print *, "[Done] >> ReadyForFix"
      end if

      if (present(debug)) then
         if (debug) then
            call time%start()
            print *, "fixLinearSolver  >> [0] started!"

         end if
      end if

      if (present(DOF)) then
         if (.not. present(entryID)) then
            print *, "ERROR :: fixLinearSolver >> argument [DOF] should be called with [entryID]"
            print *, "e.g. x-entry of nodeid=10 in terms of 3d(x,y,z) space is >> "
            print *, "nodeid =10, entryID=1, DOF=3"
            stop
         end if
         n = (nodeid - 1)*DOF + entryID
         if (present(row_DomainID)) then
            call obj%fix(nodeid=n, entryvalue=entryvalue, row_DomainID=row_DomainID, debug=debug)
         else
            call obj%fix(nodeid=n, entryvalue=entryvalue, row_DomainID=1, debug=debug)
         end if
         return
      end if

      if (.not. allocated(obj%row_domain_id) .and. .not. present(row_DomainID)) then
         ! only for COO-format
         if (.not. allocated(obj%val) .or. .not. allocated(obj%b)) then
            print *, "ERROR >> fixLinearSolver .not. allocated(val) "
            stop
         end if

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [1] fix started!"
            end if
         end if

         do i = 1, size(obj%val)
            if (obj%index_J(i) == nodeid) then
               if (.not. obj%Locked(obj%index_I(i))) then
                  obj%b(obj%index_I(i)) = obj%b(obj%index_I(i)) - obj%val(i)*entryvalue
               end if
               obj%val(i) = 0.0d0
            end if
         end do

         do i = 1, size(obj%index_I)

            if (obj%index_I(i) == nodeid) then
               if (obj%index_J(i) == nodeid) then
                  obj%val(i) = 1.0d0
               else
                  obj%val(i) = 0.0d0
               end if
               if (.not. obj%Locked(obj%index_I(i))) then
                  obj%b(obj%index_I(i)) = entryvalue
                  obj%Locked(obj%index_I(i)) = .true.
               end if

               !print *, "Locked ",obj%index_I(i),"by",entryvalue
            end if

            if (obj%index_I(i) == nodeid) then
               if (obj%index_J(i) == nodeid) then
                  obj%val(i) = 1.0d0
               end if
            end if
         end do

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [ok] done!"
            end if
         end if
         return

      elseif (allocated(obj%row_domain_id) .and. present(row_DomainID)) then
         ! only for COO-format

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [1] Multi-domain started!"
               call time%show()
            end if
         end if

         ! fix b vector
         n = row_DomainID
         if (n == 1) then
            offset = 0
         else
            offset = sum(obj%NumberOfNode(1:n - 1))*obj%DOF
         end if
         if (.not. obj%Locked(offset + nodeid)) then
            obj%b(offset + nodeid) = entryvalue
            obj%Locked(offset + nodeid) = .true.
            !print *, "Locked ",(offset + nodeid),"by",entryvalue
         end if

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [1-2] count offset number"
               call time%show()
            end if
         end if

         allocate (NumNodeBeforeDomainID(size(obj%row_Domain_ID, 1)))

         NumNodeBeforeDomainID(1) = 0

         do m = 2, size(obj%NumberOfNode)
            NumNodeBeforeDomainID(m) = sum(obj%NumberOfNode(1:m - 1))
         end do

         if (.not. allocated(obj%val) .or. .not. allocated(obj%b)) then
            print *, "ERROR >> fixLinearSolver .not. allocated(val) "
            stop
         end if

         !print *, "obj%b",obj%b

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [2] Updating b-vector"
               call time%show()
            end if
         end if

         ! update b-vector (Right-hand side vector)
         do i = 1, size(obj%val)

            if (obj%val(i) == 0.0d0) then
               cycle
            end if

            if (obj%column_Domain_ID(i) /= row_DomainID) then
               cycle
            end if

            if (obj%index_J(i) == nodeid) then

               n = obj%row_Domain_ID(i)

               offset = NumNodeBeforeDomainID(n)*obj%DOF

               n = obj%Index_I(i)

               if (.not. obj%Locked(offset + n)) then
                  if (size(obj%NumberOfNode) == 1) then
                     obj%b(n) = obj%b(n) - obj%val(i)*entryvalue
                  else
                     obj%b(offset + n) = obj%b(offset + n) - obj%val(i)*entryvalue
                  end if
               end if

               obj%val(i) = 0.0d0
            else
               cycle
            end if
         end do

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [3] Updated b-vector"
               call time%show()
            end if
         end if

         do i = 1, size(obj%index_I) ! for all queries of A matrix

            if (obj%index_I(i) == nodeid .and. obj%row_Domain_ID(i) == row_DomainID) then
               obj%val(i) = 0.0d0
               if (obj%index_J(i) == nodeid .and. obj%column_Domain_ID(i) == row_DomainID) then
                  obj%val(i) = 1.0d0
               end if
            end if

         end do

         if (present(debug)) then
            if (debug) then
               print *, "fixLinearSolver  >> [ok] Done!"
               call time%show()
            end if
         end if

      else
         print *, "ERROR  :: fixLinearSolver >> allocated(obj%row_domain_id) /= present(row_DomainID)"
         print *, allocated(obj%row_domain_id), present(row_DomainID)
         stop
      end if

   end subroutine
!====================================================================================

!====================================================================================
   recursive subroutine setLinearSolver(obj, low, column, entryvalue, init, row_DomainID, column_DomainID)
      class(LinearSolver_), intent(inout) :: obj
      integer(int32), optional, intent(in) :: low, column, row_DomainID, column_DomainID
      real(real64), optional, intent(in) :: entryvalue
      logical, optional, intent(in) :: init
      integer(int32) :: i, row_DomID, column_DomID, row_offset, column_offset, j, k, find_num, max_thread

      row_DomID = input(default=1, option=row_DomainID)
      column_DomID = input(default=1, option=column_DomainID)
      if (allocated(obj%NumberOfNode)) then
         if (present(row_DomainID)) then
            if (row_DomainID == 1) then
               row_offset = 0
            else
               row_offset = sum(obj%NumberOfNode(1:row_DomainID - 1))
            end if
         end if
         if (present(column_DomainID)) then
            if (column_DomainID == 1) then
               column_offset = 0
            else
               column_offset = sum(obj%NumberOfNode(1:column_DomainID - 1))
            end if
         end if
      end if

      if (present(init)) then
         if (init .eqv. .true.) then
            if (allocated(obj%val)) then
               obj%val(:) = 0.0d0
            end if
            if (allocated(obj%b)) then
               obj%b(:) = 0.0d0
            end if
            obj%currentID = 1
         end if
      end if

      if (present(low) .and. present(column)) then

         if (.not. allocated(obj%val)) then
            allocate (obj%val(1))
            obj%val(1) = input(default=0.0d0, option=entryvalue)
            allocate (obj%index_I(1))
            obj%index_I(1) = low
            allocate (obj%index_J(1))
            obj%index_J(1) = column
            allocate (obj%row_Domain_ID(1))
            allocate (obj%column_Domain_ID(1))
            obj%row_Domain_ID(obj%currentID) = row_DomID
            obj%column_Domain_ID(obj%currentID) = column_DomID

            ! DomainIDは,rowとcolumnの両方必要!!!
            ! DomainIDは,rowとcolumnの両方必要!!!
            ! DomainIDは,rowとcolumnの両方必要!!!
            ! DomainIDは,rowとcolumnの両方必要!!!
            ! DomainIDは,rowとcolumnの両方必要!!!
            ! DomainIDは,rowとcolumnの両方必要!!!
            ! DomainIDは,rowとcolumnの両方必要!!!
            return
         end if

         ! if already exists, add.
!     find_num = 0
!     !$omp parallel
!     !$omp do reduction(+:find_num)
!     do i=1,size(obj%index_I)
!       if(obj%index_i(i)==0 ) cycle
!       if(obj%row_domain_id(i) == row_DomainID .and. obj%column_domain_id(i) == column_DomainID )then
!         if(obj%index_I(i) == low )then
!           if(obj%index_J(i) == column )then
!             obj%val(i) = obj%val(i) + entryvalue
!             find_num = 1
!           endif
!         endif
!       endif
!     enddo
!     !$omp end do
!     !$omp end parallel
!     if(find_num>=1)then
!       return
!     endif

         if (obj%currentID + 1 > size(obj%val)) then
            call extendArray(obj%val, 0.0d0, size(obj%val))
            call extendArray(obj%index_I, 0, size(obj%index_I))
            call extendArray(obj%index_J, 0, size(obj%index_J))
            call extendArray(obj%row_Domain_ID, 0, size(obj%row_Domain_ID))
            call extendArray(obj%column_Domain_ID, 0, size(obj%column_Domain_ID))
         end if
         obj%currentID = obj%currentID + 1
         obj%val(obj%currentID) = entryvalue
         obj%index_I(obj%currentID) = low
         obj%index_J(obj%currentID) = column
         if (present(row_DomainID)) then
            obj%row_Domain_ID(obj%currentID) = row_DomainID
         else
            obj%row_Domain_ID(obj%currentID) = 1
         end if
         if (present(column_DomainID)) then
            obj%column_Domain_ID(obj%currentID) = column_DomainID
         else
            obj%column_Domain_ID(obj%currentID) = 1
         end if

         return
      elseif (present(low) .and. .not. present(column)) then ! for right-hand side vector

         if (present(row_DomainID)) then
            ! multi-domain problem
            if (.not. allocated(obj%b)) then
               print *, "ERROR :: setLinearSolver >> for multi-domain problem, please call %init method"
               print *, "with % init( NumberOfNode , DOF )"
               stop
            else

               ! Right-hand side vector
               ! Extend one-by-one
               if (row_DomainID == 1) then
                  row_offset = 0
               else
                  row_offset = sum(obj%NumberOfNode(1:row_DomainID - 1))*obj%DOF
               end if
               obj%b(row_offset + low) = obj%b(row_offset + low) + entryvalue
            end if
            return
         else
            ! single-domain problem
            if (.not. allocated(obj%b)) then
               allocate (obj%b(low))
               obj%b(low) = input(default=0.0d0, option=entryvalue)
               obj%CurrentID = low
               return
            else
               ! Right-hand side vector
               if (low > size(obj%b)) then
                  if (obj%currentID < size(obj%val)) then
                     obj%b(low) = entryvalue
                  else
                     call extendArray(obj%b, 0.0d0, low - size(obj%b))
                     obj%b(low) = obj%b(low) + entryvalue
                  end if
               end if
            end if
         end if
      else
         return
      end if

   end subroutine
!====================================================================================

!====================================================================================
   subroutine importLinearSolver(obj, a, x, b, a_e, b_e, x_e, connectivity, val, index_I, index_J)
      class(LinearSolver_), intent(inout) :: obj
      real(8), optional, intent(in) :: a(:, :), b(:), x(:), a_e(:, :, :), b_e(:, :), x_e(:, :)
      real(8), optional, intent(in) :: val(:)
      integer(int32), optional, intent(in) :: index_I(:), index_J(:)
      integer(int32), optional, intent(in) :: connectivity(:, :)
      integer(int32) :: k, l, m

      if (present(val)) then
         if (.not. allocated(obj%val)) then
            allocate (obj%val(size(val)))
            obj%val = val
         end if
      end if

      if (present(index_i)) then
         if (.not. allocated(obj%index_i)) then
            allocate (obj%index_i(size(index_i)))
            obj%index_i = index_i
         end if
      end if

      if (present(index_j)) then
         if (.not. allocated(obj%index_j)) then
            allocate (obj%index_j(size(index_j)))
            obj%index_j = index_j
         end if
      end if

      ! in case of non element-by-element
      if (present(a)) then
         ! Set Ax=b
         k = size(a, 1)
         l = size(a, 2)
         if (.not. allocated(obj%a)) then
            allocate (obj%a(k, l))
         elseif (size(obj%a, 1) /= k .or. size(obj%a, 2) /= l) then
            deallocate (obj%a)
            allocate (obj%a(k, l))
         end if
         obj%a(:, :) = a(:, :)
      end if

      if (present(b)) then
         k = size(b, 1)
         if (.not. allocated(obj%b)) then
            allocate (obj%b(k))
         elseif (size(obj%b, 1) /= k) then
            deallocate (obj%b)
            allocate (obj%b(k))
         end if
         obj%b(:) = b(:)
      end if

      if (present(x)) then
         k = size(x, 1)
         if (.not. allocated(obj%x)) then
            allocate (obj%x(k))
         elseif (size(obj%x, 1) /= k) then
            deallocate (obj%x)
            allocate (obj%x(k))
         end if
         obj%x(:) = x(:)
      end if

      if (present(a_e)) then
         ! Set A_e x_e =b_e
         k = size(a_e, 1)
         l = size(a_e, 2)
         m = size(a_e, 3)
         if (.not. allocated(obj%a_e)) then
            allocate (obj%a_e(k, l, m))
         end if
         obj%a_e(:, :, :) = a_e(:, :, :)
      end if

      if (present(b_e)) then
         ! Set Ax=b
         k = size(b_e, 1)
         l = size(b_e, 2)
         if (.not. allocated(obj%b_e)) then
            allocate (obj%b_e(k, l))
         elseif (size(obj%b_e, 1) /= k .or. size(obj%b_e, 2) /= l) then
            deallocate (obj%b_e)
            allocate (obj%b_e(k, l))
         end if
         obj%b_e(:, :) = b_e(:, :)
      end if

      if (present(x_e)) then
         ! Set Ax=b
         k = size(x_e, 1)
         l = size(x_e, 2)
         if (.not. allocated(obj%x_e)) then
            allocate (obj%x_e(k, l))
         elseif (size(obj%x_e, 1) /= k .or. size(obj%x_e, 2) /= l) then
            deallocate (obj%x_e)
            allocate (obj%x_e(k, l))
         end if
         obj%x_e(:, :) = x_e(:, :)
      end if

   end subroutine importLinearSolver
!====================================================================================

!====================================================================================
   subroutine prepareFixLinearSolver(obj, debug)
      class(LinearSolver_), intent(inout) :: obj
      logical, optional, intent(in) :: debug
      integer(int32), allocatable :: Index_I(:), Index_J(:), row_domain_id(:), column_Domain_ID(:)
      real(real64), allocatable:: val(:)
      integer(int32), allocatable :: array(:, :)
      integer(int32) :: i, m, n, rn, rd, cn, cd, same_n, count_reduc, j
      integer(int32) :: Index_I_max, Index_J_max, row_domain_id_max, column_Domain_ID_max

      ! It's too slow
      if (obj%ReadyForFix) return
      ! remove overlapped elements

      count_reduc = 0

      if (present(debug)) then
         if (debug) then
            print *, "prepareFixLinearSolver >> [1] heap-sort started."
         end if
      end if

      ! first, heap sort
      n = size(obj%val)
      allocate (array(n, 4))
      array(:, 1) = obj%Index_I
      array(:, 2) = obj%Index_J
      array(:, 3) = obj%row_domain_id
      array(:, 4) = obj%column_Domain_ID

      call heapsortArray(array, obj%val)

      obj%Index_I = array(:, 1)
      obj%Index_J = array(:, 2)
      obj%row_domain_id = array(:, 3)
      obj%column_Domain_ID = array(:, 4)

      if (present(debug)) then
         if (debug) then
            print *, "prepareFixLinearSolver >> [2] Remove overlaps"
         end if
      end if
      ! second, remove overlap
      do i = 1, n - 1
         if (obj%index_i(i) == 0) cycle
         do j = i + 1, n
            if (obj%Index_I(i) == obj%Index_I(j) .and. obj%Index_J(i) == obj%Index_J(j)) then
               if (obj%row_domain_id(i) == obj%row_domain_id(j) .and. obj%column_domain_id(i) == obj%column_domain_id(j)) then
                  obj%val(i) = obj%val(i) + obj%val(j)
                  obj%Index_I(j) = 0
                  obj%row_domain_id(j) = 0
                  obj%Index_j(j) = 0
                  obj%column_domain_id(j) = 0
               else
                  exit
               end if
            else
               exit
            end if
         end do
      end do
      ! regacy

!  do i=1,size(obj%Index_I)-1
!    if(obj%Index_I(i) == 0 ) cycle
!
!    !$OMP parallel
!    !$OMP do
!    do j=i+1, size(obj%Index_I)
!      if(obj%row_domain_id(j)==obj%row_domain_id(i) )then
!        if(obj%column_domain_id(j)==obj%column_domain_id(i) )then
!          if(obj%Index_I(j) == obj%Index_I(i) )then
!            if(obj%Index_J(j) == obj%Index_J(i) )then
!              obj%val(i) = obj%val(i) + obj%val(j)
!              obj%Index_I(j) = 0
!              obj%row_domain_id(j) = 0
!              obj%Index_j(j) = 0
!              obj%column_domain_id(j) = 0
!            else
!              cycle
!            endif
!          else
!            cycle
!          endif
!        else
!          cycle
!        endif
!      else
!        cycle
!      endif
!    enddo
!    !$OMP end do
!    !$OMP end parallel
!  enddo

      !
      if (present(debug)) then
         if (debug) then
            print *, "prepareFixLinearSolver >> [3] Renew info"
         end if
      end if

      count_reduc = 0
      do i = 1, size(obj%index_I)
         if (obj%index_I(i) /= 0) then
            count_reduc = count_reduc + 1
         end if
      end do

      allocate (val(count_reduc))
      allocate (Index_I(count_reduc))
      allocate (Index_J(count_reduc))
      allocate (row_domain_id(count_reduc))
      allocate (column_Domain_ID(count_reduc))

      n = 0
      do i = 1, size(obj%Index_I)
         if (obj%Index_I(i) == 0) cycle
         n = n + 1
         val(n) = obj%val(i)
         Index_I(n) = obj%Index_I(i)
         Index_J(n) = obj%Index_J(i)
         row_domain_id(n) = obj%row_domain_id(i)
         column_Domain_ID(n) = obj%column_Domain_ID(i)
      end do

      deallocate (obj%val)
      deallocate (obj%Index_I)
      deallocate (obj%Index_J)
      deallocate (obj%row_domain_id)
      deallocate (obj%column_Domain_ID)

      obj%val = val
      obj%Index_I = Index_I
      obj%Index_J = Index_J
      obj%row_domain_id = row_domain_id
      obj%column_Domain_ID = column_Domain_ID

      obj%ReadyForFix = .true.
      if (present(debug)) then
         if (debug) then
            print *, "prepareFixLinearSolver >> [ok] Done"
         end if
      end if

   end subroutine
!====================================================================================

!====================================================================================
   subroutine solveLinearSolver(obj, Solver, MPI, OpenCL, CUDAC, preconditioning, CRS)
      class(LinearSolver_), intent(inout) :: obj
      character(*), intent(in) :: Solver
      logical, optional, intent(in) :: MPI, OpenCL, CUDAC, preconditioning, CRS
      integer(int32), allocatable :: Index_I(:), Index_J(:), row_domain_id(:), column_Domain_ID(:)
      integer(int64), allocatable :: CRS_Index_I(:)
      real(real64), allocatable:: val(:)
      integer(int32) :: i, m, n, rn, rd, cn, cd, same_n, count_reduc, j

      print *, "[Warning] Please do not use this procedure"
      print *, "because this procedure has some bugs"
      print *, "Please use FEMSolverClass instead of this solver."

      ! if not allocated COO format
      if (.not. allocated(obj%val)) then
         if (allocated(obj%a) .and. allocated(obj%b)) then
            ! it only has obj%a and obj%b
            n = size(obj%b)
            obj%x = zeros(n)
            if (Solver == "BiCGSTAB") then
               call bicgstab1d(a=obj%a, b=obj%b, x=obj%x, n=n, itrmax=obj%itrmax, er=obj%er0)
            elseif (Solver == "GPBiCG") then
               call bicgstab1d(a=obj%a, b=obj%b, x=obj%x, n=n, itrmax=obj%itrmax, er=obj%er0)
            elseif (Solver == "GaussJordan") then
               call gauss_jordan_pv(obj%a, obj%x, obj%b, size(obj%b, 1))
            else
               call gauss_jordan_pv(obj%a, obj%x, obj%b, size(obj%b, 1))
            end if
            return
         end if
      end if

      if (.not. allocated(obj%a) .and. .not. allocated(obj%val)) then
         print *, "solveLinearSolver >> ERROR :: .not. allocated(obj%b) "
         stop
      end if

      if (.not. allocated(obj%b)) then
         print *, "solveLinearSolver >> ERROR :: .not. allocated(obj%b) "
         stop
      end if

      if (.not. allocated(obj%x)) then
         allocate (obj%x(size(obj%b)))
         obj%x(:) = 0.0d0
      end if

      if (size(obj%x) /= size(obj%b)) then
         deallocate (obj%x)
         allocate (obj%x(size(obj%b)))
         obj%x(:) = 0.0d0
      end if

      ! No MPI, No OpenCl and No CUDAC
      if (allocated(obj%a)) then
         if (allocated(obj%b)) then
            if (allocated(obj%x)) then
               ! run as non EBE-mode
               if (Solver == "GaussSeidel") then
                  call gauss_seidel(obj%a, obj%b, obj%x, size(obj%a, 1), obj%itrmax, obj%er0)
               elseif (Solver == "GaussJordanPV" .or. Solver == "GaussJordan") then
                  call gauss_jordan_pv(obj%a, obj%x, obj%b, size(obj%a, 1))
               elseif (Solver == "BiCGSTAB") then
                  if (present(CRS)) then
                     if (CRS .eqv. .true.) then
                        if (.not. allocated(obj%CRS_val)) then
                           call obj%convertCOOtoCRS()
                        end if
                        call bicgstab_CRS(obj%val, obj%CRS_index_I, obj%CRS_index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug)
                     elseif (allocated(obj%val)) then
                        ! COO format
                        if (allocated(obj%locked)) then
                      call bicgstab_COO(obj%val, obj%index_I, obj%index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug, obj%locked)
                        else
                           call bicgstab_COO(obj%val, obj%index_I, obj%index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug)
                        end if
                     else
                        call bicgstab1d(obj%a, obj%b, obj%x, size(obj%a, 1), obj%itrmax, obj%er0)
                     end if
                  else
                     call bicgstab1d(obj%a, obj%b, obj%x, size(obj%a, 1), obj%itrmax, obj%er0)
                  end if
               elseif (Solver == "GPBiCG") then
                  if (present(preconditioning)) then
                     if (preconditioning .eqv. .true.) then
                        call preconditioned_GPBiCG(obj%a, obj%b, obj%x, size(obj%a, 1), obj%itrmax, obj%er0)
                     else
                        call GPBiCG(obj%a, obj%b, obj%x, size(obj%a, 1), obj%itrmax, obj%er0)
                     end if
                  else
                     call GPBiCG(obj%a, obj%b, obj%x, size(obj%a, 1), obj%itrmax, obj%er0)
                  end if
               else
                  print *, "LinearSolver_ ERROR:: no such solver as :: ", Solver
               end if
               return
            end if
         end if
      else
         if (allocated(obj%NumberOfNode)) then
            ! May be overlapped!
            ! remove overlap
            print *, "solveLinearSolver >> preparing..."
            Index_I = obj%Index_I
            Index_J = obj%Index_J
            do i = 1, size(Index_I)
               m = obj%row_Domain_ID(i)
               if (m == 1) then
                  n = 0
               else
                  n = sum(obj%NumberOfNode(1:m - 1))*obj%DOF
               end if
               Index_I(i) = Index_I(i) + n
            end do
            do i = 1, size(Index_J)
               m = obj%column_Domain_ID(i)
               if (m == 1) then
                  n = 0
               else
                  n = sum(obj%NumberOfNode(1:m - 1))*obj%DOF
               end if
               Index_J(i) = Index_J(i) + n
            end do
            print *, "solveLinearSolver >> start!"
            if (present(CRS)) then
               ! create CRS_Index_I from Index_I@COO

               if (CRS) then
                  call bicgstab_CRS(obj%val, CRS_index_I, index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug)
                  return
               end if

            end if
            if (Solver == "BiCGSTAB") then
               if (allocated(obj%locked)) then
                  call bicgstab_COO(obj%val, index_I, index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug, obj%locked)
               else
                  call bicgstab_COO(obj%val, index_I, index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug)
               end if
            else
               call GaussJordan_COO(obj%val, index_I, index_J, obj%x, obj%b)
            end if
         else
            if (present(CRS)) then
               if (CRS) then
                  call bicgstab_CRS(obj%val, CRS_index_I, index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug)
                  return
               end if
            end if
            if (Solver == "BiCGSTAB") then
               if (allocated(obj%locked)) then
                  call bicgstab_COO(obj%val, obj%index_I, obj%index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug &
                                    , obj%locked)
               else
                  call bicgstab_COO(obj%val, obj%index_I, obj%index_J, obj%x, obj%b, obj%itrmax, obj%er0, obj%debug)
               end if
            else
               call GaussJordan_COO(obj%val, obj%index_I, obj%index_J, obj%x, obj%b)
            end if
         end if
         return
      end if

      print *, "LinearSolver_ ERROR:: EBE-mode is not implemented yet."
      stop
   end subroutine solveLinearSolver
!====================================================================================

!====================================================================================
   subroutine gauss_seidel(a, b, x, n, itrmax, er0)
      integer(int32), intent(in) :: n, itrmax
      real(real64), intent(in)  :: a(n, n), b(n), er0
      real(real64), intent(out) :: x(n)
      real(real64) s, er, rd(n), r(n)
      integer(int32) i, itr
      do i = 1, n
         if (a(i, i) == 0.0d0) stop 'a(i, i) == 0.0d0'
         rd(i) = 1.0d0/a(i, i)
      end do
      x(1:n) = 0.0d0
      do itr = 1, itrmax
         do i = 1, n
            s = dot_product(a(i, 1:i - 1), x(1:i - 1))
            s = s + dot_product(a(i, i + 1:n), x(i + 1:n))
            x(i) = rd(i)*(b(i) - s)
         end do
         r(1:n) = b(1:n) - matmul(a, x)
         er = dot_product(r, r)
         if (er <= er0) then
            write (20, *) '# converged#'
            exit
         end if
      end do
   end subroutine gauss_seidel
!===================================================================================
   subroutine GaussJordan_COO(a0, index_i, index_j, x, b)
      integer(int32) :: n
      real(real64), intent(in) :: a0(:), b(:)
      integer(int32), intent(in) :: index_i(:), index_j(:)
      real(real64), allocatable, intent(inout) :: x(:)
      integer(int32) i, j, jj, k, m, nn, mm, id1, id2
      real(real64), allocatable :: a(:, :), w(:)
      real(real32) ar, am, t

      print *, "Solver :: GaussJordan"
      ! CAUTION this code is not optimized.
      ! It just converts a COO-formatted sparse matrix
      ! to Dense matrix, and pass them to
      ! Gauss_jordan_pv_real64
      n = size(b)
      a = zeros(n, n)
      x = zeros(n)
      print *, "DOF = ", n
      do i = 1, size(index_i)
         a(index_i(i), index_j(i)) = a0(i)
      end do

      nn = n
      x(:) = b(:)
      do k = 1, n
         m = k
         am = abs(a(k, k))
         do i = k + 1, n
            if (abs(a(i, k)) > am) then
               am = abs(a(i, k))
               m = i
            end if
         end do
         if (am == 0.0d0) stop ' A is singular '
         if (k /= m) then
            w(k:n) = a(k, k:n)
            a(k, k:n) = a(m, k:n)
            a(m, k:n) = w(k:n)
            t = x(k)
            x(k) = x(m)
            x(m) = t
         end if
         ! gauss_jordan
         if (a(k, k) == 0.0d0) stop 'devide by zero3gauss_jordan_pv'
         ar = 1.0d0/a(k, k)
         a(k, k) = 1.0d0
         a(k, k + 1:n) = ar*a(k, k + 1:n)
         x(k) = ar*x(k)
         do i = 1, n
            if (i /= k) then
               a(i, k + 1:n) = a(i, K + 1:n) - a(i, k)*a(k, k + 1:n)
               x(i) = x(i) - a(i, k)*x(k)
               a(i, k) = 0.0d0
            end if
         end do
      end do

      return

!
!
!
!
!  nn = size(b,1)
!  n  = size(b,1)
!  a = a0
!  w = zeros(n)
!  x = b
!
!  do k = 1, n
!     m = k
!
!     ! am = A(k,k)
!     am = 0.0d0
!     do j=1,size(index_i)
!      if(index_i(j)==k .and. index_j(j)==k )then
!        am = abs(a(j))
!        exit
!      endif
!     enddo
!
!     !
!     !do i = k+1, n
!     ! if (abs(a(i,k)) > am) then
!     !   am = abs(a(i,k))
!     !   m = i
!     ! endif
!     !enddo
!
!     do j=1,size(index_i)
!      if(index_i(j)>=k+1 .and.  k == index_j(j) )then
!        if( abs(a(j)) > am )then
!          am = abs(a(j) )
!          m = index_i(j)
!        endif
!      endif
!     enddo
!
!
!
!     if (am == 0.0d0)   stop  ' A is singular '
!
!     !if ( k /= m) then
!     !  w(k:n) = a(k, k:n)
!     !  a(k,k:n) = a(m, k:n)
!     !  a(m, k:n) =w(k:n)
!     !  t = x(k)
!     !  x(k) = x(m)
!     !  x(m) = t
!     !endif
!     if( k/=m)then
!      !(1)  w(k:n) = a(k, k:n)
!      do j=1,size(index_i)
!        id1 = index_i(j)
!        if(id1==k)then
!          id2 = index_j(j)
!          if(k<=id2 .and. id2 <= n)then
!            w(id2) = a(j)
!          endif
!        enddo
!      enddo
!      !(2)  a(k,k:n) = a(m, k:n)
!      do j=1,size(index_i)
!        id1 = index_i(j)
!        if(id1==k)then
!          id2 = index_j(j)
!          if(k<=id2 .and. id2 <= n)then
!            w(id2) = a(j)
!          endif
!        enddo
!      enddo
!
!
!
!     endif
!
!
!     ! gauss_jordan
!     if (a(k, k) == 0.0d0)  stop  'devide by zero3gauss_jordan_pv'
!            ar = 1.0d0 / a(k,k)
!     a(k,k) = 1.0d0
!     a(k,k+1:n) = ar * a(k, k+1:n)
!     x(k) = ar * x(k)
!     do i= 1, n
!       if (i /= k) then
!         a(i, k+1:n) = a(i, K+1:n) - a(i,k) * a(k, k+1:n)
!         x(i) = x(i) - a(i,k) * x(k)
!         a(i,k) = 0.0d0
!       endif
!     enddo
!  enddo

   end subroutine
!===========================================================================

!===================================================================================
   subroutine gauss_jordan_pv_real64(a0, x, b, n)
      integer(int32), intent(in) :: n
      real(real64), intent(in) :: a0(n, n), b(n)
      real(real64), intent(inout) :: x(n)
      integer(int32) i, j, k, m, nn, mm
      real(real64) ar, am, t, a(n, n), w(n)
      nn = size(a0, 1)

      a(:, :) = a0(:, :)
      x(:) = b(:)
      do k = 1, n
         m = k
         am = abs(a(k, k))
         do i = k + 1, n
            if (abs(a(i, k)) > am) then
               am = abs(a(i, k))
               m = i
            end if
         end do
         if (am == 0.0d0) stop ' A is singular '
         if (k /= m) then
            w(k:n) = a(k, k:n)
            a(k, k:n) = a(m, k:n)
            a(m, k:n) = w(k:n)
            t = x(k)
            x(k) = x(m)
            x(m) = t
         end if
         ! gauss_jordan
         if (a(k, k) == 0.0d0) stop 'devide by zero3gauss_jordan_pv'
         ar = 1.0d0/a(k, k)
         a(k, k) = 1.0d0
         a(k, k + 1:n) = ar*a(k, k + 1:n)
         x(k) = ar*x(k)
         do i = 1, n
            if (i /= k) then
               a(i, k + 1:n) = a(i, K + 1:n) - a(i, k)*a(k, k + 1:n)
               x(i) = x(i) - a(i, k)*x(k)
               a(i, k) = 0.0d0
            end if
         end do
      end do

   end subroutine
!===========================================================================

!===================================================================================
   subroutine gauss_jordan_pv_real32(a0, x, b, n)
      integer(int32), intent(in) :: n
      real(real32), intent(in) :: a0(n, n), b(n)
      real(real32), intent(out) :: x(n)
      integer(int32) i, j, k, m, nn, mm
      real(real32) ar, am, t, a(n, n), w(n)
      nn = size(a0, 1)

      a(:, :) = a0(:, :)
      x(:) = b(:)
      do k = 1, n
         m = k
         am = abs(a(k, k))
         do i = k + 1, n
            if (abs(a(i, k)) > am) then
               am = abs(a(i, k))
               m = i
            end if
         end do
         if (am == 0.0d0) stop ' A is singular '
         if (k /= m) then
            w(k:n) = a(k, k:n)
            a(k, k:n) = a(m, k:n)
            a(m, k:n) = w(k:n)
            t = x(k)
            x(k) = x(m)
            x(m) = t
         end if
         ! gauss_jordan
         if (a(k, k) == 0.0d0) stop 'devide by zero3gauss_jordan_pv'
         ar = 1.0d0/a(k, k)
         a(k, k) = 1.0d0
         a(k, k + 1:n) = ar*a(k, k + 1:n)
         x(k) = ar*x(k)
         do i = 1, n
            if (i /= k) then
               a(i, k + 1:n) = a(i, K + 1:n) - a(i, k)*a(k, k + 1:n)
               x(i) = x(i) - a(i, k)*x(k)
               a(i, k) = 0.0d0
            end if
         end do
      end do

   end subroutine
!===========================================================================

!===================================================================================
   subroutine gauss_jordan_pv_complex64(a0, x, b, n)
      integer(int32), intent(in) :: n
      complex(real64), intent(in) :: a0(n, n), b(n)
      complex(real64), intent(out) :: x(n)
      integer(int32) i, j, k, m, nn, mm
      complex(real64) ar, am, t, a(n, n), w(n)
      nn = size(a0, 1)

      a(:, :) = a0(:, :)
      x(:) = b(:)
      do k = 1, n
         m = k
         am = abs(a(k, k))
         do i = k + 1, n
            if (abs(a(i, k)) > abs(am)) then
               am = abs(a(i, k))
               m = i
            end if
         end do
         if (am == 0.0d0) stop ' A is singular '
         if (k /= m) then
            w(k:n) = a(k, k:n)
            a(k, k:n) = a(m, k:n)
            a(m, k:n) = w(k:n)
            t = x(k)
            x(k) = x(m)
            x(m) = t
         end if
         ! gauss_jordan
         if (a(k, k) == 0.0d0) stop 'devide by zero3gauss_jordan_pv'
         ar = 1.0d0/a(k, k)
         a(k, k) = 1.0d0
         a(k, k + 1:n) = ar*a(k, k + 1:n)
         x(k) = ar*x(k)
         do i = 1, n
            if (i /= k) then
               a(i, k + 1:n) = a(i, K + 1:n) - a(i, k)*a(k, k + 1:n)
               x(i) = x(i) - a(i, k)*x(k)
               a(i, k) = 0.0d0
            end if
         end do
      end do

   end subroutine
!===========================================================================

   subroutine bicgstab_diffusion(a, b, x, n, itrmax, er, DBC, DBCVal)
      integer(int32), intent(in) :: n, itrmax, DBC(:, :)
      real(real64), intent(in) :: a(n, n), b(n), er, DBCVal(:, :)
      real(real64), intent(inout) :: x(n)
      integer(int32) itr, i, j
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = 1.00e-8
      r(:) = b - matmul(a, x)
      ! if DBC => reset residual
      do i = 1, size(DBC, 1)
         do j = 1, size(DBC, 2)
            if (DBC(i, j) < 1) then
               cycle
            else
               r(DBC(i, j)) = 0.0d0
               x(DBC(i, j)) = DBCVal(i, j)
            end if
         end do
      end do

      !
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 == 0.0d0) then
         print *, "Caution :: Initial residual is zero"
         return
      end if
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax
         !c1 = dot_product(r0,r)
         y(:) = matmul(a, p)
         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)

         ! if DBC => reset residual
         do i = 1, size(DBC, 1)
            do j = 1, size(DBC, 2)
               if (DBC(i, j) < 1) then
                  cycle
               else
                  r(DBC(i, j)) = 0.0d0
                  x(DBC(i, j)) = DBCVal(i, j)
               end if
            end do
         end do
         rr = dot_product(r, r)
         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) then

            print *, "[ok] :: BICGSTAB is converged in ", i, " steps."
            exit
         end if

         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
         if (itrmax == itr) then
            print *, "ERROR :: BICGSTAB did not converge"
            return
         end if
      end do

   end subroutine bicgstab_diffusion
!===============================================================
   subroutine bicgstab_CRS(a, ptr_i, index_j, x, b, itrmax, er, debug)
      integer(int32), intent(inout) :: index_j(:), itrmax
      integer(int64), intent(inout) :: ptr_i(:)
      real(real64), intent(inout) :: a(:), b(:), er
      real(real64), intent(inout) :: x(:)
      logical, optional, intent(in) :: debug
      logical :: speak = .false.
      integer(int32) itr, i, j, n
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64), allocatable:: r(:), r0(:), p(:), y(:), e(:), v(:), pa(:), ax(:)

      er0 = er
      if (present(debug)) then
         speak = debug
      end if

      n = size(b)
      if (speak) print *, "BiCGSTAB STARTED >> DOF:", n
      allocate (r(n), r0(n), p(n), y(n), e(n), v(n))

      r(:) = b(:)
      if (speak) print *, "BiCGSTAB >> [1] initialize"

      ax = crs_matvec(CRS_value=a, CRS_col=index_j, &
                      CRS_row_ptr=ptr_i, old_vector=x)
      r = b - ax

      if (speak) print *, "BiCGSTAB >> [2] dp1"

      c1 = dot_product(r, r)

      init_rr = c1
      if (speak) print *, "BiCGSTAB >>      |r|^2 = ", init_rr

      if (c1 < er0) return

      p(:) = r(:)
      r0(:) = r(:)

      do itr = 1, itrmax
         if (speak) print *, "BiCGSTAB >> ["//str(itr)//"] initialize"
         c1 = dot_product(r0, r)

         y = crs_matvec(CRS_value=a, CRS_col=index_j, &
                        CRS_row_ptr=ptr_i, old_vector=p)

         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v = crs_matvec(CRS_value=a, CRS_col=index_j, &
                        CRS_row_ptr=ptr_i, old_vector=e)

         if (speak) print *, "BiCGSTAB >> ["//str(itr)//"] half"

         ev = dot_product(e, v)
         vv = dot_product(v, v)

         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         rr = dot_product(r, r)

         if (speak) then
            print *, rr/init_rr
         end if

         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"
         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine
!===============================================================

   !===============================================================

   subroutine bicgstab_COO(a, index_i, index_j, x, b, itrmax, er, debug, locked)
      integer(int32), intent(inout) :: index_i(:), index_j(:), itrmax
      real(real64), intent(inout) :: a(:), b(:), er
      real(real64), intent(inout) :: x(:)
      logical, optional, intent(in) :: debug, locked(:)
      logical :: speak = .false.
      integer(int32) itr, i, j, n
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64), allocatable:: r(:), r0(:), p(:), y(:), e(:), v(:)

      print *, "[ok] BiCGSTAB for COO  started."

      if (present(debug)) then
         speak = debug
      end if

      if (speak) print *, "BiCGSTAB STARTED >> DOF:", n
      n = size(b)
      allocate (r(n), r0(n), p(n), y(n), e(n), v(n))
      er0 = er
      r(:) = b(:)
      if (speak) print *, "BiCGSTAB >> [1] initialize"

      do i = 1, size(a)
         if (index_i(i) <= 0) cycle
         r(index_i(i)) = r(index_i(i)) - a(i)*x(index_j(i))
      end do
      if (speak) print *, dot_product(r, r), size(a)

      ! >> if fixed >> r=0, x=b
      if (present(locked)) then
    !!$OMP parallel do private(i)
         do i = 1, size(locked)
            if (locked(i)) then
               r(i) = 0.0d0
               x(i) = b(i)
            end if
         end do
    !!$OMP end parallel do
      end if

      !r(:) = b - matmul(a,x)
      c1 = dot_product(r, r)
      if (speak) print *, "BiCGSTAB >> [2] dp1", c1
      init_rr = c1
      if (c1 < er0) return
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax

         if (speak) print *, "BiCGSTAB >> ["//str(itr)//"] initialize"
         c1 = dot_product(r0, r)

         !y(:) = matmul(a,p)
         y(:) = 0.0d0

    !!$OMP parallel do reduction(+:y) private(i)
         do i = 1, size(a)
            if (index_i(i) <= 0) then
               ! do nothing
            else
               y(index_i(i)) = y(index_i(i)) + a(i)*p(index_j(i))
            end if
         end do
    !!$OMP end parallel do

         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         !v(:) = matmul(a,e)

         if (speak) print *, "BiCGSTAB >> ["//str(itr)//"] half"

         v(:) = 0.0d0

    !!$OMP parallel do reduction(+:v) private(i)
         do i = 1, size(a)
            if (index_i(i) <= 0) then
               ! do nothing
            else
               v(index_i(i)) = v(index_i(i)) + a(i)*e(index_j(i))
            end if
         end do
    !!$OMP end parallel do

         ev = dot_product(e, v)
         vv = dot_product(v, v)

         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)

         ! >> if fixed >> r=0, x=b
         if (present(locked)) then
      !!$OMP parallel do private(i)
            do i = 1, size(locked)
               if (locked(i)) then
                  r(i) = 0.0d0
                  x(i) = b(i)
               end if
            end do
      !!  $OMP end parallel do
         end if

         rr = dot_product(r, r)

         if (speak) print *, 'itr, |er|/|er0| =', itr, rr/init_rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"
         p(:) = r(:) + bet*(p(:) - c3*y(:))

      end do

      print *, "[ok] BiCGSTAB_COO >> error norm", rr/init_rr
   end subroutine
!===============================================================

!===============================================================
   subroutine bicgstab_real64(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n, itrmax
      real(real64), intent(in) :: a(n, n), b(n), er
      real(real64), intent(inout) :: x(n)
      integer(int32) itr
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = er

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax
         c1 = dot_product(r0, r)

         y(:) = matmul(a, p)

         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         rr = dot_product(r, r)

         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine
!===============================================================

!===============================================================
   subroutine bicgstab_real32(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n, itrmax
      real(real32), intent(in) :: a(n, n), b(n), er
      real(real32), intent(inout) :: x(n)
      integer(int32) itr
      real(real32) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real32) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = er

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax
         c1 = dot_product(r0, r)

         y(:) = matmul(a, p)

         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         rr = dot_product(r, r)

         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine
!===============================================================

!===============================================================
   subroutine bicgstab_complex64(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n, itrmax
      complex(real64), intent(in) :: a(n, n), b(n), er
      complex(real64), intent(inout) :: x(n)
      integer(int32) itr
      complex(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      complex(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = er

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (abs(c1) < abs(er0)) return
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax
         c1 = dot_product(r0, r)

         y(:) = matmul(a, p)

         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         rr = dot_product(r, r)

         !    write(*,*) 'itr, er =', itr,rr
         if (abs(rr/init_rr) < abs(er0)) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine
!===============================================================

!===============================================================
   subroutine bicgstab1d(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n, itrmax
      real(real64), intent(in) :: a(n, n), b(n), er
      real(real64), intent(inout) :: x(n)
      integer(int32) itr
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = er

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax
         c1 = dot_product(r0, r)

         y(:) = matmul(a, p)

         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         rr = dot_product(r, r)

         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine bicgstab1d
!===============================================================

   subroutine bicgstab_nr(a, b, x, n, itrmax, er, u_nod_x, u_nod_y)
      integer(int32), intent(in) :: n, itrmax, u_nod_x(:), u_nod_y(:)
      real(real64), intent(in) :: a(n, n), b(n), er
      real(real64), intent(inout) :: x(n)
      integer(int32) itr
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = 1.00e-14

      r(:) = b - matmul(a, x)

      call modify_residual(r, u_nod_x, u_nod_y)

      c1 = dot_product(r, r)

      init_rr = c1

      if (c1 < er0) return

      p(:) = r(:)

      r0(:) = r(:)

      do itr = 1, itrmax
         y(:) = matmul(a, p)
         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         call modify_residual(r, u_nod_x, u_nod_y)
         rr = dot_product(r, r)
         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine bicgstab_nr
!====================================================================================
   subroutine bicgstab_nr1(a, b, x, n, itrmax, er, u_nod_x, u_nod_y, u_nod_dis_x, u_nod_dis_y)
      integer(int32), intent(in) :: n, itrmax, u_nod_x(:), u_nod_y(:)
      real(real64), intent(in) :: a(n, n), b(n), er, u_nod_dis_x(:), u_nod_dis_y(:)
      real(real64), intent(inout) :: x(n)
      integer(int32) itr
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = 1.00e-14
      r(:) = b - matmul(a, x)
      call modify_residual_1(r, x, u_nod_x, u_nod_y, u_nod_dis_x, u_nod_dis_y)
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      p(:) = r(:)
      r0(:) = r(:)
      do itr = 1, itrmax
         y(:) = matmul(a, p)
         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         call modify_residual_1(r, x, u_nod_x, u_nod_y, u_nod_dis_x, u_nod_dis_y)
         rr = dot_product(r, r)
         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine bicgstab_nr1
!====================================================================================

   subroutine bicgstab_dirichlet(a, b, x, n, itrmax, er, DBoundNodID, DBoundVal, SetBC)
      integer(int32), intent(in) :: n, itrmax, DBoundNodID(:, :), SetBC
      real(real64), intent(in) :: a(n, n), b(n), er, DBoundVal(:, :)
      real(real64), intent(inout) :: x(n)
      integer(int32) itr
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr
      real(real64) r(n), r0(n), p(n), y(n), e(n), v(n)
      er0 = 1.00e-14

      r(:) = b - matmul(a, x)

      call modify_residual_dirichlet(r, x, DBoundNodID, DBoundVal, SetBC)

      c1 = dot_product(r, r)

      init_rr = c1

      if (c1 < er0) return

      p(:) = r(:)

      r0(:) = r(:)

      do itr = 1, itrmax
         y(:) = matmul(a, p)
         c2 = dot_product(r0, y)
         alp = c1/c2
         e(:) = r(:) - alp*y(:)
         v(:) = matmul(a, e)
         ev = dot_product(e, v)
         vv = dot_product(v, v)
         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         c3 = ev/vv
         x(:) = x(:) + alp*p(:) + c3*e(:)
         r(:) = e(:) - c3*v(:)
         call modify_residual_dirichlet(r, x, DBoundNodID, DBoundVal, SetBC)
         rr = dot_product(r, r)
         !    write(*,*) 'itr, er =', itr,rr
         if (rr/init_rr < er0) exit
         c1 = dot_product(r0, r)
         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"

         p(:) = r(:) + bet*(p(:) - c3*y(:))
      end do
   end subroutine bicgstab_dirichlet
!====================================================================================

   subroutine modify_residual_1(r, x, u_nod_x, u_nod_y, u_nod_dis_x, u_nod_dis_y)
      integer(int32), intent(in)::u_nod_x(:), u_nod_y(:)
      real(real64), intent(in) :: u_nod_dis_x(:), u_nod_dis_y(:)
      real(real64), intent(inout)::r(:), x(:)
      integer(int32) i

      do i = 1, size(u_nod_x)

         r(2*u_nod_x(i) - 1) = 0.0d0
         x(2*u_nod_x(i) - 1) = u_nod_dis_x(i)
      end do

      do i = 1, size(u_nod_y)

         r(2*u_nod_y(i)) = 0.0d0
         x(2*u_nod_y(i)) = u_nod_dis_y(i)
      end do

   end subroutine modify_residual_1
!====================================================================================
   subroutine modify_residual(r, u_nod_x, u_nod_y)
      integer(int32), intent(in)::u_nod_x(:), u_nod_y(:)
      real(real64), intent(inout)::r(:)
      integer(int32) i

      do i = 1, size(u_nod_x)

         r(2*u_nod_x(i) - 1) = 0.0d0
      end do

      do i = 1, size(u_nod_y)

         r(2*u_nod_y(i)) = 0.0d0
      end do

   end subroutine modify_residual
!====================================================================================
   subroutine modify_residual_dirichlet(r, x, DBoundNodID, DBoundVal, SetBC)
      integer(int32), intent(in)::DBoundNodID(:, :), SetBC
      real(real64), intent(in)::DBoundVal(:, :)
      real(real64), intent(inout)::r(:), x(:)

      integer(int32) :: i, j, k, dim_num, dbc_num
      real(real64) :: val

      if (SetBC == 1) then

         dim_num = size(DBoundNodID, 2)
         dbc_num = size(DBoundNodID, 1)

         do i = 1, dim_num
            do j = 1, dbc_num
               k = DBoundNodID(j, i)
               val = DBoundVal(j, i)
               if (k < 1) then
                  cycle
               elseif (k >= 1) then
                  x(dim_num*(k - 1) + i) = val*dble(SetBC)
                  r(dim_num*(k - 1) + i) = 0.0d0
               else
                  cycle
               end if
            end do
         end do
      else

         dim_num = size(DBoundNodID, 2)
         dbc_num = size(DBoundNodID, 1)

         do i = 1, dim_num
            do j = 1, dbc_num
               k = DBoundNodID(j, i)
               val = DBoundVal(j, i)
               if (k < 1) then
                  cycle
               elseif (k >= 1) then
                  r(dim_num*(k - 1) + i) = 0.0d0
                  x(dim_num*(k - 1) + i) = 0.0d0

               else
                  cycle
               end if
            end do
         end do
      end if

   end subroutine
!====================================================================================

!===========================================================================
   subroutine GPBiCG_real64(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n
      integer(int32), optional, intent(in) :: itrmax
      real(real64), intent(in) :: a(n, n), b(n)
      real(real64), optional, intent(in)::er
      real(real64), intent(inout) :: x(n)
      integer(int32) itr, itrmax_
      real(real64) alp, c1, rr, er0, init_rr, beta
      real(real64) gzi, nu, val1, val2, r0rk, eps
      real(real64) r(n), r0(n), p(n), y(n), ap(n), q(n)
      real(real64) u(n), w(n), t(n), t_(n), z(n)
      eps = 1.0e-18
      er0 = input(default=eps, option=er)

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      beta = 0.0d0
      p(:) = 0.0d0
      r0(:) = r(:)
      w(:) = 0.0d0
      u(:) = 0.0d0
      t(:) = 0.0d0
      t_(:) = 0.0d0
      z(:) = 0.0d0
      itrmax_ = input(default=1000, option=itrmax)
      do itr = 1, itrmax_
         p(:) = r(:) + beta*(p(:) - u(:))  ! triple checked.

         ap(:) = matmul(a, p) ! triple checked. ap=v,
         alp = dot_product(r0, r)/dot_product(r0, ap)! triple checked.
         y(:) = t(:) - r(:) - alp*w(:) + alp*ap(:) !triple checked.

         t(:) = r(:) - alp*ap(:) ! triple checked.

         q(:) = matmul(a, t) ! triple checked. s=q=at=c

         r0rk = dot_product(r0, r)
         if (itr == 1) then
            gzi = dot_product(q, t)/dot_product(q, q)! double checked.
            nu = 0.0d0 ! double checked.
         else
            val1 = dot_product(y, y)*dot_product(q, t) - dot_product(y, t)*dot_product(q, y)
            val2 = dot_product(q, q)*dot_product(y, y) - dot_product(y, q)*dot_product(q, y)

            if (val2 == 0.0d0) then
               !print *, itr
               !print *,  r(:), alp*ap(:)
               stop "GPBiCG devide by zero"
            end if
            gzi = val1/val2
            val1 = dot_product(q, q)*dot_product(y, t) - dot_product(y, q)*dot_product(q, t)
            nu = val1/val2  !triple checked.
         end if

         u(:) = gzi*ap + nu*(t_(:) - r(:) + beta*u(:)) !double checked.
         z(:) = gzi*r(:) + nu*z(:) - alp*u(:)!double checked.
         x(:) = x(:) + alp*p(:) + z(:) !double checked.

         r(:) = t(:) - nu*y(:) - gzi*q(:)!double checked.
         t_(:) = t(:)
         rr = dot_product(r, r)
         !print *, 'itr, er =', itr,sqrt(rr),sqrt(rr)/sqrt(init_rr)
         !print *, "ans = ",x(:)
         if (sqrt(rr)/sqrt(init_rr) < er0) exit
         r0rk = dot_product(r0, r)
         beta = alp/gzi*dot_product(r0, r)/r0rk
         w(:) = q(:) + beta*ap(:)
         if (itr == itrmax_) then
            print *, "ERROR :: GPBiCG did not converge."
         end if
      end do
   end subroutine
!===============================================================

!===========================================================================
   subroutine GPBiCG_real32(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n
      integer(int32), optional, intent(in) :: itrmax
      real(real32), intent(in) :: a(n, n), b(n)
      real(real32), optional, intent(in)::er
      real(real32), intent(inout) :: x(n)
      integer(int32) itr, itrmax_
      real(real32) alp, c1, rr, er0, init_rr, beta
      real(real32) gzi, nu, val1, val2, r0rk, eps
      real(real32) r(n), r0(n), p(n), y(n), ap(n), q(n)
      real(real32) u(n), w(n), t(n), t_(n), z(n)
      eps = 1.0e-18
      er0 = input(default=eps, option=er)

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      beta = 0.0d0
      p(:) = 0.0d0
      r0(:) = r(:)
      w(:) = 0.0d0
      u(:) = 0.0d0
      t(:) = 0.0d0
      t_(:) = 0.0d0
      z(:) = 0.0d0
      itrmax_ = input(default=1000, option=itrmax)
      do itr = 1, itrmax_
         p(:) = r(:) + beta*(p(:) - u(:))  ! triple checked.

         ap(:) = matmul(a, p) ! triple checked. ap=v,
         alp = dot_product(r0, r)/dot_product(r0, ap)! triple checked.
         y(:) = t(:) - r(:) - alp*w(:) + alp*ap(:) !triple checked.

         t(:) = r(:) - alp*ap(:) ! triple checked.

         q(:) = matmul(a, t) ! triple checked. s=q=at=c

         r0rk = dot_product(r0, r)
         if (itr == 1) then
            gzi = dot_product(q, t)/dot_product(q, q)! double checked.
            nu = 0.0d0 ! double checked.
         else
            val1 = dot_product(y, y)*dot_product(q, t) - dot_product(y, t)*dot_product(q, y)
            val2 = dot_product(q, q)*dot_product(y, y) - dot_product(y, q)*dot_product(q, y)

            if (val2 == 0.0d0) then
               !print *, itr
               !print *,  r(:), alp*ap(:)
               stop "GPBiCG devide by zero"
            end if
            gzi = val1/val2
            val1 = dot_product(q, q)*dot_product(y, t) - dot_product(y, q)*dot_product(q, t)
            nu = val1/val2  !triple checked.
         end if

         u(:) = gzi*ap + nu*(t_(:) - r(:) + beta*u(:)) !double checked.
         z(:) = gzi*r(:) + nu*z(:) - alp*u(:)!double checked.
         x(:) = x(:) + alp*p(:) + z(:) !double checked.

         r(:) = t(:) - nu*y(:) - gzi*q(:)!double checked.
         t_(:) = t(:)
         rr = dot_product(r, r)
         !print *, 'itr, er =', itr,sqrt(rr),sqrt(rr)/sqrt(init_rr)
         !print *, "ans = ",x(:)
         if (sqrt(rr)/sqrt(init_rr) < er0) exit
         r0rk = dot_product(r0, r)
         beta = alp/gzi*dot_product(r0, r)/r0rk
         w(:) = q(:) + beta*ap(:)
         if (itr == itrmax_) then
            print *, "ERROR :: GPBiCG did not converge."
         end if
      end do
   end subroutine
!===============================================================

!===========================================================================
   subroutine GPBiCG_complex64(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n
      integer(int32), optional, intent(in) :: itrmax
      complex(real64), intent(in) :: a(n, n), b(n)
      complex(real64), optional, intent(in)::er
      complex(real64), intent(inout) :: x(n)
      integer(int32) itr, itrmax_
      complex(real64) alp, c1, rr, er0, init_rr, beta
      complex(real64) gzi, nu, val1, val2, r0rk, eps
      complex(real64) r(n), r0(n), p(n), y(n), ap(n), q(n)
      complex(real64) u(n), w(n), t(n), t_(n), z(n)
      eps = 1.0e-18
      er0 = input(default=eps, option=er)

      r(:) = b - matmul(a, x)
      c1 = dot_product(r, r)
      init_rr = c1
      if (abs(c1) < abs(er0)) return
      beta = 0.0d0
      p(:) = 0.0d0
      r0(:) = r(:)
      w(:) = 0.0d0
      u(:) = 0.0d0
      t(:) = 0.0d0
      t_(:) = 0.0d0
      z(:) = 0.0d0
      itrmax_ = input(default=1000, option=itrmax)
      do itr = 1, itrmax_
         p(:) = r(:) + beta*(p(:) - u(:))  ! triple checked.

         ap(:) = matmul(a, p) ! triple checked. ap=v,
         alp = dot_product(r0, r)/dot_product(r0, ap)! triple checked.
         y(:) = t(:) - r(:) - alp*w(:) + alp*ap(:) !triple checked.

         t(:) = r(:) - alp*ap(:) ! triple checked.

         q(:) = matmul(a, t) ! triple checked. s=q=at=c

         r0rk = dot_product(r0, r)
         if (itr == 1) then
            gzi = dot_product(q, t)/dot_product(q, q)! double checked.
            nu = 0.0d0 ! double checked.
         else
            val1 = dot_product(y, y)*dot_product(q, t) - dot_product(y, t)*dot_product(q, y)
            val2 = dot_product(q, q)*dot_product(y, y) - dot_product(y, q)*dot_product(q, y)

            if (val2 == 0.0d0) then
               !print *, itr
               !print *,  r(:), alp*ap(:)
               stop "GPBiCG devide by zero"
            end if
            gzi = val1/val2
            val1 = dot_product(q, q)*dot_product(y, t) - dot_product(y, q)*dot_product(q, t)
            nu = val1/val2  !triple checked.
         end if

         u(:) = gzi*ap + nu*(t_(:) - r(:) + beta*u(:)) !double checked.
         z(:) = gzi*r(:) + nu*z(:) - alp*u(:)!double checked.
         x(:) = x(:) + alp*p(:) + z(:) !double checked.

         r(:) = t(:) - nu*y(:) - gzi*q(:)!double checked.
         t_(:) = t(:)
         rr = dot_product(r, r)
         !print *, 'itr, er =', itr,sqrt(rr),sqrt(rr)/sqrt(init_rr)
         !print *, "ans = ",x(:)
         if (abs(sqrt(rr)/sqrt(init_rr)) < abs(er0)) exit
         r0rk = dot_product(r0, r)
         beta = alp/gzi*dot_product(r0, r)/r0rk
         w(:) = q(:) + beta*ap(:)
         if (itr == itrmax_) then
            print *, "ERROR :: GPBiCG did not converge."
         end if
      end do
   end subroutine
!===============================================================

!===========================================================================
   subroutine preconditioned_GPBiCG(a, b, x, n, itrmax, er)
      integer(int32), intent(in) :: n
      integer(int32), optional, intent(in) :: itrmax
      real(real64), intent(in) :: a(1:n, 1:n), b(1:n)
      real(real64), optional, intent(in)::er
      real(real64), intent(inout) :: x(1:n)
      real(real64) :: L(1:n, 1:n), d(1:n)
      ! presented by Moe Thuthu et al., 2009, algorithm #3
      integer(int32) itr, itrmax_, i, j, k
      real(real64) alp, c1, rr, er0, init_rr, beta, lld, ld
      real(real64) gzi, nu, val1, val2, r0rk, eps
      real(real64) r(n), rk(n), r_(n), r_k(n), r0(n), p(n), y(n), ap(n), q(n)
      real(real64) u(n), w(n), t(n), t_(n), z(n)

      print *, "<<< Under implementation >>>"

      ! Incomplete Cholosky decomposition.
      ! http://www.slis.tsukuba.ac.jp/~fujisawa.makoto.fu/cgi-bin/wiki/index.php?%A5%B3%A5%EC%A5%B9%A5%AD%A1%BC%CA%AC%B2%F2
      ! >>>>>>>>>>
      L(:, :) = 0.0d0
      d(:) = 0.0d0
      d(1) = a(1, 1)
      L(1, 1) = 1.0d0
      L(:, :) = imcompleteCholosky(a)
      call showArray(L)

      !do i=2, n
      !  ! i < kの場合
      !  do j=1, i
      !    if( abs(a(i,j)) < dble(1.0e-10)  )then
      !      cycle
      !    else
      !      lld = a(i,j)
      !      do k=1, j
      !        lld = lld - L(i,k)*L(j,k)*d(k)
      !      enddo
      !      if(d(j)==0.0d0)then
      !        stop "Error :: d(j)==0.0d0"
      !      endif
      !      L(i,j) = 1.0d0/d(j)*lld
      !    endif
      !    ld = a(i,i)
      !    do k=1, i
      !      ld = ld - L(i,k)*L(i,k)*d(k)
      !    enddo
      !    d(i) = ld
      !    L(i,i) = 1.0d0
      !  enddo
      !
      !enddo
!
      print *, d

      ! <<<<<<<<<<

      eps = dble(1.00e-14)
      er0 = input(default=eps, option=er)

      r(:) = b - matmul(a, x)
      call icres(L, d, r, p, n)

      c1 = dot_product(r, r)
      init_rr = c1
      if (c1 < er0) return
      beta = 0.0d0
      !p(:) = 0.0d0
      r0(:) = r(:)
      w(:) = 0.0d0
      u(:) = 0.0d0
      t(:) = 0.0d0
      t_(:) = 0.0d0
      z(:) = 0.0d0
      itrmax_ = input(default=100000, option=itrmax)
      itrmax_ = 10
      do itr = 1, itrmax_
         call icres(L, d, r, r_k, n)

         !p(:) = r(:)+beta*(p(:)-u(:) )  ! triple checked.
         !r0rk=dot_product(r0,r)
         ap = matmul(a, p) ! triple checked. ap=v,
         alp = dot_product(r, r_k)/dot_product(p, ap)! triple checked.
         x(:) = x(:) + alp*ap(:)
         rk(:) = r(:)
         r(:) = r(:) - alp*ap(:)
         !y(:) = t(:) - r(:) - alp*w(:) + alp*ap(:) !triple checked.
         !t(:) = r(:) - alp*ap(:) ! triple checked.

         !q(:) = matmul(a,t) ! triple checked. s=q=at=c
         !if(itr==1)then
         !  gzi  = dot_product(q,t)/dot_product(q,q)! double checked.
         !  nu   = 0.0d0 ! double checked.
         !else
         !  val1 = dot_product(y,y)*dot_product(q,t) - dot_product(y,t)*dot_product(q,y)
         !  val2 = dot_product(q,q)*dot_product(y,y) - dot_product(y,q)*dot_product(q,y)
         !
         !  if(  val2==0.0d0 ) stop "Bicgstab devide by zero"
         !  gzi  = val1/val2
         !  val1 = dot_product(q,q)*dot_product(y,t) - dot_product(y,q)*dot_product(q,t)
         !  nu  = val1/val2  !triple checked.
         !endif

         !u(:) = gzi*ap + nu*(t_(:) -r(:) + beta*u(:)  ) !double checked.
         !z(:) = gzi*r(:) + nu*z(:) - alp*u(:)!double checked.
         !x(:) = x(:) +alp*p(:) +z(:) !double checked.
         !r(:) = t(:) -nu*y(:) -gzi*q(:)!double checked.
         !t_(:)=t(:)
         rr = dot_product(r, r)
         print *, rr
         !print *, 'itr, er =', itr,rr,sqrt(rr)/sqrt(init_rr)
         !print *, "ans = ",x(:)
         if (sqrt(rr)/sqrt(init_rr) < er0) exit
         call icres(L, d, r, r_, n)
         beta = dot_product(r, r_)/dot_product(rk, r_k)
         p(:) = r_(:) + beta*p(:)
         !beta=alp/gzi*dot_product(r0,r)/r0rk
         !w(:) = q(:) + beta * ap(:)
         if (itr == itrmax_) then
            print *, "ERROR :: preconditioned-CG did not converge."
         end if
      end do
   end subroutine
!===============================================================

   subroutine icres(L, d, r, u, n)
      real(real64) :: y(n), rly, lu
      real(real64), intent(inout) :: L(1:n, 1:n), d(1:n), r(1:n), u(1:n)
      integer(int32), intent(in)  :: n
      integer(int32) :: i, j
      do i = 1, n
         rly = r(i)
         do j = 1, i
            rly = rly - L(i, j)*y(j)
         end do
         y(i) = rly/L(i, i)
      end do

      do i = n, 1, -1
         lu = 0.0d0
         do j = i + 1, n
            lu = lu + L(j, i)*u(j)
         end do
         u(i) = y(i) - d(i)*lu
      end do

   end subroutine icres

!==========================================================
   function eigen_3d(tensor) result(eigenvector)
      real(real64), intent(in) :: tensor(3, 3)
      integer(int32) :: i, j, n
      real(real64) :: eigenvector(3, 3), a, b, c, d
      real(real64) :: eigenvalue(3), mat(3, 3), ev(3), zero(3), unitmat(3, 3)

      zero(:) = 0.0d0
      unitmat(:, :) = 0.0d0
      unitmat(1, 1) = 1.0d0
      unitmat(2, 2) = 1.0d0
      unitmat(3, 3) = 1.0d0
      ! get eigen values
      ! (1) solve cubic equation
      !https://keisan.casio.jp/exec/user/1305724050
      a = 1.0d0
      b = -tensor(1, 1) - tensor(2, 2) - tensor(3, 3)
      c = tensor(1, 1)*tensor(2, 2) + tensor(2, 2)*tensor(3, 3) + tensor(3, 3)*tensor(1, 1) &
          - tensor(1, 2)*tensor(2, 1) - tensor(2, 3)*tensor(3, 2) - tensor(3, 1)*tensor(1, 3)
      d = -tensor(1, 1)*tensor(2, 2)*tensor(3, 3) - tensor(1, 2)*tensor(2, 3)*tensor(3, 1) &
          - tensor(1, 3)*tensor(2, 1)*tensor(3, 2) + tensor(1, 3)*tensor(2, 2)*tensor(3, 1) &
          + tensor(2, 3)*tensor(3, 2)*tensor(1, 1) + tensor(3, 3)*tensor(1, 2)*tensor(2, 1)
      eigenvalue = cubic_equation(a, b, c, d)

      do i = 1, 3
         mat(:, :) = eigenvalue(i)*unitmat(:, :) - tensor(:, :)
         call gauss_jordan_pv(mat, ev, zero, 3)
         eigenvector(:, 3) = ev(:)
      end do

   end function
!====================================================================================
   subroutine showLinearSolver(obj)
      class(LinearSolver_), intent(in) :: obj
      real(real64), allocatable :: A_ij(:, :)
      integer(int32) :: i, j, n, m

      n = maxval(obj%Index_I)
      m = maxval(obj%Index_J)
      n = maxval((/n, m/))

      A_ij = zeros(n, n)
      do i = 1, size(obj%val)
         A_ij(obj%Index_I(i), obj%Index_J(i)) = A_ij(obj%Index_I(i), obj%Index_J(i)) + obj%val(i)
      end do

      call print(A_ij)
   end subroutine
!====================================================================================
!====================================================================================
   function globalMatrixLinearSolver(obj) result(ret)
      class(LinearSolver_), intent(in) :: obj
      real(real64), allocatable :: ret(:, :)
      integer(int32) :: i, j, n, m, p1, p2, domain_id, offset

      if (allocated(obj%NumberOfNode)) then
         n = sum(obj%NumberOfNode)*obj%DOF
         ret = zeros(n, n)
         do i = 1, size(obj%val)
            domain_id = obj%row_domain_id(i)
            offset = sum(obj%NumberOfNode(:domain_id - 1))
            !print *, offset
            p1 = offset*obj%DOF + obj%Index_I(i)

            domain_id = obj%column_domain_id(i)
            offset = sum(obj%NumberOfNode(:domain_id - 1))

            p2 = offset*obj%DOF + obj%Index_J(i)
            ret(p1, p2) = ret(p1, p2) + obj%val(i)
         end do
         return
      else
         n = maxval(obj%Index_I)
         m = maxval(obj%Index_J)
         n = maxval((/n, m/))

         ret = zeros(n, n)
         do i = 1, size(obj%val)
            ret(obj%Index_I(i), obj%Index_J(i)) = ret(obj%Index_I(i), obj%Index_J(i)) + obj%val(i)
         end do
         return
      end if
   end function
!====================================================================================

!====================================================================================
   function globalVectorLinearSolver(obj) result(ret)
      class(LinearSolver_), intent(in) :: obj
      real(real64), allocatable :: ret(:)
      integer(int32) :: i, j, n, m, p1, p2, domain_id, offset

      if (allocated(obj%NumberOfNode)) then
         n = sum(obj%NumberOfNode)*obj%DOF
         allocate (ret(n))
         ret(:) = 0.0d0
         do i = 1, size(obj%b_Index_J)
            domain_id = obj%b_domain_id(i)
            offset = sum(obj%NumberOfNode(:domain_id - 1))
            p1 = offset*obj%DOF + obj%b_Index_J(i)
            ret(p1) = ret(p1) + obj%b(i)
         end do
         return
      else
         ret = obj%b
      end if
   end function
!====================================================================================

   subroutine convertCOOtoCRSLinearSolver(obj, OpenMP)
      class(LinearSolver_), intent(inout) :: obj
      integer(int32), allocatable :: buf(:)
      integer(int32) :: n, nnz, i, nrhs
      logical, optional, intent(in) :: OpenMP
      logical :: omp_swich
      !real(real64),allocatable   :: CRS_val(:)
      !integer(int32),allocatable :: CRS_index_I(:)
      !integer(int32),allocatable :: CRS_index_J(:)
      !integer(int32),allocatable :: CRS_row_domain_id(:)
      !integer(int32),allocatable :: CRS_column_domain_id(:)
      omp_swich = input(default=.false., option=OpenMP)
      ! Notice :: COO format data should be created and sorted.
      ! Further, multi-domain is not supported.

      nrhs = size(obj%b) + 1
      obj%CRS_index_I = int(zeros(nrhs))

      if (omp_swich) then

         !$OMP parallel do private(i)
         do i = 1, size(obj%index_I)
            obj%CRS_index_I(obj%index_I(i)) = obj%CRS_index_I(obj%index_I(i)) + 1
         end do
         !$OMP end parallel do
         buf = obj%CRS_index_I

         !$OMP parallel do private(i)
         do i = nrhs - 1, 2, -1
            obj%CRS_index_I(i) = sum(buf(1:i - 1))
         end do
         !$OMP end parallel do

         obj%CRS_index_I(1) = 0
         obj%CRS_index_I(nrhs) = size(obj%val)
      else
         do i = 1, size(obj%index_I)
            obj%CRS_index_I(obj%index_I(i)) = obj%CRS_index_I(obj%index_I(i)) + 1
         end do
         buf = obj%CRS_index_I

         do i = nrhs - 1, 2, -1
            obj%CRS_index_I(i) = sum(buf(1:i - 1))
         end do

         obj%CRS_index_I(1) = 0
         obj%CRS_index_I(nrhs) = size(obj%val)
      end if
   end subroutine
! #######################################################
   function matmulCRSLinearSolver(obj, openMP) result(mm)
      class(LinearSolver_), intent(inout) :: obj
      real(real64), allocatable :: mm(:)
      integer(int32) :: i, j
      logical, optional, intent(in) :: openMP
      logical :: omp_swich
      omp_swich = input(default=.false., option=OpenMP)

      mm = zeros(size(obj%b))
      ! Notice :: CRS format data should be created and sorted.

      if (omp_swich) then
         !$OMP parallel do private(i)
         do i = 1, size(obj%b)
            do j = obj%CRS_Index_I(i) + 1, obj%CRS_Index_I(i + 1)
               mm(i) = mm(i) + obj%b(obj%Index_J(j))*obj%val(j)
            end do
         end do
         !$OMP end parallel do
      else
         do i = 1, size(obj%b)
            do j = obj%CRS_Index_I(i) + 1, obj%CRS_Index_I(i + 1)
               mm(i) = mm(i) + obj%b(obj%Index_J(j))*obj%val(j)
            end do
         end do
      end if
   end function

! #######################################################
   function matmulCOOLinearSolver(obj, OpenMP) result(mm)
      class(LinearSolver_), intent(inout) :: obj
      real(real64), allocatable :: mm(:)
      integer(int32) :: i, j
      logical, optional, intent(in) :: openMP
      logical :: omp_swich
      omp_swich = input(default=.false., option=OpenMP)

      mm = zeros(size(obj%b))
      ! Notice :: CRS format data should be created and sorted.
      if (omp_swich) then
         !$OMP parallel do private(i)
         do i = 1, size(obj%val)
            mm(obj%index_I(i)) = mm(obj%index_I(i)) + obj%val(i)*obj%b(obj%index_J(i))
         end do
         !$OMP end parallel do
      else
         do i = 1, size(obj%val)
            mm(obj%index_I(i)) = mm(obj%index_I(i)) + obj%val(i)*obj%b(obj%index_J(i))
         end do
      end if
   end function
! #######################################################

! #######################################################
   subroutine getCOOFormatLinearSolver(obj)
      class(LinearSolver_), intent(inout) :: obj
      integer(int32) :: i, j, n

      if (.not. allocated(obj%val)) then
         if (allocated(obj%a) .and. allocated(obj%b)) then
            ! it only has obj%a and obj%b
            ! so, convert it to COO format
            ! count non-zero values
            n = 0
            do j = 1, size(obj%a, 2)
               do i = 1, size(obj%a, 1)
                  if (obj%a(i, j) /= 0.0d0) then
                     n = n + 1
                  end if
               end do
            end do

            obj%val = zeros(n)
            obj%index_I = int(zeros(n))
            obj%index_J = int(zeros(n))

            obj%row_domain_id = int(zeros(n))
            obj%column_domain_id = int(zeros(n))
            obj%b_Index_J = int(zeros(size(obj%b)))
            obj%b_Domain_ID = int(zeros(size(obj%b)))

            obj%row_domain_id(:) = 1
            obj%column_domain_id(:) = 1
            obj%b_domain_id(:) = 1
            do i = 1, size(obj%b)
               obj%b_Index_J(i) = i
            end do

            n = 0
            do j = 1, size(obj%a, 2)
               do i = 1, size(obj%a, 1)
                  if (obj%a(i, j) /= 0.0d0) then
                     n = n + 1
                     obj%val(n) = obj%a(i, j)
                     obj%index_I(n) = i
                     obj%index_J(n) = j
                  end if
               end do
            end do

         end if
      end if

   end subroutine
! #######################################################

! #######################################################
   subroutine exportAsCOOLinearSolver(obj, name)
      class(LinearSolver_), intent(inout) :: obj
      character(*), intent(in) :: name
      type(IO_) :: f

      call f%open(name, "w")
      call f%write(obj%Index_I, obj%Index_J, obj%val)
      call f%close()

   end subroutine
! #######################################################

! #######################################################
   subroutine exportRHSLinearSolver(obj, name)
      class(LinearSolver_), intent(inout) :: obj
      character(*), intent(in) :: name
      type(IO_) :: f

      call f%open(name, "w")
      call f%write(obj%b)
      call f%close()

   end subroutine
! #######################################################

! #######################################################
! FITTING by Stochastic Gradient Descend
   function fit(f, training_data, params, eta, error, max_itr, use_ratio, logfile, algorithm) result(fit_params)

      interface
         function f(x, params) result(ret)
            use iso_fortran_env
            real(real64), intent(in) :: x
            real(real64), intent(in) :: params(:)
            real(real64) :: ret
         end function
      end interface

      real(real64), intent(in) :: training_data(:, :) !(sample, {input=1, output=2})
      real(real64), intent(in) :: params(:), eta
      real(real64), optional, intent(inout) :: error
      real(real64), optional, intent(in) :: use_ratio
      real(real64), allocatable :: grad_params(:), dp(:)
      integer(int32), optional, intent(in) :: max_itr
      character(*), optional, intent(in) :: logfile
      real(real64), allocatable :: fit_params(:)
      real(real64) :: error_function, error_function_b, error_function_f, err_0, h
      integer(int32) :: i, j, n, trial, shuffle_id, param_id
      real(real64) :: eps_val = dble(1.0e-4)
      real(real64) :: tol = dble(1.0e-9)
      real(real64) :: sgd_use_ratio
      character(*), optional, intent(in) :: algorithm
      character(:), allocatable :: algorithm_type

      type(IO_) :: log_file

      real(real64) :: eta_zero
      real(real64) :: eta_tr
      integer(int32):: max_trial = 1000000
      integer(int32) :: num_select !
      type(Random_) :: random
      integer(int32), allocatable :: selected_data_ids(:)

      if (present(algorithm)) then
         algorithm_type = algorithm
      else
         algorithm_type = "SGD"
      end if

      if (algorithm_type == "SGD") then

         sgd_use_ratio = 0.050d0
         if (present(use_ratio)) then
            sgd_use_ratio = use_ratio
         end if

         num_select = int(dble(size(training_data, 1))*sgd_use_ratio) + 1

         selected_data_ids = zeros(num_select)

         if (present(max_itr)) then
            max_trial = max_itr
         end if
         if (present(logfile)) then
            call log_file%open(logfile, "w")
            call log_file%write("# itr    error-norm")
         end if

         eta_zero = eta
         fit_params = params
         error_function = 0.0d0
         do i = 1, size(training_data, 1)
            error_function = error_function + (f(x=training_data(i, 1), params=fit_params) - training_data(i, 2))**2
         end do

         ! compute grad for each params
         ! by Stochastic Gradient Descend
         do trial = 1, max_trial
            error_function = 0.0d0
            !$OMP parallel default(shared)
            !$OMP do reduction(+:error_function)
            do i = 1, size(training_data, 1)
               error_function = error_function + (f(x=training_data(i, 1), params=fit_params) - training_data(i, 2))**2
            end do
            !$OMP end do
            !$OMP end parallel

            if (trial == 1) then
               err_0 = error_function
            end if

            if (present(error)) then
               error = error_function/err_0
            end if

            if (error_function/err_0 <= tol) then

               if (present(logfile)) then
                  call log_file%close()
               end if
               exit
            end if

            if (trial == max_trial) then

               if (present(logfile)) then
                  call log_file%close()
               end if
               !print *, "ERROR: fit() > SGD did not converge. "
               return
            end if

            ! Robbins-Monro Method
            eta_tr = eta_zero/dble(trial)

            ! shuffle training data and select 1
            shuffle_id = random%randint(from=1, to=size(training_data, 1))
            do i = 1, num_select
               selected_data_ids(i) = random%randint(from=1, to=size(training_data, 1))
            end do

            ! compute grad
            grad_params = zeros(size(params))
            !$OMP parallel private(shuffle_id,dp,error_function_f,error_function_b)
            !$OMP do reduction(+:grad_params)
            do j = 1, num_select
               shuffle_id = selected_data_ids(j)
               dp = zeros(size(params))
               do param_id = 1, size(params)
                  dp(:) = 0.0d0
                  dp(param_id) = eps_val

                  error_function_f = 0.0d0
                  do i = 1, size(training_data, 1)
                     error_function_f = error_function_f + (f(x=training_data(shuffle_id, 1), params=fit_params + dp) &
                                                            - training_data(shuffle_id, 2))**2
                  end do

                  dp(param_id) = -eps_val

                  error_function_b = 0.0d0
                  do i = 1, size(training_data, 1)
                     error_function_b = error_function_b + (f(x=training_data(shuffle_id, 1), params=fit_params + dp) &
                                                            - training_data(shuffle_id, 2))**2
                  end do
                  ! numerical derivative
                  grad_params(param_id) = grad_params(param_id) + &
                                          (error_function_f - error_function_b)/(2.0d0*eps_val)
               end do
            end do
            !$OMP end do
            !$OMP end parallel

            ! average_gradient
            grad_params = grad_params/dble(num_select)
            !grad_params = grad_params/norm(grad_params)

            fit_params = fit_params - eta_tr*grad_params

            if (norm(grad_params) <= tol) exit
            grad_params = 0.0d0

            if (present(logfile)) then
               write (log_file%fh, *) trial, error
            end if
         end do

         if (present(logfile)) then
            call log_file%close()
         end if

      else
         print *, "[ERROR] FIT > no algorithm was found:: ", algorithm_type
      end if

   end function

   function LOBPCG_sparse(A_val, A_col, A_rowptr, lambda_min, tolerance) result(eigen_vectors)
      real(real64), intent(in) :: A_val(:)
      integer(int32), intent(in)::A_col(:)
      integer(int64), intent(in)::A_rowptr(:)
      real(real64), allocatable :: eigen_vectors(:, :)
      real(real64), allocatable :: V(:, :), A_(:, :), xtemp(:, :), &
                                   lambda_and_x(:, :), lambda(:), bmin(:), x(:, :), r(:, :), x0s(:, :), p(:, :), w(:), &
                                   alpha, beta, gamma, XX(:), WW(:), PP(:), SA(:, :), SB(:, :), w_x_p(:, :), WW_XX_PP(:, :), &
                                   SB_inv(:, :), x_(:, :), lambda_mat(:, :), lambda_ids(:), Bm(:, :), residual(:), norms(:)
      integer(int32) :: num_eigen
      real(real64), optional, intent(in) :: tolerance
      real(real64) :: tol = dble(1.0e-6)
      real(real64) :: mu, normval
      real(real64), intent(inout) :: lambda_min(:)
      type(Random_) :: random
      integer(int32) :: i, j, n, id, itr
      integer(int32) :: m

      tol = input(default=dble(1.0e-14), option=tolerance)
      num_eigen = size(lambda_min, 1)
      n = size(A_rowptr, 1) - 1

      if (num_eigen*3 >= n) then
         print *, "ERROR :: num_eigen*3 should be < n"
         allocate (eigen_vectors(0, 0))
         return
      end if
      !m = 3*num_eigen
      residual = zeros(num_eigen)
      !https://www.researchgate.net/figure/Algorithm-of-LOBPCG-method-for-matrix-A-Here-the-matrix-T-is-a-preconditioner_fig1_323863889
      !    !x0s = random%randn(n,3)
      !initial guess x0

      lambda_min = 0.0d0

      x = zeros(n, num_eigen)
      x = (random%randn(n, num_eigen) - 0.50d0)*2.0d0

      x = x*10.0d0
      do i = 1, size(x, 1)
         do j = 1, size(x, 2)
            if (abs(x(i, j)) < 1.0d0) then
               x(i, j) = x(i, j)/abs(x(i, j))
            end if
         end do
      end do
      itr = 0

      do
         itr = itr + 1

         if (itr == 1) then

            ! step 1 to make initial values
            if (num_eigen /= 1) then
               xtemp = X

               call gram_real(n, num_eigen, Xtemp, X)

               ! [ok] X(:,:) :: ok!
               A_ = matmul(transpose(X), crs_matmul(A_val, A_col, A_rowptr, X))

               A_ = 0.50d0*A_ + 0.50d0*transpose(A_)

               ! [ok] A_(:,:) has correct size
               call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=tol*dble(1.0e-3))

               lambda_mat = zeros(num_eigen, num_eigen)
               lambda_mat(:, :) = 0.0d0
               do i = 1, size(lambda_mat, 1)
                  lambda_mat(i, i) = lambda(i)
               end do
               ! [ok] lambda_mat

               ! [ok] A_ X = X \Lambda

               ! X1 = X0 B
               x = matmul(x, x_)
               ![ok] size and content of R
               R = crs_matmul(A_val, A_col, A_rowptr, X) - matmul(X, lambda_mat)
            else
               mu = dot_product(x(:, 1), crs_matvec(A_val, A_col, A_rowptr, x(:, 1)))/dot_product(x(:, 1), x(:, 1))
               R = crs_matmul(A_val, A_col, A_rowptr, X) - mu*X
            end if

            ! 2m次元固有値問題
            V = zeros(n, 2*num_eigen)
            V(:, 1:num_eigen) = X
            V(:, num_eigen + 1:) = R
            ! [ok]直交化
            xtemp = V
            call gram_real(n, 2*num_eigen, xtemp, V)
            X = V(:, 1:num_eigen)
            R = V(:, num_eigen + 1:)

            A_ = matmul(transpose(V), crs_matmul(A_val, A_col, A_rowptr, V))
            ! try

            A_ = 0.50d0*A_ + 0.50d0*transpose(A_)
            ![ok] size(A_,1) & size(A_,2)

            !2m 次元固有値問題

            call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=dble(1.0e-14))

            ![ok] 下から m 個の固有値と固有ベクトルからなる行列:
            lambda_ids = linspace([1.0d0, dble(2*num_eigen)], 2*num_eigen)
            call heapsort(n=2*num_eigen, array=lambda, val=lambda_ids)
            ![ok] sort

            lambda_mat = zeros(num_eigen, num_eigen)
            do i = 1, num_eigen
               lambda_mat(i, i) = lambda(i)
            end do

            ! [ok] check eigen values

            Bm = zeros(2*num_eigen, num_eigen)
            do i = 1, num_eigen
               Bm(:, i) = x_(:, int(lambda_ids(i)))
            end do
            ! X2, R2, P2
            ! V = {X, R}
            X = matmul(V, Bm)

            !V = zeros(n,num_eigen*2 )
            V(:, 1:num_eigen) = 0.0d0

            V(:, num_eigen + 1:2*num_eigen) = r(:, :)
            ! R=R1 orR=R2?
            ! 今はR1と仮定
            P = matmul(V, Bm)

            R = crs_matmul(A_val, A_col, A_rowptr, X) - matmul(X, lambda_mat)

            !V(:,1:num_eigen) =0.0d0
            !V(:,num_eigen+1:2*num_eigen)  = r(:,:)
            ! R=R1 orR=R2?
            ! 今はR2と仮定
            !P = matmul(V,Bm)

            do i = 1, num_eigen
               Residual(i) = norm(R(:, i))
            end do
            call print("residual: "+str(maxval(Residual)))
            cycle

         else
            V = zeros(n, 3*num_eigen)

            V(:, 1:num_eigen) = x(:, :)
            V(:, num_eigen + 1:2*num_eigen) = r(:, :)
            V(:, 2*num_eigen + 1:) = p(:, :)

            xtemp = V
            call gram_real(n, 3*num_eigen, xtemp, V)

            x(:, :) = V(:, 1:num_eigen)
            r(:, :) = V(:, num_eigen + 1:2*num_eigen)
            p(:, :) = V(:, 2*num_eigen + 1:)

            A_ = matmul(transpose(V), crs_matmul(A_val, A_col, A_rowptr, V))! try

            A_ = 0.50d0*A_ + 0.50d0*transpose(A_)

            x_ = zeros(3*num_eigen, 3*num_eigen)

            call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=dble(1.0e-14))

            lambda_ids = linspace([1.0d0, dble(3*num_eigen)], 3*num_eigen)
            lambda_mat = zeros(num_eigen, num_eigen)

            call heapsort(n=3*num_eigen, array=lambda, val=lambda_ids)
            ![ok] sort of lambda

            do i = 1, num_eigen
               lambda_mat(i, i) = lambda(i)
            end do

            Bm = zeros(3*num_eigen, num_eigen)
            do i = 1, num_eigen
               Bm(:, i) = x_(:, int(lambda_ids(i)))
            end do

            X = matmul(V, Bm)

            !V = zeros(n,3*num_eigen )
            V(:, 1:num_eigen) = 0.0d0
            P = matmul(V, Bm)
            !V(:,num_eigen+1:2*num_eigen)  = r(:,:)
            !V(:,2*num_eigen+1:)  = p(:,:)

            R = crs_matmul(A_val, A_col, A_rowptr, X) - matmul(X, Lambda_mat)

            do i = 1, num_eigen
               Residual(i) = norm(R(:, i))
            end do

            print *, "info: ", itr, maxval(residual), minval(residual)

            if (maxval(Residual) < tol) then
               exit
            end if

         end if
         !あとは収束判定
         if (itr == 20000) then
            print *, "ERROR :: LOBPCG >> did not converge!"
            exit
         end if
      end do
      print *, "residual:", maxval(residual)
      print *, "itr=", itr

      do i = 1, num_eigen
         lambda_min(i) = lambda(i)
      end do
      eigen_vectors = x

   end function
!! ###################################################################

! ###################################################################
   function LOBPCG_dense(A, B, lambda_min) result(eigen_vectors)
      real(real64), intent(in) :: A(:, :), B(:, :)
      real(real64), allocatable :: eigen_vectors(:, :)
      real(real64), allocatable :: V(:, :), A_(:, :), xtemp(:, :), &
                                   lambda_and_x(:, :), lambda(:), bmin(:), x(:, :), r(:, :), x0s(:, :), p(:, :), w(:), &
                                   alpha, beta, gamma, XX(:), WW(:), PP(:), SA(:, :), SB(:, :), w_x_p(:, :), WW_XX_PP(:, :), &
                                   SB_inv(:, :), x_(:, :), lambda_mat(:, :), lambda_ids(:), Bm(:, :), residual(:), norms(:)
      integer(int32) :: num_eigen

      real(real64) :: tol = dble(1.0e-14)
      real(real64) :: mu, normval
      real(real64), intent(inout) :: lambda_min(:)
      type(Random_) :: random
      integer(int32) :: i, j, n, id, itr
      integer(int32) :: m

      num_eigen = size(lambda_min, 1)
      n = size(A, 1)

      if (num_eigen*3 >= n) then
         print *, "ERROR :: num_eigen*3 should be < n"
         allocate (eigen_vectors(0, 0))
         return
      end if
      !m = 3*num_eigen
      residual = zeros(num_eigen)
      !https://www.researchgate.net/figure/Algorithm-of-LOBPCG-method-for-matrix-A-Here-the-matrix-T-is-a-preconditioner_fig1_323863889
      !    !x0s = random%randn(n,3)
      !initial guess x0

      lambda_min = 0.0d0

      n = size(A, 1)

      x = zeros(n, num_eigen)
      x = (random%randn(n, num_eigen) - 0.50d0)*2.0d0

      x = x*10.0d0
      do i = 1, size(x, 1)
         do j = 1, size(x, 2)
            if (abs(x(i, j)) < 1.0d0) then
               x(i, j) = x(i, j)/abs(x(i, j))
            end if
         end do
      end do
      itr = 0

      do
         itr = itr + 1

         if (itr == 1) then

            ! step 1 to make initial values
            if (num_eigen /= 1) then
               xtemp = X
               call gram_real(n, num_eigen, Xtemp, X)

               ! [ok] X(:,:) :: ok!
               A_ = matmul(transpose(X), matmul(A, X))

               A_ = 0.50d0*A_ + 0.50d0*transpose(A_)

               ! [ok] A_(:,:) has correct size
               call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=tol)

               lambda_mat = zeros(num_eigen, num_eigen)
               lambda_mat(:, :) = 0.0d0
               do i = 1, size(lambda_mat, 1)
                  lambda_mat(i, i) = lambda(i)
               end do
               ! [ok] lambda_mat

               ! [ok] A_ X = X \Lambda

               ! X1 = X0 B
               x = matmul(x, x_)
               ![ok] size and content of R
               R = matmul(A, X) - matmul(X, lambda_mat)
            else
               mu = dot_product(x(:, 1), matmul(A(:, :), x(:, 1)))/dot_product(x(:, 1), x(:, 1))
               R = matmul(A, X) - mu*X
            end if

            ! 2m次元固有値問題
            V = zeros(n, 2*num_eigen)
            V(:, 1:num_eigen) = X
            V(:, num_eigen + 1:) = R
            ! [ok]直交化
            xtemp = V
            call gram_real(n, 2*num_eigen, xtemp, V)
            X = V(:, 1:num_eigen)
            R = V(:, num_eigen + 1:)

            A_ = matmul(transpose(V), matmul(A, V))
            ! try

            A_ = 0.50d0*A_ + 0.50d0*transpose(A_)
            ![ok] size(A_,1) & size(A_,2)

            !2m 次元固有値問題

            call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=tol)

            ![ok] 下から m 個の固有値と固有ベクトルからなる行列:
            lambda_ids = linspace([1.0d0, dble(2*num_eigen)], 2*num_eigen)
            call heapsort(n=2*num_eigen, array=lambda, val=lambda_ids)
            ![ok] sort

            lambda_mat = zeros(num_eigen, num_eigen)
            do i = 1, num_eigen
               lambda_mat(i, i) = lambda(i)
            end do

            ! [ok] check eigen values

            Bm = zeros(2*num_eigen, num_eigen)
            do i = 1, num_eigen
               Bm(:, i) = x_(:, int(lambda_ids(i)))
            end do
            ! X2, R2, P2
            ! V = {X, R}
            X = matmul(V, Bm)

            !V = zeros(n,num_eigen*2 )
            V(:, 1:num_eigen) = 0.0d0

            V(:, num_eigen + 1:2*num_eigen) = r(:, :)
            ! R=R1 orR=R2?
            ! 今はR1と仮定
            P = matmul(V, Bm)

            R = matmul(A, X) - matmul(X, lambda_mat)

            !V(:,1:num_eigen) =0.0d0
            !V(:,num_eigen+1:2*num_eigen)  = r(:,:)
            ! R=R1 orR=R2?
            ! 今はR2と仮定
            !P = matmul(V,Bm)

            do i = 1, num_eigen
               Residual(i) = norm(R(:, i))
            end do
            call print("residual: "+str(maxval(Residual)))
            cycle

         else
            V = zeros(n, 3*num_eigen)

            V(:, 1:num_eigen) = x(:, :)
            V(:, num_eigen + 1:2*num_eigen) = r(:, :)
            V(:, 2*num_eigen + 1:) = p(:, :)

            xtemp = V
            call gram_real(n, 3*num_eigen, xtemp, V)

            x(:, :) = V(:, 1:num_eigen)
            r(:, :) = V(:, num_eigen + 1:2*num_eigen)
            p(:, :) = V(:, 2*num_eigen + 1:)

            A_ = matmul(transpose(V), matmul(A, V))! try

            A_ = 0.50d0*A_ + 0.50d0*transpose(A_)

            x_ = zeros(3*num_eigen, 3*num_eigen)

            call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=tol)

            lambda_ids = linspace([1.0d0, dble(3*num_eigen)], 3*num_eigen)
            lambda_mat = zeros(num_eigen, num_eigen)

            call heapsort(n=3*num_eigen, array=lambda, val=lambda_ids)
            ![ok] sort of lambda

            do i = 1, num_eigen
               lambda_mat(i, i) = lambda(i)
            end do

            Bm = zeros(3*num_eigen, num_eigen)
            do i = 1, num_eigen
               Bm(:, i) = x_(:, int(lambda_ids(i)))
            end do

            X = matmul(V, Bm)

            !V = zeros(n,3*num_eigen )
            V(:, 1:num_eigen) = 0.0d0
            P = matmul(V, Bm)
            !V(:,num_eigen+1:2*num_eigen)  = r(:,:)
            !V(:,2*num_eigen+1:)  = p(:,:)

            R = matmul(A, X) - matmul(X, Lambda_mat)

            do i = 1, num_eigen
               Residual(i) = norm(R(:, i))
            end do

            if (maxval(Residual) < tol) then
               exit
            end if

         end if
         !あとは収束判定
         if (itr == 20000) then
            print *, "ERROR :: LOBPCG >> did not converge!"
            exit
         end if
      end do
      print *, "residual:", maxval(residual)
      print *, "itr=", itr

      do i = 1, num_eigen
         lambda_min(i) = lambda(i)
      end do
      eigen_vectors = x

   end function
!! ###################################################################

   function LOBPCG_dense_single(A, B, lambda_min) result(eigen_vector)
      real(real64), intent(in) :: A(:, :), B(:, :)
      real(real64), allocatable :: eigen_vector(:)
      real(real64), allocatable :: x0(:), r0(:), p0(:), V(:, :), A_(:, :), xtemp(:, :), &
                                   lambda_and_x(:, :), lambda(:), bmin(:), x(:), r(:), x0s(:, :), p(:), w(:), &
                                   alpha, beta, gamma, XX(:), WW(:), PP(:), SA(:, :), SB(:, :), w_x_p(:, :), WW_XX_PP(:, :), &
                                   SB_inv(:, :), x_(:, :)
      real(real64) :: tol = dble(1.0e-10)
      real(real64) :: mu
      real(real64), intent(inout) :: lambda_min
      type(Random_) :: random
      integer(int32) :: i, j, n, id, itr
      integer(int32) :: m = 3

      !https://www.researchgate.net/figure/Algorithm-of-LOBPCG-method-for-matrix-A-Here-the-matrix-T-is-a-preconditioner_fig1_323863889
      !    !x0s = random%randn(n,3)
      !initial guess x0
      n = size(A, 1)

      lambda_min = 0.0d0

      n = size(A, 1)
      xtemp = zeros(n, 3)

      eigen_vector = zeros(n)
      !xtemp = x0s
      x = zeros(n)
      do i = 1, n
         x(i) = 1.0d0
      end do
      x = (random%randn(n) - 0.50d0)*2.0d0

      x = x/norm(x)
      mu = dot_product(x, matmul(A, x))/dot_product(x, x)

      r = matmul(A, x) - mu*x
      !r = r/norm(r)
      p = zeros(n)

      V = zeros(n, 3)
      V(:, 1) = x(:)
      V(:, 2) = r(:)
      V(:, 3) = p(:)
      xtemp = V

      call gram_real(n, 2, xtemp, v)
      x(:) = V(:, 1)
      r(:) = V(:, 2)

      V = zeros(n, 3)
      V(:, 1) = x(:)
      V(:, 2) = r(:)
      V(:, 3) = p(:)
      !

      itr = 0
      do
         itr = itr + 1

         A_ = matmul(transpose(V), matmul(A, V))
         A_ = 0.50d0*A_ + 0.50d0*transpose(A_)

         !lambda = eigenValue(A_,tol=tol)

         call eigenValueAndVector(A=A_, lambda=lambda, x=x_, tol=tol)

         ! 固有ベクトルを正規化

         do i = 1, size(x_, 2)
            x_(:, i) = x_(:, i)/norm(x_(:, i))
         end do
         if (itr == 1) then
            id = minvalID(lambda(1:2))
         else
            id = minvalID(lambda(1:3))
         end if
         print *, "---check---"

         ! 最小固有ベクトル
         bmin = x_(:, id)

         x = matmul(V, bmin)

         mu = lambda(id)
         r = matmul(A, x) - mu*x(:)

         p = zeros(n)
         p(:) = V(:, 2)*bmin(2) + V(:, 3)*bmin(3)

         if (norm(r) < tol .and. norm(x) /= 0.0d0) exit

         print *, "error", norm(r)

         ! 直交化
         V = zeros(n, 3)
         V(:, 1) = x(:)
         V(:, 2) = r(:)
         V(:, 3) = p(:)
         call gram_real(n, 3, V, xtemp)
         x(:) = xtemp(:, 1)
         r(:) = xtemp(:, 2)
         p(:) = xtemp(:, 3)
         V(:, 1) = x(:)
         V(:, 2) = r(:)
         V(:, 3) = p(:)

         !print *, dot_product(x,r)
         !print *, dot_product(r,p)
         !print *, dot_product(x,p)

         !x = x/norm(x)
         !r = r/norm(r)
         !p = p/norm(p)

         !call print(xtemp)
         !stop
         if (itr == 10000) then
            print *, "ERROR :: LOBPCG >> did not converge!"
            exit
         end if
      end do
      print *, "residual:", norm(r)
      print *, "itr=", itr

      lambda_min = lambda(id)
      eigen_vector = x

   end function
!! ###################################################################
   subroutine gram_real(m, n, mat_v, mat_v_out)
      ! cited from
      !http://park.itc.u-tokyo.ac.jp/kato-yusuke-lab/nagai/note_141009_LOBPCG.pdf
      implicit none
      ! m : ベクトルの次元
      ! n : ベクトルの本数
      integer, intent(in)::m, n
      real(8), intent(in)::mat_v(1:m, 1:n)
      real(8), intent(out)::mat_v_out(1:m, 1:n)
      integer::i, j
      real(8)::v(1:m), nai, vi(1:m), viold(1:m)
      real(8)::norm
      mat_v_out = mat_v
      do i = 1, n
         viold = mat_v_out(:, i)
         do j = 1, i - 1
            nai = dot_product(mat_v_out(1:m, j), viold(1:m))
            vi = viold - nai*mat_v_out(:, j)
            viold = vi
         end do
         norm = sqrt(dble(dot_product(viold, viold)))
         if (norm == 0.0d0) then
            ! DEBUG Right?
            print *, "ERROR gram_real :: norm-zero"
            mat_v_out(j, j) = 1.0d0
            !
            !stop

            !mat_v_out(:,j) = viold
         else
            mat_v_out(:, j) = viold/norm
         end if
      end do
      return
   end subroutine gram_real

!function LOBPCG(A,B,num_eigens,err) result(eigens)
!    type(FEMSolver_),intent(in) :: A, B
!    complex(real64),allocatable :: eigens(:,:)
!    integer(int32),intent(in) :: num_eigens
!    real(real64),optional,intent(in) :: err
!    real(real64) :: error_tolerance
!    integer(int32) :: i,j,m,nx,nv
!
!    ! Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG)
!    ! https://arxiv.org/pdf/1704.07458.pdf
!
!
!    if(.not.allocated(A%CRS_val) )then
!        print *, "[ERROR] eigFEMSolver >> (.not.allocated(this%CRS_val) )"
!        return
!    endif
!
!
!    error_tolerance = input(default=dble(1.0e-14),option=err)
!
!    m = size(A%CRS_Index_Row)-1
!    nx= num_eigens
!    allocate(eigens(0:num_eigens,1:m))
!    !eigens(0,1:m) :: eigen values
!    !eigens(1:nx,1:m) :: eigen vectors from 1st to nx-th
!    eigens(:,:) = 0.0d0
!
!
!
!
!
!end function
! ###################################################################

!subroutine RayleighRitz(S,C,Theta)
!    complex(real64),intent(in) :: S(:,:)
!    complex(real64),allocatable,intent(out) :: C(:,:),Theta(:,:)
!    complex(real64),allocatable :: D(:,:)
!
!    !Rayleigh-Ritz procedure
!    D = matmul( transpose(S), )
!
!end subroutine
   subroutine to_Dense(CRS_val, CRS_col, CRS_rowptr, DenseMatrix)

      real(real64), allocatable, intent(inout) :: DenseMatrix(:, :)
      real(real64), allocatable, intent(in) :: CRS_val(:)
      integer(int32), allocatable, intent(in) :: CRS_col(:)
      integer(int64), allocatable, intent(in) :: CRS_rowptr(:)

      integer(int64) :: nonzero_count, i, j, k, n

      n = size(CRS_rowptr) - 1

      allocate (DenseMatrix(n, n))
      DenseMatrix(:, :) = 0.0d0

      do i = 1, size(CRS_rowptr) - 1
         do j = CRS_rowptr(i), CRS_rowptr(i + 1) - 1

            DenseMatrix(i, CRS_col(j)) = CRS_val(j)
         end do
      end do
   end subroutine

!subroutine to_CRS(DenseMatrix,CRS_val,CRS_col,CRS_rowptr)
!  real(real64),intent(in) :: DenseMatrix(:,:)
!  real(real64),allocatable,intent(inout) :: CRS_val(:)
!  integer(int32),allocatable,intent(inout) :: CRS_col(:)
!  integer(int64),allocatable,intent(inout) :: CRS_rowptr(:)
!
!  integer(int64) :: nonzero_count,i,j,k,n
!
!  nonzero_count = 0
!  do i=1,size(DenseMatrix,1)
!      do j=1,size(DenseMatrix,2)
!          if(DenseMatrix(i,j)/=0.0d0 ) then
!              nonzero_count = nonzero_count + 1
!          endif
!      enddo
!  enddo
!  CRS_val = zeros(nonzero_count)
!  CRS_col = int(linspace([1.0d0,dble(nonzero_count)], nonzero_count))
!  CRS_rowptr = int(zeros(size(DenseMatrix,1)+1) )
!
!  nonzero_count = 0
!  do i=1,size(DenseMatrix,1)
!      do j=1,size(DenseMatrix,2)
!          if(DenseMatrix(i,j)/=0.0d0 ) then
!              nonzero_count = nonzero_count + 1
!              CRS_val(nonzero_count) = DenseMatrix(i,j)
!              CRS_col(nonzero_count) = j
!              CRS_rowptr(i) = CRS_rowptr(i) + 1
!          endif
!      enddo
!  enddo
!
!  n = CRS_rowptr(1)
!  !CRS_rowptr(1) =0
!  do i=1,size(CRS_rowptr)-1
!      CRS_rowptr(i+1) = CRS_rowptr(i+1) + CRS_rowptr(i)
!  enddo
!  do i=size(CRS_rowptr)-1,1,-1
!      CRS_rowptr(i+1) = CRS_rowptr(i)
!  enddo
!  do i=size(CRS_rowptr),1,-1
!      CRS_rowptr(i) = CRS_rowptr(i) +1
!  enddo
!  CRS_rowptr(1) = 1
!
!
!end subroutine

!
   function crs_matmul_generic(CRS_value, CRS_col, CRS_row_ptr, old_vectors) result(new_vectors)
      real(real64), intent(in)  :: CRS_value(:), Old_vectors(:, :)
      integeR(int32), intent(in):: CRS_col(:)
      integeR(int64), intent(in):: CRS_row_ptr(:)

      real(real64), allocatable :: new_vectors(:, :)

      integer(int64) :: i, j, n, gid, lid, row, CRS_id, col, m
      !> x_i = A_ij b_j

      n = size(CRS_row_ptr) - 1
      m = size(old_vectors, 2)
      if (size(old_vectors, 1) /= n) then
         print *, "ERROR crs_matmul :: inconsistent size for old_vectors"
         return
      end if

      new_vectors = zeros(n, m)
  !!$OMP parallel do default(shared) private(CRS_id,col)
      do row = 1, n
         do CRS_id = CRS_row_ptr(row), CRS_row_ptr(row + 1) - 1
            col = CRS_col(CRS_id)
            do j = 1, m
              !!$OMP atomic
               new_vectors(row, j) = new_vectors(row, j) &
                                     + CRS_value(CRS_id)*old_vectors(col, j)
            end do
         end do
      end do
  !!$OMP end parallel do

   end function
! ###################################################################

   function crs_matmul_for_CRStype(CRS, old_vectors) result(new_vectors)
      type(CRS_), intent(in) :: CRS
      real(real64), intent(in)  :: Old_vectors(:, :)
      real(real64), allocatable :: new_vectors(:, :)

      new_vectors = crs_matmul_generic( &
                    CRS_value=CRS%val, &
                    CRS_col=CRS%col_idx, &
                    CRS_row_ptr=CRS%row_ptr, &
                    old_vectors=old_vectors)

   end function

   function crs_matvec_generic(CRS_value, CRS_col, CRS_row_ptr, old_vector) result(new_vector)
      real(real64), intent(in)  :: CRS_value(:), Old_vector(:)
      integeR(int32), intent(in):: CRS_col(:)
      integeR(int64), intent(in):: CRS_row_ptr(:)

      real(real64), allocatable :: new_vector(:)
      integer(int64) :: i, j, n, gid, lid, row, CRS_id, col
      !> x_i = A_ij b_j

      n = size(CRS_row_ptr) - 1
      if (size(old_vector) /= n) then
         print *, "ERROR crs_matvec :: inconsistent size for old_vector"
         return
      end if

      new_vector = zeros(n)
      !$OMP parallel do default(shared) private(CRS_id,col)
      do row = 1, n
         do CRS_id = CRS_row_ptr(row), CRS_row_ptr(row + 1) - 1
            col = CRS_col(CRS_id)
            !$OMP atomic
            new_vector(row) = new_vector(row) + CRS_value(CRS_id)*old_vector(col)
         end do
      end do
      !$OMP end parallel do

   end function
! ###################################################################

   subroutine sub_crs_matvec_generic(CRS_value, CRS_col, CRS_row_ptr, old_vector, new_vector, precondition)
      real(real64), intent(in)  :: CRS_value(:), Old_vector(:)
      integeR(int32), intent(in):: CRS_col(:)
      integeR(int64), intent(in):: CRS_row_ptr(:)
      type(CRS_), optional, intent(in) :: precondition
      real(real64), allocatable, intent(inout) :: new_vector(:)
      real(real64), allocatable :: precon_old_vector(:)
      integer(int64) :: i, j, n, gid, lid, row, CRS_id, col
      !> x_i = A_ij b_j

      n = size(CRS_row_ptr) - 1
      if (size(old_vector) /= n) then
         print *, "ERROR crs_matvec :: inconsistent size for old_vector"
         return
      end if

      if (.not. allocated(new_vector)) then
         new_vector = zeros(n)
      else
         new_vector(:) = 0.0d0
      end if

      if (present(precondition)) then
         call precondition%ILU_matvec(old_vector=old_vector, new_vector=precon_old_vector)

         !$OMP parallel do default(shared) private(CRS_id,col)
         do row = 1, n
            do CRS_id = CRS_row_ptr(row), CRS_row_ptr(row + 1) - 1
               col = CRS_col(CRS_id)
               !$OMP atomic
               new_vector(row) = new_vector(row) + CRS_value(CRS_id)*precon_old_vector(col)
            end do
         end do
         !$OMP end parallel do
      else

         !$OMP parallel do default(shared) private(CRS_id,col)
         do row = 1, n
            do CRS_id = CRS_row_ptr(row), CRS_row_ptr(row + 1) - 1
               col = CRS_col(CRS_id)
               !$OMP atomic
               new_vector(row) = new_vector(row) + CRS_value(CRS_id)*old_vector(col)
            end do
         end do
         !$OMP end parallel do

      end if

   end subroutine
! ###################################################################

   function crs_matvec_for_CRStype(CRS, old_vector) result(new_vector)
      type(CRS_), intent(in) :: CRS
      real(real64), intent(in)  :: Old_vector(:)
      real(real64), allocatable :: new_vector(:)

      new_vector = crs_matvec_generic( &
                   CRS_value=CRS%val, &
                   CRS_col=CRS%col_idx, &
                   CRS_row_ptr=CRS%row_ptr, &
                   old_vector=old_vector)

   end function

   subroutine sub_crs_matvec_for_CRStype(CRS, old_vector, new_vector, precondition)
      type(CRS_), intent(in) :: CRS
      type(CRS_), optional, intent(in) :: precondition
      real(real64), intent(in)  :: Old_vector(:)
      real(real64), allocatable, intent(inout) :: new_vector(:)

      call sub_crs_matvec_generic( &
         CRS_value=CRS%val, &
         CRS_col=CRS%col_idx, &
         CRS_row_ptr=CRS%row_ptr, &
         old_vector=old_vector, &
         new_vector=new_vector, &
         precondition=precondition)

   end subroutine

   pure function eyesMatrix(rank1, rank2) result(ret)
      integer(int32), intent(in) ::rank1, rank2
      real(real64), allocatable :: ret(:, :)
      integer(int32) :: i, min_rank

      allocate (ret(rank1, rank2))
      ret(:, :) = 0.0d0
      min_rank = minval([rank1, rank2])
      do i = 1, min_rank
         if (rank2 > i) exit
         ret(i, i) = 1.0d0
      end do

   end function
! ###################################################################

   subroutine reduce_crs_matrix(CRS_val, CRS_col, CRS_rowptr, remove_IDs)
      real(real64), allocatable, intent(inout) :: CRS_val(:)
      integer(int32), allocatable, intent(inout) :: CRS_col(:)
      integer(int64), allocatable, intent(inout) :: CRS_rowptr(:)

      real(real64), allocatable :: new_CRS_val(:)
      integer(int32), allocatable :: new_CRS_col(:)
      integer(int64), allocatable :: new_CRS_rowptr(:)

      integer(int32), intent(in) :: remove_IDs(:)
      integer(int32), allocatable :: remove_IDs_sorted(:)
      integer(int32) :: i, old_row_id, old_col_id, old_col_ptr, count_m1
      integer(int32) :: j
      integer(int32) :: new_row_id, new_col_id, new_col_ptr, remove_id_ptr, rem_count, k, l, m
      integer(int32), allocatable :: new_id_from_old_id(:)

      ! remove_ids should be unique and sorted
      remove_IDs_sorted = remove_IDs
      remove_IDs_sorted = unique(remove_IDs_sorted)
      call heapsort(n=size(remove_IDs_sorted), array=remove_IDs_sorted)

      ! allocate
      new_CRS_val = CRS_val
      new_CRS_col = CRS_col
      new_CRS_rowptr = CRS_rowptr

      !new_CRS_val = zeros(size(CRS_val) - size(remove_IDs_sorted) )
      !new_CRS_col = zeros(size(CRS_col) - size(remove_IDs_sorted) )
      !new_CRS_rowptr = zeros(size(CRS_rowptr) - size(remove_IDs_sorted) )
  !!
      new_id_from_old_id = zeros(size(CRS_rowptr) - 1)
      do i = 1, size(CRS_rowptr) - 1
         new_id_from_old_id(i) = i
      end do

      do i = 1, size(remove_ids_sorted)
         new_id_from_old_id(remove_ids_sorted(i) + 1:) = new_id_from_old_id(remove_ids_sorted(i) + 1:) - 1
         new_id_from_old_id(remove_ids_sorted(i)) = -1
      end do

      ! if a id is listed in remove_ids_sorted, ignore

      !
      new_row_id = 0
      new_col_id = 0
      new_col_ptr = 0
      remove_id_ptr = 1

      ! create new_CRS_col
      ! only for column
      ! blanks are indicated by -1
      ! 当該は-1して,前送り
      do i = 1, size(CRS_rowptr) - 1
         do j = CRS_rowptr(i), CRS_rowptr(i + 1) - 1
            old_col_ptr = j
            old_col_id = CRS_col(old_col_ptr)
            new_CRS_col(old_col_ptr) = new_id_from_old_id(old_col_id)
         end do
      end do

      ! 列に-1
      do i = 1, size(remove_IDs_sorted)
         do j = CRS_rowptr(remove_IDs_sorted(i)), CRS_rowptr(remove_IDs_sorted(i) + 1) - 1
            old_col_ptr = j
            new_CRS_col(old_col_ptr) = -1
         end do
      end do

      ! renew row_ptr
      new_CRS_rowptr = 0.0d0
      do i = 1, size(CRS_rowptr) - 1
         do j = CRS_rowptr(i), CRS_rowptr(i + 1) - 1
            if (new_CRS_col(j) /= -1) then
               new_CRS_rowptr(i) = new_CRS_rowptr(i) + 1
            end if
         end do
      end do

      do i = 1, size(new_CRS_rowptr) - 1
         new_CRS_rowptr(i + 1) = new_CRS_rowptr(i + 1) + new_CRS_rowptr(i)
      end do

      do i = size(new_CRS_rowptr), 2, -1
         new_CRS_rowptr(i) = new_CRS_rowptr(i - 1) + 1
      end do
      new_CRS_rowptr(1) = 1

      ! set -1 to removable rows
      do i = 1, size(remove_ids_sorted)
         new_CRS_rowptr(remove_ids_sorted(i)) = -1
      end do

      ! renew CRS_val
      k = 0
      do i = 1, sizE(new_CRS_col)
         if (new_CRS_col(i) == -1) then
            k = k + 1
         end if
      end do

      new_CRS_val = zeros(size(CRS_val) - k)
      k = 0
      do i = 1, size(new_CRS_col)
         if (new_CRS_col(i) == -1) then
            cycle
         else
            k = k + 1
            new_CRS_val(k) = CRS_val(i)
         end if
      end do

      call searchAndRemove(new_CRS_col, eq=-1)
      call searchAndRemove(new_CRS_rowptr, eq=-1)

      deallocate (CRS_val)
      deallocate (CRS_col)
      deallocate (CRS_rowptr)

      CRS_val = new_CRS_val
      CRS_col = new_CRS_col
      CRS_rowptr = new_CRS_rowptr

   end subroutine
! ###################################################################

   function UpperTriangularMatrix(CRS_val, CRS_col, CRS_rowptr) result(UP)
      real(real64), intent(in) :: CRS_val(:)
      integer(int32), intent(in) :: CRS_col(:)
      integer(int64), intent(in) :: CRS_rowptr(:)
      integer(int64) :: i, j, col, row, N, offset
      real(real64) :: val
      real(real64), allocatable :: UP(:)

      N = size(CRS_rowptr) - 1
      UP = zeros(N*(N + 1)/2)

      !$OMP parallel default(shared) private(col,val,i)
      !$OMP do
      do row = 1, N
         do i = CRS_rowptr(row), CRS_rowptr(row + 1) - 1
            col = CRS_col(i)
            val = CRS_val(i)
            if (row <= col) then
            !!$OMP atomic
               UP(row + (col - 1)*col/2) = val
            end if
         end do
      end do
      !$OMP end do
      !$OMP end parallel

   end function
! ###################################################################

   function fillby(element, vec_size, num_repeat) result(new_vec)
      real(real64), intent(in) :: element(:)
      integer(int32), optional, intent(in) :: vec_size, num_repeat
      real(real64), allocatable :: new_vec(:)
      integer(int32) :: i, j

      if (present(vec_size)) then
         allocate (new_vec(vec_size))
         i = 0
         do
            do j = 1, 3
               i = i + 1
               if (i > size(new_vec)) then
                  return
               end if
               new_vec(i) = element(j)
            end do
         end do

      elseif (present(num_repeat)) then
         allocate (new_vec(num_repeat*size(element)))
         do i = 1, num_repeat
            new_vec((i - 1)*size(element) + 1:i*size(element)) = element(1:size(element))
         end do
      else
         new_vec = element
      end if

   end function

! ################################################################

   function unique(old_vec) result(new_vec)
      integer(int32), intent(in) :: old_vec(:)
      integer(int32), allocatable :: new_vec(:), remove_is_one(:)
      integer(int32) :: n_size, i, id, j

      n_size = size(old_vec)
      ! caution; O(N^2)

      remove_is_one = zeros(n_size)
      do i = 1, n_size
         if (remove_is_one(i) == 1) cycle
         do j = i + 1, n_size
            if (old_vec(i) == old_vec(j)) then
               remove_is_one(j) = 1
            end if
         end do
      end do

      new_vec = zeros(n_size - sum(remove_is_one))

      id = 0
      do i = 1, n_size
         if (remove_is_one(i) == 1) then
            cycle
         else
            id = id + 1
            new_vec(id) = old_vec(i)
         end if

      end do

   end function

   subroutine Lanczos(A, V, T)
      real(real64), intent(in) :: A(:, :)
      real(real64), allocatable, intent(inout) :: V(:, :), T(:, :)
      real(real64), allocatable :: v1(:), w1(:), vj(:), wj(:)
      real(real64) :: alp1, beta_j, alp_j
      type(Random_) :: random
      integer(int32) :: i, N, j, m

      ! https://en.wikipedia.org/wiki/Lanczos_algorithm

      N = size(A, 1)
      V = zeros(N, N)
      T = zeros(N, N)

      v1 = 2.0d0*random%randn(N) - 1.0d0
      v1 = v1/norm(v1)
      w1 = matmul(A, v1)
      alp1 = dot_product(w1, v1)
      w1 = w1 - alp1*v1
      beta_j = norm(w1)

      T(1, 1) = alp1
      V(:, 1) = v1

      do i = 2, N
         beta_j = norm(w1)
         if (beta_j /= 0.0d0) then
            vj = w1/beta_j
         else
            vj = 2.0d0*random%randn(N) - 1.0d0
            stop
         end if
         wj = matmul(A, vj)
         alp_j = dot_product(wj, vj)
         wj = wj - alp_j*vj - beta_j*v1
         v1 = vj
         w1 = wj

         T(i, i) = alp_j
         T(i, i - 1) = beta_j
         T(i - 1, i) = beta_j
         V(:, i) = vj
      end do

   end subroutine
! #####################################################
   subroutine PBiCGSTAB_CRS(CRS, x, b, itrmax, er, relative_er, debug, ILU_MATRIX)
      type(CRS_), intent(inout) :: CRS
      integer(int32), intent(inout) :: itrmax
      real(real64), intent(inout) ::  b(:), er
      real(real64), optional, intent(in) :: relative_er
      real(real64), intent(inout) :: x(:)
      logical, optional, intent(in) :: debug
      type(CRS_), optional, intent(inout) :: ILU_MATRIX

      call bicgstab_CRS_ILU(a=CRS%val, ptr_i=CRS%row_ptr, index_j=CRS%col_idx, &
                            x=x, b=b, itrmax=itrmax, er=er, relative_er=relative_er, debug=debug, ILU_MATRIX=ILU_MATRIX)

   end subroutine

! #####################################################
   subroutine bicgstab_CRS_ILU(a, ptr_i, index_j, x, b, itrmax, er, relative_er, debug, ILU_MATRIX)
      integer(int32), intent(inout) :: index_j(:), itrmax
      integer(int64), intent(inout) :: ptr_i(:)
      real(real64), intent(inout) :: a(:), b(:), er
      real(real64), optional, intent(in) :: relative_er
      real(real64), intent(inout) :: x(:)
      logical, optional, intent(in) :: debug
      type(CRS_), optional, intent(inout) :: ILU_MATRIX
      type(CRS_) :: Pmat
      logical :: speak = .false.
      integer(int32) itr, i, j, n
      real(real64) alp, bet, c1, c2, c3, ev, vv, rr, er0, init_rr, re_er0
      real(real64), allocatable:: r(:), r0(:), p(:), y(:), e(:), v(:), pa(:), ax(:), pd(:), ps(:)

      er0 = er
      if (present(debug)) then
         speak = debug
      end if

      call Pmat%init(val=a, row_ptr=int(ptr_i, int64), col_idx=index_j)

      if (speak) print *, "BiCGSTAB ILU(0) STARTED >> "
      if (present(ILU_MATRIX)) then
         if (allocated(ILU_MATRIX%val)) then
            Pmat = ILU_MATRIX
         else
            call Pmat%ILU(0, debug=debug)
            ILU_MATRIX = Pmat
         end if
      else
         call Pmat%ILU(0, debug=debug)
      end if

      n = size(b)
      if (speak) print *, "BiCGSTAB STARTED >> DOF:", n
      allocate (r(n), r0(n), p(n), y(n), e(n), v(n))

      r(:) = b(:)
      if (speak) print *, "BiCGSTAB >> [1] initialize"

      call sub_crs_matvec(CRS_value=a, CRS_col=index_j, &
                          CRS_row_ptr=ptr_i, old_vector=x, new_vector=ax)
      r = b - ax

      if (speak) print *, "BiCGSTAB >> [2] dp1"

      c1 = dot_product(r, r)
      !call omp_dot_product(r,r,c1)

      init_rr = c1
      !if(speak) print *, "BiCGSTAB >>      |r|^2 = ",init_rr

      if (c1 < er0) return

      p(:) = r(:)
      r0(:) = r(:)

      do itr = 1, itrmax
         if (speak) print *, "BiCGSTAB >> ["//str(itr)//"] initialize"
         c1 = dot_product(r0, r)
         !call omp_dot_product(r0,r,c1)

         y(:) = 0.0d0

         ! compute y = A P^{-1} d (p:=d)
         call sub_crs_matvec(CRS_value=a, CRS_col=index_j, &
                             CRS_row_ptr=ptr_i, old_vector=p, new_vector=y, precondition=Pmat)

         c2 = dot_product(r0, y)
         !call omp_dot_product(r0,y,c2)

         alp = c1/c2

         ! compute e := s = r - alpa * A P^{-1} d (p:=d)
         e(:) = r(:) - alp*y(:)

         v(:) = 0.0d0
         ! v = A P^{-1} e = A P^{-1} s
         call sub_crs_matvec(CRS_value=a, CRS_col=index_j, &
                             CRS_row_ptr=ptr_i, old_vector=e, new_vector=v, precondition=Pmat)

         if (speak) print *, "BiCGSTAB >> ["//str(itr)//"] half"

         ev = dot_product(e, v)
         vv = dot_product(v, v)
         !call omp_dot_product(e,v,ev)
         !call omp_dot_product(v,v,vv)

         if (vv == 0.0d0) stop "Bicgstab devide by zero"
         ! w = c3 =
         c3 = ev/vv
         !https://cattech-lab.com/science-tools/simulation-lecture-mini-cg/
         call Pmat%ILU_matvec(old_vector=p, new_vector=pd)
         call Pmat%ILU_matvec(old_vector=e, new_vector=ps)
         x(:) = x(:) + alp*pd(:) + c3*ps(:)
         !x(:) = x(:) + alp * p(:) + c3 * e(:)
         r(:) = e(:) - c3*v(:)

         rr = dot_product(r, r)
         !call omp_dot_product(r,r,rr)

         if (itr == 1) then
            re_er0 = rr
         end if

         if (speak) then
            print *, sqrt(rr)
         end if
         if (present(relative_er)) then
            if (sqrt(rr/re_er0) < relative_er) then
               exit
            end if
         end if
         !    write(*,*) 'itr, er =', itr,rr
         if (sqrt(rr) < er0) exit

         c1 = dot_product(r0, r)
         !call omp_dot_product(r0,r,c1)

         bet = c1/(c2*c3)
         if ((c2*c3) == 0.0d0) stop "Bicgstab devide by zero"
         p(:) = r(:) + bet*(p(:) - c3*y(:))

      end do
   end subroutine
!===============================================================

! ###################################################
   function SGD_objective_function(fx, training_data_x, training_data_fx, &
                                   params) result(ret)
      interface
         function fx(x, params) result(ret)
            use iso_fortran_env
            real(real64), intent(in) :: x
            real(real64), intent(in) :: params(:)
            real(real64) :: ret
         end function
      end interface

      real(real64), intent(in) :: training_data_x(:)
      real(real64), intent(in) :: training_data_fx(:)
      real(real64), intent(in) :: params(:)
      real(real64) :: ret
      integer(int32) :: i

      ret = 0.0d0
      do i = 1, size(training_data_x)
         ret = ret + 1.0d0/dble(size(training_data_x))*(training_data_fx(i) - fx(x=training_data_x(i), params=params))**2
      end do

   end function
! ###################################################

   function StochasticGradientDescent_scalar_function(fx, &
                                      training_data_x, training_data_fx, init_params, eta, max_itr, tol, logfile_idx) result(params)
      interface
         function fx(x, params) result(ret)
            use iso_fortran_env
            real(real64), intent(in) :: x
            real(real64), intent(in) :: params(:)
            real(real64) :: ret
         end function
      end interface

      real(real64), intent(in) :: training_data_x(:)
      real(real64), intent(in) :: training_data_fx(:)
      real(real64), intent(in) :: init_params(:)
      real(real64), intent(in) :: eta, tol
      integer(int32), intent(in) :: max_itr
      integer(int32), optional, intent(in) :: logfile_idx
      real(real64), allocatable :: params(:), d_params(:)
      integer(int32) :: itr, idx
      real(real64) :: num
      integer(int32) :: SGD_log

      params = init_params

      if (present(logfile_idx)) then
         open (newunit=SGD_log, file="SGD"+zfill(logfile_idx, 6) + ".log", status="replace")
      else
         open (newunit=SGD_log, file="SGD.log", status="replace")
      end if
      do itr = 1, max_itr
         call random_number(num)
         idx = int(num*dble(size(training_data_x)) + 1.0d0)
         !print *, idx
         d_params = -eta*SGD_gradient( &
                    fx=fx, &
                    training_data_x=training_data_x(idx), &
                    training_data_fx=training_data_fx(idx), &
                    params=params &
                    )

         write (SGD_log, *) itr, SGD_objective_function( &
            fx=fx, &
            training_data_x=training_data_x, &
            training_data_fx=training_data_fx, &
            params=params &
            )
         ! 勾配がNaNであれば無視
         if (maxval(d_params(:) - d_params(:)) /= 0.00d0 .or. minval(d_params(:) - d_params(:)) /= 0.00d0) cycle

         if (maxval(abs(d_params)) < tol) exit
         params = params + d_params

      end do
      close (SGD_log)

   end function
! ###################################################

! ###################################################
   function SGD_gradient(fx, training_data_x, training_data_fx, &
                         params) result(grad)
      interface
         function fx(x, params) result(ret)
            use iso_fortran_env
            real(real64), intent(in) :: x
            real(real64), intent(in) :: params(:)
            real(real64) :: ret
         end function
      end interface

      real(real64), intent(in) :: training_data_x
      real(real64), intent(in) :: training_data_fx
      real(real64), intent(in) :: params(:)

      real(real64), allocatable :: grad(:)
      real(real64), allocatable :: d_params(:)
      real(real64) :: epsilon
      integer(int32) :: i, j

      epsilon = dble(1.0e-4)
      grad = 0.0d0*params
      d_params = 0.0d0*params

      do j = 1, size(grad)
         d_params = 0.0d0*params
         d_params(j) = epsilon
         grad(j) = -2.0d0*(training_data_fx - fx(x=training_data_x, params=params)) &
                   *(fx(x=training_data_x, params=params + d_params) - fx(x=training_data_x, params=params))/epsilon
      end do

   end function
! ###################################################

   subroutine importCRS_LinearSolver(this, CRS)
      class(LinearSolver_), intent(inout) :: this
      type(CRS_), intent(in) :: CRS

      this%CRS_val = CRS%val
      this%CRS_Index_I = CRS%col_idx
      this%CRS_Index_J = CRS%row_ptr

   end subroutine

end module