708 if( solveNormal ) temp.
Resize( dim );
709 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
710 const T2* _b = &b[0];
712 std::vector< int > partition( threads+1 );
715 for(
int i=0 ; i<A.
rows ; i++ ) eCount += A.
rowSizes[i];
717#pragma omp parallel for num_threads( threads )
718 for(
int t=0 ; t<threads ; t++ )
721 for(
int i=0 ; i<A.
rows ; i++ )
724 if( _eCount*threads>=eCount*(t+1) )
731 partition[threads] = A.
rows;
738#pragma omp parallel for num_threads( threads ) schedule( static )
739 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i];
744#pragma omp parallel for num_threads( threads ) schedule( static )
745 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i];
747 double delta_new = 0 , delta_0;
748 for( std::size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
752 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
756 for( ii=0; ii<iters && delta_new>eps*delta_0 ; ii++ )
761 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
762 T2 alpha = T2( delta_new / dDotQ );
763#pragma omp parallel for num_threads( threads ) schedule( static )
764 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
765 if( (ii%50)==(50-1) )
770#pragma omp parallel for num_threads( threads ) schedule( static )
771 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i];
774#pragma omp parallel for num_threads( threads ) schedule( static )
775 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha;
777 double delta_old = delta_new;
779 for( std::size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
780 T2 beta = T2( delta_new / delta_old );
781#pragma omp parallel for num_threads( threads ) schedule( static )
782 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
791 int threads = scratch.
threads();
795 if( reset ) x.
Resize( dim );
796 if( solveNormal ) temp.
Resize( dim );
797 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
798 const T2* _b = &b[0];
800 double delta_new = 0 , delta_0;
803 A.
Multiply( x , temp , scratch , addDCTerm ) , A.
Multiply( temp , r , scratch , addDCTerm ) , A.
Multiply( b , temp , scratch , addDCTerm );
804#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
805 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
809 A.
Multiply( x , r , scratch , addDCTerm );
810#pragma omp parallel for num_threads( threads ) reduction ( + : delta_new )
811 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
816 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
820 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
822 if( solveNormal ) A.
Multiply( d , temp , scratch , addDCTerm ) , A.
Multiply( temp , q , scratch , addDCTerm );
823 else A.
Multiply( d , q , scratch , addDCTerm );
825#pragma omp parallel for num_threads( threads ) reduction( + : dDotQ )
826 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
827 T2 alpha = T2( delta_new / dDotQ );
828 double delta_old = delta_new;
830 if( (ii%50)==(50-1) )
832#pragma omp parallel for num_threads( threads )
833 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
835 if( solveNormal ) A.
Multiply( x , temp , scratch , addDCTerm ) , A.
Multiply( temp , r , scratch , addDCTerm );
836 else A.
Multiply( x , r , scratch , addDCTerm );
837#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
838 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
841#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
842 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
844 T2 beta = T2( delta_new / delta_old );
845#pragma omp parallel for num_threads( threads )
846 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
857 if( threads<1 ) threads = 1;
858 if( threads>1 ) outScratch.
resize( threads , dim );
859 if( reset ) x.
Resize( dim );
862 if( solveNormal ) temp.
Resize( dim );
863 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
864 const T2* _b = &b[0];
866 double delta_new = 0 , delta_0;
870 if( threads>1 ) A.
Multiply( x , temp , outScratch , addDCTerm ) , A.
Multiply( temp , r , outScratch , addDCTerm ) , A.
Multiply( b , temp , outScratch , addDCTerm );
872#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
873 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
877 if( threads>1 ) A.
Multiply( x , r , outScratch , addDCTerm );
878 else A.
Multiply( x , r , addDCTerm );
879#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
880 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
886 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
890 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
894 if( threads>1 ) A.
Multiply( d , temp , outScratch , addDCTerm ) , A.
Multiply( temp , q , outScratch , addDCTerm );
895 else A.
Multiply( d , temp , addDCTerm ) , A.
Multiply( temp , q , addDCTerm );
899 if( threads>1 ) A.
Multiply( d , q , outScratch , addDCTerm );
900 else A.
Multiply( d , q , addDCTerm );
903#pragma omp parallel for num_threads( threads ) reduction( + : dDotQ )
904 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
905 T2 alpha = T2( delta_new / dDotQ );
906 double delta_old = delta_new;
909 if( (ii%50)==(50-1) )
911#pragma omp parallel for num_threads( threads )
912 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
916 if( threads>1 ) A.
Multiply( x , temp , outScratch , addDCTerm ) , A.
Multiply( temp , r , outScratch , addDCTerm );
917 else A.
Multiply( x , temp , addDCTerm ) , A.
Multiply( temp , r , addDCTerm );
921 if( threads>1 ) A.
Multiply( x , r , outScratch , addDCTerm );
922 else A.
Multiply( x , r , addDCTerm );
924#pragma omp parallel for num_threads( threads ) reduction ( + : delta_new )
925 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
929#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
930 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
933 T2 beta = T2( delta_new / delta_old );
934#pragma omp parallel for num_threads( threads )
935 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;