dmritool-doxygen/decomp_8h_source.html

 #ifndef DECOMP_H
 #define DECOMP_H

 #include <utils.h>
 #include <linalg.h>

 namespace spams
 {

 /* **************************
  * Greedy Forward Selection
  * **************************/


 template <typename T>
 void omp(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       const int *L, const T* eps, const T* lambda, const bool vecL = false,
       const bool vecEps = false, const bool Lambda=false, const int numThreads=-1,
       Matrix<T>* path = NULL);

 template <typename T>
 void omp_mask(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha, const Matrix<bool>& mask,
       const int *L, const T* eps, const T* lambda, const bool vecL = false,
       const bool vecEps = false, const bool Lambda=false, const int numThreads=-1,
       Matrix<T>* path = NULL);

 template <typename T>
 void coreORMP(Vector<T>& scores, Vector<T>& norm, Vector<T>& tmp,
       Matrix<T>& Un, Matrix<T>& Undn, Matrix<T>& Unds, Matrix<T>& Gs,
       Vector<T>& Rdn, const AbstractMatrix<T>& G, Vector<int>& ind,
       Vector<T>& RUn, T& normX, const T* eps, const int* L, const T* lambda,
       T* path = NULL);


 template <typename T>
 void coreORMPB(Vector<T>& RtD, const AbstractMatrix<T>& G, Vector<int>& ind,
       Vector<T>& coeffs, T& normX, const int L, const T eps, const T lambda = 0);

 /* **************
  * LARS - Lasso
  * **************/

 enum constraint_type { L1COEFFS, L2ERROR, PENALTY, SPARSITY, L2ERROR2, PENALTY2};

 template <typename T>
 void lasso(const Matrix<T>& X, const Matrix<T>& D,
       SpMatrix<T>& spalpha,
       int L, const T constraint, const T lambda2 = 0, constraint_type mode = PENALTY,
       const bool pos = false, const bool ols = false, const int numThreads=-1,
       Matrix<T>* path = NULL, const int length_path=-1);

 template <typename T>
 void lasso(const Data<T>& X, const AbstractMatrix<T>& G, const AbstractMatrix<T>& DtX,
       SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode = PENALTY,
       const bool pos = false, const bool ols = false, const int numThreads=-1,
       Matrix<T>* path = NULL, const int length_path=-1);

 template <typename T>
 void lasso2(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       int L, const T constraint,const T lambda2=0, constraint_type mode = PENALTY, const bool pos = false,
       const int numThreads = -1, Matrix<T>* path = NULL, const int length_path=-1);

 template <typename T>
 void lasso2(const Data<T>& X, const AbstractMatrix<T>& G, const AbstractMatrix<T>& DtX,
       SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode = PENALTY, const bool pos = false,
       const int numThreads = -1, Matrix<T>* path = NULL, const int length_path=-1);

 template <typename T>
 void lasso_mask(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha, const Matrix<bool>& mask,
       int L, const T constraint,const T lambda2=0, constraint_type mode = PENALTY, const bool pos = false,
       const int numThreads = -1);

 template <typename T>
 void lassoReweighted(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode, const bool pos,
       const T sigma,
       const int numThreads = -1);

 template <typename T>
 void coreLARS(Vector<T>& Rdn, Vector<T>& Xdn, Vector<T>& A,
       Vector<T>& u, Vector<T>& sig,
       Vector<T>& av, Vector<T>& RUn, Matrix<T>& Un,
       Matrix<T>& Unds, Matrix<T>& Gs,
       Matrix<T>& Gsa, Matrix<T>& workT, Matrix<T>& R,
       const AbstractMatrix<T>& G,T& normX,
       Vector<int>& ind,Vector<T>& coeffs,const T constraint,
       const bool ols = false,
       const bool pos =false,
       constraint_type mode = L1COEFFS,
       T* path = NULL, int length_path=-1);

 template <typename T>
 void coreLARS2(Vector<T>& DtR, const AbstractMatrix<T>& G,
       Matrix<T>& Gs,
       Matrix<T>& Ga,
       Matrix<T>& invGs,
       Vector<T>& u,
       Vector<T>& coeffs,
       Vector<int>& ind,
       Matrix<T>& work,
       T& normX,
       const constraint_type mode,
       const T constraint, const bool pos = false,
       T* pr_path = NULL, int length_path = -1);

 template <typename T>
 void coreLARS2W(Vector<T>& DtR, AbstractMatrix<T>& G,
       Matrix<T>& Gs,
       Matrix<T>& Ga,
       Matrix<T>& invGs,
       Vector<T>& u,
       Vector<T>& coeffs,
       const Vector<T>& weights,
       Vector<int>& ind,
       Matrix<T>& work,
       T& normX,
       const constraint_type mode,
       const T constraint, const bool pos = false);

 template <typename T>
 void downDateLasso(int& j,int& minBasis,T& normX,const bool ols,
       const bool pos, Vector<T>& Rdn, int* ind,
       T* coeffs, Vector<T>& sig, Vector<T>& av,
       Vector<T>& Xdn, Vector<T>& RUn,Matrix<T>& Unm, Matrix<T>& Gsm,
       Matrix<T>& Gsam, Matrix<T>& Undsm, Matrix<T>& Rm);


 /* ************************
  * Iterative thresholding
  * ************************/

 template <typename T>
 void ist(const Matrix<T>& X, const Matrix<T>& D,
       SpMatrix<T>& spalpha, T lambda, constraint_type mode,
       const int itermax=500,
       const T tol = 0.5, const int numThreads = -1);
 template <typename T>
 void ist(const Matrix<T>& X, const Matrix<T>& D,
       Matrix<T>& spalpha, T lambda, constraint_type mode,
       const int itermax=500,
       const T tol = 0.5, const int numThreads=-1);


 template <typename T>
 void coreIST(const AbstractMatrix<T>& G, Vector<T>& DtR, Vector<T>& coeffs,
       const T thrs, const int itermax = 500,
       const T tol = 0.5);

 template <typename T>
 void coreISTconstrained(const AbstractMatrix<T>& G, Vector<T>& DtR, Vector<T>& coeffs,
       const T normX2,
       const T thrs, const int itermax = 500,
       const T tol = 0.5);

 template <typename T>
 void ist_groupLasso(const Matrix<T>* XT, const Matrix<T>& D,
       Matrix<T>* alphaT, const int Ngroups,
       const T lambda, const constraint_type mode,
       const int itermax = 500,
       const T tol = 0.5, const int numThreads = -1);

 template <typename T>
 void coreGroupIST(const Matrix<T>& G, Matrix<T>& RtD,
       Matrix<T>& alphat,
       const T thrs,
       const int itermax=500,
       const T tol = 0.5);


 template <typename T>
 void coreGroupISTConstrained(const Matrix<T>& G, Matrix<T>& RtD,
       Matrix<T>& alphat, const T normR,
       const T eps,
       const int itermax=500,
       const T tol = 0.5);

 template <typename T>
 T computeError(const T normX2,const Vector<T>& norms,
       const Matrix<T>& G,const Matrix<T>& RtD,const Matrix<T>& alphat);

 template <typename T>
 T computeError(const T normX2,
       const Matrix<T>& G,const Vector<T>& DtR,const Vector<T>& coeffs,
       SpVector<T>& coeffs_tmp);

 /* ******************
  * Simultaneous OMP
  * *****************/
 template <typename T>
 void somp(const Matrix<T>* X, const Matrix<T>& D, SpMatrix<T>* spalpha,
       const int Ngroups, const int L, const T* pr_eps, const bool adapt=false,
       const int numThreads=-1);

 template <typename T>
 void somp(const Matrix<T>* X, const Matrix<T>& D, SpMatrix<T>* spalpha,
       const int Ngroups, const int L, const T eps, const int numThreads=-1);


 template <typename T>
 void coreSOMP(const Matrix<T>& X, const Matrix<T>& D, const Matrix<T>& G,
       Matrix<T>& vM,
       Vector<int>& rv, const int L, const T eps);

 /* *********************
  * Implementation of OMP
  * *********************/


 template <typename T>
 void omp(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       const int* pL, const T* peps, const T* pLambda,
       const bool vecL, const bool vecEps,
       const bool vecLambda, const int numThreads, Matrix<T>* path) {
    int L;
    if (!vecL) {
       L=*pL;
    } else {
       Vector<int> vL(const_cast<int*>(pL),X.n());
       L=vL.maxval();
    }
    spalpha.clear();
    if (L <= 0) return;
    const int M = X.n();
    const int K = D.n();
    L = MIN(X.m(),MIN(L,K));
    Matrix<T> vM(L,M);
    Matrix<int> rM(L,M);

    ProdMatrix<T> G(D, K < 25000 && M > 10);

    int NUM_THREADS=init_omp(numThreads);

    Vector<T>* scoresT=new Vector<T>[NUM_THREADS];
    Vector<T>* normT=new Vector<T>[NUM_THREADS];
    Vector<T>* tmpT=new Vector<T>[NUM_THREADS];
    Vector<T>* RdnT=new Vector<T>[NUM_THREADS];
    Matrix<T>* UnT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* UndnT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* UndsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       scoresT[i].resize(K);
       normT[i].resize(K);
       tmpT[i].resize(K);
       RdnT[i].resize(K);
       UnT[i].resize(L,L);
       UnT[i].setZeros();
       UndnT[i].resize(K,L);
       UndsT[i].resize(L,L);
       GsT[i].resize(K,L);
    }

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> Xi;
       X.refCol(i,Xi);
       T normX = Xi.nrm2sq();

       Vector<int> ind;
       rM.refCol(i,ind);
       ind.set(-1);

       Vector<T> RUn;
       vM.refCol(i,RUn);

       Vector<T>& Rdn=RdnT[numT];
       D.multTrans(Xi,Rdn);
       coreORMP(scoresT[numT],normT[numT],tmpT[numT],UnT[numT],UndnT[numT],UndsT[numT],
             GsT[numT],Rdn,G,ind,RUn, normX, vecEps ? peps+i : peps,
             vecL ? pL+i : pL, vecLambda ? pLambda+i : pLambda,
             path && i==0 ? path->rawX() : NULL);
    }

    delete[](scoresT);
    delete[](normT);
    delete[](tmpT);
    delete[](RdnT);
    delete[](UnT);
    delete[](UndnT);
    delete[](UndsT);
    delete[](GsT);

    spalpha.convert(vM,rM,K);
 };

 template <typename T>
 void omp_mask(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha, const Matrix<bool>& mask,
       const int *pL, const T* peps, const T* pLambda, const bool vecL,
       const bool vecEps, const bool vecLambda, const int numThreads,
       Matrix<T>* path) {
    int L;
    if (!vecL) {
       L=*pL;
    } else {
       Vector<int> vL(const_cast<int*>(pL),X.n());
       L=vL.maxval();
    }
    spalpha.clear();
    if (L <= 0) return;
    const int M = X.n();
    const int K = D.n();
    L = MIN(X.m(),MIN(L,K));
    Matrix<T> vM(L,M);
    Matrix<int> rM(L,M);

    ProdMatrix<T> G(D, K < 25000 && M > 10);

    int NUM_THREADS=init_omp(numThreads);

    Vector<T>* scoresT=new Vector<T>[NUM_THREADS];
    Vector<T>* normT=new Vector<T>[NUM_THREADS];
    Vector<T>* tmpT=new Vector<T>[NUM_THREADS];
    Vector<T>* RdnT=new Vector<T>[NUM_THREADS];
    Matrix<T>* UnT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* UndnT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* UndsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    ProdMatrix<T>* GT=new ProdMatrix<T>[NUM_THREADS];
    Matrix<T>* DmaskT=new Matrix<T>[NUM_THREADS];
    Vector<T>* XmaskT=new Vector<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DmaskT[i].resize(D.m(),D.n());
       XmaskT[i].resize(X.m());
       scoresT[i].resize(K);
       normT[i].resize(K);
       tmpT[i].resize(K);
       RdnT[i].resize(K);
       UnT[i].resize(L,L);
       UnT[i].setZeros();
       UndnT[i].resize(K,L);
       UndsT[i].resize(L,L);
       GsT[i].resize(K,L);
    }

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> Xi;
       X.refCol(i,Xi);

       Vector<int> ind;
       rM.refCol(i,ind);
       ind.set(-1);

       Vector<T> RUn;
       vM.refCol(i,RUn);

       Vector<bool> maski;
       mask.refCol(i,maski);
       Vector<T>& Rdn=RdnT[numT];
       if (maski.allfalse()) continue;
       if (maski.alltrue()) {
          D.multTrans(Xi,Rdn);
          T normX = Xi.nrm2sq();
          coreORMP(scoresT[numT],normT[numT],tmpT[numT],UnT[numT],UndnT[numT],UndsT[numT],
                GsT[numT],Rdn,G,ind,RUn, normX, vecEps ? peps+i : peps,
                vecL ? pL+i : pL, vecLambda ? pLambda+i : pLambda,
                path && i==0 ? path->rawX() : NULL);
       } else {
          D.copyMask(DmaskT[numT],maski);
          Xi.copyMask(XmaskT[numT],maski);
          T normX = XmaskT[numT].nrm2sq();
          DmaskT[numT].multTrans(XmaskT[numT],Rdn);
          GT[numT].setMatrices(DmaskT[numT],false);
          GT[numT].addDiag(T(1e-10));
          T eps_mask= (vecEps ? *(peps+i) : *peps)*XmaskT[numT].n()/Xi.n();
          coreORMP(scoresT[numT],normT[numT],tmpT[numT],
                UnT[numT],UndnT[numT],UndsT[numT],
                GsT[numT],Rdn,GT[numT],ind,RUn,
                normX, &eps_mask, vecL ? pL+i : pL,
                vecLambda ? pLambda+i : pLambda,
                path && i==0 ? path->rawX() : NULL);

          DmaskT[numT].setm(D.m());
          DmaskT[numT].setn(D.n());
          XmaskT[numT].setn(X.m());
       }
    }

    delete[](GT);
    delete[](XmaskT);
    delete[](DmaskT);
    delete[](scoresT);
    delete[](normT);
    delete[](tmpT);
    delete[](RdnT);
    delete[](UnT);
    delete[](UndnT);
    delete[](UndsT);
    delete[](GsT);

    spalpha.convert(vM,rM,K);
 };

 template <typename T>
 void coreORMPB(Vector<T>& RtD, const AbstractMatrix<T>& G, Vector<int>& ind,
       Vector<T>& coeffs, T& normX, const int L, const T eps, const T lambda) {
    const int K = G.n();
    Vector<T> scores(K);
    Vector<T> norm(K);
    Vector<T> tmp(K);
    Matrix<T> Un(L,L);
    Matrix<T> Undn(K,L);
    Matrix<T> Unds(L,L);
    Matrix<T> Gs(K,L);
    ind.set(-1);
    coreORMP(scores,norm,tmp,Un,Undn,Unds,Gs,RtD,G,ind,coeffs,normX,&eps,&L,&lambda);
 };

 template <typename T>
 void coreORMP(Vector<T>& scores, Vector<T>& norm, Vector<T>& tmp, Matrix<T>& Un,
       Matrix<T>& Undn, Matrix<T>& Unds, Matrix<T>& Gs, Vector<T>& Rdn,
       const AbstractMatrix<T>& G,
       Vector<int>& ind, Vector<T>& RUn,
        T& normX, const T* peps, const int* pL, const T* plambda,
       T* path) {
    const T eps = abs<T>(*peps);
    const int L = MIN(*pL,Gs.n());
    const T lambda=*plambda;
    if ((normX <= eps) || L == 0) return;
    const int K = scores.n();
    scores.copy(Rdn);
    norm.set(T(1.0));
    Un.setZeros();

    // permit unsafe low level access
    T* const prUn = Un.rawX();
    T* const prUnds = Unds.rawX();
    T* const prUndn = Undn.rawX();
    T* const prGs = Gs.rawX();
    T* const prRUn= RUn.rawX();
    if (path)
       memset(path,0,K*L*sizeof(T));

    int j;
    for (j = 0; j<L; ++j) {
       const int currentInd=scores.fmax();
       if (norm[currentInd] < 1e-8) {
          ind[j]=-1;
          break;
       }
       const T invNorm=T(1.0)/sqrt(norm[currentInd]);
       const T RU=Rdn[currentInd]*invNorm;
       const T delta = RU*RU;
       if (delta < 2*lambda) {
          break;
       }

       RUn[j]=RU;
       normX -= delta;
       ind[j]=currentInd;
       //for (int k = 0; k<j; ++k) prUn[j*L+k]=0.0;
       //prUn[j*L+j]=T(1.0);

       //    for (int k = 0; k<j; ++k) prUnds[k*L+j]=prUndn[k*K+currentInd];
       // MGS algorithm, Update Un
       //      int iter = norm[currentInd] < 0.5 ? 2 : 1;
       //int iter=1;
       //     for (int k = 0; k<iter; ++k) {
       //         T scal=-cblas_dot<T>(j+1-l,prUn+j*L+l,1,prUnds+l*L+l,1);
       //        T scal = -prUnds[l*L+j];
       //         cblas_axpy<T>(l+1,scal,prUn+l*L,1,prUn+j*L,1);
       //       }
       //    }

       prUn[j*L+j]=-T(1.0);
       cblas_copy<T>(j,prUndn+currentInd,K,prUn+j*L,1);
       cblas_trmv<T>(CblasColMajor,CblasUpper,CblasNoTrans,CblasNonUnit,j,prUn,L,prUn+j*L,1);
       cblas_scal<T>(j+1,-invNorm,prUn+j*L,1);

       if (j == L-1 || (normX <= eps)) {
          ++j;
          break;
       }

       if (path) {
          T* last_path=path+(L-1)*K;
          cblas_copy<T>(j+1,prRUn,1,last_path,1);
          cblas_trmv<T>(CblasColMajor,CblasUpper,CblasNoTrans,CblasNonUnit,
                j+1,prUn,L,last_path,1);
          for (int k = 0; k<=j; ++k) {
             path[j*K+ind[k]]=last_path[k];
          }
       }

       // update the variables Gs, Undn, Unds, Rdn, norm, scores
       Vector<T> Gsj;
       Gs.refCol(j,Gsj);
       G.copyCol(currentInd,Gsj);
       cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,j+1,T(1.0),prGs,K,prUn+j*L,1,
             T(0.0),prUndn+j*K,1);
      // prUnds[j*L+j] = prUndn[j*K+currentInd];
       Vector<T> Undnj;
       Undn.refCol(j,Undnj);
       Rdn.add(Undnj,-RUn[j]);
       tmp.sqr(Undnj);
       norm.sub(tmp);
       scores.sqr(Rdn);
       scores.div(norm);
       for (int k = 0; k<=j; ++k) scores[ind[k]]=T();
    }
    // compute the final coefficients
    cblas_trmv<T>(CblasColMajor,CblasUpper,CblasNoTrans,CblasNonUnit,
          j,prUn,L,prRUn,1);
    if (path) {
       memset(path+(L-1)*K,0,L*sizeof(T));
       for (int k = 0; k<j; ++k) {
          path[(j-1)*K+ind[k]]=prRUn[k];
       }
    }
 };


 /* **************
  * LARS - Lasso
  * **************/


 template <typename T>
 void lasso(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       int L, const T lambda, const T lambda2, constraint_type mode,
       const bool pos, const bool ols, const int numThreads,
       Matrix<T>* path, const int length_path) {
    ProdMatrix<T> G(D, X.n() > 10 && D.n() < 50000);
    G.addDiag(MAX(lambda2,1e-10));
    ProdMatrix<T> DtX(D,X,false);
    lasso(X,G,DtX,spalpha,L,lambda,mode,pos,ols,numThreads,path,length_path);
 }

 template <typename T>
 void lasso(const Data<T>& X, const AbstractMatrix<T>& G,
       const AbstractMatrix<T>& DtX, SpMatrix<T>& spalpha,
       int L, const T lambda, constraint_type mode,
       const bool pos, const bool ols, const int numThreads,
       Matrix<T>* path, const int length_path) {

    spalpha.clear();
    const int M = X.n();
    const int K = G.n();
    Matrix<T> vM;
    Matrix<int> rM;
    vM.resize(L,M);
    rM.resize(L,M);

    if (L <= 0) return;
    if (path) path->setZeros();

    int NUM_THREADS=init_omp(numThreads);

    //ProdMatrix<T> G(D, K < 25000 && M > 10);

    Vector<T>* RdnT=new Vector<T>[NUM_THREADS];
    Vector<T>* XdnT =new Vector<T>[NUM_THREADS];
    Vector<T>* AT=new Vector<T>[NUM_THREADS];
    Vector<T>* uT=new Vector<T>[NUM_THREADS];
    Vector<T>* sigT=new Vector<T>[NUM_THREADS];
    Vector<T>* avT=new Vector<T>[NUM_THREADS];
    Vector<T>* RUnT = new Vector<T>[NUM_THREADS];
    Matrix<T>* UnT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* RT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* UndsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GsaT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* workT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       RdnT[i].resize(K);
       if (ols) XdnT[i].resize(K);
       AT[i].resize(K);
       uT[i].resize(L);
       sigT[i].resize(L);
       avT[i].resize(L);
       if (ols) RUnT[i].resize(L);
       UnT[i].resize(L,L);
       UnT[i].setZeros();
       UndsT[i].resize(L,L);
       UndsT[i].setZeros();
       GsT[i].resize(K,L);
       GsaT[i].resize(L,L);
       workT[i].resize(K,2);
       RT[i].resize(L,L);
    }

    Vector<T> norms;
    X.norm_2sq_cols(norms);
    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       T normX = norms[i];

       Vector<int> ind;
       rM.refCol(i,ind);
       Vector<T> coeffs;
       vM.refCol(i,coeffs);
       coeffs.setZeros();

       Vector<T>& Rdn=RdnT[numT];
       DtX.copyCol(i,Rdn);
       coreLARS(Rdn,XdnT[numT], AT[numT], uT[numT], sigT[numT], avT[numT],
             RUnT[numT], UnT[numT], UndsT[numT], GsT[numT], GsaT[numT],
             workT[numT],RT[numT],G,normX, ind,coeffs,lambda,ols,pos,
             mode,path && i==0 ? path->rawX() : NULL, length_path);
    }

    delete[](RdnT);
    delete[](XdnT);
    delete[](AT);
    delete[](uT);
    delete[](sigT);
    delete[](avT);
    delete[](RUnT);
    delete[](UnT);
    delete[](RT);
    delete[](UndsT);
    delete[](GsT);
    delete[](GsaT);
    delete[](workT);

    spalpha.convert(vM,rM,K);
 };

 template <typename T>
 void coreLARS(Vector<T>& Rdnv, Vector<T>& Xdnv, Vector<T>& Av,
       Vector<T>& uv, Vector<T>& sigv, Vector<T>& avv, Vector<T>& RUnv,
       Matrix<T>& Unm, Matrix<T>& Undsm, Matrix<T>& Gsm,
       Matrix<T>& Gsam, Matrix<T>& workm, Matrix<T>& Rm,
       const AbstractMatrix<T>& Gm,T& normX,
       Vector<int>& indv,Vector<T>& coeffsv,const T constraint,
       const bool ols,const bool pos, constraint_type mode,
       T* path, int length_path) {
    if (mode == L2ERROR && normX < constraint) return;

    const int LL = Gsm.n();
    const int K = Gsm.m();
    const int L = MIN(LL,K);
    if (length_path <= 1) length_path=4*L;
    // permit unsafe fast low level access
    T* const Rdn = Rdnv.rawX();
    T* const Xdn = Xdnv.rawX();
    T* const A = Av.rawX();
    T* const u = uv.rawX();
    T* const sig = sigv.rawX();
    T* const av = avv.rawX();
    T* const RUn = RUnv.rawX();
    T* const Un = Unm.rawX();
    T* const Unds = Undsm.rawX();
    T* const Gs = Gsm.rawX();
    T* const Gsa = Gsam.rawX();
    T* const work = workm.rawX();
    //T* const G = Gm.rawX();
    T* const R = Rm.rawX();
    int* ind = indv.rawX();
    T* coeffs = coeffsv.rawX();

    coeffsv.setZeros();
    indv.set(-1);

    if (ols) Xdnv.copy(Rdnv);
    int currentInd= pos ? Rdnv.max() : Rdnv.fmax();
    bool newAtom=true;
    T Cmax;
    int iter=1;
    T thrs = 0.0;

    int* const ind_orig = ind;
    T* const coeffs_orig = coeffs;

    int j;
    for (j = 0; j<L; ++j) {
       if (newAtom) {
          ind[j]=currentInd;

          if (pos) {
             Cmax = Rdn[currentInd];
             sig[j]=1.0;
          } else {
             Cmax = abs<T>(Rdn[currentInd]);
             sig[j] = SIGN(Rdn[currentInd]);
          }
          for (int k = 0; k<=j; ++k) Un[j*L+k]=0.0;
          Un[j*L+j]=1.0;
          Gm.extract_rawCol(currentInd,Gs+K*j);
          for (int k = 0; k<j; ++k) Gs[K*j+ind[k]] *= sig[k];
          if (sig[j] < 0) {
             Rdn[currentInd]=-Rdn[currentInd];
             if (ols) Xdn[currentInd]=-Xdn[currentInd];
             cblas_scal<T>(K,sig[j],Gs+K*j,1);
             cblas_scal<T>(j+1,sig[j],Gs+currentInd,K);
          }
          cblas_copy<T>(j+1,Gs+currentInd,K,Gsa+j*L,1);
          for (int k = 0; k<j; ++k) Gsa[k*L+j]=Gsa[j*L+k];

          // <d_j,d_i>
          cblas_copy<T>(j,Gsa+j*L,1,Unds+j,L);
          // <U_j final,d_i>
          cblas_trmv<T>(CblasColMajor,CblasUpper,CblasTrans,CblasNonUnit,
                j+1,Un,L,Unds+j,L);
          // norm2
          T norm2=Gsa[j*L+j];
          for (int k = 0; k<j; ++k) norm2 -= Unds[k*L+j]*Unds[k*L+j];
          if (norm2 < 1e-15) {
             ind[j]=-1;
       //      cerr << "bad exit" << endl;
             break;
          }

       //   int iter2 = norm2 < 0.5 ? 2 : 1;
       //   for(int k = 0; k<iter2; ++k) {
       //      for (int l = 0; l<j; ++l) {
       //         T scal=-cblas_dot<T>(j+1-l,Un+j*L+l,1,Unds+l*L+l,1);
       //         cblas_axpy<T>(l+1,scal,Un+l*L,1,Un+j*L,1);
       //      }
       //   }
          Un[j*L+j]=-T(1.0);
          cblas_copy<T>(j,Unds+j,L,Un+j*L,1);
          cblas_trmv<T>(CblasColMajor,CblasUpper,CblasNoTrans,CblasNonUnit,j,Un,L,Un+j*L,1);

          T invNorm=1.0/sqrt(norm2);
          cblas_scal<T>(j+1,-invNorm,Un+j*L,1);
          Unds[j*L+j]=cblas_dot<T>(j+1,Un+j*L,1,Gsa+j*L,1);
       }

       for (int k = 0; k<=j; ++k) u[k]=T(1.0);
       cblas_trmv<T>(CblasColMajor,CblasUpper,CblasTrans,CblasNonUnit,
             j+1,Un,L,u,1);

       T a = T(1.0)/cblas_nrm2<T>(j+1,u,1);

       cblas_trmv<T>(CblasColMajor,CblasUpper,CblasNoTrans,CblasNonUnit,
             j+1,Un,L,u,1);
       cblas_scal<T>(j+1,a,u,1);

       cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,j+1,T(1.0),Gs,K,u,1,T(0.0),A,1);

       T potentNorm=0.0;
       if (!ols) {
          for (int k = 0; k<=j; ++k)  potentNorm += Rdn[ind[k]]*u[k];
       }

       if (pos) {
          for (int k = 0; k<K; ++k) {
             T diff = a-A[k];
             work[k]= diff <= 0 ? INFINITY : (Cmax-Rdn[k])/diff;
          }
          for (int k = 0; k<=j; ++k) {
             work[ind[k]]=INFINITY;
          }
          for (int k = 0; k<K; ++k)
             if (work[k] <=0) work[k]=INFINITY;
          currentInd =cblas_iamin<T>(K,work,1);
       } else {
          memset(work,0,2*K*sizeof(T));
          for (int k = 0; k<=j; ++k) {
             const int index=2*ind[k];
             work[index]=INFINITY;
             work[index+1]=INFINITY;
          }
          for (int k = 0; k<K; ++k) {
             const int index=2*k;
             if (!work[index]) {
                const T diff1=a-A[k];
                work[index]= diff1 <= 0 ? INFINITY : (Cmax-Rdn[k])/diff1;
                const T diff2=a+A[k];
                work[index+1]=diff2 <= 0 ? INFINITY : (Cmax+Rdn[k])/diff2;
             }
          }
          currentInd =cblas_iamin<T>(2*K,work,1);
       }
       T gamma=work[currentInd];
       T gammaMin=0;
       int minBasis=0;

       //if (j == L-1) gamma=potentNorm;

       if (mode == PENALTY) {
          gamma=MIN(gamma,(Cmax-constraint)/a);
       }

 //      if (j > 0) {
          vDiv<T>(j+1,coeffs,u,work);
          cblas_scal<T>(j+1,-T(1.0),work,1);
          for (int k=0; k<=j; ++k)
             if (coeffs[k]==0 || work[k] <=0) work[k]=INFINITY;
          minBasis=cblas_iamin<T>(j+1,work,1);
          gammaMin=work[minBasis];
          if (gammaMin < gamma) gamma=gammaMin;
  //     }

       if (mode == L1COEFFS) {
          T Tu = 0.0;
          for (int k = 0; k<=j; ++k) Tu += u[k];

          if (Tu > EPSILON)
             gamma= MIN(gamma,(constraint-thrs)/Tu);
          thrs+=gamma*Tu;
       }

       // compute the norm of the residdual

       if (ols == 0) {
          const T t = gamma*gamma - 2*gamma*potentNorm;
          if (t > 0 || std::isnan(t) || std::isinf(t)) {
       //      cerr << "bad bad exit" << endl;
      //       cerr << t << endl;
             ind[j]=-1;
             break;
          }
          normX += t;
       } else {
          // plan the last orthogonal projection
          if (newAtom) {
             RUn[j]=0.0;
             for (int k = 0; k<=j; ++k) RUn[j] += Xdn[ind[k]]*
                Un[j*L+k];
             normX -= RUn[j]*RUn[j];
          }
       }

       // Update the coefficients
       cblas_axpy<T>(j+1,gamma,u,1,coeffs,1);

       if (pos) {
          for (int k = 0; k<j+1; ++k)
             if (coeffs[k] < 0) coeffs[k]=0;
       }

       cblas_axpy<T>(K,-gamma,A,1,Rdn,1);
       if (!pos) currentInd/= 2;
       if (path) {
          for (int k = 0; k<=j; ++k)
             path[iter*K+ind[k]]=coeffs[k]*sig[k];
       }

       if (gamma == gammaMin) {
          downDateLasso<T>(j,minBasis,normX,ols,pos,Rdnv,ind,coeffs,sigv,
                avv,Xdnv, RUnv, Unm, Gsm, Gsam,Undsm,Rm);
          newAtom=false;
          Cmax=abs<T>(Rdn[ind[0]]);
          --j;
       } else {
          newAtom=true;
       }
       ++iter;

       if (mode == PENALTY) {
          thrs=abs<T>(Rdn[ind[0]]);
       }

       if ((j == L-1) ||
             (mode == PENALTY && (thrs - constraint < 1e-15)) ||
             (mode == L1COEFFS && (thrs - constraint > -1e-15)) ||
             (newAtom && mode == L2ERROR && (normX - constraint < 1e-15)) ||
             (normX < 1e-15) ||
             (iter >= length_path)) {
      //       cerr << "exit" << endl;
      //       PRINT_F(thrs)
      //       PRINT_F(constraint)
      //       PRINT_F(normX)
          break;
       }

    }
    if (ols) {
       cblas_copy<T>(j+1,RUn,1,coeffs,1);
       cblas_trmv<T>(CblasColMajor,CblasUpper,CblasNoTrans,CblasNonUnit,
             j+1,Un,L,coeffs,1);
    }
    vMul<T>(j+1,coeffs,sig,coeffs);
 };

 template <typename T>
 inline void downDateLasso(int& j,int& minBasis,T& normX,const bool ols,
       const bool pos,
       Vector<T>& Rdnv, int* ind,
       T* coeffs, Vector<T>& sigv, Vector<T>& avv,
       Vector<T>& Xdnv, Vector<T>& RUnv,Matrix<T>& Unm, Matrix<T>& Gsm,
       Matrix<T>& Gsam, Matrix<T>& Undsm, Matrix<T>& Rm) {
    int k,l;
    const int L = Gsm.n();
    const int K = Gsm.m();
    T* const Rdn = Rdnv.rawX();
    T* const Xdn = Xdnv.rawX();
    T* const sig = sigv.rawX();
    T* const av = avv.rawX();
    T* const RUn = RUnv.rawX();
    T* const Un = Unm.rawX();
    T* const Unds = Undsm.rawX();
    T* const Gs = Gsm.rawX();
    T* const Gsa = Gsam.rawX();
    T* const R = Rm.rawX();

    int indB=ind[minBasis];

    if (!pos && sig[minBasis] < 0) {
       // Update Rdn
       Rdn[indB]=-Rdn[indB];
       if (ols) Xdn[indB]=-Xdn[indB];
    }

    int num=j-minBasis;
    for (int k = 0; k<num*num;++k) R[k]=0.0;
    for (int k = 0; k<num; ++k) R[k*num+k]=1.0;
    // Update Un
    for (int k = minBasis+1; k<=j; ++k) {
       T a = -Un[k*L+minBasis]/Un[minBasis*L+minBasis];
       av[k-minBasis-1] = a;
       cblas_axpy<T>(minBasis,a,Un+minBasis*L,1,Un+k*L,1);
    }
    for (int k = minBasis+1; k<=j; ++k) {
       cblas_copy<T>(minBasis,Un+k*L,1,Un+(k-1)*L,1);
       cblas_copy<T>(num,Un+k*L+minBasis+1,1,Un+(k-1)*L+minBasis,1);
    }
    T alpha=1.0;
    T alphab,gamma,lambda;
    for (int k = 0; k<num; ++k) {
       alphab=alpha+av[k]*av[k];
       R[k*num+k]=sqrt(alphab/alpha);
       gamma=av[k]*R[k*num+k]/alphab;
       alpha=alphab;
       cblas_copy<T>(num-k-1,av+k+1,1,R+k*num+k+1,1);
       cblas_scal<T>(num-k-1,gamma,R+k*num+k+1,1);
    }
    if (num > 0) {
       trtri<T>(low,nonUnit,num,R,num);
       cblas_trmm<T>(CblasColMajor,CblasRight,CblasLower,CblasTrans,CblasNonUnit,
             j,num,T(1.0),R,num,Un+minBasis*L,L);
    }

    // Update Unds
    for (int k = minBasis+1; k<=j; ++k)
       cblas_axpy<T>(j-minBasis,av[k-minBasis-1],Unds+minBasis*L+minBasis+1,1,
             Unds+k*L+minBasis+1,1);
    for (int k = 0; k<minBasis; ++k)
       for (int l = minBasis+1; l<=j; ++l)
          Unds[k*L+l-1]=Unds[k*L+l];
    for (int k = minBasis+1; k<=j; ++k)
       cblas_copy<T>(j-minBasis,Unds+k*L+minBasis+1,1,Unds+(k-1)*L+minBasis,1);
    if (num > 0)
       cblas_trmm<T>(CblasColMajor,CblasRight,CblasLower,CblasTrans,CblasNonUnit,
             j-minBasis,num,T(1.0),R,num,Unds+minBasis*L+minBasis,L);
    for (int k = minBasis+1; k<=j; ++k)
       for (int l = 0; l<k; ++l) Unds[k*L+l]=0.0;

    // Update Gs
    for (int k = minBasis+1; k<=j; ++k) {
       cblas_copy<T>(K,Gs+k*K,1,Gs+(k-1)*K,1);
    }
    if (!pos && sig[minBasis] < T(0.0)) cblas_scal<T>(j,T(-1.0),Gs+indB,K);
    // Update Gsa
    for (int k = minBasis+1; k<=j; ++k) {
       cblas_copy<T>(minBasis,Gsa+k*L,1,Gsa+(k-1)*L,1);
       cblas_copy<T>(j-minBasis,Gsa+k*L+minBasis+1,1,Gsa+(k-1)*L+minBasis,1);
    }
    for (int k = 0; k<minBasis; ++k) {
       for (int l = minBasis+1; l<=j; ++l) Gsa[k*L+l-1]=Gsa[k*L+l];
    }

    // Update sig
    for (int k = minBasis+1; k<=j && !pos; ++k) sig[k-1]=sig[k];
    // Update ind
    for (int k = minBasis+1; k<=j; ++k) ind[k-1]=ind[k];
    ind[j]=-1;

    for (int k = minBasis+1; k<=j; ++k) coeffs[k-1]=coeffs[k];
    coeffs[j]=0.0;

    if (ols) {
       // Update RUn and normX
       for (int k = minBasis; k<=j; ++k)
          normX += RUn[k]*RUn[k];
       for (int k = minBasis; k<j; ++k) {
          RUn[k]=0.0;
          for (int l = 0; l<=k; ++l) RUn[k] += Xdn[ind[l]]*
             Un[k*L+l];
          normX -= RUn[k]*RUn[k];
       }
    }

    // Update j
    --j;
 }

 template <typename T>
 void lassoReweighted(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode, const bool pos,
       const T sigma,
       const int numThreads) {
    spalpha.clear();
    const int M = X.n();
    const int K = D.n();
    Matrix<T> vM;
    Matrix<int> rM;
    vM.resize(L,M);
    rM.resize(L,M);
    const int iterR = 30;

    if (L <= 0) return;

    int NUM_THREADS=init_omp(numThreads);

    //ProdMatrix<T> G(D, K < 25000 && M > 10);
    ProdMatrix<T> G(D, K < 50000);
    //Matrix<T> G;
    //D.XtX(G);
    G.addDiag(1e-10);

    Vector<T>* DtRT=new Vector<T>[NUM_THREADS];
    Vector<T>* DtRRT=new Vector<T>[NUM_THREADS];
    Vector<T>* uT=new Vector<T>[NUM_THREADS];
    Vector<T>* weightsT=new Vector<T>[NUM_THREADS];
    Vector<int>* inddT=new Vector<int>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GaT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* invGsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* workT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DtRT[i].resize(K);
       DtRRT[i].resize(K);
       uT[i].resize(K);
       weightsT[i].resize(K);
       GT[i].resize(K,K);
       inddT[i].resize(K);
       GsT[i].resize(L,L);
       invGsT[i].resize(L,L);
       GaT[i].resize(K,L);
       workT[i].resize(K,3);
       workT[i].setZeros();
    }

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> Xi;
       X.refCol(i,Xi);
       T normXo = Xi.nrm2sq();
       T normX = normXo;

       Vector<int> ind;
       rM.refCol(i,ind);
       Vector<T> coeffs;
       vM.refCol(i,coeffs);
       Vector<T>& DtR=DtRT[numT];
       Vector<T>& DtRR = DtRRT[numT];
       D.multTrans(Xi,DtR);
       DtRR.copy(DtR);
       coreLARS2(DtRR,G,GsT[numT],GaT[numT],invGsT[numT],uT[numT],coeffs,
             ind,workT[numT],normX,mode,constraint,pos);
       //Matrix<T>& GG = GT[numT];
       Vector<T>& weights = weightsT[numT];
       //Vector<int>& indd = inddT[numT];
       for (int j = 0; j<iterR; ++j) {
          const T sig = sigma*pow(0.7,iterR-1-j);
          weights.set(sig);
          for (int k = 0; k<K; ++k) {
             if (ind[k] != -1) {
                weights[ind[k]] = MAX(1e-4,sig*exp(-sig*abs<T>(coeffs[k])));
             } else {
                break;
             }
          }
          DtRR.copy(DtR);
          normX=normXo;
          coreLARS2W(DtRR,G,GsT[numT],GaT[numT],invGsT[numT],uT[numT],coeffs,weights,
                ind,workT[numT],normX,mode,constraint,pos);
       }
    }

    delete[](DtRT);
    delete[](DtRRT);
    delete[](inddT);
    delete[](uT);
    delete[](weightsT);
    delete[](GsT);
    delete[](GT);
    delete[](GaT);
    delete[](invGsT);
    delete[](workT);

    spalpha.convert(vM,rM,K);

 }

 template <typename T>
 void lassoWeight(const Matrix<T>& X, const Matrix<T>& D, const Matrix<T>& weights,
       SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode, const bool pos,
       const int numThreads) {

    spalpha.clear();
    const int M = X.n();
    const int K = D.n();
    Matrix<T> vM;
    Matrix<int> rM;
    vM.resize(L,M);
    rM.resize(L,M);

    if (L <= 0) return;

    int NUM_THREADS=init_omp(numThreads);

    //ProdMatrix<T> G(D, K < 25000 && M > 10);
    ProdMatrix<T> G(D, K < 50000);
    //Matrix<T> G;
    //D.XtX(G);
    G.addDiag(1e-10);

    Vector<T>* DtRT=new Vector<T>[NUM_THREADS];
    Vector<T>* uT=new Vector<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GaT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* invGsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* workT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DtRT[i].resize(K);
       uT[i].resize(K);
       uT[i].setZeros();
       GsT[i].resize(L,L);
       invGsT[i].resize(L,L);
       GaT[i].resize(K,L);
       workT[i].resize(K,3);
       workT[i].setZeros();
    }

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> Xi;
       X.refCol(i,Xi);
       T normX = Xi.nrm2sq();

       Vector<int> ind;
       rM.refCol(i,ind);
       Vector<T> coeffs;
       vM.refCol(i,coeffs);

       Vector<T>& DtR=DtRT[numT];
       D.multTrans(Xi,DtR);
       Vector<T> we;
       weights.refCol(i,we);

       coreLARS2W(DtR,G,GsT[numT],GaT[numT],invGsT[numT],uT[numT],coeffs,we,
             ind,workT[numT],normX,mode,constraint,pos);
    }

    delete[](DtRT);
    delete[](uT);
    delete[](GsT);
    delete[](GaT);
    delete[](invGsT);
    delete[](workT);

    spalpha.convert(vM,rM,K);
 };

 template <typename T>
 void lassoWeightPreComputed(const Matrix<T>& X, const Matrix<T>& G, const Matrix<T>& DtR, const Matrix<T>& weights,
       SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode, const bool pos,
       const int numThreads) {

    spalpha.clear();
    const int M = X.n();
    const int K = G.n();
    Matrix<T> vM;
    Matrix<int> rM;
    vM.resize(L,M);
    rM.resize(L,M);

    if (L <= 0) return;

    int NUM_THREADS=init_omp(numThreads);

    Vector<T>* DtRT=new Vector<T>[NUM_THREADS];
    Vector<T>* uT=new Vector<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GaT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* invGsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* workT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DtRT[i].resize(K);
       uT[i].resize(K);
       uT[i].setZeros();
       GsT[i].resize(L,L);
       invGsT[i].resize(L,L);
       GaT[i].resize(K,L);
       workT[i].resize(K,3);
       workT[i].setZeros();
    }

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> Xi;
       X.refCol(i,Xi);
       T normX = Xi.nrm2sq();

       Vector<int> ind;
       rM.refCol(i,ind);
       Vector<T> coeffs;
       vM.refCol(i,coeffs);

       Vector<T>& DtRi=DtRT[numT];
       DtR.copyCol(i,DtRi);
       Vector<T> we;
       weights.refCol(i,we);

       coreLARS2W(DtRi,G,GsT[numT],GaT[numT],invGsT[numT],uT[numT],coeffs,we,
             ind,workT[numT],normX,mode,constraint,pos);
    }

    delete[](DtRT);
    delete[](uT);
    delete[](GsT);
    delete[](GaT);
    delete[](invGsT);
    delete[](workT);

    spalpha.convert(vM,rM,K);
 };

 template <typename T>
 void lasso_mask(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha, const Matrix<bool>& mask,
       int L, const T constraint,const T lambda2, constraint_type mode, const bool pos,
       const int numThreads) {
    spalpha.clear();
    const int M = X.n();
    const int K = D.n();
    Matrix<T> vM;
    Matrix<int> rM;
    vM.resize(L,M);
    rM.resize(L,M);

    if (L <= 0) return;

    int NUM_THREADS=init_omp(numThreads);

    ProdMatrix<T> G(D,K < 25000 && M > 10);
    G.addDiag(MAX(lambda2,1e-10));

    Vector<T>* DtRT=new Vector<T>[NUM_THREADS];
    Vector<T>* uT=new Vector<T>[NUM_THREADS];
    Vector<T>* XmaskT=new Vector<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    ProdMatrix<T>* GT=new ProdMatrix<T>[NUM_THREADS];
    Matrix<T>* DmaskT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GaT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* invGsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* workT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DmaskT[i].resize(D.m(),D.n());
       DtRT[i].resize(K);
       uT[i].resize(K);
       XmaskT[i].resize(X.m());
       uT[i].setZeros();
       GsT[i].resize(L,L);
       invGsT[i].resize(L,L);
       GaT[i].resize(K,L);
       workT[i].resize(K,3);
       workT[i].setZeros();
    }

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> Xi;
       X.refCol(i,Xi);
       Vector<bool> maski;
       mask.refCol(i,maski);
       Vector<int> ind;
       rM.refCol(i,ind);
       Vector<T> coeffs;
       vM.refCol(i,coeffs);
       Vector<T>& DtR=DtRT[numT];

       if (maski.allfalse()) continue;
       if (maski.alltrue()) {
          T normX = Xi.nrm2sq();
          D.multTrans(Xi,DtR);
          coreLARS2(DtR,G,GsT[numT],GaT[numT],invGsT[numT],uT[numT],coeffs,
                ind,workT[numT],normX,mode,constraint,pos);
       } else {
          D.copyMask(DmaskT[numT],maski);
          Xi.copyMask(XmaskT[numT],maski);
          T constraint_mask = mode == PENALTY || mode == L2ERROR ? constraint*XmaskT[numT].n()/Xi.n() : constraint;
          T normX = XmaskT[numT].nrm2sq();
          DmaskT[numT].multTrans(XmaskT[numT],DtR);
          GT[numT].setMatrices(DmaskT[numT],false);
          GT[numT].addDiag(MAX(lambda2,T(1e-10)));
          coreLARS2(DtR,GT[numT],
                GsT[numT],GaT[numT],invGsT[numT],uT[numT],coeffs,
                ind,workT[numT],normX,mode,constraint_mask,pos);
          DmaskT[numT].setm(D.m());
          DmaskT[numT].setn(D.n());
          XmaskT[numT].setn(X.m());
       }
    }

    delete[](GT);
    delete[](XmaskT);
    delete[](DmaskT);
    delete[](DtRT);
    delete[](uT);
    delete[](GsT);
    delete[](GaT);
    delete[](invGsT);
    delete[](workT);

    spalpha.convert(vM,rM,K);

 };

 template <typename T>
 void lasso2(const Matrix<T>& X, const Matrix<T>& D, SpMatrix<T>& spalpha,
       int L, const T constraint, const T lambda2, constraint_type mode, const bool pos,
       const int numThreads, Matrix<T>* path, int length_path) {
    ProdMatrix<T> G(D,X.n() > 10 && D.n() < 50000);
    ProdMatrix<T> DtX(D,X,false);
    G.addDiag(MAX(lambda2,1e-10));
    lasso2(X,G,DtX,spalpha,L,constraint,mode,pos,numThreads,path, length_path);
 }


 template <typename T>
 void lasso2(const Data<T>& X, const AbstractMatrix<T>& G, const AbstractMatrix<T>& DtX,
       SpMatrix<T>& spalpha,
       int L, const T constraint, constraint_type mode, const bool pos,
       const int numThreads, Matrix<T>* path, int length_path) {
    spalpha.clear();
    const int M = X.n();
    const int K = G.n();
    Matrix<T> vM;
    Matrix<int> rM;
    vM.resize(L,M);
    rM.resize(L,M);

    if (L <= 0) return;
    if (path) path->setZeros();

    int NUM_THREADS=init_omp(numThreads);

    Vector<T>* DtRT=new Vector<T>[NUM_THREADS];
    Vector<T>* uT=new Vector<T>[NUM_THREADS];
    Matrix<T>* GsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* GaT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* invGsT=new Matrix<T>[NUM_THREADS];
    Matrix<T>* workT=new Matrix<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DtRT[i].resize(K);
       uT[i].resize(K);
       uT[i].setZeros();
       GsT[i].resize(L,L);
       invGsT[i].resize(L,L);
       GaT[i].resize(K,L);
       workT[i].resize(K,3);
       workT[i].setZeros();
    }
    int i;
    Vector<T> norms;
    X.norm_2sq_cols(norms);
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
     //  Vector<T> Xi;
     //  X.refCol(i,Xi);
     //  T normX = Xi.nrm2sq();
       T normX = norms[i];

       Vector<int> ind;
       rM.refCol(i,ind);
       Vector<T> coeffs;
       vM.refCol(i,coeffs);

       Vector<T>& DtR=DtRT[numT];
       DtX.copyCol(i,DtR);
       //D.multTrans(Xi,DtR);
       coreLARS2(DtR,G,GsT[numT],GaT[numT],invGsT[numT],
             uT[numT],coeffs,
             ind,workT[numT],normX,mode,constraint,pos,
             path && i==0 ? path->rawX() : NULL,length_path);
    }

    delete[](DtRT);
    delete[](uT);
    delete[](GsT);
    delete[](GaT);
    delete[](invGsT);
    delete[](workT);

    spalpha.convert(vM,rM,K);
 };


 template <typename T>
 void coreLARS2(Vector<T>& DtR, const AbstractMatrix<T>& G,
       Matrix<T>& Gs,
       Matrix<T>& Ga,
       Matrix<T>& invGs,
       Vector<T>& u,
       Vector<T>& coeffs,
       Vector<int>& ind,
       Matrix<T>& work,
       T& normX,
       const constraint_type mode,
       const T constraint,
       const bool pos,
       T* path, int length_path) {
    const int LL = Gs.n();
    const int K = G.n();
    const int L = MIN(LL,K);
    if (length_path <= 1) length_path=4*L;

    coeffs.setZeros();
    ind.set(-1);

    T* const pr_Gs = Gs.rawX();
    T* const pr_invGs = invGs.rawX();
    T* const pr_Ga = Ga.rawX();
    T* const pr_work = work.rawX();
    T* const pr_u = u.rawX();
    T* const pr_DtR = DtR.rawX();
    T* const pr_coeffs = coeffs.rawX();
    int* const pr_ind = ind.rawX();

    // Find the most correlated element
    int currentInd = pos ? DtR.max() : DtR.fmax();
    if (mode == PENALTY && abs(DtR[currentInd]) < constraint) return;
    if (mode == L2ERROR && normX < constraint) return;
    bool newAtom=true;

    int i;
    int iter=0;
    T thrs = 0;
    for (i = 0; i<L; ++i) {
       ++iter;
       if (newAtom) {
          pr_ind[i]=currentInd;
      //    cerr << "Add " << currentInd << endl;
          G.extract_rawCol(pr_ind[i],pr_Ga+i*K);
          for (int j = 0; j<=i; ++j)
             pr_Gs[i*LL+j]=pr_Ga[i*K+pr_ind[j]];

          // Update inverse of Gs
          if (i == 0) {
             pr_invGs[0]=T(1.0)/pr_Gs[0];
          } else {
             cblas_symv<T>(CblasColMajor,CblasUpper,i,T(1.0),
                   pr_invGs,LL,pr_Gs+i*LL,1,T(0.0),pr_u,1);
             const T schur =
                T(1.0)/(pr_Gs[i*LL+i]-cblas_dot<T>(i,pr_u,1,pr_Gs+i*LL,1));
             pr_invGs[i*LL+i]=schur;
             cblas_copy<T>(i,pr_u,1,pr_invGs+i*LL,1);
             cblas_scal<T>(i,-schur,pr_invGs+i*LL,1);
             cblas_syr<T>(CblasColMajor,CblasUpper,i,schur,pr_u,1,
                   pr_invGs,LL);
          }
       }

       // Compute the path direction
       for (int j = 0; j<=i; ++j)
          pr_work[j]= pr_DtR[pr_ind[j]] > 0 ? T(1.0) : T(-1.0);
       cblas_symv<T>(CblasColMajor,CblasUpper,i+1,T(1.0),pr_invGs,LL,
             pr_work,1,T(0.0),pr_u,1);

       // Compute the step on the path
       T step_max = INFINITY;
       int first_zero = -1;
       for (int j = 0; j<=i; ++j) {
          T ratio = -pr_coeffs[j]/pr_u[j];
          if (ratio > 0 && ratio <= step_max) {
             step_max=ratio;
             first_zero=j;
          }
       }
  //     PRINT_F(step_max)

       T current_correlation = abs<T>(pr_DtR[pr_ind[0]]);
       cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,i+1,T(1.0),pr_Ga,
             K,pr_u,1,T(0.0),pr_work+2*K,1);
       cblas_copy<T>(K,pr_work+2*K,1,pr_work+K,1);
       cblas_copy<T>(K,pr_work+2*K,1,pr_work,1);

      for (int j = 0; j<=i; ++j) {
          pr_work[pr_ind[j]]=INFINITY;
          pr_work[pr_ind[j]+K]=INFINITY;
       }
       for (int j = 0; j<K; ++j) {
          pr_work[j] = ((pr_work[j] < INFINITY) && (pr_work[j] > T(-1.0))) ? (pr_DtR[j]+current_correlation)/(T(1.0)+pr_work[j]) : INFINITY;
       }
  //     work.print("work");
       for (int j = 0; j<K; ++j) {
          pr_work[j+K] = ((pr_work[j+K] < INFINITY) && (pr_work[j+K] < T(1.0))) ? (current_correlation-pr_DtR[j])/(T(1.0)-pr_work[j+K]) : INFINITY;
       }
  //     work.print("work");

       if (pos) {
          for (int j = 0; j<K; ++j) {
             pr_work[j]=INFINITY;
          }
       }
  //     work.print("work");
  //     coeffs.print("coeffs");
       int index = cblas_iamin<T>(2*K,pr_work,1);
       T step = pr_work[index];

       // Choose next element
       currentInd = index % K;

       // compute the coefficients of the polynome representing normX^2
       T coeff1 = 0;
       for (int j = 0; j<=i; ++j)
          coeff1 += pr_DtR[pr_ind[j]] > 0 ? pr_u[j] : -pr_u[j];
       T coeff2 = 0;
       for (int j = 0; j<=i; ++j)
          coeff2 += pr_DtR[pr_ind[j]]*pr_u[j];
       T coeff3 = normX-constraint;


       T step_max2;
       if (mode == PENALTY) {
          step_max2 = current_correlation-constraint;
       } else if (mode == L2ERROR) {
          const T delta = coeff2*coeff2-coeff1*coeff3;
          step_max2 = delta < 0 ? INFINITY : (coeff2-sqrt(delta))/coeff1;
          step_max2 = MIN(current_correlation,step_max2);
       } else {
          step_max2 = coeff1 < 0 ? INFINITY : (constraint-thrs)/coeff1;
          step_max2 = MIN(current_correlation,step_max2);
       }
       step = MIN(MIN(step,step_max2),step_max);
       if (step == INFINITY) break; // stop the path

       // Update coefficients
       cblas_axpy<T>(i+1,step,pr_u,1,pr_coeffs,1);

       if (pos) {
          for (int j = 0; j<i+1; ++j)
             if (pr_coeffs[j] < 0) pr_coeffs[j]=0;
       }

       // Update correlations
       cblas_axpy<T>(K,-step,pr_work+2*K,1,pr_DtR,1);

       // Update normX
       normX += coeff1*step*step-2*coeff2*step;

       // Update norm1
       thrs += step*coeff1;

       if (path) {
          for (int k = 0; k<=i; ++k)
             path[iter*K+ind[k]]=pr_coeffs[k];
       }

       // Choose next action

       if (step == step_max) {
       //   cerr << "Remove " << pr_ind[first_zero] << endl;
          for (int j = first_zero; j<i; ++j) {
             cblas_copy<T>(K,pr_Ga+(j+1)*K,1,pr_Ga+j*K,1);
             pr_ind[j]=pr_ind[j+1];
             pr_coeffs[j]=pr_coeffs[j+1];
          }
          pr_ind[i]=-1;
          pr_coeffs[i]=0;
          for (int j = first_zero; j<i; ++j) {
             cblas_copy<T>(first_zero,pr_Gs+(j+1)*LL,1,pr_Gs+j*LL,1);
             cblas_copy<T>(i-first_zero,pr_Gs+(j+1)*LL+first_zero+1,1,
                   pr_Gs+j*LL+first_zero,1);
          }
          const T schur = pr_invGs[first_zero*LL+first_zero];
          cblas_copy<T>(first_zero,pr_invGs+first_zero*LL,1,pr_u,1);
          cblas_copy<T>(i-first_zero,pr_invGs+(first_zero+1)*LL+first_zero,LL,
                pr_u+first_zero,1);
          for (int j = first_zero; j<i; ++j) {
             cblas_copy<T>(first_zero,pr_invGs+(j+1)*LL,1,pr_invGs+j*LL,1);
             cblas_copy<T>(i-first_zero,pr_invGs+(j+1)*LL+first_zero+1,1,
                   pr_invGs+j*LL+first_zero,1);
          }
          cblas_syr<T>(CblasColMajor,CblasUpper,i,T(-1.0)/schur,
                pr_u,1,pr_invGs,LL);
          newAtom=false;
          i=i-2;
       } else {
          newAtom=true;
       }
       if ((iter >= length_path-1) || abs(step) < 1e-15 ||
             step == step_max2 || (normX < 1e-15) ||
             (i == (L-1)) ||
             (mode == L2ERROR && normX - constraint < 1e-15) ||
             (mode == L1COEFFS && (constraint-thrs < 1e-15))) {
          break;
       }
    }
 }

 template <typename T>
 void coreLARS2W(Vector<T>& DtR, AbstractMatrix<T>& G,
       Matrix<T>& Gs,
       Matrix<T>& Ga,
       Matrix<T>& invGs,
       Vector<T>& u,
       Vector<T>& coeffs,
       const Vector<T>& weights,
       Vector<int>& ind,
       Matrix<T>& work,
       T& normX,
       const constraint_type mode,
       const T constraint,
       const bool pos) {
    const int LL = Gs.n();
    const int K = G.n();
    const int L = MIN(LL,K);
    coeffs.setZeros();
    ind.set(-1);

    T* const pr_Gs = Gs.rawX();
    T* const pr_invGs = invGs.rawX();
    T* const pr_Ga = Ga.rawX();
    //  T* const pr_G = G.rawX();
    T* const pr_work = work.rawX();
    T* const pr_u = u.rawX();
    T* const pr_DtR = DtR.rawX();
    T* const pr_coeffs = coeffs.rawX();
    T* const pr_weights = weights.rawX();
    int* const pr_ind = ind.rawX();

    DtR.div(weights);

    // Find the most correlated element
    int currentInd = pos ? DtR.max() : DtR.fmax();
    if (mode == PENALTY && abs(DtR[currentInd]) < constraint) return;
    if (mode == L2ERROR && normX < constraint) return;
    bool newAtom=true;

    int i;
    int iter=0;
    T thrs = 0;
    for (i = 0; i<L; ++i) {
       ++iter;
       if (newAtom) {
          pr_ind[i]=currentInd;
          // Update upper part of Gs and Ga
          G.extract_rawCol(pr_ind[i],pr_Ga+i*K);
          for (int j = 0; j<=i; ++j)
             pr_Gs[i*LL+j]=pr_Ga[i*K+pr_ind[j]];

          // Update inverse of Gs
          if (i == 0) {
             pr_invGs[0]=T(1.0)/pr_Gs[0];
          } else {
             cblas_symv<T>(CblasColMajor,CblasUpper,i,T(1.0),
                   pr_invGs,LL,pr_Gs+i*LL,1,T(0.0),pr_u,1);
             const T schur =
                T(1.0)/(pr_Gs[i*LL+i]-cblas_dot<T>(i,pr_u,1,pr_Gs+i*LL,1));
             pr_invGs[i*LL+i]=schur;
             cblas_copy<T>(i,pr_u,1,pr_invGs+i*LL,1);
             cblas_scal<T>(i,-schur,pr_invGs+i*LL,1);
             cblas_syr<T>(CblasColMajor,CblasUpper,i,schur,pr_u,1,
                   pr_invGs,LL);
          }
       }

       // Compute the path direction
       for (int j = 0; j<=i; ++j)
          pr_work[j]= pr_DtR[pr_ind[j]] > 0 ? weights[pr_ind[j]] : -weights[pr_ind[j]];
       cblas_symv<T>(CblasColMajor,CblasUpper,i+1,T(1.0),pr_invGs,LL,
             pr_work,1,T(0.0),pr_u,1);

       // Compute the step on the path
       T step_max = INFINITY;
       int first_zero = -1;
       for (int j = 0; j<=i; ++j) {
          T ratio = -pr_coeffs[j]/pr_u[j];
          if (ratio > 0 && ratio <= step_max) {
             step_max=ratio;
             first_zero=j;
          }
       }

       T current_correlation = abs<T>(pr_DtR[pr_ind[0]]);
       cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,i+1,T(1.0),pr_Ga,
             K,pr_u,1,T(0.0),pr_work+2*K,1);
       vDiv<T>(K,pr_work+2*K,pr_weights,pr_work+2*K);
       cblas_copy<T>(K,pr_work+2*K,1,pr_work+K,1);
       cblas_copy<T>(K,pr_work+2*K,1,pr_work,1);

      for (int j = 0; j<=i; ++j) {
          pr_work[pr_ind[j]]=INFINITY;
          pr_work[pr_ind[j]+K]=INFINITY;
       }
       for (int j = 0; j<K; ++j) {
          pr_work[j] = ((pr_work[j] < INFINITY) && (pr_work[j] > T(-1.0))) ? (pr_DtR[j]+current_correlation)/(T(1.0)+pr_work[j]) : INFINITY;
       }
       for (int j = 0; j<K; ++j) {
          pr_work[j+K] = ((pr_work[j+K] < INFINITY) && (pr_work[j+K] < T(1.0))) ? (current_correlation-pr_DtR[j])/(T(1.0)-pr_work[j+K]) : INFINITY;
       }

       if (pos) {
          for (int j = 0; j<K; ++j) {
             pr_work[j]=INFINITY;
          }
       }
       int index = cblas_iamin<T>(2*K,pr_work,1);
       T step = pr_work[index];
       // Choose next element
       currentInd = index % K;

       // compute the coefficients of the polynome representing normX^2
       T coeff1 = 0;
       for (int j = 0; j<=i; ++j)
          coeff1 += pr_DtR[pr_ind[j]] > 0 ? pr_weights[pr_ind[j]]*pr_u[j] :
             -pr_weights[pr_ind[j]]*pr_u[j];
       T coeff2 = 0;
       for (int j = 0; j<=i; ++j)
          coeff2 += pr_DtR[pr_ind[j]]*pr_u[j]*pr_weights[pr_ind[j]];
       T coeff3 = normX-constraint;

       T step_max2;
       if (mode == PENALTY) {
          step_max2 = current_correlation-constraint;
       } else if (mode == L2ERROR) {
          const T delta = coeff2*coeff2-coeff1*coeff3;
          step_max2 = delta < 0 ? INFINITY : (coeff2-sqrt(delta))/coeff1;
       } else {
          step_max2 = coeff1 < 0 ? INFINITY : (constraint-thrs)/coeff1;
       }
       step = MIN(MIN(step,step_max2),step_max);

       if (step == INFINITY) break; // stop the path

       // Update coefficients
       cblas_axpy<T>(i+1,step,pr_u,1,pr_coeffs,1);

       // Update correlations
       cblas_axpy<T>(K,-step,pr_work+2*K,1,pr_DtR,1);

       // Update normX
       normX += coeff1*step*step-2*coeff2*step;

       // Update norm1
       thrs += step*coeff1;

       if (step == step_max) {
          for (int j = first_zero; j<i; ++j) {
             cblas_copy<T>(K,pr_Ga+(j+1)*K,1,pr_Ga+j*K,1);
             pr_ind[j]=pr_ind[j+1];
             pr_coeffs[j]=pr_coeffs[j+1];
          }
          pr_ind[i]=-1;
          pr_coeffs[i]=0;
          for (int j = first_zero; j<i; ++j) {
             cblas_copy<T>(first_zero,pr_Gs+(j+1)*LL,1,pr_Gs+j*LL,1);
             cblas_copy<T>(i-first_zero,pr_Gs+(j+1)*LL+first_zero+1,1,
                   pr_Gs+j*LL+first_zero,1);
          }
          const T schur = pr_invGs[first_zero*LL+first_zero];
          cblas_copy<T>(first_zero,pr_invGs+first_zero*LL,1,pr_u,1);
          cblas_copy<T>(i-first_zero,pr_invGs+(first_zero+1)*LL+first_zero,LL,
                pr_u+first_zero,1);
          for (int j = first_zero; j<i; ++j) {
             cblas_copy<T>(first_zero,pr_invGs+(j+1)*LL,1,pr_invGs+j*LL,1);
             cblas_copy<T>(i-first_zero,pr_invGs+(j+1)*LL+first_zero+1,1,
                   pr_invGs+j*LL+first_zero,1);
          }
          cblas_syr<T>(CblasColMajor,CblasUpper,i,T(-1.0)/schur,
                pr_u,1,pr_invGs,LL);
          newAtom=false;
          i=i-2;
       } else {
          newAtom=true;
       }
       // Choose next action
       if (iter > 4*L || abs(step) < 1e-10 ||
             step == step_max2 || (normX < 1e-10) ||
             (i == (L-1)) ||
             (mode == L2ERROR && normX - constraint < 1e-10) ||
             (mode == L1COEFFS && (constraint-thrs < 1e-10))) {
          break;
       }
    }
 }


 /* ************************
  * Iterative thresholding
  * ************************/

 template <typename T>
 void ist(const Matrix<T>& X, const Matrix<T>& D,
       SpMatrix<T>& spalpha, T lambda, constraint_type mode,
       const int itermax,
       const T tol,
       const int numThreads) {
    Matrix<T> alpha;
    spalpha.toFull(alpha);
    spalpha.clear();
    ist(X,D,alpha,lambda,mode,itermax,tol,numThreads);
    alpha.toSparse(spalpha);
 }

 template <typename T>
 void ist(const Matrix<T>& X, const Matrix<T>& D,
       Matrix<T>& alpha, T lambda, constraint_type mode,
       const int itermax,
       const T tol, const int numThreads) {

    if (mode == L1COEFFS) {
       std::cerr << "Mode not implemented" << std::endl;
       return;
    }

    int K=D.n();
    int M=X.n();
    alpha.resize(K,M);
    if (!D.isNormalized()) {
       cerr << "Current implementation of IST does not support non-normalized dictionaries" << endl;
       return;
    }

    //CachedProdMatrix<T> G(D, K < 20000 && M*K/10 > K);
    //ProdMatrix<T> G(D, K < 20000 && M*K/10 > K);
    Matrix<T> G;
    D.XtX(G);
    // for (int i = 0; i<K; ++i) G[i*K+i] += 1e-6;
    G.addDiag(1e-12);
    ProdMatrix<T> DtX(D,X,false);

    int NUM_THREADS=init_omp(numThreads);

    Vector<T>* DtRT= new Vector<T>[NUM_THREADS];
    SpVector<T>* spAlphaT= new SpVector<T>[NUM_THREADS];
    for (int i = 0; i<NUM_THREADS; ++i) {
       DtRT[i].resize(K);
       spAlphaT[i].resize(K);
    };

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< M; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       Vector<T> coeffs;
       alpha.refCol(i,coeffs);
       Vector<T>& DtR=DtRT[numT];
       SpVector<T>& spAlpha=spAlphaT[numT];
       T norm1 = coeffs.asum();
       // Compute DtR
       DtX.copyCol(i,DtR);
       Vector<T> Xi;
       X.refCol(i,Xi);
       T normX2 = Xi.nrm2sq();

       if (norm1 > EPSILON) {
          coeffs.toSparse(spAlpha);
          G.mult(spAlpha,DtR,-1.0,1.0);
       }

       if (mode == PENALTY) {
          coreIST(G,DtR,coeffs,lambda,itermax,tol);
       } else {
          coreISTconstrained(G,DtR,coeffs,normX2,lambda,itermax,tol);
       }
    }

    delete[](DtRT);
    delete[](spAlphaT);

 }

 template <typename T>
 inline void coreIST(const AbstractMatrix<T>& G, Vector<T>& DtRv, Vector<T>& coeffsv,
       const T thrs, const int itermax,
       const T tol) {

    const int K = G.n();
    T* const coeffs = coeffsv.rawX();
    T* const DtR = DtRv.rawX();
    //  T* const prG = G.rawX();

    const T lambda_init=thrs;
    T maxDtR = DtRv.fmaxval();
    T norm1=coeffsv.asum();
    T lambda=lambda_init;
    vAdd(K,DtR,coeffs,DtR);

    for (int iter=0; iter < itermax; ++iter) {
       for (int j = 0; j <K; ++j) {
          if (DtR[j] > lambda) {
             T diff=coeffs[j];
             coeffs[j]=DtR[j]-lambda;
             diff-=coeffs[j];
             DtR[j]-=diff;
             G.add_rawCol(j,DtR,diff);
             //cblas_axpy(K,diff,prG+j*K,1,DtR,1);
          } else if (DtR[j] < -lambda) {
             T diff=coeffs[j];
             coeffs[j]=DtR[j]+lambda;
             diff-=coeffs[j];
             DtR[j]-=diff;
             G.add_rawCol(j,DtR,diff);
             //cblas_axpy(K,diff,prG+j*K,1,DtR,1);
          } else if (coeffs[j]) {
             T diff=coeffs[j];
             coeffs[j]=T();
             DtR[j]-=diff;
             G.add_rawCol(j,DtR,diff);
             //cblas_axpy(K,diff,prG+j*K,1,DtR,1);
          }
       }
       if (iter % 5 == 1) {
          vSub(K,DtR,coeffs,DtR);
          maxDtR = DtRv.fmaxval();
          norm1 =T();
          T DtRa = T();
          for (int j = 0; j<K; ++j) {
             if (coeffs[j]) {
                norm1 += abs(coeffs[j]);
                DtRa += DtR[j]*coeffs[j];
             }
          }
          vAdd(K,DtR,coeffs,DtR);
          const T kappa = -DtRa+norm1*maxDtR;
          if (abs(lambda - maxDtR) < tol && kappa <= tol)
             break;
       }
    }
 }


 template <typename T>
 void coreISTconstrained(const AbstractMatrix<T>& G, Vector<T>& DtRv, Vector<T>&
       coeffsv, const T normX2, const T eps, const int itermax, const T tol) {
    const int K = G.n();
    T* const coeffs = coeffsv.rawX();
    T* const DtR = DtRv.rawX();
    // T* const prG = G.rawX();
    T err = normX2;

    T norm1 = coeffsv.asum();
    if (!norm1 && err <= eps) return;
    T current_tol = 10.0*tol;
    T maxDtR = DtRv.fmaxval();
    T lambda = maxDtR;
    T lambdasq= lambda*lambda;
    if (!norm1) {
       lambdasq *= eps/err;
       lambda=sqrt(lambdasq);
    }

    Vector<int> indices(K);
    indices.set(-1);
    int* const pr_indices=indices.rawX();
    int count;

    for (int iter=0; iter < itermax; ++iter) {

       count=0;
       T old_err = err;
       for (int j = 0; j <K; ++j) {

          // Soft-thresholding
          T old_coeff = coeffs[j];
          T diff = DtR[j]+old_coeff;
          if (diff > lambda) {
             coeffs[j] = diff - lambda;
             err+=lambdasq-DtR[j]*DtR[j];
             pr_indices[count++]=j;
          } else if (diff < - lambda) {
             coeffs[j] = diff + lambda;
             err+=lambdasq-DtR[j]*DtR[j];
             pr_indices[count++]=j;
          } else {
             coeffs[j]=T();
             if (old_coeff) {
                err+=diff*diff-DtR[j]*DtR[j];
             }
          }
          // Update DtR
          diff = old_coeff-coeffs[j];
          if (diff) {
             G.add_rawCol(j,DtR,diff);
             //cblas_axpy<T>(K,old_coeff-coeffs[j],prG+j*K,1,DtR,1);
          }
       }

       maxDtR = DtRv.fmaxval();
       norm1 =T();
       T DtRa = T();
       for (int j = 0; j<count; ++j) {
          const int ind = pr_indices[j];
          norm1 += abs(coeffs[ind]);
          DtRa += DtR[ind]*coeffs[ind];
       }
       if (norm1-DtRa/maxDtR <= current_tol) {
          const bool change = ((old_err > eps) && err < eps+current_tol) ||
             (old_err < eps && err > eps-current_tol);
          if (change) {
             if (current_tol == tol) {
                break;
             } else {
                current_tol = MAX(current_tol*0.5,tol);
             }
          }
          lambdasq *= eps/err;
          lambda=sqrt(lambdasq);
       }
    }
 };


 template <typename T>
 void ist_groupLasso(const Matrix<T>* XT, const Matrix<T>& D,
       Matrix<T>* alphaT, const int Ngroups,
       const T lambda, const constraint_type mode,
       const int itermax,
       const T tol, const int numThreads) {
    int K=D.n();
    int n = D.m();

    if (!D.isNormalized()) {
       cerr << "Current implementation of block coordinate descent does not support non-normalized dictionaries" << endl;
       return;
    }

    if (mode == L1COEFFS) {
       std::cerr << "Mode not implemented" << std::endl;
       return;
    }


    Matrix<T> G;
    D.XtX(G);

    int NUM_THREADS=init_omp(numThreads);

    Matrix<T>* RtDT = new Matrix<T>[NUM_THREADS];
    Matrix<T>* alphatT = new Matrix<T>[NUM_THREADS];

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< Ngroups; ++i) {
 #ifdef _OPENMP
       int numT=omp_get_thread_num();
 #else
       int numT=0;
 #endif
       const Matrix<T>& X = XT[i];
       int M = X.n();
       Matrix<T>& alphat = alphatT[numT];
       alphaT[i].transpose(alphat);
       Matrix<T>& RtD = RtDT[numT];
       X.mult(D,RtD,true,false);


       Vector<T> col, col2;
       T norm1 = alphat.asum();
       T normX2;

       if (!norm1) {
          Vector<T> DtR_mean(K);
          Vector<T> coeffs_mean(K);
          coeffs_mean.setZeros();
          RtD.meanRow(DtR_mean);
          coeffs_mean.setZeros();
          if (mode == PENALTY) {
             coreIST(G,DtR_mean,coeffs_mean,lambda/T(2.0),itermax,tol);
          } else {
             Vector<T> meanVec(n);
             X.meanCol(meanVec);
             normX2=meanVec.nrm2sq();
             coreISTconstrained(G,DtR_mean,coeffs_mean,normX2,
                   lambda,itermax,tol);
             SpVector<T> spalpha(K);
             normX2-=computeError(normX2,G,DtR_mean,coeffs_mean,spalpha);
             normX2=X.normFsq()-M*normX2;
          }
          alphat.fillRow(coeffs_mean);
       }

       if (M > 1) {
          for (int j = 0; j<K; ++j) {
             alphat.refCol(j,col);
             const T nrm=col.nrm2sq();
             if (nrm) {
                G.refCol(j,col2);
                RtD.rank1Update(col,col2,T(-1.0));
             }
          }

          if (mode == PENALTY) {
             coreGroupIST(G,RtD,alphat,sqr<T>(M)*lambda/T(2.0),itermax,sqr<T>(M)*tol);
          } else  {
             coreGroupISTConstrained(G,RtD,alphat,normX2,M*lambda,itermax,sqr<T>(M)*tol);
          }
       }
       alphat.transpose(alphaT[i]);
    }

    delete[](RtDT);
    delete[](alphatT);
 };


 template <typename T>
 void coreGroupIST(const Matrix<T>& G, Matrix<T>& RtDm,
       Matrix<T>& coeffsm,
       const T thrs,
       const int itermax,
       const T tol) {
    const int K = G.n();
    const int M = RtDm.m();
    T* const prG = G.rawX();
    T* const RtD = RtDm.rawX();
    T* const coeffs = coeffsm.rawX();

    const T lambda_init=thrs;
    T lambda=lambda_init;

    Vector<T> old_coeffv(M);
    T* const old_coeff = old_coeffv.rawX();
    Vector<T> normsv(K);
    T* const norms = normsv.rawX();
    coeffsm.norm_2_cols(normsv);
    Vector<T> normRtDv(K);

    Vector<int> activatev(K);
    activatev.set(3);
    int* const activate=activatev.rawX();

    for (int iter=0; iter < itermax; ++iter) {
       for (int j = 0; j <K; ++j) {
          if (activate[j] >= 0) {
             if (norms[j]) {
                cblas_copy(M,coeffs+j*M,1,old_coeff,1);
                vAdd(M,coeffs+j*M,RtD+j*M,coeffs+j*M);
                const T nrm = cblas_nrm2(M,coeffs+j*M,1);
                if (nrm > lambda) {
                   norms[j]=nrm-lambda;
                   cblas_scal(M,norms[j]/nrm,coeffs+j*M,1);
                   vSub(M,old_coeff,coeffs+j*M,old_coeff);
                   cblas_ger(CblasColMajor,M,K,T(1.0),old_coeff,1,prG+j*K,1,RtD,M);
                   activate[j]=5;
                } else {
                   memset(coeffs+j*M,0,M*sizeof(T));
                   norms[j]=T();
                   cblas_ger(CblasColMajor,M,K,T(1.0),old_coeff,1,prG+j*K,1,RtD,M);
                   --activate[j];
                }
             } else {
                cblas_copy(M,RtD+j*M,1,old_coeff,1);
                const T nrm = cblas_nrm2(M,old_coeff,1);
                if (nrm > lambda) {
                   norms[j]=nrm-lambda;
                   cblas_copy(M,old_coeff,1,coeffs+j*M,1);
                   cblas_scal(M,norms[j]/nrm,coeffs+j*M,1);
                   cblas_ger(CblasColMajor,M,K,T(-1.0),coeffs+j*M,1,prG+j*K,1,RtD,M);
                   activate[j]=5;
                } else {
                   activate[j] = (activate[j] == 0) ? -10 : activate[j]-1;
                }
             }
          } else {
             ++activate[j];
          }
       }

       if (iter % 5 == 4) {
          T norm1=normsv.asum();
          RtDm.norm_2sq_cols(normRtDv);
          T maxDtR = sqr(normRtDv.maxval());
          T DtRa=T();
          for (int j = 0; j<K; ++j) {
             if (norms[j]) {
                DtRa += cblas_dot(M,coeffs+j*M,1,RtD+j*M,1);
             }
          }
          if ((maxDtR - lambda) < (tol*maxDtR/norm1) && norm1-DtRa/maxDtR < tol) break;
       }
    }
 };


 template <typename T>
 void coreGroupISTConstrained(const Matrix<T>& G, Matrix<T>& RtDm,
       Matrix<T>& coeffsm, const T normR,
       const T eps,
       const int itermax,
       const T tol) {
    const int K = G.n();
    const int M = RtDm.m();
    T* const prG = G.rawX();
    T* const RtD = RtDm.rawX();
    T* const coeffs = coeffsm.rawX();

    T err = normR;

    Vector<T> old_coeffv(M);
    T* const old_coeff = old_coeffv.rawX();
    Vector<T> normsv(K);
    T* const norms = normsv.rawX();
    coeffsm.norm_2_cols(normsv);
    Vector<T> normRtDv(K);
    RtDm.norm_2sq_cols(normRtDv);

    Vector<int> activatev(K);
    activatev.set(3);
    int* const activate=activatev.rawX();

    T norm1 = normsv.sum();
    if (!norm1 && err <= eps) return;
    T current_tol = 10.0*tol;

    T maxDtR = sqr(normRtDv.maxval());
    T lambda = maxDtR;
    T lambdasq= lambda*lambda;

    if (!norm1) {
       lambdasq *= eps/err;
       lambda=sqrt(lambdasq);
    }

    for (int iter=0; iter < itermax; ++iter) {

       T old_err = err;
       for (int j = 0; j <K; ++j) {
          if (activate[j] >= 0) {
             if (norms[j]) {
                cblas_copy(M,coeffs+j*M,1,old_coeff,1);
                vAdd(M,coeffs+j*M,RtD+j*M,coeffs+j*M);
                const T nrm = cblas_nrm2(M,coeffs+j*M,1);
                if (nrm > lambda) {
                   norms[j]=nrm-lambda;
                   cblas_scal(M,norms[j]/nrm,coeffs+j*M,1);
                   vSub(M,old_coeff,coeffs+j*M,old_coeff);
                   err += cblas_dot(M,old_coeff,1,old_coeff,1)
                      +2*cblas_dot(M,old_coeff,1,RtD+j*M,1);
                   cblas_ger(CblasColMajor,M,K,T(1.0),old_coeff,1,prG+j*K,1,RtD,M);
                   activate[j]=3;
                } else {
                   memset(coeffs+j*M,0,M*sizeof(T));
                   norms[j]=T();
                   err += cblas_dot(M,old_coeff,1,old_coeff,1)
                      +2*cblas_dot(M,old_coeff,1,RtD+j*M,1);
                   cblas_ger(CblasColMajor,M,K,T(1.0),old_coeff,1,prG+j*K,1,RtD,M);
                   --activate[j];
                }
             } else {
                cblas_copy(M,RtD+j*M,1,old_coeff,1);
                const T nrm = cblas_nrm2(M,old_coeff,1);
                if (nrm > lambda) {
                   norms[j]=nrm-lambda;
                   cblas_copy(M,old_coeff,1,coeffs+j*M,1);
                   cblas_scal(M,norms[j]/nrm,coeffs+j*M,1);
                   err += cblas_dot(M,coeffs+j*M,1,coeffs+j*M,1)
                      -2*cblas_dot(M,coeffs+j*M,1,RtD+j*M,1);
                   cblas_ger(CblasColMajor,M,K,T(-1.0),coeffs+j*M,1,prG+j*K,1,RtD,M);
                   activate[j]=3;
                } else {
                   activate[j] = (activate[j] == 0) ? -3 : activate[j]-1;
                }
             }
          } else {
             ++activate[j];
          }
       }

       norm1 = normsv.sum();
       RtDm.norm_2sq_cols(normRtDv);
       maxDtR = sqr(normRtDv.maxval());
       T DtRa=T();
       for (int j = 0; j<K; ++j) {
          if (norms[j]) {
             DtRa += cblas_dot(M,coeffs+j*M,1,RtD+j*M,1);
          }
       }
       if (norm1-DtRa/maxDtR <= current_tol) {
          const T tol_bis=current_tol*maxDtR;
          const bool change = ((old_err > eps) && err < eps+tol_bis) ||
             (old_err < eps && err > eps-tol_bis);
          if (change) {
             if (current_tol == tol) {
                break;
             } else {
                current_tol = MAX(current_tol*0.5,tol);
             }
          }
          lambdasq *= eps/err;
          lambda=sqrt(lambdasq);
       }
    }
 };

 template <typename T>
 T computeError(const T normX2,const Vector<T>& norms,
       const Matrix<T>& G,const Matrix<T>& RtD,const Matrix<T>& alphat) {
    T err2 = normX2;
    Vector<T> col,col2;
    for (int j = 0; j<G.n(); ++j) {
       if (norms[j] > EPSILON) {
          alphat.refCol(j,col);
          RtD.refCol(j,col2);
          err2 -= 2*col.dot(col2);
          T add = 0.0;
          for (int k = 0; k<j; ++k) {
             if (norms[k] > EPSILON) {
                alphat.refCol(k,col2);
                add -= G(j,k)*col.dot(col2);
             }
          }
          add += add - G(j,j)*col.nrm2sq();
          err2 += add;
       }
    }
    return err2;
 }

 template <typename T>
 T computeError(const T normX2,
       const Matrix<T>& G,const Vector<T>& DtR,const Vector<T>& coeffs,
       SpVector<T>& spAlpha) {
    coeffs.toSparse(spAlpha);
    return normX2 -G.quad(spAlpha)-2*DtR.dot(spAlpha);
 };

 /* ******************
  * Simultaneous OMP
  * *****************/

 template <typename T>
 void somp(const Matrix<T>* X, const Matrix<T>& D, SpMatrix<T>* spalpha,
       const int Ngroups, const int L, const T eps,const int numThreads) {
    somp(X,D,spalpha,Ngroups,L,&eps,false,numThreads);
 }

 template <typename T>
 void somp(const Matrix<T>* XT, const Matrix<T>& D, SpMatrix<T>* spalphaT,
       const int Ngroups, const int LL, const T* eps, const bool adapt,
       const int numThreads) {
    if (LL <= 0) return;
    const int K = D.n();
    const int L = MIN(D.m(),MIN(LL,K));

    if (!D.isNormalized()) {
       cerr << "Current implementation of OMP does not support non-normalized dictionaries" << endl;
       return;
    }

    Matrix<T> G;
    D.XtX(G);

    int NUM_THREADS=init_omp(numThreads);

    int i;
 #pragma omp parallel for private(i)
    for (i = 0; i< Ngroups; ++i) {
       const Matrix<T>& X = XT[i];
       const int M = X.n();
       SpMatrix<T>& spalpha = spalphaT[i];
       spalpha.clear();
       Vector<int> rv;
       Matrix<T> vM;
       T thrs = adapt ? eps[i] : M*(*eps);
       coreSOMP(X,D,G,vM,rv,L,thrs);
       spalpha.convert2(vM,rv,K);
    }
 }

 template <typename T>
 void coreSOMP(const Matrix<T>& X, const Matrix<T>& D, const Matrix<T>& G,
       Matrix<T>& v,
       Vector<int>& r, const int L, const T eps) {
    const int K = G.n();
    const int n = D.m();
    const int M = X.n();

    const bool big_mode = M*K*(n+L) > 2*(M*n*n+K*n*(n+L));
    r.resize(L);
    r.set(-1);
    v.resize(0,X.n());

    if (M == 1) {
       Vector<T> scores(K);
       Vector<T> norm(K);
       Vector<T> tmp(K);
       Matrix<T> Un(L,L);
       Un.setZeros();
       Matrix<T> Undn(K,L);
       Matrix<T> Unds(L,L);
       Matrix<T> Gs(K,L);
       Vector<T> Rdn(K);
       Vector<T> Xt(X.rawX(),n);
       D.multTrans(Xt,Rdn);
       Vector<T> RUn(L);
       T normX = Xt.nrm2sq();
       T lambda=0;
       coreORMP(scores,norm,tmp,Un,Undn,Unds,Gs,Rdn,G,r,RUn,normX,&eps,&L,&lambda);
       int count=0;
       for (int i = 0; i<L; ++i) {
          if (r[i] == -1) break;
          ++count;
       }
       v.resize(count,X.n());
       Vector<T> v1(v.rawX(),count);
       Vector<T> v2(RUn.rawX(),count);
       v1.copy(v2);
       return;
    }

    Matrix<T> XXtD;
    Matrix<T> XtD;
    T E;
    if (big_mode) {
       Matrix<T> XXt;
       X.XXt(XXt);
       E = XXt.trace();
       if (E < eps) return;
       XXt.mult(D,XXtD);
    } else {
       E=X.normFsq();
       if (E < eps) return;
       X.mult(D,XtD,true);
    }

    Matrix<T> A(K,L);
    A.setZeros();
    Matrix<T> B(L,K);
    B.setZeros();
    Matrix<T> S(L,L);
    S.setZeros();
    Matrix<T> Fs(K,L);
    Fs.setZeros();
    Matrix<T> Gs(K,L);
    Gs.setZeros();
    Matrix<T> As(L,L);
    As.setZeros();

    Vector<T> tmp(K);
    Vector<T> e(K);
    G.diag(e);
    Vector<T> f(K);
    if (big_mode) {
       for (int i = 0; i<K; ++i) {
          Vector<T> di;
          D.refCol(i,di);
          Vector<T> di2;
          XXtD.refCol(i,di2);
          f[i]=di.dot(di2);
       }
    } else {
       XtD.norm_2sq_cols(f);
    }
    Vector<T> c(L);
    c.setZeros();
    Vector<T> scores(K);

    T* const prAs = As.rawX();
    T* const prA = A.rawX();
    T* const prS = S.rawX();
    T* const prGs = Gs.rawX();
    T* const prFs = Fs.rawX();
    T* const prB = B.rawX();
    T* const pr_c = c.rawX();
    T* const pr_tmp = tmp.rawX();

    int j;
    for (j = 0; j<L; ++j) {
       scores.copy(f);
       scores.div(e);
       for (int k = 0; k<j; ++k) scores[r[k]]=-1.0;
       const int currentInd = scores.max();
       const T invNorm=T(1.0)/sqrt(e[currentInd]);
       if (invNorm > 1e3) {
          j=j-1;
          break;
       }
       r[j]=currentInd;
       E -= scores[currentInd];
       for (int k = 0; k<j; ++k) prS[j*L+k]=T();
       prS[j*L+j]=T(1.0);
       for (int k = 0; k<j; ++k) prAs[k*L+j]=prA[k*K+currentInd];

       int iter = invNorm > 1.41 ? 2 : 1;
       for (int k = 0; k<iter; ++k) {
          for (int l = 0; l<j; ++l) {
             T scal = -cblas_dot<T>(j-l+1,prAs+l*L+l,1,prS+j*L+l,1);
             cblas_axpy<T>(l+1,scal,prS+l*L,1,prS+j*L,1);
          }
       }
       cblas_scal<T>(j+1,invNorm,prS+j*L,1);

       if (j == L-1 || E <= eps) {
          ++j;
          break;
       }

       Vector<T> Gsj;
       Gs.refCol(j,Gsj);
       G.copyCol(currentInd,Gsj);
       cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,j+1,T(1.0),prGs,K,prS+j*L,1,
             T(0.0),prA+j*K,1);
       prAs[j*L+j]=prA[j*K+currentInd];
       Vector<T> Aj;
       A.refCol(j,Aj);
       tmp.sqr(Aj);
       e.sub(tmp);

       Vector<T> Fsj;
       Fs.refCol(j,Fsj);
       if (big_mode) {
          Vector<T> di;
          D.refCol(currentInd,di);
          XXtD.multTrans(di,Fsj);
       } else {
          Vector<T> di;
          XtD.refCol(currentInd,di);
          XtD.multTrans(di,Fsj);
       }
       cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,j+1,T(1.0),prFs,K,prS+j*L,1,
             T(0.0),prB+j,L);
       for (int k = 0; k<j;++k) pr_c[k]=T();
       for (int k = 0; k<=j;++k)
          cblas_axpy<T>(j,prS[j*L+k],prB+r[k]*L,1,pr_c,1);
       f.add(tmp,f[currentInd]*invNorm*invNorm);
       if (j > 0) {
          cblas_gemv<T>(CblasColMajor,CblasNoTrans,K,j,T(1.0),prA,K,pr_c,1,
                T(0.0),pr_tmp,1);
       } else {
          tmp.setZeros();
       }
       cblas_axpy<T>(K,T(-1.0),prB+j,L,pr_tmp,1);
       tmp.mult(tmp,Aj);
       f.add(tmp,T(2.0));
    }
    A.clear();
    B.clear();
    Fs.clear();
    Gs.clear();
    As.clear();

    if (j == 0) return;

    Matrix<T> SSt;
    S.upperTriXXt(SSt,j);
    Matrix<T> Dg(n,j);
    for (int i = 0; i<j;++i) {
       Vector<T> Dgi;
       Dg.refCol(i,Dgi);
       D.copyCol(r[i],Dgi);
    }
    Matrix<T> SStDt;
    SSt.mult(Dg,SStDt,false,true);
    SStDt.mult(X,v);
 };

 }

 #endif // DECOMP_H

spams::Matrix::quad
T quad(const Vector< T > &vec1, const SpVector< T > &vec2) const
return vec1&#39;*A*vec2, where vec2 is sparse
Definition: linalg.h:2047

spams::SpMatrix
Sparse Matrix class.
Definition: linalg.h:63

spams::SpMatrix::toFull
void toFull(Matrix< T > &matrix) const
copy the sparse matrix into a dense matrix
Definition: linalg.h:4760

spams
Definition: dag.h:26

spams::Matrix::rank1Update
void rank1Update(const Vector< T > &vec1, const Vector< T > &vec2, const T alpha=1.0)
perform A <- A + alpha*vec1*vec2&#39;
Definition: linalg.h:2384

spams::Vector::sum
T sum() const
returns the sum of the vector
Definition: linalg.h:3344

spams::ist
void ist(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, T lambda, constraint_type mode, const int itermax=500, const T tol=0.5, const int numThreads=-1)
Definition: decomp.h:1966

spams::Matrix::norm_2_cols
void norm_2_cols(Vector< T > &norms) const
returns the l2 norms of the columns
Definition: linalg.h:2162

spams::Matrix::meanCol
void meanCol(Vector< T > &mean) const
Compute the mean of the columns.
Definition: linalg.h:2226

CblasLower
Definition: utl_cblas.h:8

CblasRight
Definition: utl_cblas.h:10

spams::computeError
T computeError(const T normX2, const Vector< T > &norms, const Matrix< T > &G, const Matrix< T > &RtD, const Matrix< T > &alphat)
auxiliary function for ist_groupLasso
Definition: decomp.h:2481

utl::vSub
void vSub(int n, T *vecIn, T *vecIn2, T *vecOut)
interface to v*Sub
Definition: utlCoreMKL.h:279

spams::Vector::copy
void copy(const Vector< T > &x)
make a copy of x
Definition: linalg.h:2865

utl::cblas_copy
void cblas_copy(const INTT N, const T *X, const INTT incX, T *Y, const INTT incY)

spams::constraint_type
constraint_type
Definition: decomp.h:88

spams::coreGroupISTConstrained
void coreGroupISTConstrained(const Matrix< T > &G, Matrix< T > &RtD, Matrix< T > &alphat, const T normR, const T eps, const int itermax=500, const T tol=0.5)
Auxiliary function for ist_groupLasso.
Definition: decomp.h:2370

spams::coreISTconstrained
void coreISTconstrained(const AbstractMatrix< T > &G, Vector< T > &DtR, Vector< T > &coeffs, const T normX2, const T thrs, const int itermax=500, const T tol=0.5)
coreIST constrained
Definition: decomp.h:2113

spams::sqr
T sqr(const T x)
template version of the fabs function

spams::Matrix::setn
void setn(const int n)
modify _n
Definition: linalg.h:279

utl::cblas_scal
void cblas_scal(const INTT N, const T alpha, T *X, const INTT incX)

spams::downDateLasso
void downDateLasso(int &j, int &minBasis, T &normX, const bool ols, const bool pos, Vector< T > &Rdn, int *ind, T *coeffs, Vector< T > &sig, Vector< T > &av, Vector< T > &Xdn, Vector< T > &RUn, Matrix< T > &Unm, Matrix< T > &Gsm, Matrix< T > &Gsam, Matrix< T > &Undsm, Matrix< T > &Rm)
Auxiliary functoni for coreLARS (Cholesky downdate)
Definition: decomp.h:1000

spams::L2ERROR
Definition: decomp.h:88

linalg.h

spams::Vector::fmaxval
T fmaxval() const
returns the maximum magnitude
Definition: linalg.h:2799

CblasTrans
Definition: utl_cblas.h:6

spams::AbstractMatrix::n
virtual int n() const  =0

spams::Vector::add
void add(const Vector< T > &x, const T a=1.0)
A <- A + a*x.
Definition: linalg.h:3029

spams::Matrix::XtX
void XtX(Matrix< T > &XtX) const
XtX = A&#39;*A.
Definition: linalg.h:1959

spams::Matrix::addDiag
void addDiag(const Vector< T > &diag)
Definition: linalg.h:1506

utl::cblas_ger
void cblas_ger(const CBLAS_ORDER order, const INTT M, const INTT N, const T alpha, const T *X, const INTT incX, const T *Y, const INTT incY, T *A, const INTT lda)

utl::E
static constexpr double E
Definition: utlConstants.h:25

spams::Vector::mult
void mult(const Vector< T > &x, const Vector< T > &y)
A <- x .* y.
Definition: linalg.h:3114

spams::Vector::rawX
T * rawX() const
returns a modifiable reference of the data, DANGEROUS
Definition: linalg.h:593

CblasUpper
Definition: utl_cblas.h:8

spams::Matrix::transpose
void transpose(Matrix< T > &trans)
Definition: linalg.h:1489

spams::omp
void omp(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, const int *L, const T *eps, const T *lambda, const bool vecL=false, const bool vecEps=false, const bool Lambda=false, const int numThreads=-1, Matrix< T > *path=NULL)
Definition: decomp.h:298

utl::vAdd
void vAdd(int n, T *vecIn, T *vecIn2, T *vecOut)
interface to v*Add
Definition: utlCoreMKL.h:300

spams::Vector::maxval
T maxval() const
returns the maximum value
Definition: linalg.h:2789

spams::Vector::sqr
void sqr(const Vector< T > &x)
A <- x .^ 2.
Definition: linalg.h:3077

spams::coreORMPB
void coreORMPB(Vector< T > &RtD, const AbstractMatrix< T > &G, Vector< int > &ind, Vector< T > &coeffs, T &normX, const int L, const T eps, const T lambda=0)
Auxiliary function of omp.
Definition: decomp.h:498

spams::coreIST
void coreIST(const AbstractMatrix< T > &G, Vector< T > &DtR, Vector< T > &coeffs, const T thrs, const int itermax=500, const T tol=0.5)
coreIST
Definition: decomp.h:2052

spams::Matrix::diag
void diag(Vector< T > &d) const
extract the diagonal
Definition: linalg.h:1985

spams::ProdMatrix::copyCol
void copyCol(const int i, Vector< T > &DtXi) const
compute DtX(:,i)
Definition: linalg.h:5075

spams::lassoReweighted
void lassoReweighted(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, int L, const T constraint, constraint_type mode, const bool pos, const T sigma, const int numThreads=-1)
second implementation using matrix inversion lemma
Definition: decomp.h:1113

spams::Vector::setn
void setn(const int n)
Definition: linalg.h:627

spams::coreSOMP
void coreSOMP(const Matrix< T > &X, const Matrix< T > &D, const Matrix< T > &G, Matrix< T > &vM, Vector< int > &rv, const int L, const T eps)
Definition: decomp.h:2558

low
static char low
a few static variables for lapack
Definition: cblas_alt_template.h:20

spams::Vector::alltrue
bool alltrue() const

spams::Vector::copyMask
void copyMask(Vector< T > &out, Vector< bool > &mask) const
extract the rows of a matrix corresponding to a binary mask
Definition: linalg.h:4041

spams::Matrix::copyCol
void copyCol(const int i, Vector< T > &x) const
Copy the column i into x.
Definition: linalg.h:1100

CblasColMajor
Definition: utl_cblas.h:5

spams::Matrix::m
int m() const
Definition: linalg.h:222

CblasNonUnit
Definition: utl_cblas.h:9

spams::somp
void somp(const Matrix< T > *X, const Matrix< T > &D, SpMatrix< T > *spalpha, const int Ngroups, const int L, const T *pr_eps, const bool adapt=false, const int numThreads=-1)
Definition: decomp.h:2524

utils.h
Contains various variables and class timer.

spams::Vector::n
int n() const
returns the size of the vector
Definition: linalg.h:591

spams::Data
Data class, abstract class, useful in the class image.
Definition: linalg.h:130

MIN
#define MIN(a, b)
Definition: utils.h:47

spams::Matrix::refCol
void refCol(int i, Vector< T > &x) const
Reference the column i into the vector x.
Definition: linalg.h:1144

CblasNoTrans
Definition: utl_cblas.h:6

spams::lasso
void lasso(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, int L, const T constraint, const T lambda2=0, constraint_type mode=PENALTY, const bool pos=false, const bool ols=false, const int numThreads=-1, Matrix< T > *path=NULL, const int length_path=-1)
Definition: decomp.h:639

spams::Matrix::copyMask
void copyMask(Matrix< T > &out, Vector< bool > &mask) const
extract the rows of a matrix corresponding to a binary mask
Definition: linalg.h:4052

spams::Vector::allfalse
bool allfalse() const

nonUnit
static char nonUnit
Definition: cblas_alt_template.h:22

MAX
#define MAX(a, b)
Definition: utils.h:48

spams::Vector::setZeros
void setZeros()
Set all values to zero.
Definition: linalg.h:2871

spams::Vector::asum
T asum() const
computes the sum of the magnitudes of the vector
Definition: linalg.h:3324

spams::Vector::dot
T dot(const Vector< T > &x) const
returns A&#39;x
Definition: linalg.h:3012

spams::AbstractMatrix::copyCol
virtual void copyCol(const int i, Vector< T > &Xi) const  =0
copy X(:,i) into Xi

spams::Matrix::trace
T trace() const
return the trace of the matrix
Definition: linalg.h:2090

spams::abs
T abs(const T x)
template version of the fabs function

spams::Matrix::norm_2sq_cols
void norm_2sq_cols(Vector< T > &norms) const
returns the l2 norms ^2 of the columns
Definition: linalg.h:2204

EPSILON
#define EPSILON
Definition: utils.h:60

spams::Vector
Dense Vector class.
Definition: linalg.h:65

spams::Matrix::isNormalized
bool isNormalized() const
Check wether the columns of the matrix are normalized or not.
Definition: linalg.h:1158

spams::ProdMatrix
Class representing the product of two matrices.
Definition: linalg.h:998

spams::Vector::toSparse
void toSparse(SpVector< T > &vec) const
make a sparse copy
Definition: linalg.h:4025

spams::lasso2
void lasso2(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, int L, const T constraint, const T lambda2=0, constraint_type mode=PENALTY, const bool pos=false, const int numThreads=-1, Matrix< T > *path=NULL, const int length_path=-1)
second implementation using matrix inversion lemma
Definition: decomp.h:1468

spams::L1COEFFS
Definition: decomp.h:88

spams::SpMatrix::clear
void clear()
clear the matrix
Definition: linalg.h:4204

spams::PENALTY
Definition: decomp.h:88

spams::ist_groupLasso
void ist_groupLasso(const Matrix< T > *XT, const Matrix< T > &D, Matrix< T > *alphaT, const int Ngroups, const T lambda, const constraint_type mode, const int itermax=500, const T tol=0.5, const int numThreads=-1)
ist for group Lasso
Definition: decomp.h:2196

spams::Vector::set
void set(const T val)
set each value of the vector to val
Definition: linalg.h:2997

spams::Matrix::fillRow
void fillRow(const Vector< T > &row)
fill the matrix with the row given
Definition: linalg.h:2241

spams::Vector::max
int max() const
returns the index of the largest value
Definition: linalg.h:2761

spams::lassoWeightPreComputed
void lassoWeightPreComputed(const Matrix< T > &X, const Matrix< T > &G, const Matrix< T > &DtR, const Matrix< T > &weights, SpMatrix< T > &spalpha, int L, const T constraint, constraint_type mode, const bool pos, const int numThreads)
Definition: decomp.h:1298

spams::AbstractMatrix
Abstract matrix class.
Definition: linalg.h:188

spams::Matrix::mult
void mult(const Vector< T > &x, Vector< T > &b, const T alpha=1.0, const T beta=0.0) const
perform b = alpha*A*x+beta*b
Definition: linalg.h:1783

spams::Matrix::n
int n() const
Number of columns.
Definition: linalg.h:224

spams::lasso_mask
void lasso_mask(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, const Matrix< bool > &mask, int L, const T constraint, const T lambda2=0, constraint_type mode=PENALTY, const bool pos=false, const int numThreads=-1)
second implementation using matrix inversion lemma
Definition: decomp.h:1371

spams::AbstractMatrix::extract_rawCol
virtual void extract_rawCol(const int i, T *Xi) const  =0
copy X(:,i) into Xi

SIGN
#define SIGN(a)
Definition: utils.h:49

spams::SpVector
Sparse Vector class.
Definition: linalg.h:67

spams::Vector::nrm2sq
T nrm2sq() const
returns ||A||_2^2
Definition: linalg.h:3007

spams::Matrix::setZeros
void setZeros()
Set all the values to zero.
Definition: linalg.h:1240

spams::Vector::resize
void resize(const int n)
resize the vector
Definition: linalg.h:2876

spams::AbstractMatrix::add_rawCol
virtual void add_rawCol(const int i, T *col, const T a) const  =0
compute X(:,i)<- X(:,i)+a*col;

spams::Data::norm_2sq_cols
virtual void norm_2sq_cols(Vector< T > &norms) const
Definition: linalg.h:139

spams::Matrix::XXt
void XXt(Matrix< T > &XXt) const
XXt = A*A&#39;.
Definition: linalg.h:1967

spams::SpMatrix::convert2
void convert2(const Matrix< T > &v, const Vector< int > &r, const int K)
use the data from v, r for _v, _r
Definition: linalg.h:4809

spams::coreLARS2W
void coreLARS2W(Vector< T > &DtR, AbstractMatrix< T > &G, Matrix< T > &Gs, Matrix< T > &Ga, Matrix< T > &invGs, Vector< T > &u, Vector< T > &coeffs, const Vector< T > &weights, Vector< int > &ind, Matrix< T > &work, T &normX, const constraint_type mode, const T constraint, const bool pos=false)
Auxiliary function for lasso.
Definition: decomp.h:1764

spams::PENALTY2
Definition: decomp.h:88

spams::Matrix::meanRow
void meanRow(Vector< T > &mean) const
Compute the mean of the rows.
Definition: linalg.h:2233

spams::Matrix::upperTriXXt
void upperTriXXt(Matrix< T > &XXt, const int L) const
XXt = A*A&#39; where A is an upper triangular matrix.
Definition: linalg.h:1975

spams::Vector::fmax
int fmax() const
returns the index of the value with largest magnitude
Definition: linalg.h:2843

spams::SpVector::resize
void resize(const int nzmax)
resizes the vector
Definition: linalg.h:4971

spams::init_omp
static int init_omp(const int numThreads)
Definition: misc.h:264

spams::ProdMatrix::setMatrices
void setMatrices(const Matrix< T > &D, const bool high_memory=true)
set_matrices
Definition: linalg.h:5059

utl::cblas_dot
T cblas_dot(const INTT N, const T *X, const INTT incX, const T *Y, const INTT incY)

spams::coreGroupIST
void coreGroupIST(const Matrix< T > &G, Matrix< T > &RtD, Matrix< T > &alphat, const T thrs, const int itermax=500, const T tol=0.5)
Auxiliary function for ist_groupLasso.
Definition: decomp.h:2290

spams::Matrix::asum
T asum() const
compute the sum of the magnitude of the matrix values
Definition: linalg.h:2085

spams::Matrix::multTrans
void multTrans(const Vector< T > &x, Vector< T > &b, const T alpha=1.0, const T beta=0.0) const
perform b = alpha*A&#39;x + beta*b
Definition: linalg.h:1743

spams::SpMatrix::convert
void convert(const Matrix< T > &v, const Matrix< int > &r, const int K)
use the data from v, r for _v, _r
Definition: linalg.h:4786

spams::Vector::div
void div(const Vector< T > &x)
A <- A ./ x.
Definition: linalg.h:3064

spams::lassoWeight
void lassoWeight(const Matrix< T > &X, const Matrix< T > &D, const Matrix< T > &weights, SpMatrix< T > &spalpha, int L, const T constraint, constraint_type mode, const bool pos, const int numThreads)
Definition: decomp.h:1220

spams::coreLARS2
void coreLARS2(Vector< T > &DtR, const AbstractMatrix< T > &G, Matrix< T > &Gs, Matrix< T > &Ga, Matrix< T > &invGs, Vector< T > &u, Vector< T > &coeffs, Vector< int > &ind, Matrix< T > &work, T &normX, const constraint_type mode, const T constraint, const bool pos=false, T *pr_path=NULL, int length_path=-1)
Auxiliary function for lasso.
Definition: decomp.h:1556

spams::omp_mask
void omp_mask(const Matrix< T > &X, const Matrix< T > &D, SpMatrix< T > &spalpha, const Matrix< bool > &mask, const int *L, const T *eps, const T *lambda, const bool vecL=false, const bool vecEps=false, const bool Lambda=false, const int numThreads=-1, Matrix< T > *path=NULL)
Definition: decomp.h:382

spams::Matrix::resize
void resize(int m, int n)
Resize the matrix.
Definition: linalg.h:1217

spams::Matrix::normFsq
T normFsq() const
return ||A||_F^2
Definition: linalg.h:2110

spams::Matrix::toSparse
void toSparse(SpMatrix< T > &matrix) const
make a sparse copy of the current matrix
Definition: linalg.h:2566

spams::Vector::sub
void sub(const Vector< T > &x)
A <- A - x.
Definition: linalg.h:3052

spams::L2ERROR2
Definition: decomp.h:88

spams::SPARSITY
Definition: decomp.h:88

spams::Matrix
Dense Matrix class.
Definition: linalg.h:61

utl::cblas_nrm2
T cblas_nrm2(const INTT N, const T *X, const INTT incX)

spams::Data::n
virtual int n() const  =0

spams::coreLARS
void coreLARS(Vector< T > &Rdn, Vector< T > &Xdn, Vector< T > &A, Vector< T > &u, Vector< T > &sig, Vector< T > &av, Vector< T > &RUn, Matrix< T > &Un, Matrix< T > &Unds, Matrix< T > &Gs, Matrix< T > &Gsa, Matrix< T > &workT, Matrix< T > &R, const AbstractMatrix< T > &G, T &normX, Vector< int > &ind, Vector< T > &coeffs, const T constraint, const bool ols=false, const bool pos=false, constraint_type mode=L1COEFFS, T *path=NULL, int length_path=-1)
Auxiliary function for lasso.
Definition: decomp.h:748

spams::ProdMatrix::addDiag
void addDiag(const T diag)
add something to the diagonal
Definition: linalg.h:5112

spams::coreORMP
void coreORMP(Vector< T > &scores, Vector< T > &norm, Vector< T > &tmp, Matrix< T > &Un, Matrix< T > &Undn, Matrix< T > &Unds, Matrix< T > &Gs, Vector< T > &Rdn, const AbstractMatrix< T > &G, Vector< int > &ind, Vector< T > &RUn, T &normX, const T *eps, const int *L, const T *lambda, T *path=NULL)
Auxiliary function of omp.
Definition: decomp.h:514

spams::Matrix::setm
void setm(const int m)
modify _m
Definition: linalg.h:277

INFINITY
#define INFINITY
Definition: utils.h:62

spams::Matrix::rawX
T * rawX() const
reference a modifiable reference to the data, DANGEROUS
Definition: linalg.h:254

spams::Matrix::clear
void clear()
Clear the matrix.
Definition: linalg.h:1250