% This is the companion code for Sec. IV of the following paper:
% Y. Yang, M. Pesavento, Symeon Chatzinotas, and Bjrn Ottersten, 
% "Successive convex approximation algorithms for sparse signal estimation
% with nonconvex regularizations",
% IEEE Journal of Selected Topics in Signal Processing

clear; 
clc;

% A: N*K
N         = 20000; 
K         = 40000;  
density   = 0.01; % density of the sparse vector
theta     = 0.001; 

Sample    = 20; % number of repeatitions in the Monte Carlo simulations

MaxIter_j = 30; % maximum number of iterations
MaxIter_m = 10; % maximum number of iterations
MaxIter_g = 400; % maximum number of iterations

% the achieved objective value versus the number of iterations
val_j     = zeros(Sample, MaxIter_j + 1); % "_j" stands for the proposed algorithm
val_m     = zeros(Sample, MaxIter_m + 1); % "_m" stands for the majorization-minimization algorithm
val_g     = zeros(Sample, MaxIter_g + 1); % "_g" stands for the GIST algorithm

% the required CPU time versus the number of iterations
time_j    = zeros(Sample, MaxIter_j + 1);
time_m    = zeros(Sample, MaxIter_m + 1);
time_g    = zeros(Sample, MaxIter_g + 1);

% the achieved error versus the number of iterations
error_j   = zeros(Sample, MaxIter_j);
error_g   = zeros(Sample, MaxIter_g);

% intermediate variables
theta_vec = theta*ones(K,1); % capped L1 norm

for s = 1:1:Sample
    disp(['Sample ' num2str(s)]);
    
    % generating the parameters
    [A, b, mu, x0] = FUN_parameter(N, K, density);
    
    %% the GIST algorithm
    % initialization stage
    tic;    
    mu_vec        = mu*ones(K, 1);
    x_g           = zeros(K, 1);    
    residual_g    = A * x_g - b;
    Gradient_g    = ( residual_g' *A )';    
    time_g(s, 1)  = toc; % end of initialization
    
    val_g(s, 1)   = 1/2 * (residual_g'*residual_g) + mu_vec'*min(abs(x_g), theta_vec);
    error_g(s, 1) = norm(x_g - x0)/norm(x0);
    
    disp(['Proximal MM Algorithm: iteration 0 with value ' num2str(val_g(s, 1)) ', and time ' num2str(time_g(s,1))]);
    for t = 1: 1 :MaxIter_g
        tic;
        
        % line search for Lipschitz constant
        c     = 1; 
        alpha = 0.5; 
        beta  = 2;        
        while 1
            u_g = x_g - Gradient_g/c;

            x1 = sign(u_g).*max(theta_vec, abs(u_g));
            h1 = 1/2*(x1 - u_g).*(x1 - u_g) + mu*min(abs(x1), theta_vec);
            
            x2 = sign(u_g).*min(theta_vec, max(zeros(K,1), abs(u_g) - mu_vec/c));
            h2 = 1/2*(x2 - u_g).*(x2 - u_g) + mu*min(abs(x2), theta_vec); clear u_g;
            
            x_new = x1.*(h1<=h2) + x2.*(ones(K,1) - (h1<=h2)); clear x1 x2 h1 h2;

            residual_new = A*x_new - b;
            val_g_new = 0.5*(residual_new'*residual_new) + mu_vec'*min(abs(x_new), theta_vec);

            if val_g_new <= val_g(s,t) - alpha * c / 2 * ( x_new - x_g )' * ( x_new - x_g )
                x_g          = x_new; clear x_new
                val_g(s,t+1) = val_g_new; clear val_g_new
                residual_g   = residual_new; clear residual_new
                Gradient_g   = (residual_g'*A)'; clear residual_g
                clear c;
                break;
            else
                c = c * beta;
            end            
        end
       
        time_g(s,t+1) = toc + time_g(s,t);
        
        error_g(s,t+1) = norm(x_g - x0)/norm(x0);

        disp(['Proximal MM Algorithm: iteration ' num2str(t) ' with value ' num2str(val_g(s,t+1))...
            ', and time ' num2str(time_g(s,t+1))]);
    end
    clear Gradient_g;
    
    %% the majorization-minimization method
    % initialization
    tic;
    d_AtA       = sum(A.^2, 1)'; % diagonal of AtA    
    mu_vec      = mu * ones(K, 1);
    mu_vec_normalized = mu_vec ./ d_AtA;
    x_m         = zeros(K, 1);
    residual_m  = A*x_m - b;
    Gradient_m  = (residual_m'*A)';    
    time_m(s,1) = toc; % end of initialization
    
    val_m(s,1)  = 1/2*(residual_m'*residual_m) + mu_vec'*min(abs(x_m), theta_vec);
    disp(['Majorization-Minimization: iteration 0 with value ' num2str(val_m(s,1))...
        ', and time ' num2str(time_m(s,1))]);
    for t = 1:1:MaxIter_m
        tic;
        
        % subgradient of g_minus
        xi_minus = mu_vec.*((x_m>=theta_vec) - (x_m<=-theta_vec));
  
        MaxIter  = 200; % maximum number of iterations in the inner loop of MM
        for tau  = 1:1:MaxIter
            % compute best-response
            Bx = FUN_quad(x_m - (Gradient_m - xi_minus)./d_AtA, mu_vec_normalized, K);
            
            % compute stepsize
            x_dif    = (Bx - x_m);
            Ax_dif   = A*x_dif;            
            stepsize = min((-residual_m'*Ax_dif - (mu_vec'*((abs(Bx) - abs(x_m))) - xi_minus'*x_dif))/(Ax_dif'*Ax_dif), 1);
            
            % update variable
            x_m = x_m + stepsize*x_dif;  clear Bx;
            
            % calculate intermediate variables for the next iteration (recursive update)
            residual_m = residual_m + stepsize*Ax_dif;  clear stepsize Ax_dif;
            Gradient_m = (residual_m'*A)';
            
            if x_dif'*x_dif<=10^-8
                clear x_dif;
%                 disp(['Majorization-Minimization: inner number of iterations ' num2str(tau)]);
                break;
            end
        end
        clear xi_minus;

        time_m(s,t+1) = toc + time_m(s,t);
                
        %     calculate objective value
        val_m(s,t+1)  = 1/2*(residual_m'*residual_m) + mu_vec'*min(abs(x_m), theta_vec);
        
        disp(['Majorization-Minimization: iteration ' num2str(t) ' with value ' num2str(val_m(s,t+1))...
            ', and time ' num2str(time_m(s,t+1)) ', and precision ' num2str(val_m(s,t+1)<=val_m(s,t)) ]);
    end
    clear mu_vec mu_vec_normalized d_AtA Gradient_m residual_m;
    
    %% the parallel STELA algorithm
    % initialization
    tic;
    d_AtA       = sum(A.^2, 1)'; % diagonal of AtA    
    mu_vec      = mu*ones(K, 1);
    mu_vec_normalized = mu_vec./d_AtA;
    x_j         = zeros(K, 1); % initial point of x
    residual_j  = A*x_j - b;
    Gradient_j  = (residual_j'*A)';    
    time_j(s,1) = toc; % end of initialization

    val_j(s,1)   = 1/2*( residual_j' * residual_j ) + mu_vec' * min(abs(x_j), theta_vec);
    error_j(s,1) = norm(x_j - x0)/norm(x0);

    disp(['Parallel STELA: iteration 0 with value ' num2str(val_j(s,1))...
        ', and time ' num2str(time_j(s,1))]);
    for t = 1:1:MaxIter_j
        tic;
        
        % subgradient of g_minus
        xi_minus = mu_vec.*((x_j>=theta_vec) - (x_j <= -theta_vec));
        
        % compute best-response
        Bx = FUN_quad(x_j - (Gradient_j - xi_minus)./d_AtA, mu_vec_normalized, K);
        
        % compute stepsize
        x_dif    = (Bx - x_j);
        Ax_dif   = A*x_dif;        
        stepsize = min((-residual_j'*Ax_dif - (mu_vec'*((abs(Bx) - abs(x_j))) - xi_minus'*x_dif))/(Ax_dif'*Ax_dif), 1);
        clear xi_minus;
        
        % update variable
        x_j = x_j + stepsize * x_dif; clear Bx x_dif;
        
        % calculate intermediate variables for the next iteration (recursive update)
        residual_j = residual_j + stepsize*Ax_dif; clear Ax_dif;
        Gradient_j = (residual_j'*A)';
        
        time_j(s,t+1) = toc + time_j(s,t);
                
        %     calculate objective value
        val_j(s,t+1)   = 1/2 * ( residual_j' * residual_j ) + mu_vec' *min(abs(x_j), theta_vec);
        error_j(s,t+1) = norm( x_j - x0 ) / norm( x0 );

        disp(['Parallel STELA: iteration ' num2str(t) ' with value ' num2str(val_j(s,t+1))...
            ', and time ' num2str(time_j(s,t+1)) ', and precision ' num2str(val_j(s,t+1)<=val_j(s,t))]);
    end
    clear mu_vec mu_vec_normalized d_AtA Gradient_j residual_j;

%     plot(1:K,  x0,  'xk');
%     hold on;
%     plot(1:K,  x_j, 'or');
%     legend('original signal', 'estimated signal');
%     xlabel('signal index');
%     ylabel('signal value');

    
end
clear theta_vec A b mu s t tau;
clear x0 x_j x_g x_m;

%%
figure;
subplot(3, 1, 1);
semilogy([0 mean(time_j, 1)], [mean(val_j(:,1)) mean(val_j, 1)], 'r', 'Linewidth', 1.5);
hold on; box on;
semilogy([0 mean(time_m, 1)], [mean(val_m(:,1)) mean(val_m, 1)], 'ko--', 'Linewidth', 1.5);
semilogy([0 mean(time_g, 1)], [mean(val_g(:,1)) mean(val_g, 1)], 'b-.', 'Linewidth', 1.5);
legend('STELA (proposed)', 'classic MM method(state-of-the-art)', 'proximal MM algorithm (state-of-the-art)');
xlabel('(a): CPU time (seconds)'); ylabel('function value');

subplot(3, 1, 2);
semilogy([0 mean(time_j, 1)], [mean(error_j(:,1)) mean(error_j, 1)], 'r', 'Linewidth', 1.5);
hold on;
semilogy([0 mean(time_g, 1)], [mean(error_g(:,1)) mean(error_g, 1)], 'b-.', 'Linewidth', 1.5);
legend('STELA (proposed)', 'proximal MM algorithm (state-of-the-art)');
xlabel('(b): CPU time (seconds)'); ylabel('error in variable');

subplot(3, 1, 3);
semilogy(mean(error_j, 1), 'r', 'Linewidth', 1.5);
hold on;
semilogy(mean(error_g, 1), 'b-.', 'Linewidth', 1.5);
legend('STELA (proposed)', 'proximal MM algorithm (state-of-the-art)');
xlabel('(c): number of iterations'); ylabel('error in variable');