GSP_LEARN_GRAPH_L2_DEGREES - Learn graph from pairwise distances using l2 prior on nodes degrees

Program code:

function [W, stat] = gsp_learn_graph_l2_degrees(Z, a, params)
%GSP_LEARN_GRAPH_L2_DEGREES Learn graph from pairwise distances using l2 prior on nodes degrees
%   Usage:  [W, stat] = gsp_learn_graph_l2_degrees(Z, a)
%           [W, stat] = gsp_learn_graph_l2_degrees(Z, a, params)
%
%   Inputs:
%         Z         : Matrix with (squared) pairwise distances of nodes
%         a         : L||_F^2 prior constant   (bigger a -> more dense W)
%         params    : Optional parameters
%
%   Outputs:
%         W         : Weighted adjacency matrix
%         stat      : Optional output statistics (adds small overhead)
%
%   'W = gsp_learn_graph_l2_degrees(Z, a, params)' computes a weighted
%   adjacency matrix W from squared pairwise distances in Z, using the
%   smoothness assumption that text{trace}(X^TLX) is small, where X is
%   the data (columns) changing smoothly from node to node on the graph and
%   L = D-W is the combinatorial graph Laplacian.
%
%   Alternatively, Z can contain other types of distances and use the
%   smoothness assumption that
%
%      sum(sum(W .* Z))
%
%   is small. 
%
%   The minimization problem solved is 
%
%      minimize_W sum(sum(W .* Z)) + a/2 * ||W||_F^2/2 
%                 + a/2 * ||sum(W)||_2^2 
%                 s.t. sum(sum(W)) == n
%
%   subject to W being a valid weighted adjacency matrix (non-negative,
%   symmetric, with zero diagonal). Note that 
%
%      .||W||_F^2/2 + ||sum(W)||_2^2  = ||L||^2
%
%   The algorithm used is forward-backward-forward (FBF) based primal dual
%   optimization (see references).
%
%   Example:
%
%         G = gsp_sensor(256);
%         f1 = @(x,y) sin((2-x-y).^2);
%         f2 = @(x,y) cos((x+y).^2);
%         f3 = @(x,y) (x-.5).^2 + (y-.5).^3 + x - y;
%         f4 = @(x,y) sin(3*((x-.5).^2+(y-.5).^2));
%         X = [f1(G.coords(:,1), G.coords(:,2)), f2(G.coords(:,1), G.coords(:,2)), f3(G.coords(:,1), G.coords(:,2)), f4(G.coords(:,1), G.coords(:,2))];
%         figure; subplot(2,2,1); gsp_plot_signal(G, X(:,1)); title('1st smooth signal');
%         subplot(2,2,2); gsp_plot_signal(G, X(:,2)); title('2nd smooth signal');
%         subplot(2,2,3); gsp_plot_signal(G, X(:,3)); title('3rd smooth signal');
%         subplot(2,2,4); gsp_plot_signal(G, X(:,4)); title('4th smooth signal');
%         Z = gsp_distanz(X').^2;
%         [W] = gsp_learn_graph_l2_degrees(Z*25, 1);
%         W(W<1e-5) = 0;
%         G2 = gsp_update_weights(G, W);
%         figure; gsp_plot_graph(G2); title('Graph with edges learned from above 4 signals');
%       
%
%   Additional parameters
%   ---------------------
%  
%    params.W_init   : Initialization point. default: zeros(size(Z))
%    verbosity       : Above 1 adds a small overhead. Default: 1
%    maxit           : Maximum number of iterations. Default: 1000
%    tol             : Tolerance for stopping criterion. Default: 1e-5
%    step_size       : Step size from the interval (0,1). Default: 0.5
%    fix_zeros       : Fix a set of edges to zero (true/false)
%    edge_mask       : Mask indicating the non zero edges if "fix_zeros"
%
%   The stopping criterion is whether both relative primal and dual
%   distance between two iterations are below a given tolerance. 
%   
%   To set the step size use the following rule of thumb: Set it so that
%   relative change of primal and dual converge with similar rates (use
%   verbosity > 1).
%
%
%   See also: gsp_learn_graph_log_degrees gsp_distanz gsp_update_weights
% 
%   References:
%     V. Kalofolias. How to learn a graph from smooth signals. Technical
%     report, AISTATS 2016: proceedings at Journal of Machine Learning
%     Research (JMLR)., 2016.
%     
%     N. Komodakis and J.-C. Pesquet. Playing with duality: An overview of
%     recent primal? dual approaches for solving large-scale optimization
%     problems. Signal Processing Magazine, IEEE, 32(6):31--54, 2015.
%     
%     V. Kalofolias and N. Perraudin. Large Scale Graph Learning from Smooth
%     Signals. arXiv preprint arXiv:1710.05654, 2017.
%     
%
%
%   Url: https://epfl-lts2.github.io/gspbox-html/doc/learn_graph/gsp_learn_graph_l2_degrees.html

% Copyright (C) 2013-2016 Nathanael Perraudin, Johan Paratte, David I Shuman.
% This file is part of GSPbox version 0.7.5
%
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program.  If not, see <http://www.gnu.org/licenses/>.

% If you use this toolbox please kindly cite
%     N. Perraudin, J. Paratte, D. Shuman, V. Kalofolias, P. Vandergheynst,
%     and D. K. Hammond. GSPBOX: A toolbox for signal processing on graphs.
%     ArXiv e-prints, Aug. 2014.
% http://arxiv.org/abs/1408.5781

% Author: Vassilis Kalofolias
% Testing: gsp_test_learn_graph
% Date: June 2015


if nargin < 3
    params = struct;
end

%% Default parameters
if not(isfield(params, 'verbosity')),   params.verbosity = 1;   end
if not(isfield(params, 'maxit')),       params.maxit = 200;      end
if not(isfield(params, 'tol')),         params.tol = 1e-5;      end
if not(isfield(params, 'step_size')),   params.step_size = .5;      end     % from (0, 1)
if not(isfield(params, 'fix_zeros')),   params.fix_zeros = false;   end
if not(isfield(params, 'max_w')),       params.max_w = inf;         end


%% Fix parameter size and initialize
if isvector(Z)
    z = Z;  % lazy copying of matlab doesn't allocate new memory for z
else
    z = squareform_sp(Z);
end

z = z(:);
l = length(z);                   % number of edges
% n(n-1)/2 = l => n = (1 + sqrt(1+8*l))/ 2
n = round((1 + sqrt(1+8*l))/ 2);    % number of nodes

if isfield(params, 'w_0')
    if not(isfield(params, 'c'))
        error('When params.w_0 is specified, params.c should also be specified');
    else
        c = params.c;
    end
    if isvector(params.w_0)
        w_0 = params.w_0;
    else
        w_0 = squareform_sp(params.w_0);
    end
    w_0 = w_0(:);
else
    w_0 = 0;
    c = 0;
end

% if sparsity pattern is fixed we optimize with respect to a smaller number
% of variables, all included in w
if params.fix_zeros
    if not(isvector(params.edge_mask))
        params.edge_mask = squareform_sp(params.edge_mask);
    end
    % use only the non-zero elements to optimize
    ind = find(params.edge_mask(:));
    z = full(z(ind));
    if not(isscalar(w_0))
        w_0 = full(w_0(ind));
    end
else
    z = full(z);
    w_0 = full(w_0);
end

w = zeros(size(z));

% number of edges over which we finally optimize (l2 <= l)
l2 = length(w);


%% Needed operators
% S*w = sum(W)
if params.fix_zeros
    [S, St] = sum_squareform(n, params.edge_mask(:));
else
    [S, St] = sum_squareform(n);
end

ones_l2 = ones(l2,1);

sum_op = @(w) S*w;
sum_t_op = @(z) St*z;

% edges -> scalar
K_op = @(w) 2*sum(w);

% scalar -> edges
Kt_op = @(z) 2 * z * ones_l2;

norm_K = 2 * sqrt(l2);
%norm_L = l2;         % do i need the square norm or the square root?

%% TODO: Normalization??

%% Learn the graph
% min_{W>=0}     tr(X'*L*X) - gc * sum(log(sum(W))) + gp * norm(W-W0,'fro')^2, where L = diag(sum(W))-W
% min_W       I{W>=0} + W(:)'*Dx(:)  - gc * sum(log(sum(W))) + gp * norm(W-W0,'fro')^2
% min_W                f(W)          +       g(L_op(W))      +   h(W)

% put proximal of trace plus positivity together
f.eval = @(w) 2*w'*z;    % half should be counted
%f.eval = @(W) 0;
f.prox = @(w, c) min(params.max_w, max(0, w - 2*c*z));  % all change the same

% projection of sum of W on n
% g(w) = IND(sum(w) == n)
g.eval = @(z) 1000 * abs(z-n);  % this could be 0. Measures violation of the condition sum(w) = n!!
g.prox = @(z, c) n;
% proximal of conjugate of g: z - c*g.prox(z/c, 1/c)
g_star_prox = @(z, c) z - c*n;

if w_0 == 0
	% "if" not needed, for c = 0 both are the same but gains speed
    % the following is half the squared frobenius norm of the graph Laplacian
    h.eval = @(w) a/2 * (2*norm(w)^2 + norm(sum_op(w))^2);   % CAREFUL: w two times!
    h.grad = @(w) a * (2*w + sum_t_op(sum_op(w)));
    % = (a*2*I + a*St*S)*w
    % Lipschitz constant of h.grad:   a * norm(2*eye(n) + 2*ones(n))
    h.beta = 2 * a * n;    % a*2 + a*(2*(n-1))
    % = normest(a * (2*speye(l) + sum_t_op(sum_op(speye(l)))));
else
    % the following is half the squared frobenius norm of the graph Laplacian
    h.eval = @(w) a/2 * (2*norm(w)^2 + norm(sum_op(w))^2) + c/2 * (2*norm(w-w_0));   % CAREFUL: norm(W, 'fro') = 2*norm(w) two times!
    h.grad = @(w) a * (2*w + sum_t_op(sum_op(w))) + c*2*(w-w_0);
    % = (a*2*I + a*St*S + c*2*I)*w  +  const(w)
    % Lipschitz constant of h.grad:   norm(2*a*eye(n) + 2*a*ones(n) + 2*c*eye(n))
    h.beta = a*2*(n-1) + 2*(a+c);    % norm(ones(n) + c*eye(n)) = n+c
end    
    
    
% if there is no quadratic term, it reduces to a linear program:
if a == 0 && w_0 == 0
    %   X =     linprog(f,   A,   b,   Aeq,           beq,LB,UB,X0)
    [w, stat.fval, stat.exitflag, stat.output, stat.lambda] = linprog(z, [], [], ones(size(z))', 100*n, zeros(size(z)));
else
    %% My custom FBF based primal dual (see [1] = [Komodakis, Pesquet])
    % parameters mu, epsilon for convergence (see [1])
    mu = h.beta + norm_K;     %TODO: is it squared or not??
    epsilon = lin_map(0.0, [0, 1/(1+mu)], [0,1]);   % in (0, 1/(1+mu) )
    
    % INITIALIZATION
    % primal variable ALREADY INITIALIZED
    %w = params.w_init;
    % dual variable
    v_n = K_op(w);
    if nargout > 1 || params.verbosity > 1
        stat.f_eval = nan(params.maxit, 1);
        stat.g_eval = nan(params.maxit, 1);
        stat.h_eval = nan(params.maxit, 1);
        stat.fgh_eval = nan(params.maxit, 1);
        stat.pos_violation = nan(params.maxit, 1);
    end
    if params.verbosity > 1
        fprintf('Relative change of primal, dual variables, and objective fun\n');
    end
    tic
    gn = lin_map(params.step_size, [epsilon, (1-epsilon)/mu], [0,1]);              % in [epsilon, (1-epsilon)/mu]
    for i = 1:params.maxit
        %Y_n = w - gn * (h.grad(w) + Kt_op(v_n));
        Y_n = w - gn * (h.grad(w) + 2*v_n);
        y_n = v_n + gn * (K_op(w));
        P_n = f.prox(Y_n, gn);
        p_n = g_star_prox(y_n, gn); % = y_n - gn*g_prox(y_n/gn, 1/gn)
        %Q_n = P_n - gn * (h.grad(P_n) + Kt_op(p_n));
        Q_n = P_n - gn * (h.grad(P_n) + 2*p_n);
        q_n = p_n + gn * (K_op(P_n));
        
        if nargout > 1 || params.verbosity > 2
            stat.f_eval(i) = f.eval(w);
            stat.g_eval(i) = g.eval(K_op(w));
            stat.h_eval(i) = h.eval(w);
            stat.fgh_eval(i) = stat.f_eval(i) + stat.g_eval(i) + stat.h_eval(i);
            stat.pos_violation(i) = -sum(min(0,w));
        end
        rel_norm_primal = norm(- Y_n + Q_n, 'fro')/norm(w, 'fro');
        rel_norm_dual = norm(- y_n + q_n)/norm(v_n);
        
        if params.verbosity > 2
            fprintf('iter %4d: %6.4e   %6.4e   %6.4e    %6.4e   %6.3e \n', i, rel_norm_primal, rel_norm_dual, stat.fgh_eval(i), full(sum(w)*2), stat.h_eval(i));
        elseif params.verbosity > 1
            fprintf('iter %4d: %6.4e   %6.4e\n', i, rel_norm_primal, rel_norm_dual);
        end
        
        w = w - Y_n + Q_n;
        v_n = v_n - y_n + q_n;
        
        if rel_norm_primal < params.tol && rel_norm_dual < params.tol
            break
        end
    end
    stat.time = toc;
    if params.verbosity > 0
        fprintf('# iters: %4d. Rel primal: %6.4e Rel dual: %6.4e   %6.3e\n', i, rel_norm_primal, rel_norm_dual, f.eval(w) + g.eval(K_op(w)) + h.eval(w));
        fprintf('Time needed is %f seconds\n', stat.time);
    end

% Use the following for testing:
%     g.L = K_op;
%     g.Lt = Kt_op;
%     g.norm_L = norm_K;
%     [w, info] = fbf_primal_dual(w, f, g, h, params);
    %[w, info] = fb_based_primal_dual(w, f, g, h, params);
    %%
    
    if params.verbosity > 3
        figure; plot(real([stat.f_eval, stat.g_eval, stat.h_eval])); hold all; plot(real(stat.fgh_eval), '.'); legend('f = w''*dx2', 'g = 1000 * abs(2sum(w)-n)', 'h = a*norm(L)^2', 'f+g+h');
        figure; plot(stat.pos_violation); title('sum of negative (invalid) values per iteration')
        figure; semilogy(max(0,-diff(real(stat.fgh_eval'))),'b.-'); hold on; semilogy(max(0,diff(real(stat.fgh_eval'))),'ro-'); title('|f(i)-f(i-1)|'); legend('going down','going up');
    end
end

if params.fix_zeros
    w = sparse(ind, ones(size(ind)), w, l, 1);
end

if isvector(Z)
    W = w;
else
    W = squareform_sp(w);
end