function [sol,b,obj] = primal_svm(linear,Y,lambda,opt) % [SOL, B] = PRIMAL_SVM(LINEAR,Y,LAMBDA,OPT) % Solves the SVM optimization problem in the primal (with quatratic % penalization of the training errors). % % If LINEAR is 1, a global variable X containing the training inputs % should be defined. X is an n x d matrix (n = number of points). % If LINEAR is 0, a global variable K (the n x n kernel matrix) should be defined. % Y is the target vector (+1 or -1, length n). % LAMBDA is the regularization parameter ( = 1/C) % % IF LINEAR is 0, SOL is the expansion of the solution (vector beta of length n). % IF LINEAR is 1, SOL is the hyperplane w (vector of length d). % B is the bias % The outputs on the training points are either K*SOL+B or X*SOL+B % OBJ is the objective function value % % OPT is a structure containing the options (in brackets default values): % cg: Do not use Newton, but nonlinear conjugate gradients [0] % lin_cg: Compute the Newton step with linear CG % [0 unless solving sparse linear SVM] % iter_max_Newton: Maximum number of Newton steps [20] % prec: Stopping criterion % cg_prec and cg_it: stopping criteria for the linear CG. % Copyright Olivier Chapelle, olivier.chapelle@tuebingen.mpg.de % Last modified 25/08/2006 if nargin < 4 % Assign the options to their default values opt = []; end; if ~isfield(opt,'cg'), opt.cg = 0; end; if ~isfield(opt,'lin_cg'), opt.lin_cg = 0; end; if ~isfield(opt,'iter_max_Newton'), opt.iter_max_Newton = 20; end; if ~isfield(opt,'prec'), opt.prec = 1e-6; end; if ~isfield(opt,'cg_prec'), opt.cg_prec = 1e-4; end; if ~isfield(opt,'cg_it'), opt.cg_it = 20; end; % Call the right function depending on problem type and CG / Newton % Also check that X / K exists and that the dimension of Y is correct if linear global X; if isempty(X), error('Global variable X undefined'); end; [n,d] = size(X); if issparse(X), opt.lin_cg = 1; end; if size(Y,1)~=n, error('Dimension error'); end; if ~opt.cg [sol,obj] = primal_svm_linear (Y,lambda,opt); else [sol,obj] = primal_svm_linear_cg(Y,lambda,opt); end; else global K; if isempty(K), error('Global variable K undefined'); end; n = size(Y,1); if any(size(K)~=n), error('Dimension error'); end; if ~opt.cg [sol,obj] = primal_svm_nonlinear (Y,lambda,opt); else [sol,obj] = primal_svm_nonlinear_cg(Y,lambda,opt); end; end; % The last component of the solution is the bias b. b = sol(end); sol = sol(1:end-1); fprintf('\n'); function [w,obj] = primal_svm_linear(Y,lambda,opt) % ------------------------------- % Train a linear SVM using Newton % ------------------------------- global X; [n,d] = size(X); w = zeros(d+1,1); % The last component of w is b. iter = 0; out = ones(n,1); % Vector containing 1-Y.*(X*w) while 1 iter = iter + 1; if iter > opt.iter_max_Newton; warning(sprintf(['Maximum number of Newton steps reached.' ... 'Try larger lambda'])); break; end; [obj, grad, sv] = obj_fun_linear(w,Y,lambda,out); % Compute the Newton direction either exactly or by linear CG if opt.lin_cg % Advantage of linear CG when using sparse input: the Hessian is never % computed explicitly. [step, foo, relres] = minres(@hess_vect_mult, -grad,... opt.cg_prec,opt.cg_it,[],[],[],sv,lambda); else Xsv = X(sv,:); hess = lambda*diag([ones(d,1); 0]) + ... % Hessian [[Xsv'*Xsv sum(Xsv,1)']; [sum(Xsv) length(sv)]]; step = - hess \ grad; % Newton direction end; % Do an exact line search [t,out] = line_search_linear(w,step,out,Y,lambda); w = w + t*step; fprintf(['Iter = %d, Obj = %f, Nb of sv = %d, Newton decr = %.3f, ' ... 'Line search = %.3f'],iter,obj,length(sv),-step'*grad/2,t); if opt.lin_cg fprintf(', Lin CG acc = %.4f \n',relres); else fprintf(' \n'); end; if -step'*grad < opt.prec * obj % Stop when the Newton decrement is small enough break; end; end; function [w, obj] = primal_svm_linear_cg(Y,lambda,opt) % ----------------------------------------------------- % Train a linear SVM using nonlinear conjugate gradient % ----------------------------------------------------- global X; [n,d] = size(X); w = zeros(d+1,1); % The last component of w is b. iter = 0; out = ones(n,1); % Vector containing 1-Y.*(X*w) go = [X'*Y; sum(Y)]; % -gradient at w=0 s = go; % The first search direction is given by the gradient while 1 iter = iter + 1; if iter > opt.cg_it * min(n,d) warning(sprintf(['Maximum number of CG iterations reached. ' ... 'Try larger lambda'])); break; end; % Do an exact line search [t,out] = line_search_linear(w,s,out,Y,lambda); w = w + t*s; % Compute the new gradient [obj, gn] = obj_fun_linear(w,Y,lambda,out); gn=-gn; fprintf('Iter = %d, Obj = %f, Norm of grad = %.3f \n',iter,obj,norm(gn)); % Stop when the relative decrease in the objective function is small if t*s'*go < opt.prec*obj, break; end; % Flecher-Reeves update. Change 0 in 1 for Polack-Ribiere be = (gn'*gn - 0*gn'*go) / (go'*go); s = be*s+gn; go = gn; end; function [obj, grad, sv] = obj_fun_linear(w,Y,lambda,out) % Compute the objective function, its gradient and the set of support vectors % Out is supposed to contain 1-Y.*(X*w) global X out = max(0,out); w0 = w; w0(end) = 0; % Do not penalize b obj = sum(out.^2)/2 + lambda*w0'*w0/2; % L2 penalization of the errors grad = lambda*w0 - [((out.*Y)'*X)'; sum(out.*Y)]; % Gradient sv = find(out>0); function y = hess_vect_mult(w,sv,lambda) % Compute the Hessian times a given vector x. % hess = lambda*diag([ones(d-1,1); 0]) + (X(sv,:)'*X(sv,:)); global X y = lambda*w; y(end) = 0; z = (X*w(1:end-1)+w(end)); % Computing X(sv,:)*x takes more time in Matlab :-( zz = zeros(length(z),1); zz(sv)=z(sv); y = y + [(zz'*X)'; sum(zz)]; function [t,out] = line_search_linear(w,d,out,Y,lambda) % From the current solution w, do a line search in the direction d by % 1D Newton minimization global X t = 0; % Precompute some dots products Xd = X*d(1:end-1)+d(end); wd = lambda * w(1:end-1)'*d(1:end-1); dd = lambda * d(1:end-1)'*d(1:end-1); while 1 out2 = out - t*(Y.*Xd); % The new outputs after a step of length t sv = find(out2>0); g = wd + t*dd - (out2(sv).*Y(sv))'*Xd(sv); % The gradient (along the line) h = dd + Xd(sv)'*Xd(sv); % The second derivative (along the line) t = t - g/h; % Take the 1D Newton step. Note that if d was an exact Newton % direction, t is 1 after the first iteration. if g^2/h < 1e-10, break; end; % fprintf('%f %f\n',t,g^2/h) end; out = out2; function [beta,obj] = primal_svm_nonlinear(Y,lambda,opt) % ----------------------------------- % Train a non-linear SVM using Newton % ----------------------------------- global K training = find(Y); % The points with 0 are ignored. n = length(training); % The real number of training points if n>=1000 % Train a subset first perm = randperm(n); ind = training(perm(1:round(.75*n))); % Take a random subset of size n/4 Y2 = Y; Y2(ind) = 0; beta = primal_svm_nonlinear(Y2,lambda,opt); sv = find(beta(1:end-1)~=0); Kb = K(training,sv)*beta(sv); % Kb will always contains K times the current beta else sv = training; beta = zeros(length(Y)+1,1); % The last component of beta is b. Kb = zeros(n,1); end; iter = 0; % If the set of support vectors has changed, we need to reiterate. while 1 old_sv = sv; % Computing the objective function out = 1 - Y(training) .* (Kb+beta(end)); sv = training(out > 0); obj = (lambda*beta(training)'*Kb + sum(max(0,out).^2)) / 2; iter = iter + 1; % If the set of support vectors doesn't change, we can't improve anymore if (iter > 1) & isempty(setxor(sv,old_sv)), break; end; if iter > opt.iter_max_Newton warning(sprintf(['Maximum number of Newton steps reached. ' ... 'Try larger lambda'])); break; end; H = K(sv,sv) + lambda*eye(length(sv)); cte_for_b = mean(diag(K)); H(end+1,:) = cte_for_b; % To take the bias into account H(:,end+1) = cte_for_b; % The actual value of this constant does not matter. H(end,end) = 0; % For numerical reasons, take it of the order of K. % Beta_new would be the new vevtor beta is the full Newton step is taken beta_new = zeros(length(Y)+1,1); if opt.lin_cg [beta_new([sv; end]), foo1, relres] = minres(H,[Y(sv);0],opt.cg_prec,opt.cg_it); else beta_new([sv; end]) = H\[Y(sv);0]; end; beta_new(end) = beta_new(end) * cte_for_b; % Do line search, but with a preference for a full Newton step step = beta_new - beta; [t, Kb] = line_search_nonlinear(step([training; end]),Kb,beta(end),Y,lambda,1); beta = beta + t*step; fprintf('n = %d, iter = %d, obj = %f, nb of sv = %d, line srch = %.4f',... [n iter obj length(sv) t]); if opt.lin_cg fprintf(', Lin CG acc = %.4f \n',relres); else fprintf(' \n'); end; end; sol = beta; function [beta, obj] = primal_svm_nonlinear_cg(Y,lambda,opt) % ----------------------------------------------------- % Train a linear SVM using nonlinear conjugate gradient % ----------------------------------------------------- global K; n = length(K); beta = zeros(n+1,1); % The last component of beta is b. iter = 0; Kb = zeros(n,1); % Kb will always contains K times the current beta go = [Y; sum(Y)]; % go = -gradient at beta=0 s = go; % Initial search direction Kgo = [K*Y; sum(Y)]; % We use the preconditioner [[K 0]; [0 1]] Ks = Kgo(1:end-1); % Ks will always contain K*s(1:end-1) while 1 iter = iter + 1; if iter > opt.cg_it * n warning(sprintf(['Maximum number of CG iterations reached. ' ... 'Try larger lambda'])); break; end; % Do an exact line search [t,Kb] = line_search_nonlinear(s,Kb,beta(end),Y,lambda,0,Ks); beta = beta + t*s; % Compute new gradient and objective. % Note that the gradient is already "divided" by the preconditioner [obj, grad] = obj_fun_nonlinear(beta,Y,lambda,Kb); gn = -grad; fprintf('Iter = %d, Obj = %f, Norm grad = %f \n',iter,obj,norm(gn)); % Stop when the relative decrease in the objective function is small if t*s'*Kgo < opt.prec*obj, break; end; Kgn = [K*gn(1:end-1); gn(end)]; % Multiply by the preconditioner % -> Kgn is the real gradient % Flecher-Reeves update. Change 0 in 1 for Polack-Ribiere be = (Kgn'*gn - 0*Kgn'*go) / (Kgo'*go); % be = (gn'*gn - gn'*go) / (go'*go); s = be*s+gn; Ks = be*Ks + Kgn(1:end-1); go = gn; Kgo = Kgn; end; function [t, Kb] = line_search_nonlinear(step,Kb,b,Y,lambda,fullstep,Ks) % Given the current solution (as given by Kb), do a line sesrch in % direction step. First try to take a full step if fullstep = 1. global K; training = find(Y~=0); act = find(step(1:end-1)); % The set of points for which beta change if nargin<7 Ks = K(training,training(act))*step(act); end; Kss = step(act)'*Ks(act); % Precompute some dot products Kbs = step(act)'*Kb(act); t = 0; Y = Y(training); % Compute the objective function for t=1 out = 1-Y.*(Kb+b+Ks+step(end)); sv = out>0; obj1 = (lambda*(2*Kbs+Kss)+sum(out(sv).^2))/2; while 1 out = 1-Y.*(Kb+b+t*(Ks+step(end))); sv = out>0; % The objective function and the first derivative (along the line) obj = (lambda*(2*t*Kbs+t^2*Kss)+sum(out(sv).^2))/2; g = lambda * (Kbs+t*Kss) - (Ks(sv)'+step(end))*(Y(sv).*out(sv)); if fullstep & (t==0) & (obj-obj1 > -0.2*g) % First check t=1: if it works, keep it -> sparser solution t = 1; break; end; % The second derivative (along the line) h = lambda*Kss + norm(Ks(sv)+step(end))^2; % fprintf('%d %f %f %f\n',length(find(sv)),t,obj,g^2/h); % Take the 1D Newton step t = t - g/h; if g^2/h < 1e-10, break; end; end; Kb = Kb + t*Ks; function [obj, grad] = obj_fun_nonlinear(beta,Y,lambda,Kb) global K; out = Kb+beta(end); sv = find(Y.*out < 1); % Objective function... obj = (lambda*beta(1:end-1)'*Kb + sum((1-Y(sv).*out(sv)).^2)) / 2; % ... and preconditioned gradient grad = [lambda*beta(1:end-1); sum(out(sv)-Y(sv))]; grad(sv) = grad(sv) + (out(sv)-Y(sv)); % To compute the real gradient, one would have to execute the following line % grad = [K*grad(1:end-1); grad(end)];