function w = median_regression(X, Y, lambda, k, estMethod, Sigma)
%{
Median regression: Linear regression for heavy tails.
Implements the regression algorithm from Hsu & Sabato,
"Heavy-tailed Regression with a Generalized Median-of-means", ICML 2014.
Output: a linear predictor w that hopes to have small
squared loss when predicting a label of a d-dimensional vector x by x'*w.
Input:
X is a matrix m x d of training examples, where m is number of examples and d is the dimension. Every row is one training example.
Y is a 1 x m vector of labels for the training examples.
Lambda is a non-negative scalar, defining regularization.
k is scalar which defines the number of parts the sample will be split into. Theoretically, k = O(log(1/delta)). In practice,
set k to a number larger than 2, and smaller than m/d.
If k does not divide m, mod(m,k) samples will remain unused.
estMethod determines how the covariance matrix will be estimated:
estMethod = 1: estimated once for the entire sample (good when X are not too heavy tailed)
estMethod = 2: estimated using the full random procedure (good when X are very heavy tailed)
estMethod = 3: Not estimated, in this case the last argument "Sigma" will
be used instead, where Sigma needs to be a d x d matrix. This is sometimes useful for testing purposes.
Sigma: If estMethod = 3, a d x d matrix, should be the covariance of the
distribution. If estMethod is anything else, Sigma is not used.
Code by Sivan Sabato.
Release date: June 19th, 2014.
%}
%ranodmizing order of X
[m,d] = size(X);
perm = randperm(m);
X = X(perm,:);
Y = Y(perm);
smallm = floor(m/k);
W = zeros(k,d);
for i = 1:k
%fprintf('from %d to %d\n',smallm(k)*(i-1)+1, (smallm(k)*i));
Xsmall = X((smallm*(i-1)+1):(smallm*i),:);
Ysmall = Y((smallm*(i-1)+1):(smallm*i));
z = myregression(Xsmall,Ysmall, lambda);
W(i,:) = z';
end
switch estMethod
case 1
Sigma = X'*X;
covariances = W*(Sigma + lambda * eye(d))*W'; %w_i*M*w_j
distances = diag(covariances)*ones(1,k) + (diag(covariances)*ones(1,k))' -2*covariances;%(w_i - w_j)M(w_i-w_j) = w_iMw_i + w_jMw_j - 2w_iMwj
case 3
covariances = W*(Sigma + lambda * eye(d))*W'; %w_i*M*w_j
distances = diag(covariances)*ones(1,k) + (diag(covariances)*ones(1,k))' -2*covariances;%(w_i - w_j)M(w_i-w_j) = w_iMw_i + w_jMw_j - 2w_iMwj
case 2
%reusing the sample to estimate the distances: this gives similar
%theoretical guarantees to the ones in the paper.
numsplits = k-1;
X = X(randperm(m),:); %not sure this is needed, but can't hurt
smallm = floor(m/numsplits);
distances = zeros(k,k);
for i = 1:numsplits
Xsmall = X((smallm*(i-1)+1):(smallm*i),:);
cov(:,:,i) = Xsmall' * Xsmall;
end
%create distance matrix. Every row of the matrix has
%independent estimations within the row, but between rows there
%are dependencies.
for i = 1:k
for j = 1:k
if (j == i)
distances(i,j) = 0;
continue;
end
if (j < i)
numcov = j;
else
numcov = j-1;
end
distances(i,j) = (W(i,:) - W(j,:))*(cov(:,:,numcov) + lambda * eye(d))*(W(i,:) - W(j,:))';
end
end
otherwise
error('No such estimation method\n');
end
index = generalized_median(distances);
w = W(index,:)';
end
function [ w, emploss ] = myregression(X, Y, lambda)
% X is m x d, Y is 1 x m
[m,d] = size(X);
cov = (X' * X)/m + lambda * eye(d); %d x d
cor = (X' * Y)/m; %1 x d
w = pinv(cov)*cor;
emploss = sum((X*w - Y).^2)/m;
end
function index = generalized_median(distances)
% distances is a k x t matrix (t can be k or it can be smaller if it is a
% random sample from the distances)
[~,index] = min(median(distances));
end