function [inputs, cost, n_expand, n_eval] = ...
    BFS(X, y, costfunction, use_compound, epsilon, k, initial, varargin)
% BFS   Variable selection with best-first search
% [inputs, cost, n_expand, n_eval] = BFS(X, y, costfunction, ...
%                      use_compound, epsilon, k, initial, varargin)
%
% Operators in the state space are addition and deletion of a single
% variable, and optionally also compound operators. Operators correspond
% to edges in the graph. Nodes are sets of variables.
% 
% Mandatory parameters:
%   X              Nxd data matrix (d is dimension, N is number of samples)
%   y              Nx1 output vector
%   costfunction   Handle to a cost function c=costfunction(X,Y,par)
%
% Optional parameters:
%   use_compound   1 if compound operators are to be used, 0 if not.
%                  Compound operators affect the inclusion or exclusion
%                  of several variables at the same time, and often make
%                  the search faster. Default: 1.
%   epsilon        A new node is better than the current best node if
%                  (1+epsilon) * new_cost < current_best. In other words,
%                  the new node is penalized. Epsilon should only be
%                  given non-negative values. Negative values are
%                  corrected to the default value. Default: 0.001.
%   k              Limit for stale search. If the best node has remained
%                  the same for the last k node expansions, the search
%                  terminates. Default: 5.
%   initial        Initial set of variables (variable indices).
%                  Default: the empty set ([]).
%
%   Finally, a number of arguments can (or must) be given to the cost
%   function (depends on the cost function). These are given after all
%   other arguments.
%
%   If any of the optional parameters is specified, all the previous ones
%   must also be specified. Use [] for a default value.
%
% Return values:
%   inputs         Chosen input variables (vector of indices 1--d)
%   cost           Cost corresponding to the chosen variables
%   n_expand       Number of nodes expanded during the search
%   n_eval         Number of nodes evaluated (number of calls to the cost
%                  function)
%
% Examples (when KNN is a cost function with optional parameter kmax):
%   [inputs,cost] = BFS(X, y ,@KNN)
%      Default parameters
%   [inputs,cost] = BFS(X, y ,@KNN, [], [], [], [], 5)
%      Default parameters for BFS, kmax=5 is given to KNN
%   [inputs,cost] = BFS(X, y ,@KNN, 0)
%      Disable compound operators
%   [inputs,cost,n_expand,n_eval] = BFS(X, y ,@KNN, [], [], 10, 1:size(X,2))
%      Start search from full set of variables, with stale search limit
%      raised to 10. Also stores number of nodes expanded and number of
%      states evaluated. 

default_compound = 1;
default_epsilon = 0.001;
default_initial = [];
default_k = 5;

n_expand = 0;

if nargin < 7
  initial = default_initial;
end
if nargin < 6 || length(k) ~= 1
  k = default_k;
end
if nargin < 5 || length(epsilon) ~= 1 || epsilon < 0
  epsilon = default_epsilon;
end
if nargin < 4 || length(use_compound) ~= 1
  use_compound = default_compound;
end

[N, d] = size(X);

% Let's store each state in a Boolean vector, where 1 means that the
% corresponding feature is selected.
initial_b = boolean(zeros(1,d));
initial_b(initial) = 1;

% These are lists (sets, actually) for open and closed nodes. In
% practice, these are Boolean matrices with each state as a row.
open = initial_b;
closed = boolean([]);

best = initial_b;

% Initialize the best (smallest) cost.
cost = costfunction(X(:,initial_b), y, varargin{:});
n_eval = 1;

% Costs of all nodes in the open list
costs = cost;

% Keep expanding nodes until none are left or stale search terminates
stale_counter = 0;
while size(open,1) > 0

  % Select node with minimal cost
  [cost_temp, idx_selected] = min(costs);
  idx_selected = idx_selected(1);
  selected = open(idx_selected,:);
  closed = [closed; selected]; % add node to closed list
  open(idx_selected,:) = []; % remove node from open list
  costs(idx_selected) = []; % remove cost of node from the costs list

  % Update information about best node, if necessary
  if (1+epsilon)*cost_temp < cost
    best = selected;
    cost = cost_temp;
    stale_counter = 0;
  else
    stale_counter = stale_counter + 1;
  end

  % Expand the node, giving its children.
  n_expand = n_expand + 1;
  
  % Here, the operators are: addition or deletion of a single feature
  for i=1:d
    child = selected;
    child(i) = not(selected(i));
    children(i,:) = child;
  end
  
  costs_children = [];
  open_children = boolean([]);
  affected_variable = [];
  
  % Evaluate and add to open list each child not in open or closed list
  for i=1:d
    probe_open = ones(size(open,1),1) * children(i,:);
    probe_closed = ones(size(closed,1),1) * children(i,:);

    if not(any(not(sum(abs(probe_open - open), 2))) || ...
           any(not(sum(abs(probe_closed - closed), 2))))
      open_children = [open_children; children(i,:)];
      affected_variable = [affected_variable; i];
      costs_children = ...
          [costs_children; ...
           costfunction(X(:,children(i,:)), y, varargin{:})];
      n_eval = n_eval + 1;
    end
  end

  % OPTIONAL: Create compound operators and corresponding child nodes
  if use_compound && length(affected_variable) > 1
  
    % Sort children created with simple operators (those that were accepted
    % after inspecting open and closed lists). Simple operators are added
    % to the compound operator in the order of increasing cost.
    [foo, idx] = sort(costs_children);
    affected_variable = affected_variable(idx);

    cost_of_compound = inf;
    compound_node = selected;
    this_var = affected_variable(1);
    compound_node(this_var) = not(selected(this_var));
    costs_compound = [];
    open_compound = boolean([]);
    
    for i=2:length(affected_variable)
      this_var = affected_variable(i);
      compound_node(this_var) = not(selected(this_var));

      % Evaluate and add to open list, if not in open or closed list
      probe_open = ones(size(open,1),1) * compound_node;
      probe_closed = ones(size(closed,1),1) * compound_node;

      if not(any(not(sum(abs(probe_open - open), 2))) || ...
             any(not(sum(abs(probe_closed - closed), 2))))
        open_compound = [open_compound; compound_node];
        cost_temp = costfunction(X(:,compound_node), y, varargin{:});
        costs_compound = [costs_compound; cost_temp];
        n_eval = n_eval + 1;
      end

      % Create compound nodes as long as they keep getting better.
      if cost_temp < cost_of_compound
        cost_of_compound = cost_temp;
      else
        break;
      end
    end

    % The compound nodes are _really_ added to the open list here
    % (to save some time)
    open = [open; open_compound];
    costs = [costs; costs_compound];
  end
  % -- Compound nodes created
  
  % The simple child nodes are _really_ added to the open list here
  open = [open; open_children];
  costs = [costs; costs_children];
  
  if stale_counter == k
    break;
  end
  
end

inputs = find(best);
