%% DEMO: Signal energy, frames and clustering
%
% Consider an audio signal x[n] which is several
% seconds long. Fourier analysis requires that
% the signal is stationary. Therefore we divide
% the signal into small frames or windows, whose
% length is typically 10 ms .. 100 ms. We assume
% that in that frame the signal is stationary.
%
% Analyzing signal x[n] in small frames/windows
% is a key element in audio/speech signal processing.
%
% In this demo we will compute signal energy
% in small frames or windows. Energy can be
% used as a feature for analyzing signal.
%
% Energy for L-length signal x:
%
%$E = \sum_{i=0}^{L-1} (x_i)^2$
%
% Then we draw the spectrogram, that is
% a visualization of short-time Fourier spectrum.
% A small frame
% T-61.3010, Jukka Parviainen, 11.3.2008
%% Read audio file
% You can read in any audio file (speech, music, ...)
audiofile = 'kiisseli.wav';
[x, fs] = wavread(audiofile);
soundsc(x, fs);
%% Stereo -> Mono
% If stereo sound, x is matrix with two columns.
% Now we want to be sure that there's only one column.
if (size(x, 2) == 2)
x = x(:,1);
disp('NOTE! Only left channel');
end;
%% Plot the waveform
t = [0 : length(x)-1]/fs;
figure(1); clf;
plot(t, x);
grid on;
xlabel('time (s)');
title(['Signal x[n] -- ' audiofile]);
xMax = max(x); % needed for later use
%% Compute energy
% Frame/window length L in samples.
% Energy components are saved in E.
% There is one value for each frame.
L = 1024;
Lsec = L/fs;
disp(['Window/frame size L = ' num2str(L) ' = ' num2str(Lsec) 's']);
E = zeros(ceil(length(x)/L),1);
ind = 0;
for k = 1 : L : length(x)-L
ind = ind+1;
E(ind) = sum(x(k:k+L-1).^2)/L;
end;
%% Plot energy curve
% We draw the energy curve with red color ([R G B])
% in the same figure with waveform. Therefore
% we first scale energy.
ScE = xMax*E/max(E);
figure(1); hold on;
nSE = length(ScE);
p = plot([Lsec/2 : Lsec : nSE*Lsec], ScE,'o-');
set(p, 'LineWidth', 2, 'Color', [1 0 0]);
legend({'Signal x[n]','Scaled frame energy'});
%% BREAKPOINT #1
%
% - How many samples are there in the signal x? 'whos'
% - How many frames are there? 'whos'
% - What can be see about red curve?
%% Draw spectrogram
% Default values can be read from 'help spectrogram'.
% Now we use frame size of L, overlap of L/4,
% and number of FFT is 32, which gives 32/2+1 positive
% frequencies in range 0 .. fs/2.
%
% In the figure x-axis is time, y-axis is frequency and
% z-axis shows how strong a certain frequency element
% (cosines at those frequencies) is at a certain time moment
% (time frame). Colors in z-axis are in decibel scale.
%
% caxis can be used to re-scale colors.
figure(2);
overlap = L/4;
NFFT = 32;
spectrogram(x, L, overlap, NFFT, fs, 'yaxis');
colorbar;
title(['Spectrogram -- ' audiofile]);
% caxis([-120 -75]); colorbar
%% Compute spectrogram values
% Note that S here is in linear scale whereas
% the spectrogram figure previously was automatically in
% logarithmic scale.
%
% Here we will start with overlap=0.
%
% Matrix S: MxN, M bands (frequency), N frames (time)
overlap = 0;
NFFT = 32;
disp(['There are ' num2str(NFFT/2+1) ' frequency bands']);
S = spectrogram(x, L, overlap, NFFT);
% If you want to visualize matrix S:
% imagesc(abs(flipud(S)));
% imagesc(10*log10(abs(flipud(S))));
%% BREAKPOINT 2
%
% - What can be see in spectrogram?
% - How large is size of matrix S?
%% Data for clustering
% Collect first data -- now only spectral components.
% Other possible features: energy, delta-energy, ...
D = [abs(S')];
%D = [10*log10(abs(S'))];
%% Cluster data using k-means
% First we choose how many clusters we want to have.
% Start, e.g., kC = 3.
%
% 'kmeans' clusters frames into kC different
% clusters (groups). The algorithm is stochastic,
% because initial positions of cluster centroids
% are random. In other words, each time there can
% be different clusters.
kC = 3;
idx = kmeans(D, kC); % k-means clustering
%% Visualize clusters
% Cluster label is shown in x-axis in spectrogram
XL = cell(length(idx),1);
for k = 1 : length(idx)
XL{k} = idx(k);
end;
figure(2);
xlim = get(gca,'XLim');
dummy = linspace(xlim(1),xlim(2),length(idx));
set(gca,'XTick', dummy, 'XTickLabel', XL);
xlabel('Index label for cluster / Time');
%% Listen to each cluster
% 'Cind' is a cell array that contains index numbers of
% frames associated to each cluster.
% 'CSig' is a cell vector that contains audio frames of
% each cluster.
%
% In the end you can listen to each cluster.
Cind = cell(kC,1);
for k = 1 : kC
Cind{k} = find(idx == k);
end;
CSig = cell(kC,1);
for k = 1 : kC
dummy = [];
for m = 1 : length(Cind{k})
iStart = L*(Cind{k}(m)-1)+1;
iStop = iStart+L-1;
dummy = [dummy; x(iStart:iStop)];
end;
CSig{k} = dummy;
end;
for k = 1 : kC
disp(['Listen to cluster #' num2str(k) '... press any key to continue']);
soundsc(CSig{k}, fs);
%mysoundsc(CSig{k}, fs); % linux
%wavwrite(CSig{k}, fs, 8, ['dummy' num2str(k) '.wav']); % to a file
pause;
end;
%% BREAKPOINT 3
%
% - Does the clustering work as it should be:
% group frames according their spectral bands?
% - Does the clustering make any sense?
% - k-means is stochastic algorithm -- run the code
% several times. How stable are the clusters?
% - Test with other audio files, similar or different results?
% - If not working, what is wrong: data, features, clustering, ...?