\documentclass[12pt, epsf]{article}
\usepackage[finnish]{babel}   %% suomalainen tavutus
\usepackage[latin1]{inputenc} %% 8-bittiset tiedostot
\usepackage[T1]{fontenc}      %% 8-bittiset fontit
\usepackage[dvips]{epsfig}
\textheight 22cm
\textwidth 16cm
\headheight -1.7 cm
%\footheight 0.1cm
\oddsidemargin 3mm
\topskip 1mm
\parskip 0.5cm
\parindent 1cm
\baselineskip 6mm
\begin{document}
%\input epsf
\hfill Janne Nikkilä \today
\section{Exercises 1}
\subsection{a)}
\begin{itemize}
\item Nonprob: heuristic scoring
\item Nonprob: dynamic programming
\item Prob: pairwise HMMs: graph
\item Prob: full model (also end and start states)
\item parameters to estimate:p(x,y),p(x,-),p(-,y),transition P:s, end transition
\item Prob: in parameter estimation, training seqs needed
\item Prob: algorithms used in the estimation(Viterbi, B-W, forward)
\item Prob: algorithms used in the search of the most probable path ie. the probability of the given seq
\end{itemize}
\subsection{b)}
\begin{itemize}
\item heuristic easy to implement and fast
\item seq alignment can be presentd as FSA and this can be modified to eq. prob. model
\item in heur. 
\item probabilistic stand on the firmer ground
\item possible to use {\it a priori} knowledge
\item prob.model allow the analysis of reliability of the alignment
\end{itemize}
\subsection{c)}
\begin{itemize}
\item MC: state can be deduced from the outcome
\item MC ex: finding structural groups (i.e. is this seq. a codon), locating CpG islands
\item HMM ex: find all codons in this seq., alignment
\item some explanation when to use Mc and when to use HMM
\end{itemize}
\subsection{d)}
\begin{itemize}
\item can be assumed that the transition happens before emission or vice versa
\item emission first: P(\{cola,lem\})=0.025
\item transition first: P(\{cola,lem\})=0.0935
\item Markov model: the state can be reasoned from the outcome
\end{itemize}
\subsection{e)}
\begin{itemize}
\item idea is to consider the probability $\tau$ of the transition to the end
  state, which is same for all sequencences of any length
\item probability of the all seqs of length L is $P=\tau(1-\tau)^{L-1}$ and this must be summed over all lengths
\end{itemize}
\subsection{f)}
\begin{itemize}
\item FSA: deterministic, parameters by hand, only one 'best'
  solution, fast, and easy
\item: PHMM: probabilistic, with forward-algorithm knowledge of the
  goodness of all the possible solutions, Viterbi may be inferior or
  the same as FSA:s, parameters from the data, complicated
\end{itemize}
\subsection{g)}
\begin{itemize}
\item used in multiple alignment
\item PSSM: length predetermined, simple HMM(transition prob 1), no
  gaps or insertions, defines a prob.distribution for each symbol,
  symbols are assumed to be independent of each other
\item PHMM: can be derived from the PSSM by adding the silent and
  insert states, pair HMM conditioned to emit certain seq, differs
  from HMM by having no backward transitions, can be used to model
  position specific information
\end{itemize}
\subsection{h)}
\begin{itemize}
\item minimizing the rounding errors (taking the logs separately of
  all factors) yields 9 seqs, otherwise result is 10 seqs
\item both american and english billions are accepted
\end{itemize}
\subsection{i)}
\begin{itemize}
\item Feng-Doolittle, CLUSTALW, Barton-Stenberg, Profile-HMM
\item ideas of all, some relevant points from each one
\item zero tolerance for too long answers
\end{itemize}
\subsection{j)}
\begin{itemize}
\item it is possible to sequence only hundreds of base pairs at time,
  while the lengths of the gene sequences are from thousands to
  millions of base pairs -> fragment assembly
\item this is because gel phosporese-method
\item it is also important to know the location of the sequenced
  fragment in the chromosome -> physical mapping by markers (examples
  of these)
\end{itemize}
\subsection{k)}
\begin{itemize}
\item for example: length 56, with one error; length 50,51, no errors
\item if a algorithm implemented, higher grade
\end{itemize}
\subsection{l)}
\begin{itemize}
\item trivial answer: restriction sites are sequences of certain
  order, restriction enzymes are sensitive to certain sites and it can
  be assumed that there is one to one correspondence between enzymes
  and sites;thus, if two different enzymes are used in the DDP, sites
  cannot coincide exactly
\item if some relevant discussion presented, higher grade
\end{itemize}
\subsection{m)}
\begin{itemize}
\item binary matrices admit the perfect phylogeny iff for each pair
  (mark P) of columns the sets of 1's are disjoint (mark Pd) or another is a
  subset of another (mark Ps)
\item C1P for columns means that all 1's in each column are consequtive
\item task: show that, if the former, the latter follows
\item when perfect phylogeny, rows having 1's in a set of Ps-columns
  can be ordered without changing the order of 1's in any other
  columns
\item thus, every set of columns with the property Ps can be ordered
  resulting in the matrix with consequtive 1's in each column, that is
  with C1P for columns
\item algorithm could for example be to first take those rows with 1's
  in the first column, move them to the top of the matrix. Then take
  the rows having the 1's in the first column which is not in the
  previous group, move below the previous group, and so on until there
  are no unordered rows
\end{itemize}
\subsection{n)}
\begin{itemize}
%\item applying Markov-property we get
%  $\sum_{b(t)}P(a(s+t)|b(t),c(0))P(b(t)|c(0))=\sum{b(t)}P(a(s+t)|b(t))P(b(t)|c(0))$
\item For every term in the sum
  $P(a(s+t)|b(t),c(0))P(b(t)|c(0))$=$\frac{P(a(s+t),b(t),c(0))}{P(b(t),c(0))}*\frac{P(b(t),c(0))}{P(c(0))}=P(a(s+t),b(0)|c(0))$
\item Summing over all $b(t)$ we get
  $\sum_{b(t)}P(a(s+t),b(t)|c(0))=P(a(s+t)|c(0))$, which was required
  to be proven
\item lhs: Markov-property gives
  $\sum_{b(t)}P(a(s+t)|b(t),c(0))P(b(t)|c(0))=\sum_{b(t)}P(a(s+t)|b(t))P(b(t)|c(0)$
  and from this we get with stationarity
  $\sum_{b(t)}P(a(s+t)|b(t))P(b(t)|c(0)=\sum_{b(t)}P(a|b,s)P(b|c,t)$
\item rhs with stationarity gives : $P(a(s+t)|c(0))=P(a|c,s+t)$ and
  thus multiplicativity holds by definition
\item multiplicativity here means that the final substitution matrix is
  achieved by multiplying intermediate substitution matrixes
\end{itemize}
\subsection{o)}
\begin{itemize}
\item derivate P in DEKM p. 199 and solve zeroes
\end{itemize}

\section{Exercises 2}
\subsection{a)}
\begin{itemize}
\item if shown by example: 3
\item if more methematical approach: 5
\end{itemize}
\subsection{b)}
\begin{itemize}
\item correct example: 3
\item if some additional relevant note about i.e. length of the seq: 5
\end{itemize}
\subsection{c)}
\begin{itemize}
\item both emission first and transition first approaches accepted
\item derivation of the transmission probabilities
\item correct idea in the sthocastic grammar
\end{itemize}
\subsection{d)}
\begin{itemize}
\item replace $\delta(i,j)$ with $s(i,j)$ 
\item proved that algorithm finds tha maximal scoring: 5
\end{itemize}
\subsection{e)}
\begin{itemize}
\item PAMs and WACs are used to model the probability of mutations
\item PAM's estimated from the seqs, WACs estimated from the seqs and
  the chemical environment
\item if environment is well known and measured WACs useful,
  otherwise may produce additional noise
\item PAMs simpler
\item some discussion assumed in the answer 
\end{itemize}
\subsection{f)}
\begin{itemize}
\item the objective function is a sum of terms which take into account
  the scoring of the individual alignment to core templates, pairwise,
  triplewise and so on
\item all of the terms are not usually used (when, why)
\end{itemize}
\subsection{g)}
\begin{itemize}
\item the goal of finding out the gene regulatory network is _very_ important
\item the problem is far from trivial
\item it is biologically sensible to assume that one gene is either
  inhibitor or activator, but not both
\item also, it is sensible to assume that every gene needs to be both
  activated and inhibited, to regulate the functions of the cell better
\item formulation of the problem as a graph-problem is also
  intuitively clear
\end{itemize}
\subsection{h)}
\begin{itemize}
\item $\tau$-function is a _combination_ of the neighborhood-function
  $h$ multiplied and the learning rate parameter $\alpha$
\item $f_i(N)$ is the modelvector $m(i)$ of the indexed mapnode N at
  iteration $i$
\item other notation trivial 
\end{itemize}

\end{document}




