Version 4.1.5
Main Page | Class Hierarchy | Class List | File List | Class Members | Related Pages

PhasedMarkov.h

Go to the documentation of this file.
00001 /* seqpp/PhasedMarkov.h
00002  *
00003  * Copyright (C) 2003 Laboratoire Statistique & Génome
00004  *
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or (at
00008  * your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful, but
00011  * WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  * General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018  */
00019 
00028 #ifndef SEQPP_PHASEDMARKOV_H
00029 #define SEQPP_PHASEDMARKOV_H
00030 
00031 using namespace std;
00032 #include <seqpp/SequenceSet.h>
00033 #include <seqpp/arnoldi.h>
00034 #include <vector>
00035 #include <string>
00036 
00037 #include <gsl/gsl_sf_gamma.h>
00038 
00063 class PhasedMarkov
00064 {
00065 
00066 public :
00067 
00069   /*
00070     \param markov_file description of an input model in a file (generated by print method) 
00071     \param calc_rank calculus of the convergence rank if true
00072     Exple of file:
00073     \code
00074     # 4 <- Order of the phased Markov chain<br>
00075     # 3 <- Phase<br>
00076     # 4 <- Alphabet size<br>
00077     # Phase n°: 0<br>
00078     # 10 steps <- Convergence to the stationnary distribution (!!Optionnal!!)<br>
00079     # Transition matrix:<br>
00080     ...<br>
00081     # Stationnary Probability: (!!Optionnal!!)<<br>
00082     ...
00083     \endcode
00084   */
00085   PhasedMarkov(  const string & markov_file,
00086                  bool calc_rank = false  );
00087  
00089 
00096   PhasedMarkov( const SequenceSet & seqset,
00097                 short phase, short initial_phase = 0,
00098                 bool calc_rank = false,
00099                 const string& prior_alpha_file = string() );
00100   
00102 
00109   PhasedMarkov( const Sequence & seq,
00110                 short phase, short initial_phase = 0,
00111                 bool calc_rank = false,
00112                 const string& prior_alpha_file = string() );
00113 
00115   PhasedMarkov( const PhasedMarkov & phm );
00116 
00118   PhasedMarkov();
00119 
00121 
00129   PhasedMarkov( short size, short order, short phase, bool alloc=true,
00130                 const string& prior_alpha_file = string() );
00131 
00133 
00138   PhasedMarkov(const PhasedMarkov &M1, const PhasedMarkov &M2, const float p);
00139 
00141 
00149   PhasedMarkov(const SequenceSet & seqset, 
00150                const vector<int> &Indseq, 
00151                short phase, short initial_phase = 0,
00152                bool calc_rank = false,
00153                const string& prior_alpha_file = string() ); 
00154 
00156 
00179   PhasedMarkov( const gsl_rng * r,
00180                 short size, short order, 
00181                 short phase, 
00182                 bool calc_rank = false );
00183 
00185 
00194   PhasedMarkov( unsigned long * * count,
00195                 short size, short order,
00196                 short phase, short initial_phase = 0,
00197                 bool calc_rank = false,
00198                 const string& prior_alpha_file = string() );
00199 
00201   virtual ~PhasedMarkov();
00202   
00203   //---------------------------------------
00205 
00214   template <class TSeq>
00215     void estimate( const TSeq & tseq,
00216                    short phase, short initial_phase, 
00217                    unsigned long beg, unsigned long end,
00218                    bool calc_rank = false, bool count_again = true )
00219     {
00220       if (count_again)
00221         tseq.clear_count();  
00222       tseq.count_p_occurencies(_phase,initial_phase,beg,end);
00223       
00224       estimate( tseq.get_p_count(), true, calc_rank );
00225     } 
00226 
00227 
00229 
00233   void estimate( const string& count_file, bool calc_rank= false )
00234   {
00235     unsigned long ** count = new unsigned long*[_phase];
00236     for (short p=0; p<_phase; p++ )
00237       count[p] = new unsigned long[_nPi];    
00238 
00239     if (!load_count( count_file, count )){
00240       cerr<<"Bad count file"<<endl;
00241       exit(1);
00242     }
00243       
00244     estimate( count, false, calc_rank );
00245 
00246     for (short p=0; p<_phase; p++ )
00247       delete[] count[p];
00248     delete [] count;    
00249   }
00250 
00252 
00257   void estimate( unsigned long * * count, bool decal_required, bool calc_rank= false );
00258 
00259 
00260   //---------------------------------------
00262   const double ** markov_matrices( ) const{
00263     return const_cast< const double** >(_Pis);
00264   }
00266   const double * markov_matrix( short numphase ) const{
00267     if ( numphase<_phase )
00268       return _Pis[numphase];
00269     else 
00270       return NULL;
00271   }
00272   
00274 
00293   void draw_markov_matrices(const  gsl_rng * r);
00294 
00295 
00297   inline virtual void new_markov_matrices();
00299   inline virtual void free_markov_matrices();
00300 
00302   double total_variation( const PhasedMarkov & M );
00303 
00304 
00305   //---------------------------------------
00307   /*
00308     \param force "true" to force the calculation, even if the laws already exist. Default => "false". 
00309   */ 
00310   void compute_stat_laws( bool force = false ); 
00312   const double * stat_law( short numphase = 0 ) const{
00313     if ( numphase<_phase )
00314       return _Mus[numphase];
00315     else 
00316       return NULL;
00317   }
00319   inline void free_stat_laws();
00320   
00322   /*
00323     \param MuInit initial law, must have allocated
00324     \seqset set of sequences
00325   */
00326   void compute_init_law( double * MuInit, const SequenceSet & seqset ) const;
00327 
00328 
00329   //---------------------------------------
00331   virtual int compute_rank();   
00333   virtual long nb_parameters() const{
00334     return  _nb_param;
00335   }
00336 
00337 
00338   //---------------------------------------
00340   void link_to_translator( const Translator & trans ){
00341     _trans = &trans;
00342   }
00344 
00349   double proba_c( const string & word, 
00350                   Coder & coder, short numphase=0 ) const;
00352 
00357   double proba( const string & word, 
00358                 Coder & coder, short numphase=0 ) const;
00359 
00360 
00362 
00367   double proba_c( const vector<short> & word, 
00368                   Coder & coder, short numphase=0 ) const;
00370 
00375   double proba( const vector<short> & word, 
00376                 Coder & coder, short numphase=0 ) const;
00377 
00378 
00380 
00386   double proba_c( long word, int lw=-1, long jump=-1, short numphase=0 ) const;
00388 
00394   double proba( long word, int lw=-1, long jump=-1, short numphase=0 ) const;
00395 
00396   
00398 
00404   double proba_c(const long * seq, long tbeg, long tend, short numphase=0) const;
00405 
00407 
00413   double proba(const long * seq, long tbeg, long tend, short numphase=0) const;
00414  
00415  
00416   //---------------------------------------
00418 
00423   double log_likelihood( const SequenceSet & seqset,
00424                          short initial_phase = 0, short numphase=-1 ) const; 
00425 
00427 
00436   double log_ratio_likelihood( const SequenceSet & seqset,
00437                                const PhasedMarkov &M,
00438                                short initial_phase1=0,                               
00439                                short initial_phase2=0 ) const ; 
00440 
00441 
00443 
00448   double log_likelihood( const Sequence & seq,
00449                          short initial_phase = 0, short numphase=-1 ) const; 
00450 
00452 
00461   double log_ratio_likelihood( const Sequence & seq,
00462                                const PhasedMarkov &M,
00463                                short initial_phase1=0,                               
00464                                short initial_phase2=0 ) const ;
00465   
00467 
00471   template <class TSeq>
00472     double BIC( const TSeq & tseq, short initial_phase=0 ) const{   
00473     //BIC = -2*loglikelihood + nbparam*log(length)
00474     return( -2*log_likelihood( tseq, initial_phase ) + _nb_param*log( (double)tseq.tell_length() ) );
00475   }
00476 
00477  
00479 
00483   template <class TSeq>
00484     double AIC( const TSeq & tseq, short initial_phase=0 ) const{
00485     //AIC = -2*loglikelihood + 2*nbparam
00486     return( -2*log_likelihood( tseq, initial_phase )  + 2*_nb_param );
00487   }
00488 
00489 
00490   //---------------------------------------
00492 
00499    template <class TSeq1, class TSeq2>
00500    double post_log_likelihood( const TSeq1& tseq_train, const TSeq2& tseq_eval,
00501                                bool force=false,
00502                                short initial_phase_train = 0, short initial_phase_eval = 0 )
00503    {
00504     double LogLik_te = 0., LogLik_t = 0.,sum1=0., sum2=0.;
00505     short p, u;
00506     long j, word;
00507 
00508     if (( tseq_train.tell_order() != _order )||( tseq_eval.tell_order() != _order )){
00509       cerr << "Order incompatibility beetween  sequence and model " <<endl;
00510       exit(-1);
00511     }
00512 
00513     if (_prior_alpha.size() == 0){
00514        cerr << "Prior input required for post_log_likelihood." <<endl;
00515       exit(-1);
00516     }
00517       
00518     // A FAIRE attention seq > order...
00519 
00520     tseq_train.count_p_occurencies( _phase, initial_phase_train );
00521     tseq_eval.count_p_occurencies(_phase, initial_phase_eval);
00522        
00523     unsigned long * countp_t, * countp_e;
00524     long jump = tseq_train.tell_jump();
00525 
00526    
00527     vector<double> lng1; 
00528     lng1.reserve(_size);
00529     for (p=0; p<_phase; p++){
00530       
00531       double sum = 0;
00532       for (u=0; u<_size; u++){
00533         lng1[u] = gsl_sf_lngamma(_prior_alpha[p][u]);
00534         sum += _prior_alpha[p][u];
00535       }
00536       double lng2 = gsl_sf_lngamma(sum);
00537       
00538       countp_t = (tseq_train.get_p_count())[p] + jump;
00539       countp_e = (tseq_eval.get_p_count())[p] + jump;
00540 
00541         for(j = 0; j < _nMu; j++){      
00542           sum1=0., sum2=0.;
00543           word = j*_size;
00544           for (u=0; u<_size; u++){
00545             sum1 = countp_t[word+u] + countp_e[word+u] + _prior_alpha[p][u];
00546             LogLik_te += gsl_sf_lngamma(sum1) - lng1[u];
00547             sum2 += sum1;
00548           }
00549           LogLik_te += lng2 - gsl_sf_lngamma(sum2);
00550         } 
00551     }
00552     
00553     if ((_postloglike>0)||(force==true)){
00554       for (p=0; p<_phase; p++){
00555 
00556         double sum = 0;
00557         for (u=0; u<_size; u++){
00558           lng1[u] = gsl_sf_lngamma(_prior_alpha[p][u]);
00559           sum += _prior_alpha[p][u];
00560         }
00561         double lng2 = gsl_sf_lngamma(sum);
00562         
00563         countp_t = (tseq_train.get_p_count())[p] + jump;
00564         for(j = 0; j < _nMu; j++){      
00565           sum1=0., sum2=0.;
00566           word = j*_size;
00567           for (u=0; u<_size; u++){
00568             sum1 = countp_t[word+u] + _prior_alpha[p][u];
00569             LogLik_t += gsl_sf_lngamma(sum1) - lng1[u];
00570             sum2 += sum1;
00571           }
00572           LogLik_t += lng2 - gsl_sf_lngamma(sum2);
00573         }       
00574       }
00575       _postloglike = LogLik_t;
00576     }
00577     
00578     return(LogLik_te - _postloglike);
00579    }
00580 
00581   
00582    
00583    //---------------------------------------
00585 
00606    inline void print( const string &FileOut );
00607   
00609   void print(ofstream &Out) const;
00610 
00611   //---------------------------------------
00613   int tell_size() const { return(_size);};
00615   int tell_rank() const { return(_rank);};
00617   int tell_order() const { return(_order);};
00619   int tell_phase() const { return(_phase); }
00620 
00621   //---------------------------------------
00623   int nMu() const { return(_nMu); }
00625   int nPi() const { return(_nPi); }
00626 
00628 
00632   double Pi(int index, int p=0) const { 
00633     return(_Pis[p][index]); 
00634   }
00636 
00640   double & operator() (int index, int p=0){ 
00641     return(_Pis[p][index]); 
00642   }
00643 
00645 
00649   double Mu(int index, int p=0) const { 
00650     return(_Mus[p][index]); 
00651   }  
00653   bool isPis() const { return(_Pis != NULL);};
00655   bool isMus()  const { return(_Mus != NULL);};
00656 
00657 
00659   inline short nextPhase(short p) const;
00661   inline short prevPhase(short p) const;
00662   //--------------------------------------- 
00663 
00665   bool Stochasticity();
00666 
00668 
00672   void file_to_count( const string& src_file, unsigned long ** dest_count);
00673    
00674  protected :
00675   
00677   short _phase;
00678 
00680   double **_Pis;
00682   double **_containers; 
00683 
00685   double **_Mus;   
00686   
00688   short _size; 
00689   
00691   short _order;
00692   
00694   long _nPi; 
00695 
00697   long _nMu; 
00698  
00700   long _nb_param;
00701 
00703   int _rank;
00704 
00706   long _jump;
00707 
00709   short *_nextPhase;
00710 
00712   short *_prevPhase;
00713 
00715   const Translator * _trans;
00716 
00718   double _postloglike;
00719 
00721   vector< vector<double> > _prior_alpha;
00722 
00723 
00724   inline void FreeNextPhase();
00725   inline void FreePrevPhase();
00726 
00728   bool isNextPhase()  const { return(_nextPhase != NULL);};
00730   bool isPrevPhase()  const { return(_prevPhase != NULL);}; 
00731 
00732   bool load_count( const string & count_file, unsigned long** data );
00733 
00734   bool load_prior_alpha( const string & prior_alpha_file );
00735 };
00736 
00737 
00738 inline short PhasedMarkov::nextPhase(short p) const
00739 {
00740   return (_nextPhase[p]);
00741 }
00742 
00743 inline short PhasedMarkov::prevPhase(short p) const
00744 {
00745   return (_prevPhase[p]);
00746 }
00747 
00748 inline void PhasedMarkov::new_markov_matrices()
00749 {   
00750   if (!_Pis){
00751     _containers = new double*[_phase];
00752     _Pis = new double*[_phase];
00753     for (short p=0; p<_phase; p++){
00754       _containers[p] = new double[_nPi];
00755       _Pis[p] = _containers[p];
00756     }
00757   }
00758 }
00759 
00760 inline void PhasedMarkov::free_markov_matrices()
00761 {
00762   if (_Pis){
00763     for (int p=0;p<_phase;p++)
00764       delete []_containers[p];
00765     delete []_containers;
00766     
00767     delete []_Pis;
00768   }
00769   _Pis = NULL;
00770   _containers = NULL;
00771 }
00772 
00773 inline void PhasedMarkov::free_stat_laws()
00774 {
00775   if (_Mus)
00776     {
00777       for (int p=0;p<_phase;p++)
00778         delete []_Mus[p];
00779       delete []_Mus;
00780     }
00781   _Mus = NULL;
00782 }
00783 
00784 inline void PhasedMarkov::FreeNextPhase()
00785 {
00786   if (isNextPhase())
00787     delete [] _nextPhase;
00788   _nextPhase = NULL;
00789 }
00790 
00791 inline void PhasedMarkov::FreePrevPhase()
00792 {
00793   if (isPrevPhase())
00794     delete [] _prevPhase;
00795   _prevPhase = NULL;
00796 }
00797 
00798 
00799 inline void PhasedMarkov::print(const string & FileOut )
00800 {
00801   ofstream Out(FileOut.c_str());
00802   print(Out);
00803 }
00804 
00805 #endif/*SEQPP_PHASEDMARKOV_H*/
00806 



Download seq++ 4.1.5
Download previous versions
Statistique & Genome Home


Generated on Thu Aug 4 20:33:12 2005 for seqpp by doxygen 1.3.9.1