00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00028 #ifndef SEQPP_PHASEDMARKOV_H
00029 #define SEQPP_PHASEDMARKOV_H
00030
00031 using namespace std;
00032 #include <seqpp/SequenceSet.h>
00033 #include <seqpp/arnoldi.h>
00034 #include <vector>
00035 #include <string>
00036
00037 #include <gsl/gsl_sf_gamma.h>
00038
00063 class PhasedMarkov
00064 {
00065
00066 public :
00067
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085 PhasedMarkov( const string & markov_file,
00086 bool calc_rank = false );
00087
00089
00096 PhasedMarkov( const SequenceSet & seqset,
00097 short phase, short initial_phase = 0,
00098 bool calc_rank = false,
00099 const string& prior_alpha_file = string() );
00100
00102
00109 PhasedMarkov( const Sequence & seq,
00110 short phase, short initial_phase = 0,
00111 bool calc_rank = false,
00112 const string& prior_alpha_file = string() );
00113
00115 PhasedMarkov( const PhasedMarkov & phm );
00116
00118 PhasedMarkov();
00119
00121
00129 PhasedMarkov( short size, short order, short phase, bool alloc=true,
00130 const string& prior_alpha_file = string() );
00131
00133
00138 PhasedMarkov(const PhasedMarkov &M1, const PhasedMarkov &M2, const float p);
00139
00141
00149 PhasedMarkov(const SequenceSet & seqset,
00150 const vector<int> &Indseq,
00151 short phase, short initial_phase = 0,
00152 bool calc_rank = false,
00153 const string& prior_alpha_file = string() );
00154
00156
00179 PhasedMarkov( const gsl_rng * r,
00180 short size, short order,
00181 short phase,
00182 bool calc_rank = false );
00183
00185
00194 PhasedMarkov( unsigned long * * count,
00195 short size, short order,
00196 short phase, short initial_phase = 0,
00197 bool calc_rank = false,
00198 const string& prior_alpha_file = string() );
00199
00201 virtual ~PhasedMarkov();
00202
00203
00205
00214 template <class TSeq>
00215 void estimate( const TSeq & tseq,
00216 short phase, short initial_phase,
00217 unsigned long beg, unsigned long end,
00218 bool calc_rank = false, bool count_again = true )
00219 {
00220 if (count_again)
00221 tseq.clear_count();
00222 tseq.count_p_occurencies(_phase,initial_phase,beg,end);
00223
00224 estimate( tseq.get_p_count(), true, calc_rank );
00225 }
00226
00227
00229
00233 void estimate( const string& count_file, bool calc_rank= false )
00234 {
00235 unsigned long ** count = new unsigned long*[_phase];
00236 for (short p=0; p<_phase; p++ )
00237 count[p] = new unsigned long[_nPi];
00238
00239 if (!load_count( count_file, count )){
00240 cerr<<"Bad count file"<<endl;
00241 exit(1);
00242 }
00243
00244 estimate( count, false, calc_rank );
00245
00246 for (short p=0; p<_phase; p++ )
00247 delete[] count[p];
00248 delete [] count;
00249 }
00250
00252
00257 void estimate( unsigned long * * count, bool decal_required, bool calc_rank= false );
00258
00259
00260
00262 const double ** markov_matrices( ) const{
00263 return const_cast< const double** >(_Pis);
00264 }
00266 const double * markov_matrix( short numphase ) const{
00267 if ( numphase<_phase )
00268 return _Pis[numphase];
00269 else
00270 return NULL;
00271 }
00272
00274
00293 void draw_markov_matrices(const gsl_rng * r);
00294
00295
00297 inline virtual void new_markov_matrices();
00299 inline virtual void free_markov_matrices();
00300
00302 double total_variation( const PhasedMarkov & M );
00303
00304
00305
00307
00308
00309
00310 void compute_stat_laws( bool force = false );
00312 const double * stat_law( short numphase = 0 ) const{
00313 if ( numphase<_phase )
00314 return _Mus[numphase];
00315 else
00316 return NULL;
00317 }
00319 inline void free_stat_laws();
00320
00322
00323
00324
00325
00326 void compute_init_law( double * MuInit, const SequenceSet & seqset ) const;
00327
00328
00329
00331 virtual int compute_rank();
00333 virtual long nb_parameters() const{
00334 return _nb_param;
00335 }
00336
00337
00338
00340 void link_to_translator( const Translator & trans ){
00341 _trans = &trans;
00342 }
00344
00349 double proba_c( const string & word,
00350 Coder & coder, short numphase=0 ) const;
00352
00357 double proba( const string & word,
00358 Coder & coder, short numphase=0 ) const;
00359
00360
00362
00367 double proba_c( const vector<short> & word,
00368 Coder & coder, short numphase=0 ) const;
00370
00375 double proba( const vector<short> & word,
00376 Coder & coder, short numphase=0 ) const;
00377
00378
00380
00386 double proba_c( long word, int lw=-1, long jump=-1, short numphase=0 ) const;
00388
00394 double proba( long word, int lw=-1, long jump=-1, short numphase=0 ) const;
00395
00396
00398
00404 double proba_c(const long * seq, long tbeg, long tend, short numphase=0) const;
00405
00407
00413 double proba(const long * seq, long tbeg, long tend, short numphase=0) const;
00414
00415
00416
00418
00423 double log_likelihood( const SequenceSet & seqset,
00424 short initial_phase = 0, short numphase=-1 ) const;
00425
00427
00436 double log_ratio_likelihood( const SequenceSet & seqset,
00437 const PhasedMarkov &M,
00438 short initial_phase1=0,
00439 short initial_phase2=0 ) const ;
00440
00441
00443
00448 double log_likelihood( const Sequence & seq,
00449 short initial_phase = 0, short numphase=-1 ) const;
00450
00452
00461 double log_ratio_likelihood( const Sequence & seq,
00462 const PhasedMarkov &M,
00463 short initial_phase1=0,
00464 short initial_phase2=0 ) const ;
00465
00467
00471 template <class TSeq>
00472 double BIC( const TSeq & tseq, short initial_phase=0 ) const{
00473
00474 return( -2*log_likelihood( tseq, initial_phase ) + _nb_param*log( (double)tseq.tell_length() ) );
00475 }
00476
00477
00479
00483 template <class TSeq>
00484 double AIC( const TSeq & tseq, short initial_phase=0 ) const{
00485
00486 return( -2*log_likelihood( tseq, initial_phase ) + 2*_nb_param );
00487 }
00488
00489
00490
00492
00499 template <class TSeq1, class TSeq2>
00500 double post_log_likelihood( const TSeq1& tseq_train, const TSeq2& tseq_eval,
00501 bool force=false,
00502 short initial_phase_train = 0, short initial_phase_eval = 0 )
00503 {
00504 double LogLik_te = 0., LogLik_t = 0.,sum1=0., sum2=0.;
00505 short p, u;
00506 long j, word;
00507
00508 if (( tseq_train.tell_order() != _order )||( tseq_eval.tell_order() != _order )){
00509 cerr << "Order incompatibility beetween sequence and model " <<endl;
00510 exit(-1);
00511 }
00512
00513 if (_prior_alpha.size() == 0){
00514 cerr << "Prior input required for post_log_likelihood." <<endl;
00515 exit(-1);
00516 }
00517
00518
00519
00520 tseq_train.count_p_occurencies( _phase, initial_phase_train );
00521 tseq_eval.count_p_occurencies(_phase, initial_phase_eval);
00522
00523 unsigned long * countp_t, * countp_e;
00524 long jump = tseq_train.tell_jump();
00525
00526
00527 vector<double> lng1;
00528 lng1.reserve(_size);
00529 for (p=0; p<_phase; p++){
00530
00531 double sum = 0;
00532 for (u=0; u<_size; u++){
00533 lng1[u] = gsl_sf_lngamma(_prior_alpha[p][u]);
00534 sum += _prior_alpha[p][u];
00535 }
00536 double lng2 = gsl_sf_lngamma(sum);
00537
00538 countp_t = (tseq_train.get_p_count())[p] + jump;
00539 countp_e = (tseq_eval.get_p_count())[p] + jump;
00540
00541 for(j = 0; j < _nMu; j++){
00542 sum1=0., sum2=0.;
00543 word = j*_size;
00544 for (u=0; u<_size; u++){
00545 sum1 = countp_t[word+u] + countp_e[word+u] + _prior_alpha[p][u];
00546 LogLik_te += gsl_sf_lngamma(sum1) - lng1[u];
00547 sum2 += sum1;
00548 }
00549 LogLik_te += lng2 - gsl_sf_lngamma(sum2);
00550 }
00551 }
00552
00553 if ((_postloglike>0)||(force==true)){
00554 for (p=0; p<_phase; p++){
00555
00556 double sum = 0;
00557 for (u=0; u<_size; u++){
00558 lng1[u] = gsl_sf_lngamma(_prior_alpha[p][u]);
00559 sum += _prior_alpha[p][u];
00560 }
00561 double lng2 = gsl_sf_lngamma(sum);
00562
00563 countp_t = (tseq_train.get_p_count())[p] + jump;
00564 for(j = 0; j < _nMu; j++){
00565 sum1=0., sum2=0.;
00566 word = j*_size;
00567 for (u=0; u<_size; u++){
00568 sum1 = countp_t[word+u] + _prior_alpha[p][u];
00569 LogLik_t += gsl_sf_lngamma(sum1) - lng1[u];
00570 sum2 += sum1;
00571 }
00572 LogLik_t += lng2 - gsl_sf_lngamma(sum2);
00573 }
00574 }
00575 _postloglike = LogLik_t;
00576 }
00577
00578 return(LogLik_te - _postloglike);
00579 }
00580
00581
00582
00583
00585
00606 inline void print( const string &FileOut );
00607
00609 void print(ofstream &Out) const;
00610
00611
00613 int tell_size() const { return(_size);};
00615 int tell_rank() const { return(_rank);};
00617 int tell_order() const { return(_order);};
00619 int tell_phase() const { return(_phase); }
00620
00621
00623 int nMu() const { return(_nMu); }
00625 int nPi() const { return(_nPi); }
00626
00628
00632 double Pi(int index, int p=0) const {
00633 return(_Pis[p][index]);
00634 }
00636
00640 double & operator() (int index, int p=0){
00641 return(_Pis[p][index]);
00642 }
00643
00645
00649 double Mu(int index, int p=0) const {
00650 return(_Mus[p][index]);
00651 }
00653 bool isPis() const { return(_Pis != NULL);};
00655 bool isMus() const { return(_Mus != NULL);};
00656
00657
00659 inline short nextPhase(short p) const;
00661 inline short prevPhase(short p) const;
00662
00663
00665 bool Stochasticity();
00666
00668
00672 void file_to_count( const string& src_file, unsigned long ** dest_count);
00673
00674 protected :
00675
00677 short _phase;
00678
00680 double **_Pis;
00682 double **_containers;
00683
00685 double **_Mus;
00686
00688 short _size;
00689
00691 short _order;
00692
00694 long _nPi;
00695
00697 long _nMu;
00698
00700 long _nb_param;
00701
00703 int _rank;
00704
00706 long _jump;
00707
00709 short *_nextPhase;
00710
00712 short *_prevPhase;
00713
00715 const Translator * _trans;
00716
00718 double _postloglike;
00719
00721 vector< vector<double> > _prior_alpha;
00722
00723
00724 inline void FreeNextPhase();
00725 inline void FreePrevPhase();
00726
00728 bool isNextPhase() const { return(_nextPhase != NULL);};
00730 bool isPrevPhase() const { return(_prevPhase != NULL);};
00731
00732 bool load_count( const string & count_file, unsigned long** data );
00733
00734 bool load_prior_alpha( const string & prior_alpha_file );
00735 };
00736
00737
00738 inline short PhasedMarkov::nextPhase(short p) const
00739 {
00740 return (_nextPhase[p]);
00741 }
00742
00743 inline short PhasedMarkov::prevPhase(short p) const
00744 {
00745 return (_prevPhase[p]);
00746 }
00747
00748 inline void PhasedMarkov::new_markov_matrices()
00749 {
00750 if (!_Pis){
00751 _containers = new double*[_phase];
00752 _Pis = new double*[_phase];
00753 for (short p=0; p<_phase; p++){
00754 _containers[p] = new double[_nPi];
00755 _Pis[p] = _containers[p];
00756 }
00757 }
00758 }
00759
00760 inline void PhasedMarkov::free_markov_matrices()
00761 {
00762 if (_Pis){
00763 for (int p=0;p<_phase;p++)
00764 delete []_containers[p];
00765 delete []_containers;
00766
00767 delete []_Pis;
00768 }
00769 _Pis = NULL;
00770 _containers = NULL;
00771 }
00772
00773 inline void PhasedMarkov::free_stat_laws()
00774 {
00775 if (_Mus)
00776 {
00777 for (int p=0;p<_phase;p++)
00778 delete []_Mus[p];
00779 delete []_Mus;
00780 }
00781 _Mus = NULL;
00782 }
00783
00784 inline void PhasedMarkov::FreeNextPhase()
00785 {
00786 if (isNextPhase())
00787 delete [] _nextPhase;
00788 _nextPhase = NULL;
00789 }
00790
00791 inline void PhasedMarkov::FreePrevPhase()
00792 {
00793 if (isPrevPhase())
00794 delete [] _prevPhase;
00795 _prevPhase = NULL;
00796 }
00797
00798
00799 inline void PhasedMarkov::print(const string & FileOut )
00800 {
00801 ofstream Out(FileOut.c_str());
00802 print(Out);
00803 }
00804
00805 #endif
00806