# This file is part of ModPipe, Copyright 1997-2020 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

package PLLib::HHSuiteUtils;
require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw( CheckHHSuiteDB RunHHMakeModel);
             

use strict;
use PLLib::Utils;
use Cwd;
use PLLib::Sequence;
use PLLib::MD5Utils;
use PLLib::ModPy;
use PLLib::Modeller;
use MPLib::MPUtils;
use PLLib::Alignment;
use MPLib::Version;
use MPLib::Binaries;
use MPLib::Serialize;
use File::Basename;
use MPLib::MPInit;
use PLLib::ModProfile;

sub CheckHHSuiteDB {


  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 2;

  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     print "${subname}__E> Expected: $nargs\n" ;
     return;
  }

  #-- reassign the variables
  my $suitedb = $_[0];
  my $hhprogram = $_[1];

  # -- Check for the existence of hhsuite databases
  my $dbdir = dirname($suitedb);
  my (@ext, $db, $dbname);
  if ($hhprogram eq "hhblits") {
      if (grep/uniprot/,$suitedb) {
         @ext = qw( .cs219 .cs219.sizes _hhm_db _hhm_db.index 
                        _a3m_db _a3m_db.index);
      } elsif (grep/pdb/,$suitedb) {
         @ext = qw( .cs219 .cs219.sizes _hhm_db _hhm_db.index 
                        _a3m_db _a3m_db.index );
      }

  } elsif ($hhprogram eq "hhsearch") {
      if (grep/uniprot/,$suitedb) {
         @ext = qw( _hhm_db );
      } 
  } else {
    warn "${subname}__E> Unknown program $hhprogram\n";
    return;
  }
  foreach my $ext ( @ext ) {
   
    # -- Get the directory and filename
    my $dbnam = basename($suitedb);
    my $dbpath    = "${dbdir}/${dbnam}${ext}";
 
    # -- Check if file exists
    unless ( -e $dbpath ){
      warn "${subname}__E> Could not find ${hhprogram} database file:\n";
      warn "${subname}__E>    Directory: ${dbdir}\n";
      warn "${subname}__E>    Directory: ${dbpath}\n";
      warn "${subname}__E>    Filename : ${dbnam}${ext}\n";
      return;
    }
  }

  # -- Return
  return 1;
}

sub RunHHMakeModel {

  use File::Basename;

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 5;

  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     print "${subname}__E> Expected: $nargs, Given: ".scalar(@_)."\n" ;
     return;
  }

  #-- reassign the variables
  my $seqid = $_[0];
  my $evaluehits = $_[1];
  my $hhr= $_[2];
  my $mode= $_[3];
  my $pdbdir = $_[4];
  my $outhits = [];

  my ($hhlib, $hhmakemodel) = GetHHMakeModel('hhmakemodel_modpipe.pl');
     
  if ( -e $hhmakemodel ) {
     my $alidir = AliDirMP( $seqid );
     MakeDirSys($alidir);
     my $aliext = "hhblits.pir";
     my $max_evalue = $evaluehits;
     my (@alignments, @evalues, %p); 
     push @evalues,0;
     $p{0}=0;
     my $count = 0;

     # create 10 alignments at different evalues and probabilities
     for (my $i=0; $i<30; $i++) {
         my $mult =  (0.1)**$i;
         if (($mult <= $max_evalue) && ($count <=10)) {
            push @evalues,$mult;
            $p{$mult}=$mult;
            $count++;
         }
     }
     $count = 0;
     my $align_id;


     # get large profile to use for statistics estimation

     my $reference_alignment = "$seqid-reference-$aliext";
     
     $ENV{'HHLIB'} = $hhlib;
     # print STDERR " $hhmakemodel -i $hhr -e 1 -p 1 -pir $reference_alignment -d $pdbdir ";
     my $return = system("$hhmakemodel -i $hhr -e 1 -v 0 -p 1 -pir $reference_alignment -d $pdbdir ");
     
     my ($chisqrd, $ksstat) = Get_Hit_statistics('hhblits', $seqid, $evaluehits, $reference_alignment);

     foreach my $evalue (@evalues) {
        $count++;
        my $ali = "$seqid-$count-$aliext";
        my $return = system("$hhmakemodel -i $hhr -e $evalue -v 0 -p $p{$evalue} -pir $ali -d $pdbdir");
        my @strcodes = StructureCodes($ali);
        my @tmpl;
            
        my ($aln, $seq, $max_prob, $target_beg, $target_end);
        my @align_ids = ();
        $aln = InitAlignment() unless ( defined($aln) );
        if (scalar(@strcodes) >0) {
           my $gapcode = $strcodes[0];
           $align_id = AliMD5($ali);
           my $new = 1;
           foreach my $current (@align_ids) {
              if ($align_id == $current) {
                  $new = 0;
               }
           }
           if ($new == 1) {
              push @align_ids, $align_id;
              print "${subname}__C> Added new alignment $align_id\n";
              my ($gap_percentage, $maxseq_identity) = GetPercGaps( $ali, "PIR", $gapcode ); 
 
              CopyFile($ali, "$alidir/${align_id}.ali");
              my $pdb_residues = "AliResidues.txt";
              open ("R","$pdb_residues");
              my ($skip, $target_length, $target_beg, $target_end);
              while (my $line = <R>) {
                  chomp $line;
                  my (@result) = split(/\t/,$line);
                  if ($result[0] eq "SEQ") {
                     ($skip, $target_length, $target_beg, $target_end ) = @result; 
                     # Create Sequence object (for outhits)
                     $seq = Sequence -> new(id => $seqid, length =>$target_length);
                  } elsif ($result[0] eq "PROB" ) {
                     $max_prob = $result[1];
                  } elsif ($result[0] eq "EVALUE") {
                     my $evalue_alignment = $result[1];
                     # made up numbers for chisqrd and ksstat, discuss with Ben!
                     $aln = Alignment->new(score_chi_squared => $chisqrd,
                           score_ks => $ksstat,
                           evalue => $evalue_alignment,
                           gap_percentage => $gap_percentage,
                           id => $align_id);
                  } else {
                     my ($skip, $pdb_chain,$first,$last, $prob) = @result;

                  # assemble Template object
                     my $tmpl = Template->new(code   => substr($pdb_chain,0,4), 
                                            chain  => substr($pdb_chain,5,1),
                                            region => [$first,$last],
                                            sequence_identity => $maxseq_identity);  
                     push @tmpl, $tmpl;
                 }
              }
              close(R);
              if (!defined($seq) || !defined($max_prob) || !defined($aln)
                  || !@tmpl) {
                  print "${subname}__E> AliResidues.txt incomplete\n" ;
              }
              my $outhit = Hit->new(sequence  => $seq,
                                    alignment => $aln,
                                    region    => [$target_beg, $target_end],
                                    templates => \@tmpl,
                                    fold_assignment_method => $mode,
                                    highest_sequence_identity => $maxseq_identity);
              push @$outhits, $outhit;
              warn "${subname}__M> Alignment created - E-value ${evalue}: ". join(" ",@strcodes)."\n";
           } else { 
              warn "${subname}__M> Duplicate alignment skipped - E-value ${evalue}: $align_id ". join(" ",@strcodes)."\n";
           }
        } else {
           warn "${subname}__M> No structures in alignment for E-value and P-value ${evalue}.\n";
        }
     }
     return ($outhits);
     

   } else {
         print "${subname}__D> file not found: ${hhmakemodel} \n";
         return;
   }
   return ;

}

sub Get_Hit_statistics {

  # Estimates ksstat, chi2 from hhsuite PIR files
  # by running profile.scan against pdb95 profile database

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 4;


  # Hit statistics estimated as:
  # 1. use fasta sequence
  # 2. extract new profile list file containing the pdb structures found in HH profile 
  # 3. if fails, use full pdb95 profile list (CPU intense)
  # 4. if fails, use 0.5/0.5


  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     print "${subname}__E> Expected: $nargs\n" ;
     return;
  }


   my ($hhprogram, $seqid, $evaluehits, $ali) = @_;
   # --- Set default parameters
   my @pdb_chains = ExtractChainsPIR($ali);
   my ($prflist, $pssmfile) = MakePrfList($init::xprflist, $init::xprfpssmdb, \@pdb_chains);

   my ($chi2, $ksscore) = Estimate_Hit_statistics($hhprogram, $seqid, $evaluehits, $seqid, $prflist, $pssmfile);
   if (!$chi2 || !$ksscore) {
      ($chi2, $ksscore) = Estimate_Hit_statistics($hhprogram, $seqid, $evaluehits, $seqid, $init::xprflist, $init::xprfpssmdb);
   } 
   if (!$chi2 || !$ksscore) {
      print "${subname}__W> Couldn't estimate Hit statistics, using default values\n";
      $chi2 = 0.5; 
      $ksscore = 0.5; 
   }
   return ($chi2, $ksscore);
}
   
sub Estimate_Hit_statistics {

  # Estimates ksstat, chi2 from hhsuite PIR files
  # by running profile.scan against pdb95 profile database

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 5;

  my ($hhprogram, $seqid, $evaluehits, $ali, $prflist, $pssmfile) = @_;

   # Don't try to give Modeller an empty PSSM file
   if (! -s $pssmfile) {
      return;
   }

   my $prfformat;
   if ($ali eq $seqid) {
       $prfformat = "FASTA";
       $ali = "$ali.fsa";
   } else {
       $prfformat = "PIR";
   }
   my $pssmdbname = $pssmfile;
   my $pssmdbfmt  = "TEXT";
   my $evcut      = $evaluehits;
   my $runname    = "${seqid}-stat";
   my $rrfile     = 'blosum62.sim.mat';

   my $ccmatoffset = -100;
   my $matoffset   = -450;
   my $gap_o       = -700;
   my $gap_e       = -70;

   my $pssmwght  = 'HH1';
   my $alnbase   = "${seqid}";
   my $pdbrep    = $init::pdbrep;
   my $customtag = "HHSuite ($hhprogram)"; 
   my $scoretype = 'CCMAT';

   my $scrstat   = 'True';
   my $assumepdb = 'ON';
   my $cleanup   = 'ON';

   my $natpdb    = undef;
   my $natchn    = ' ';
   # my $natchn     = ' ' if ( ! $natchn || $natchn eq '' || $natchn eq '_' );
   my $nattyp    = 'sequence';

   my $summfile = 'estimation_summary.txt';
   my $scrfile = 'OFF';
  
   # --- Create the python file variables
   my $cwd = cwd();
   my $pynam  = "$cwd/${runname}.py";
   my $lognam = "${runname}.log";

   # --- Open the python file
   my $pyfh = OpenNewFile( $pynam ) or die "Can't open $pynam";

   $prfformat = 'TEXT' if ( $prfformat =~ /\bPROFILE\b/i );

   WRTPY_PPSCAN($pyfh, "$cwd/$ali", $prfformat, $pssmdbname, $pssmdbfmt, $prflist,
                $matoffset, $ccmatoffset, $rrfile, $gap_o, $gap_e, $scrstat, 'True',
                $scrfile, $pssmwght, $evcut, $alnbase, $summfile, $scoretype,
                1);
   close( $pyfh );
   # --- Run MODELLER for PPSCAN
   my $starttime = time();
   RunModeller($pynam)
      or die "${subname}__E> MODELLER failed when estimating hit statistics: $seqid, $ali\n";
   my $runtime = time() - $starttime;
   
   (-e $lognam) or
   die "${subname}__E> Could not find MODELLER log file: $lognam\n";

   my $fh_log = IO::File->new("< $lognam"); 
     # -- Check divergence, iterations etc
   my ($iter, $chi2low, $chi2high, $kstatlow, $kstathigh, $div)
        = ProfileStatInfo($fh_log);
   if (!defined($iter)) {
      return;
   }

   close($fh_log);
   $chi2low  =~ s/,//g;
   $kstatlow =~ s/,//g;
   return($chi2low, $kstatlow);

}

sub ExtractChainsPIR {

   # Extracts pdb_code and pdb_chain from hhsuite/modeller pir files (or maybe from hhr files?)
   my $ali = shift @_;
   my @pdb_chains;
   open ("A","$ali");
   while (my $line = <A>) {
       if (grep/^structure/,$line) {
          # structureX:1u7i:9:A:127:A:H
          my ($j, $pdb_code, $k, $pdb_chain, @j) = split(/\:/,$line);
          push @pdb_chains, lc($pdb_code).uc($pdb_chain);
       }
   }
   return (@pdb_chains) 

}

sub MakePrfList {

   # Creates a list of pdb95 profiles using the chains from hhsuite profiles

   my ($xprflist,$xpssmfile, $pdb_chains) = @_;
   open ("X","$xprflist") or die "Can't open $xprflist";
   my $prflist = "reference_profiles.list";
   my $pssmfile = "reference_profiles.pssm";
   my %found;
   foreach my $chain (@$pdb_chains) {
       $found{$chain} = 1;
   }
   open ("C",">$prflist") or die "Can't open $prflist for writing";
   while (my $line = <X>) {
        my @fields = split(/\//,$line);
        my $xchain = pop @fields;
        $xchain = substr($xchain,0,5);
        if ($found{$xchain} == 1) {
            print C $line;
        }
   }
   open ("P",">$pssmfile") or die "Can't open $pssmfile for writing";
   open ("PSSM","$xpssmfile") or die "Can't open $xpssmfile";
   my $start = 0;
   while (my $line = <PSSM>) {
      if ($start == 0 ) {
         $start = 1;
         # first line - gives the path info - chains position
         my @fields = split(/\//,$line);
         my $xchain = pop @fields;
         $xchain = substr($xchain,0,5);
         if ($found{$xchain} == 1) {
             print P $line;
             $start = 2;
         }

      } elsif (substr($line,0,2) eq "//") {
         if ($start == 2) {
            print P $line;
         }
         $start = 0;
      } elsif ($start == 2) {
         print P $line;
      } else {
        $start = 1;
      }
   }
   close (P);
   close (PSSM);
   close (C);
   close (X);
   return ($prflist, $pssmfile);
}

