#!/usr/bin/python
# This file is part of ModPipe, Copyright 1997-2020 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function
from optparse import OptionParser
import modpipe.fold_assignment
import modpipe.version
import modeller
import time
import os
import sys

def get_options():
    parser = OptionParser(version=modpipe.version.message())

    parser.set_usage("""
 Find sequence-sequence hits by scanning a target sequence against a database.

 Usage: %prog [options] target_code sequence database

 sequence is a file containing target sequences (target_code is the one code
 in that file to use), and database the sequence database.

 Run `%prog -h` for help information
""")

    parser.add_option("--sequence_format", dest="seqfmt",
                      metavar="FORMAT",
                      type="choice", choices=('FASTA', 'PIR'),
                      help="""Format of the input sequence file
                              (FASTA, PIR, default FASTA)""",
                      default="FASTA")
    add_common_options(parser, 'seqseq', "BUILDP (Seq-Seq)")
    opts, args = parser.parse_args()

    if len(args) != 3:
        parser.error("You must specify the target code, sequence " + \
                     "and database file names")
    target_code, seqfile, database = args

    # Set defaults
    set_common_defaults(opts, target_code, parser, 'seqseq')

    return target_code, seqfile, database, opts


def add_common_options(parser, suffix, customtag):
    parser.add_option("--db_format", dest="dbfmt", metavar="FORMAT",
                      type="choice", choices=('FASTA', 'PIR', 'BINARY'),
                      help="""Format of the sequence database
                              (FASTA, PIR, BINARY, default BINARY)""",
                      default="BINARY")
    parser.add_option("--e_value", dest="evalue",
                      metavar="FLOAT", type="float",
                      help="""E-Value threshold to use to report hits
                              against the database (default 1.0)""",
                      default=1.0)
    parser.add_option("--output_scorefile", dest="score_file",
                      metavar='FILE', type="string",
                      help="""If specified individual scores are written
                              out to this file.""", default=None)
    parser.add_option("--set_score_statistics", dest="score_statistics",
                      metavar='BOOL', type="choice", choices=('ON', 'OFF'),
                      help="""Flag to trigger the calculation of E-values.
                              Set it to OFF when there are only a few
                              sequences in the database. Default: ON""",
                      default='ON')
    parser.add_option("--output_file", dest="outfile",
                      metavar='FILE', type="string",
                      help="""Name of the output file to write out the
                              alignments from the database scan. It will
                              be in the MODELLER profile format.
                              Default: <target_code>-%s.prf""" % suffix,
                      default=None)
    parser.add_option("--substitution_matrix", dest="matrix",
                      metavar='FILE', type="string",
                      help="""Substitution matrix (default
                              ${LIB}/blosum62.sim.mat)""",
                      default="${LIB}/blosum62.sim.mat")
    parser.add_option("--matrix_offset", dest="matrix_offset",
                      metavar='FLOAT', type="float",
                      help="""Value used to offset the alignment matrix.
                              It is correlated with the gap costs. Best
                              left at the default value (default -450.0)""",
                      default=-450.0)
    parser.add_option("--gap_open_cost", dest="gap_open",
                      metavar='FLOAT', type="float",
                      help="""Cost to open a gap. Best left at the
                              default. Default: -500""",
                      default=-500.0)
    parser.add_option("--gap_extend_cost", dest="gap_extension",
                      metavar='FLOAT', type="float",
                      help="""Cost to extend a gap. Best left at the
                              default value. Default: -50""",
                      default=-50.0)
    parser.add_option("--pssm_weighting_scheme", dest="pssm_weighting",
                      metavar='PSSM', type="choice", choices=('HH0', 'HH1'),
                      help="""Weighting scheme to use in the calculation
                              of the PSSM. Only two different implementations
                              of the Henikoff & Henikkoff scheme are
                              currently available - HH0 or HH1. Default: HH1""",
                      default='HH1')
    parser.add_option("--output_alidir", dest="alidir",
                      metavar='DIR', type="string",
                      help="""Directory to store the output alignments.
                              Default: <target_code>-%s-ali""" % suffix,
                      default=None)
    parser.add_option("--hitfile_name", dest="hitsfile",
                      metavar='FILE', type="string",
                      help="""File to store the details of the hits.
                              Default: <target_code>-%s.hits""" % suffix,
                      default=None)
    parser.add_option("--alignment_basename", dest="alnbase",
                      metavar='FILE', type="string",
                      help="""Basename for the output alignments. The
                              actual alignment files will be based on this
                              variable appended with a serial number.
                              Default: <target_code>-%s""" % suffix,
                      default=None)
    parser.add_option("--custom_tag", dest="customtag",
                      metavar='STRING', type="string",
                      help="""This tag will be written out in the alignment
                              file in the ninth column of the output PIR
                              files. This is useful to identify all the
                              alignments from a particular run.
                              Default: %s""" % customtag,
                      default=customtag)
    parser.add_option("--native_pdb", dest="natpdb",
                      metavar='CODE', type="string",
                      help="""The PDB code of the target sequence. A
                              convenience feature that can be used when
                              benchmarking with PDB sequences. The input
                              should be in PDB 4-letter format (e.g. 1abc).
                              This also requires the --native_chn option.""",
                      default=None)
    parser.add_option("--template_fast", dest="template_fast",
                      help="[OFF],ON. Skip Chi value calculation for fast template based calculations",
                      default="OFF")
    parser.add_option("--native_chn", dest="natchn",
                      metavar='CHN', type="string",
                      help="The chain ID of the target PDB (see --native_pdb)",
                      default=None)
    parser.add_option("--native_type", dest="nattyp",
                      metavar='TYPE', type="choice",
                      choices=('sequence', 'structure'),
                      help="""Type for target sequence in the PIR alignment.
                              Again a convenience feature when benchmarking.
                              Set it to 'sequence' (default) or 'structure',
                              according to whether you want to use the
                              resulting alignment for model building or
                              some structure calculation resp.""",
                      default='sequence')
    parser.add_option("--set_assume_pdb", dest="assumepdb",
                      metavar='BOOL', type="choice", choices=('ON', 'OFF'),
                      help="""Flag to assume the database sequences are
                              from PDB. In which case, the program will
                              take the sequence codes to be the standard
                              5-letter PDB representation and will write
                              out modeling alignments. Default: ON""",
                      default='ON')
    pdb = modpipe.pdbutils.get_pdb_repository()
    parser.add_option("--pdb_repository", dest="pdbrep",
                      metavar='DIR', type="string",
                      help="""The PDB repositories to look for coordinate
                              files. Should exist if --set_assume_pdb = ON.
                              Default: """ + pdb,
                      default=pdb)
    parser.add_option("-v", "--verbose", dest="verbose",
                      action="count", default=0,
                      help="Be verbose (repeat the option for extra output)")


def set_common_defaults(opts, target_code, parser, suffix):
    if not opts.outfile:
        opts.outfile = target_code + '-%s.prf' % suffix
    if not opts.alidir:
        opts.alidir = target_code + '-%s-ali' % suffix
    if not opts.hitsfile:
        opts.hitsfile = target_code + '-%s.hits' % suffix
    if not opts.alnbase:
        opts.alnbase = target_code + '-' + suffix
    if opts.natchn in (None, '', '_'):
        opts.natchn = ' '
    opts.assumepdb = (opts.assumepdb == 'ON')

    # Make sure that natpdb is specified if nattyp eq structure
    if opts.nattyp == 'structure' and opts.natpdb is None:
        parser.error("Native PDB code should be specified if target" + \
                     "sequence type is 'structure'")


def set_modeller_verbosity(verbose):
    if verbose == 0:
        modeller.log.none()
    elif verbose == 1:
        modeller.log.minimal()
    else:
        modeller.log.verbose()


def report_hits_summary(hits, target_code, sequence, database):
    numhits = len(hits)
    sub = "HitsSeqSeq.py"
    print(sub + "__M> Found %d hits for %s (Seq-Seq)" % (numhits, target_code))
    print(sub + "__M>       Input   : %s" % sequence)
    print(sub + "__M>       Database: %s" % database)

    # Stop here if there are no hits
    if numhits < 1:
        print(sub + "__W> No hits against template database")
        sys.exit(0)


def report_hit(fh_hit, alnfile, hit, seqid, perc_gaps, source):
    template = hit.template
    target = hit.target
    template_str = "%5d|%-5s %5s %5s %5d;" \
                      % (seqid, template.code, template.range[0][0],
                         template.range[1][0], seqid)
    print("%s: %40s %5d %5s %5s|%40s|%8.4f|%8.4f|%10.2g|%3d|%5d %s" \
          % (source, target.code, target.db_length, template.target_start,
             template.target_stop, template.code, hit.chi2, hit.kstat,
             template.evalue, perc_gaps, 1, template_str))
    print("%s: %40s|%5d|%5s|%5s|%40s|%8.4f|%8.4f|%10.2g|%3d|%5d|%s #%s" \
          % (source, target.code, target.db_length, template.target_start,
             template.target_stop, template.code, hit.chi2, hit.kstat,
             template.evalue, perc_gaps, 1, template_str, alnfile), file=fh_hit)

def get_perc_gaps(aln):
    seq0 = aln[0]
    seq1 = aln[1]
    numali = 0
    numgapped = 0
    for pos in aln.positions:
        r0 = pos.get_residue(seq0)
        r1 = pos.get_residue(seq1)
        if not (r0 and r1):
            numgapped += 1
        elif r0 and r1:
            numali += 1
    return (100.0 * numgapped) / float(numali)


def process_hits(env, hits, target_code, opts, source):
    fh_hit = open(opts.hitsfile, 'w')
    sub = "main::ProcessAliBPSS"
    # Ensure there is at least one template to process
    if len(hits) <= 0:
        print(sub + "__E> There are no templates to process")
        return
    print(sub + "__M> Total number of templates to process: %d" % len(hits))

    # Write the header into the file with time
    print(time.strftime('# Produced by ModPipe: %a %b %d %H:%M:%S %Y'),
          file=fh_hit)

    if not os.path.exists(opts.alidir):
        os.makedirs(opts.alidir)
    alicnt = 0
    for hit in hits:
        a = hit.get_modeling_alignment()
        if a:
            alicnt += 1
            alnfile = os.path.join(opts.alidir,
                                   "%s_%04d.ali" % (opts.alnbase, alicnt))
            a.write(file=alnfile)
            print(sub + "__M> Created alignment: %s" % alnfile)
            perc_gaps = get_perc_gaps(a)
            seqid = int(a[0].get_sequence_identity(a[1]) + 0.5)
            report_hit(fh_hit, alnfile, hit, seqid, perc_gaps, source)


def main():
    target_code, sequence, database, opts = get_options()
    set_modeller_verbosity(opts.verbose)
    env = modeller.environ()
    env.io.atom_files_directory = opts.pdbrep
    s = modpipe.fold_assignment.SequenceSequence(env, database, opts)
    hits = s.search(target_code, sequence)
    report_hits_summary(hits, target_code, sequence, database)
    process_hits(env, hits, target_code, opts, 'SEQSEQ')


if __name__ == '__main__':
    main()
