#!/usr/bin/python
# This file is part of ModPipe, Copyright 1997-2020 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

from modeller import *
from optparse import OptionParser
import modpipe.version
import sys, os

def main():

    # Parse command line options
    parser = OptionParser(version=modpipe.version.message())

    # Set defaults
    parser.set_usage("""
 This script takes an ascii file with protein sequences in PIR or
 FASTA format and clusters them by sequence identity at a specified
 threshold

 Usage: %prog [options]

 Run `%prog -h` for help information
 """)

    parser.set_defaults(inpfile='',
                        sformat='FASTA',
                        seqidcut=95,
                        grpfile='seqfilter.grp',
                        codfile='seqfilter.cod')

    # Populate options list
    parser.add_option("-i", "--input_file",
                 dest="inpfile",
                 type='string',
                 help="""File containing the sequences to be clustered.
                      This is a mandatory option.""",
                 metavar="FILE")
    parser.add_option("-f", "--sequence_format",
                 dest="sformat",
                 type='string',
                 help="""Format of the sequence file. Acceptable formats
                      are PIR or FASTA.""",
                 metavar="FORMAT")
    parser.add_option("-t", "--seqid_threshold",
                 dest="seqidcut",
                 type='int',
                 help="""The sequence identity threshold for clustering the
                      sequences.""",
                 metavar="CUTOFF")
    parser.add_option("-c", "--output_grp_file",
                 dest="grpfile",
                 help="""Name of the file to write out the clusters.""",
                 metavar="FILE")
    parser.add_option("-r", "--output_cod_file",
                 dest="codfile",
                 help="""Name of the file to write out the representatives.""",
                 metavar="FILE")

    # Check mandatory options
    opts, args = parser.parse_args()

    if not opts.inpfile:
        parser.print_help()
        sys.exit(1)

    if not os.path.isfile(opts.inpfile):
        parser.error("""Cannot proceed without an input file""")

    opts.sformat = opts.sformat.lower()
    if not opts.sformat in ['fasta', 'pir']:
        parser.error("""Sequence format can only be FASTA or PIR.""")

    # -- Initialize some modeller stuff
    log.verbose()
    env = Environ()

    sdb = SequenceDB(env,
                seq_database_file=opts.inpfile,
                seq_database_format=opts.sformat,
                chains_list='all',
                minmax_db_seq_len=[30, 3000],
                clean_sequences=True)

    sdb.filter(rr_file='${LIB}/blosum62.sim.mat',
               gap_penalties_1d=[-500, -50],
               matrix_offset=-450,
               seqid_cut=opts.seqidcut,
               max_diff_res=30,
               output_grp_file=opts.grpfile,
               output_cod_file=opts.codfile)

if __name__ == "__main__":
    main()
